From 3359c1e5ddd18de6ace98aacdd0e38550999275e Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 30 May 2019 15:58:12 -0700 Subject: [PATCH 0001/1000] vp8: restrict 1st pass cpu_used range < 4 isn't meaningful in the first pass; additional analysis will be done, but thrown out, unnecessarily increasing the runtime. Change-Id: Ic3de77e3eaa7a8a3371f76f84693e9655c60fdba --- vp8/vp8_cx_iface.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index d65bf9652e..bda109e7a3 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -370,6 +370,9 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, #endif oxcf->cpu_used = vp8_cfg.cpu_used; + if (cfg.g_pass == VPX_RC_FIRST_PASS) { + oxcf->cpu_used = VPXMAX(4, oxcf->cpu_used); + } oxcf->encode_breakout = vp8_cfg.static_thresh; oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref; oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity; From d1ed2f0d7a8414a07e3283d54dff97d7d2ed411e Mon Sep 17 00:00:00 2001 From: angiebird Date: Fri, 29 May 2020 16:26:57 -0700 Subject: [PATCH 0002/1000] Refactor simple_encode_test.cc 1) Avoid using global variables. 2) Add comments to EncodeConsistencyTest. 3) Check frame_type and show_idx in EncodeConsistencyTest. Change-Id: I2261a0bd65189beb70432d62c077ef618a2712ab --- test/simple_encode_test.cc | 177 ++++++++++++++++++++++--------------- 1 file changed, 104 insertions(+), 73 deletions(-) diff --git a/test/simple_encode_test.cc b/test/simple_encode_test.cc index eddd1c59b6..a85675133b 100644 --- a/test/simple_encode_test.cc +++ b/test/simple_encode_test.cc @@ -10,6 +10,7 @@ #include #include +#include #include #include "third_party/googletest/src/include/gtest/gtest.h" #include "vp9/simple_encode.h" @@ -17,15 +18,6 @@ namespace vp9 { namespace { -// TODO(angirbid): Find a better way to construct encode info -const int w = 352; -const int h = 288; -const int frame_rate_num = 30; -const int frame_rate_den = 1; -const int target_bitrate = 1000; -const int num_frames = 17; -const char infile_path[] = "bus_352x288_420_f20_b8.yuv"; - double GetBitrateInKbps(size_t bit_size, int num_frames, int frame_rate_num, int frame_rate_den) { return static_cast(bit_size) / num_frames * frame_rate_num / @@ -36,14 +28,26 @@ double GetBitrateInKbps(size_t bit_size, int num_frames, int frame_rate_num, // For example, if size is 7, return 2. int GetNumUnit4x4(int size) { return (size + 3) >> 2; } -TEST(SimpleEncode, ComputeFirstPassStats) { - SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); +class SimpleEncodeTest : public ::testing::Test { + protected: + const int width_ = 352; + const int height_ = 288; + const int frame_rate_num_ = 30; + const int frame_rate_den_ = 1; + const int target_bitrate_ = 1000; + const int num_frames_ = 17; + const std::string in_file_path_str_ = "bus_352x288_420_f20_b8.yuv"; +}; + +TEST_F(SimpleEncodeTest, ComputeFirstPassStats) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, + in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); std::vector> frame_stats = simple_encode.ObserveFirstPassStats(); - EXPECT_EQ(frame_stats.size(), static_cast(num_frames)); - size_t data_num = frame_stats[0].size(); + EXPECT_EQ(frame_stats.size(), static_cast(num_frames_)); + const size_t data_num = frame_stats[0].size(); // Read ObserveFirstPassStats before changing FIRSTPASS_STATS. EXPECT_EQ(data_num, static_cast(25)); for (size_t i = 0; i < frame_stats.size(); ++i) { @@ -56,25 +60,27 @@ TEST(SimpleEncode, ComputeFirstPassStats) { } } -TEST(SimpleEncode, GetCodingFrameNum) { - SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); +TEST_F(SimpleEncodeTest, GetCodingFrameNum) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, + in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); - int num_coding_frames = simple_encode.GetCodingFrameNum(); + const int num_coding_frames = simple_encode.GetCodingFrameNum(); EXPECT_EQ(num_coding_frames, 19); } -TEST(SimpleEncode, EncodeFrame) { - SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); +TEST_F(SimpleEncodeTest, EncodeFrame) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, + in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); int num_coding_frames = simple_encode.GetCodingFrameNum(); - EXPECT_GE(num_coding_frames, num_frames); + EXPECT_GE(num_coding_frames, num_frames_); simple_encode.StartEncode(); size_t total_data_bit_size = 0; int coded_show_frame_count = 0; int frame_coding_index = 0; - while (coded_show_frame_count < num_frames) { + while (coded_show_frame_count < num_frames_) { const GroupOfPicture group_of_picture = simple_encode.ObserveGroupOfPicture(); const std::vector &encode_frame_list = @@ -99,22 +105,23 @@ TEST(SimpleEncode, EncodeFrame) { } coded_show_frame_count += group_of_picture.show_frame_count; } - const double bitrate = GetBitrateInKbps(total_data_bit_size, num_frames, - frame_rate_num, frame_rate_den); + const double bitrate = GetBitrateInKbps(total_data_bit_size, num_frames_, + frame_rate_num_, frame_rate_den_); const double off_target_threshold = 150; - EXPECT_LE(fabs(target_bitrate - bitrate), off_target_threshold); + EXPECT_LE(fabs(target_bitrate_ - bitrate), off_target_threshold); simple_encode.EndEncode(); } -TEST(SimpleEncode, ObserveKeyFrameMap) { - SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); +TEST_F(SimpleEncodeTest, ObserveKeyFrameMap) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, + in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); std::vector key_frame_map = simple_encode.ObserveKeyFrameMap(); - EXPECT_EQ(key_frame_map.size(), static_cast(num_frames)); + EXPECT_EQ(key_frame_map.size(), static_cast(num_frames_)); simple_encode.StartEncode(); int coded_show_frame_count = 0; - while (coded_show_frame_count < num_frames) { + while (coded_show_frame_count < num_frames_) { const GroupOfPicture group_of_picture = simple_encode.ObserveGroupOfPicture(); const std::vector &encode_frame_list = @@ -134,11 +141,12 @@ TEST(SimpleEncode, ObserveKeyFrameMap) { simple_encode.EndEncode(); } -TEST(SimpleEncode, EncodeFrameWithQuantizeIndex) { - SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); +TEST_F(SimpleEncodeTest, EncodeFrameWithQuantizeIndex) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, + in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); - int num_coding_frames = simple_encode.GetCodingFrameNum(); + const int num_coding_frames = simple_encode.GetCodingFrameNum(); simple_encode.StartEncode(); for (int i = 0; i < num_coding_frames; ++i) { const int assigned_quantize_index = 100 + i; @@ -150,15 +158,24 @@ TEST(SimpleEncode, EncodeFrameWithQuantizeIndex) { simple_encode.EndEncode(); } -TEST(SimpleEncode, EncodeConsistencyTest) { +// This test encodes the video using EncodeFrame(), where quantize indexes +// are selected by vp9 rate control. +// Encode stats and the quantize_indexes are collected. +// Then the test encodes the video again using EncodeFrameWithQuantizeIndex() +// using the quantize indexes collected from the first run. +// Then test whether the encode stats of the two encoding runs match. +TEST_F(SimpleEncodeTest, EncodeConsistencyTest) { std::vector quantize_index_list; std::vector ref_sse_list; std::vector ref_psnr_list; std::vector ref_bit_size_list; + std::vector ref_frame_type_list; + std::vector ref_show_idx_list; { // The first encode. - SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); + SimpleEncode simple_encode(width_, height_, frame_rate_num_, + frame_rate_den_, target_bitrate_, num_frames_, + in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); const int num_coding_frames = simple_encode.GetCodingFrameNum(); simple_encode.StartEncode(); @@ -169,13 +186,16 @@ TEST(SimpleEncode, EncodeConsistencyTest) { ref_sse_list.push_back(encode_frame_result.sse); ref_psnr_list.push_back(encode_frame_result.psnr); ref_bit_size_list.push_back(encode_frame_result.coding_data_bit_size); + ref_frame_type_list.push_back(encode_frame_result.frame_type); + ref_show_idx_list.push_back(encode_frame_result.show_idx); } simple_encode.EndEncode(); } { // The second encode with quantize index got from the first encode. - SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); + SimpleEncode simple_encode(width_, height_, frame_rate_num_, + frame_rate_den_, target_bitrate_, num_frames_, + in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); const int num_coding_frames = simple_encode.GetCodingFrameNum(); EXPECT_EQ(static_cast(num_coding_frames), @@ -189,6 +209,8 @@ TEST(SimpleEncode, EncodeConsistencyTest) { EXPECT_EQ(encode_frame_result.sse, ref_sse_list[i]); EXPECT_DOUBLE_EQ(encode_frame_result.psnr, ref_psnr_list[i]); EXPECT_EQ(encode_frame_result.coding_data_bit_size, ref_bit_size_list[i]); + EXPECT_EQ(encode_frame_result.frame_type, ref_frame_type_list[i]); + EXPECT_EQ(encode_frame_result.show_idx, ref_show_idx_list[i]); } simple_encode.EndEncode(); } @@ -196,13 +218,14 @@ TEST(SimpleEncode, EncodeConsistencyTest) { // Test the information (partition info and motion vector info) stored in // encoder is the same between two encode runs. -TEST(SimpleEncode, EncodeConsistencyTest2) { - const int num_rows_4x4 = GetNumUnit4x4(w); - const int num_cols_4x4 = GetNumUnit4x4(h); +TEST_F(SimpleEncodeTest, EncodeConsistencyTest2) { + const int num_rows_4x4 = GetNumUnit4x4(width_); + const int num_cols_4x4 = GetNumUnit4x4(height_); const int num_units_4x4 = num_rows_4x4 * num_cols_4x4; // The first encode. - SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, + in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); const int num_coding_frames = simple_encode.GetCodingFrameNum(); std::vector partition_info_list(num_units_4x4 * @@ -222,8 +245,9 @@ TEST(SimpleEncode, EncodeConsistencyTest2) { } simple_encode.EndEncode(); // The second encode. - SimpleEncode simple_encode_2(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); + SimpleEncode simple_encode_2(width_, height_, frame_rate_num_, + frame_rate_den_, target_bitrate_, num_frames_, + in_file_path_str_.c_str()); simple_encode_2.ComputeFirstPassStats(); const int num_coding_frames_2 = simple_encode_2.GetCodingFrameNum(); simple_encode_2.StartEncode(); @@ -264,14 +288,15 @@ TEST(SimpleEncode, EncodeConsistencyTest2) { } // Test the information stored in encoder is the same between two encode runs. -TEST(SimpleEncode, EncodeConsistencyTest3) { +TEST_F(SimpleEncodeTest, EncodeConsistencyTest3) { std::vector quantize_index_list; - const int num_rows_4x4 = GetNumUnit4x4(w); - const int num_cols_4x4 = GetNumUnit4x4(h); + const int num_rows_4x4 = GetNumUnit4x4(width_); + const int num_cols_4x4 = GetNumUnit4x4(height_); const int num_units_4x4 = num_rows_4x4 * num_cols_4x4; // The first encode. - SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, + in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); const int num_coding_frames = simple_encode.GetCodingFrameNum(); std::vector partition_info_list(num_units_4x4 * @@ -288,8 +313,9 @@ TEST(SimpleEncode, EncodeConsistencyTest3) { } simple_encode.EndEncode(); // The second encode. - SimpleEncode simple_encode_2(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); + SimpleEncode simple_encode_2(width_, height_, frame_rate_num_, + frame_rate_den_, target_bitrate_, num_frames_, + in_file_path_str_.c_str()); simple_encode_2.ComputeFirstPassStats(); const int num_coding_frames_2 = simple_encode_2.GetCodingFrameNum(); simple_encode_2.StartEncode(); @@ -319,21 +345,22 @@ TEST(SimpleEncode, EncodeConsistencyTest3) { // Get QPs and arf locations from the first encode. // Set external arfs and QPs for the second encode. // Expect to get matched results. -TEST(SimpleEncode, EncodeConsistencySetExternalGroupOfPicturesMap) { +TEST_F(SimpleEncodeTest, EncodeConsistencySetExternalGroupOfPicturesMap) { std::vector quantize_index_list; std::vector ref_sse_list; std::vector ref_psnr_list; std::vector ref_bit_size_list; - std::vector gop_map(num_frames, 0); + std::vector gop_map(num_frames_, 0); { // The first encode. - SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); + SimpleEncode simple_encode(width_, height_, frame_rate_num_, + frame_rate_den_, target_bitrate_, num_frames_, + in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); simple_encode.StartEncode(); int coded_show_frame_count = 0; - while (coded_show_frame_count < num_frames) { + while (coded_show_frame_count < num_frames_) { const GroupOfPicture group_of_picture = simple_encode.ObserveGroupOfPicture(); gop_map[coded_show_frame_count] |= kGopMapFlagStart; @@ -358,8 +385,9 @@ TEST(SimpleEncode, EncodeConsistencySetExternalGroupOfPicturesMap) { { // The second encode with quantize index got from the first encode. // The external arfs are the same as the first encode. - SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); + SimpleEncode simple_encode(width_, height_, frame_rate_num_, + frame_rate_den_, target_bitrate_, num_frames_, + in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); simple_encode.SetExternalGroupOfPicturesMap(gop_map); const int num_coding_frames = simple_encode.GetCodingFrameNum(); @@ -379,12 +407,13 @@ TEST(SimpleEncode, EncodeConsistencySetExternalGroupOfPicturesMap) { } } -TEST(SimpleEncode, SetExternalGroupOfPicturesMap) { - SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); +TEST_F(SimpleEncodeTest, SetExternalGroupOfPicturesMap) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, + in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); - std::vector gop_map(num_frames, 0); + std::vector gop_map(num_frames_, 0); // Should be the first gop group. gop_map[0] = 0; @@ -425,7 +454,7 @@ TEST(SimpleEncode, SetExternalGroupOfPicturesMap) { simple_encode.StartEncode(); int coded_show_frame_count = 0; - while (coded_show_frame_count < num_frames) { + while (coded_show_frame_count < num_frames_) { const GroupOfPicture group_of_picture = simple_encode.ObserveGroupOfPicture(); const std::vector &encode_frame_list = @@ -446,11 +475,12 @@ TEST(SimpleEncode, SetExternalGroupOfPicturesMap) { simple_encode.EndEncode(); } -TEST(SimpleEncode, GetEncodeFrameInfo) { +TEST_F(SimpleEncodeTest, GetEncodeFrameInfo) { // Makes sure that the encode_frame_info obtained from GetEncodeFrameInfo() // matches the counterpart in encode_frame_result obtained from EncodeFrame() - SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, + in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); const int num_coding_frames = simple_encode.GetCodingFrameNum(); simple_encode.StartEncode(); @@ -464,11 +494,12 @@ TEST(SimpleEncode, GetEncodeFrameInfo) { simple_encode.EndEncode(); } -TEST(SimpleEncode, GetFramePixelCount) { - SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den, - target_bitrate, num_frames, infile_path); +TEST_F(SimpleEncodeTest, GetFramePixelCount) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, + in_file_path_str_.c_str()); EXPECT_EQ(simple_encode.GetFramePixelCount(), - static_cast(w * h * 3 / 2)); + static_cast(width_ * height_ * 3 / 2)); } } // namespace From 812eb89b26e6fe458133e90e74b0a11b853df56d Mon Sep 17 00:00:00 2001 From: angiebird Date: Mon, 8 Jun 2020 16:46:15 -0700 Subject: [PATCH 0003/1000] Fix assertion error in simple_encode.cc Change-Id: I271d11cc35d34d5450a8b56fabcedaf2bb7c6565 --- vp9/simple_encode.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 9d62d26e1d..bce52e28ca 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -848,10 +848,10 @@ T *GetVectorData(const std::vector &v) { static GOP_COMMAND GetGopCommand(const std::vector &gop_map, int start_show_index) { - assert(static_cast(start_show_index) < gop_map.size()); - assert((gop_map[start_show_index] & kGopMapFlagStart) != 0); GOP_COMMAND gop_command; if (gop_map.size() > 0) { + assert(static_cast(start_show_index) < gop_map.size()); + assert((gop_map[start_show_index] & kGopMapFlagStart) != 0); int end_show_index = start_show_index + 1; // gop_map[end_show_index] & kGopMapFlagStart == 0 means this is // the start of a gop. @@ -1099,7 +1099,8 @@ int SimpleEncode::GetCodingFrameNum() const { } std::vector SimpleEncode::ComputeKeyFrameMap() const { - assert(impl_ptr_->first_pass_stats.size() == num_frames_); + // The last entry of first_pass_stats is the overall stats. + assert(impl_ptr_->first_pass_stats.size() == num_frames_ + 1); vpx_rational_t frame_rate = make_vpx_rational(frame_rate_num_, frame_rate_den_); const VP9EncoderConfig oxcf = From e753d4930f3b2859968068ffb77f6a6159b2f3c6 Mon Sep 17 00:00:00 2001 From: angiebird Date: Thu, 11 Jun 2020 15:10:38 -0700 Subject: [PATCH 0004/1000] Let SetExternalGroupOfPicturesMap use c-style arr Change-Id: Ic92ce5a3cc5bb74120eb32fc6219e43b1b861f14 --- test/simple_encode_test.cc | 4 ++-- vp9/simple_encode.cc | 7 +++++-- vp9/simple_encode.h | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/test/simple_encode_test.cc b/test/simple_encode_test.cc index a85675133b..69bed5a51a 100644 --- a/test/simple_encode_test.cc +++ b/test/simple_encode_test.cc @@ -389,7 +389,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencySetExternalGroupOfPicturesMap) { frame_rate_den_, target_bitrate_, num_frames_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); - simple_encode.SetExternalGroupOfPicturesMap(gop_map); + simple_encode.SetExternalGroupOfPicturesMap(gop_map.data(), gop_map.size()); const int num_coding_frames = simple_encode.GetCodingFrameNum(); EXPECT_EQ(static_cast(num_coding_frames), quantize_index_list.size()); @@ -427,7 +427,7 @@ TEST_F(SimpleEncodeTest, SetExternalGroupOfPicturesMap) { // Last gop group. gop_map[14] |= kGopMapFlagStart | kGopMapFlagUseAltRef; - simple_encode.SetExternalGroupOfPicturesMap(gop_map); + simple_encode.SetExternalGroupOfPicturesMap(gop_map.data(), gop_map.size()); std::vector observed_gop_map = simple_encode.ObserveExternalGroupOfPicturesMap(); diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index bce52e28ca..7bce91f7f0 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -806,8 +806,11 @@ std::vector> SimpleEncode::ObserveFirstPassStats() { return output_stats; } -void SimpleEncode::SetExternalGroupOfPicturesMap(std::vector gop_map) { - gop_map_ = gop_map; +void SimpleEncode::SetExternalGroupOfPicturesMap(int *gop_map, + int gop_map_size) { + for (int i = 0; i < gop_map_size; ++i) { + gop_map_.push_back(gop_map[i]); + } // The following will check and modify gop_map_ to make sure the // gop_map_ satisfies the constraints. // 1) Each key frame position should be at the start of a gop. diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index d7c9dfa140..b217320701 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -335,7 +335,7 @@ class SimpleEncode { // constraints. // 1) Each key frame position should be at the start of a gop. // 2) The last gop should not use an alt ref. - void SetExternalGroupOfPicturesMap(std::vector gop_map); + void SetExternalGroupOfPicturesMap(int *gop_map, int gop_map_size); // Observe the group of pictures map set through // SetExternalGroupOfPicturesMap(). This function should be called after From 5174eb5b9236a76c24e7bfadd0665d7b765395e1 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Mon, 15 Jun 2020 19:11:53 -0700 Subject: [PATCH 0005/1000] vp9-svc: Fix to dynamic resize for svc denoising Fix the logic to allow denoiser reset on resize for SVC mode, as dynamic resize is allowed for SVC under single_layer mode. Change-Id: I7776c68dadff2ccbce9b0b4a7f0d12624c2ccf90 --- vp9/encoder/vp9_encoder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index f90aee0725..ae7b47b523 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -3747,7 +3747,7 @@ static void set_frame_size(VP9_COMP *cpi) { } #endif // !CONFIG_REALTIME_ONLY - if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && !cpi->use_svc && + if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending != 0) { oxcf->scaled_frame_width = (oxcf->width * cpi->resize_scale_num) / cpi->resize_scale_den; From 3101666d2a8b5b2e6bff14ffb39db685f1cc98a0 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Tue, 16 Jun 2020 11:41:00 -0700 Subject: [PATCH 0006/1000] vp9-svc: Add svc test for denoiser and dynamic resize This catches the assert/crash fixed in 5174eb5. Also fix to only check for dynamic resize in SVC mode for base temporal layer. Change-Id: Ie6eb7d233cc43eafb1b78cec4aeb94fb4d7fe11a --- test/svc_datarate_test.cc | 56 +++++++++++++++++++++++++++++++++++--- vp9/encoder/vp9_ratectrl.c | 3 +- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc index 7260d8ec86..1920debfc8 100644 --- a/test/svc_datarate_test.cc +++ b/test/svc_datarate_test.cc @@ -354,6 +354,20 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { cfg_.rc_target_bitrate = cfg_.layer_target_bitrate[0]; encoder->Config(&cfg_); } + } else if (!dynamic_drop_layer_ && single_layer_resize_) { + if (video->frame() == 2) { + cfg_.layer_target_bitrate[0] = 30; + cfg_.layer_target_bitrate[1] = 50; + cfg_.rc_target_bitrate = + (cfg_.layer_target_bitrate[0] + cfg_.layer_target_bitrate[1]); + encoder->Config(&cfg_); + } else if (video->frame() == 160) { + cfg_.layer_target_bitrate[0] = 1500; + cfg_.layer_target_bitrate[1] = 2000; + cfg_.rc_target_bitrate = + (cfg_.layer_target_bitrate[0] + cfg_.layer_target_bitrate[1]); + encoder->Config(&cfg_); + } } if (force_key_test_ && force_key_) frame_flags_ = VPX_EFLAG_FORCE_KF; @@ -452,10 +466,12 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { parse_superframe_index(static_cast(pkt->data.frame.buf), pkt->data.frame.sz, sizes_parsed, &count); // Count may be less than number of spatial layers because of frame drops. - for (int sl = 0; sl < number_spatial_layers_; ++sl) { - if (pkt->data.frame.spatial_layer_encoded[sl]) { - sizes[sl] = sizes_parsed[num_layers_encoded]; - num_layers_encoded++; + if (number_spatial_layers_ > 1) { + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + if (pkt->data.frame.spatial_layer_encoded[sl]) { + sizes[sl] = sizes_parsed[num_layers_encoded]; + num_layers_encoded++; + } } } // For superframe with Intra-only count will be +1 larger @@ -875,6 +891,38 @@ TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL_SingleLayerResize) { #endif } +// For pass CBR SVC with 1 spatial and 2 temporal layers with dynamic resize +// and denoiser enabled. The resizer will resize the single layer down and back +// up again, as the bitrate goes back up. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc1SL2TL_DenoiseResize) { + SetSvcConfig(1, 2); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 2; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + cfg_.rc_resize_allowed = 1; + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280, + 720, 12, 1, 0, 300); + top_sl_width_ = 1280; + top_sl_height_ = 720; + cfg_.rc_target_bitrate = 800; + ResetModel(); + dynamic_drop_layer_ = false; + single_layer_resize_ = true; + denoiser_on_ = 1; + base_speed_setting_ = speed_setting_; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Expect at least one resize down and at least one resize back up. + EXPECT_GE(num_resize_down_, 1); + EXPECT_GE(num_resize_up_, 1); +} + // Run SVC encoder for 1 temporal layer, 2 spatial layers, with spatial // downscale 5x5. TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL1TL5x5MultipleRuns) { diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 2d03bad4d5..2a279d4fce 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -2376,7 +2376,8 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { if (cm->show_frame) update_buffer_level_svc_preencode(cpi); if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 && - svc->spatial_layer_id == svc->first_spatial_layer_to_encode) { + svc->spatial_layer_id == svc->first_spatial_layer_to_encode && + svc->temporal_layer_id == 0) { LAYER_CONTEXT *lc = NULL; cpi->resize_pending = vp9_resize_one_pass_cbr(cpi); if (cpi->resize_pending) { From e9c6cb64744f6df31ba52501d4759b50ba832df2 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Wed, 17 Jun 2020 10:27:39 -0700 Subject: [PATCH 0007/1000] vp9-rtc: Fixes to resizer for real-time Reduce the time before sampling begins (after key) and reduce averaging window, to make resize act faster. Reset RC paramaters for temporal layers on resize. Add per-frame-bandwidth thresholds to force downsize for extreme case, for HD input. Change-Id: I8e08580b2216a2e6981502552025370703cd206c --- test/resize_test.cc | 8 ++++---- vp9/encoder/vp9_ratectrl.c | 31 +++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/test/resize_test.cc b/test/resize_test.cc index d996cbe1be..a2dc5d7a70 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -672,11 +672,11 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) { ASSERT_EQ(info->h, GetFrameHeight(idx)); if (info->w != last_w || info->h != last_h) { resize_count++; - if (resize_count == 1) { + if (resize_count <= 2) { // Verify that resize down occurs. ASSERT_LT(info->w, last_w); ASSERT_LT(info->h, last_h); - } else if (resize_count == 2) { + } else if (resize_count > 2) { // Verify that resize up occurs. ASSERT_GT(info->w, last_w); ASSERT_GT(info->h, last_h); @@ -687,8 +687,8 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) { } #if CONFIG_VP9_DECODER - // Verify that we get 2 resize events in this test. - ASSERT_EQ(resize_count, 2) << "Resizing should occur twice."; + // Verify that we get 4 resize events in this test. + ASSERT_EQ(resize_count, 4) << "Resizing should occur twice."; EXPECT_EQ(static_cast(0), GetMismatchFrames()); #else printf("Warning: VP9 decoder unavailable, unable to check resize count!\n"); diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 2a279d4fce..34dd618acf 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -2391,6 +2391,11 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { cpi->resize_scale_num * lc->scaling_factor_num; lc->scaling_factor_den_resize = cpi->resize_scale_den * lc->scaling_factor_den; + // Reset rate control for all temporal layers. + lc->rc.buffer_level = lc->rc.optimal_buffer_level; + lc->rc.bits_off_target = lc->rc.optimal_buffer_level; + lc->rc.rate_correction_factors[INTER_FRAME] = + rc->rate_correction_factors[INTER_FRAME]; } // Set the size for this current temporal layer. lc = &svc->layer_context[svc->spatial_layer_id * @@ -2663,6 +2668,7 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { int min_width = (320 * 4) / 3; int min_height = (180 * 4) / 3; int down_size_on = 1; + int force_downsize_rate = 0; cpi->resize_scale_num = 1; cpi->resize_scale_den = 1; // Don't resize on key frame; reset the counters on key frame. @@ -2683,10 +2689,31 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { } #endif + // Force downsize based on per-frame-bandwidth, for extreme case, + // for HD input. + if (cpi->resize_state == ORIG && cm->width * cm->height >= 1280 * 720) { + if (rc->avg_frame_bandwidth < (int)(300000 / 30)) { + resize_action = DOWN_ONEHALF; + cpi->resize_state = ONE_HALF; + force_downsize_rate = 1; + } else if (rc->avg_frame_bandwidth < (int)(400000 / 30)) { + resize_action = ONEHALFONLY_RESIZE ? DOWN_ONEHALF : DOWN_THREEFOUR; + cpi->resize_state = ONEHALFONLY_RESIZE ? ONE_HALF : THREE_QUARTER; + force_downsize_rate = 1; + } + } else if (cpi->resize_state == THREE_QUARTER && + cm->width * cm->height >= 960 * 540) { + if (rc->avg_frame_bandwidth < (int)(300000 / 30)) { + resize_action = DOWN_ONEHALF; + cpi->resize_state = ONE_HALF; + force_downsize_rate = 1; + } + } + // Resize based on average buffer underflow and QP over some window. // Ignore samples close to key frame, since QP is usually high after key. - if (cpi->rc.frames_since_key > 2 * cpi->framerate) { - const int window = (int)(4 * cpi->framerate); + if (!force_downsize_rate && cpi->rc.frames_since_key > cpi->framerate) { + const int window = VPXMIN(30, (int)(2 * cpi->framerate)); cpi->resize_avg_qp += cm->base_qindex; if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100)) ++cpi->resize_buffer_underflow; From 0370a43816cadc4939661d7b214a2077f8d25e88 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 18 Jun 2020 10:54:23 -0700 Subject: [PATCH 0008/1000] vp9_skip_loopfilter_test: make Init() return a bool ASSERT's in the function only force a return, not termination. this fixes a static analyzer issue with using a null decoder object in following calls. BUG=webm:1695 Change-Id: I79762df8076d029c5c8fef4d5e06ed655719de62 --- test/vp9_skip_loopfilter_test.cc | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/test/vp9_skip_loopfilter_test.cc b/test/vp9_skip_loopfilter_test.cc index d41a784a22..1fb7e48485 100644 --- a/test/vp9_skip_loopfilter_test.cc +++ b/test/vp9_skip_loopfilter_test.cc @@ -33,24 +33,32 @@ class SkipLoopFilterTest { } // If |threads| > 0 then set the decoder with that number of threads. - void Init(int num_threads) { + bool Init(int num_threads) { expected_md5_[0] = '\0'; junk_[0] = '\0'; video_ = new libvpx_test::WebMVideoSource(kVp9TestFile); - ASSERT_TRUE(video_ != NULL); + if (video_ == NULL) { + EXPECT_TRUE(video_ != NULL); + return false; + } video_->Init(); video_->Begin(); vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); if (num_threads > 0) cfg.threads = num_threads; decoder_ = new libvpx_test::VP9Decoder(cfg, 0); - ASSERT_TRUE(decoder_ != NULL); + if (decoder_ == NULL) { + EXPECT_TRUE(decoder_ != NULL); + return false; + } OpenMd5File(kVp9Md5File); + return !::testing::Test::HasFailure(); } // Set the VP9 skipLoopFilter control value. void SetSkipLoopFilter(int value, vpx_codec_err_t expected_value) { + ASSERT_NE(decoder_, nullptr); decoder_->Control(VP9_SET_SKIP_LOOP_FILTER, value, expected_value); } @@ -121,7 +129,7 @@ TEST(SkipLoopFilterTest, ShutOffLoopFilter) { const int non_zero_value = 1; const int num_threads = 0; SkipLoopFilterTest skip_loop_filter; - skip_loop_filter.Init(num_threads); + ASSERT_TRUE(skip_loop_filter.Init(num_threads)); skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK); ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames()); skip_loop_filter.CheckMd5(false); @@ -131,7 +139,7 @@ TEST(SkipLoopFilterTest, ShutOffLoopFilterSingleThread) { const int non_zero_value = 1; const int num_threads = 1; SkipLoopFilterTest skip_loop_filter; - skip_loop_filter.Init(num_threads); + ASSERT_TRUE(skip_loop_filter.Init(num_threads)); skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK); ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames()); skip_loop_filter.CheckMd5(false); @@ -141,7 +149,7 @@ TEST(SkipLoopFilterTest, ShutOffLoopFilter8Threads) { const int non_zero_value = 1; const int num_threads = 8; SkipLoopFilterTest skip_loop_filter; - skip_loop_filter.Init(num_threads); + ASSERT_TRUE(skip_loop_filter.Init(num_threads)); skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK); ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames()); skip_loop_filter.CheckMd5(false); @@ -151,7 +159,7 @@ TEST(SkipLoopFilterTest, WithLoopFilter) { const int non_zero_value = 1; const int num_threads = 0; SkipLoopFilterTest skip_loop_filter; - skip_loop_filter.Init(num_threads); + ASSERT_TRUE(skip_loop_filter.Init(num_threads)); skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK); skip_loop_filter.SetSkipLoopFilter(0, VPX_CODEC_OK); ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames()); @@ -161,7 +169,7 @@ TEST(SkipLoopFilterTest, WithLoopFilter) { TEST(SkipLoopFilterTest, ToggleLoopFilter) { const int num_threads = 0; SkipLoopFilterTest skip_loop_filter; - skip_loop_filter.Init(num_threads); + ASSERT_TRUE(skip_loop_filter.Init(num_threads)); for (int i = 0; i < 10; ++i) { skip_loop_filter.SetSkipLoopFilter(i % 2, VPX_CODEC_OK); From 83769e3d250b05df0bb97fc619f5886587b2a310 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 17 Jun 2020 12:16:51 -0700 Subject: [PATCH 0009/1000] update googletest to v1.10.0 this moves the framework to c++11 and changes *_TEST_CASE* to _TEST_SUITE BUG=webm:1695 Change-Id: I07f2c20850312a9c7e381b38353d2f9f45889cb1 --- test/active_map_refresh_test.cc | 6 +- test/active_map_test.cc | 6 +- test/add_noise_test.cc | 6 +- test/alt_ref_aq_segment_test.cc | 8 +- test/altref_test.cc | 16 +- test/aq_segment_test.cc | 8 +- test/avg_test.cc | 94 +- test/blockiness_test.cc | 3 +- test/borders_test.cc | 4 +- test/byte_alignment_test.cc | 4 +- test/codec_factory.h | 12 +- test/comp_avg_pred_test.cc | 16 +- test/config_test.cc | 2 +- test/consistency_test.cc | 4 +- test/convolve_test.cc | 39 +- test/cpu_speed_test.cc | 10 +- test/cq_test.cc | 4 +- test/dct16x16_test.cc | 44 +- test/dct32x32_test.cc | 16 +- test/dct_partial_test.cc | 18 +- test/dct_test.cc | 74 +- test/decode_corrupted.cc | 4 +- test/decode_perf_test.cc | 8 +- test/decode_svc_test.cc | 2 +- test/encode_perf_test.cc | 4 +- test/error_resilience_test.cc | 12 +- test/external_frame_buffer_test.cc | 2 +- test/fdct8x8_test.cc | 66 +- test/hadamard_test.cc | 18 +- test/idct_test.cc | 19 +- test/invalid_file_test.cc | 19 +- test/keyframe_test.cc | 2 +- test/level_test.cc | 8 +- test/lpf_test.cc | 26 +- test/minmax_test.cc | 14 +- test/partial_idct_test.cc | 28 +- test/pp_filter_test.cc | 50 +- test/predict_test.cc | 24 +- test/quantize_test.cc | 21 +- test/realtime_test.cc | 8 +- test/resize_test.cc | 20 +- test/sad_test.cc | 54 +- test/sum_squares_test.cc | 6 +- test/superframe_test.cc | 2 +- test/svc_datarate_test.cc | 32 +- test/svc_end_to_end_test.cc | 4 +- test/test_vector_test.cc | 8 +- test/tile_independence_test.cc | 2 +- test/timestamp_test.cc | 8 +- test/variance_test.cc | 163 +- test/vp8_datarate_test.cc | 10 +- test/vp8_denoiser_sse2_test.cc | 2 +- test/vp8_fdct4x4_test.cc | 18 +- test/vp9_arf_freq_test.cc | 6 +- test/vp9_block_error_test.cc | 6 +- test/vp9_datarate_test.cc | 24 +- test/vp9_denoiser_test.cc | 4 +- test/vp9_encoder_parms_get_to_decoder.cc | 6 +- test/vp9_end_to_end_test.cc | 24 +- test/vp9_ethread_test.cc | 6 +- test/vp9_intrapred_test.cc | 32 +- test/vp9_lossless_test.cc | 8 +- test/vp9_motion_vector_test.cc | 8 +- test/vp9_quantize_test.cc | 30 +- test/vp9_scale_test.cc | 12 +- test/vp9_subtract_test.cc | 24 +- test/vp9_thread_test.cc | 2 +- test/vpx_scale_test.cc | 8 +- test/y4m_test.cc | 8 +- test/yuv_temporal_filter_test.cc | 12 +- third_party/googletest/README.libvpx | 10 +- third_party/googletest/src/README.md | 227 +- .../src/include/gtest/gtest-death-test.h | 23 +- .../src/include/gtest/gtest-matchers.h | 750 +++ .../src/include/gtest/gtest-message.h | 43 +- .../src/include/gtest/gtest-param-test.h | 1201 +--- .../src/include/gtest/gtest-param-test.h.pump | 500 -- .../src/include/gtest/gtest-printers.h | 285 +- .../googletest/src/include/gtest/gtest-spi.h | 5 +- .../src/include/gtest/gtest-test-part.h | 33 +- .../src/include/gtest/gtest-typed-test.h | 198 +- .../googletest/src/include/gtest/gtest.h | 683 +- .../src/include/gtest/gtest_pred_impl.h | 74 +- .../internal/gtest-death-test-internal.h | 114 +- .../include/gtest/internal/gtest-filepath.h | 2 +- .../include/gtest/internal/gtest-internal.h | 558 +- .../include/gtest/internal/gtest-linked_ptr.h | 243 - .../internal/gtest-param-util-generated.h | 5552 ----------------- .../gtest-param-util-generated.h.pump | 282 - .../include/gtest/internal/gtest-param-util.h | 485 +- .../include/gtest/internal/gtest-port-arch.h | 15 +- .../src/include/gtest/internal/gtest-port.h | 761 +-- .../src/include/gtest/internal/gtest-string.h | 22 +- .../src/include/gtest/internal/gtest-tuple.h | 1021 --- .../include/gtest/internal/gtest-tuple.h.pump | 348 -- .../include/gtest/internal/gtest-type-util.h | 21 +- .../gtest/internal/gtest-type-util.h.pump | 20 +- third_party/googletest/src/src/gtest-all.cc | 1 + .../googletest/src/src/gtest-death-test.cc | 414 +- .../googletest/src/src/gtest-filepath.cc | 28 +- .../googletest/src/src/gtest-internal-inl.h | 238 +- .../googletest/src/src/gtest-matchers.cc | 97 + third_party/googletest/src/src/gtest-port.cc | 277 +- .../googletest/src/src/gtest-printers.cc | 33 +- .../googletest/src/src/gtest-test-part.cc | 21 +- .../googletest/src/src/gtest-typed-test.cc | 6 +- third_party/googletest/src/src/gtest.cc | 1600 ++--- third_party/googletest/src/src/gtest_main.cc | 12 +- 108 files changed, 4696 insertions(+), 12785 deletions(-) create mode 100644 third_party/googletest/src/include/gtest/gtest-matchers.h delete mode 100644 third_party/googletest/src/include/gtest/gtest-param-test.h.pump delete mode 100644 third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h delete mode 100644 third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h delete mode 100644 third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump delete mode 100644 third_party/googletest/src/include/gtest/internal/gtest-tuple.h delete mode 100644 third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump create mode 100644 third_party/googletest/src/src/gtest-matchers.cc diff --git a/test/active_map_refresh_test.cc b/test/active_map_refresh_test.cc index a985ed4f11..208d7c6d79 100644 --- a/test/active_map_refresh_test.cc +++ b/test/active_map_refresh_test.cc @@ -122,7 +122,7 @@ TEST_P(ActiveMapRefreshTest, Test) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP9_INSTANTIATE_TEST_CASE(ActiveMapRefreshTest, - ::testing::Values(::libvpx_test::kRealTime), - ::testing::Range(5, 6)); +VP9_INSTANTIATE_TEST_SUITE(ActiveMapRefreshTest, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Range(5, 6)); } // namespace diff --git a/test/active_map_test.cc b/test/active_map_test.cc index 03536c81ef..e08d9a0367 100644 --- a/test/active_map_test.cc +++ b/test/active_map_test.cc @@ -85,7 +85,7 @@ TEST_P(ActiveMapTest, Test) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP9_INSTANTIATE_TEST_CASE(ActiveMapTest, - ::testing::Values(::libvpx_test::kRealTime), - ::testing::Range(0, 9)); +VP9_INSTANTIATE_TEST_SUITE(ActiveMapTest, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Range(0, 9)); } // namespace diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc index 0d1893c524..1c4403ef6a 100644 --- a/test/add_noise_test.cc +++ b/test/add_noise_test.cc @@ -129,20 +129,20 @@ TEST_P(AddNoiseTest, CheckCvsAssembly) { using std::make_tuple; -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, AddNoiseTest, ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_c), make_tuple(4.4, vpx_plane_add_noise_c))); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, AddNoiseTest, ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_sse2), make_tuple(4.4, vpx_plane_add_noise_sse2))); #endif #if HAVE_MSA -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, AddNoiseTest, ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_msa), make_tuple(4.4, vpx_plane_add_noise_msa))); diff --git a/test/alt_ref_aq_segment_test.cc b/test/alt_ref_aq_segment_test.cc index 6e03a47852..00a00e27c5 100644 --- a/test/alt_ref_aq_segment_test.cc +++ b/test/alt_ref_aq_segment_test.cc @@ -150,8 +150,8 @@ TEST_P(AltRefAqSegmentTest, TestNoMisMatchAltRefAQ4) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP9_INSTANTIATE_TEST_CASE(AltRefAqSegmentTest, - ::testing::Values(::libvpx_test::kOnePassGood, - ::libvpx_test::kTwoPassGood), - ::testing::Range(2, 5)); +VP9_INSTANTIATE_TEST_SUITE(AltRefAqSegmentTest, + ::testing::Values(::libvpx_test::kOnePassGood, + ::libvpx_test::kTwoPassGood), + ::testing::Range(2, 5)); } // namespace diff --git a/test/altref_test.cc b/test/altref_test.cc index 0119be4da0..69bcef774e 100644 --- a/test/altref_test.cc +++ b/test/altref_test.cc @@ -63,8 +63,8 @@ TEST_P(AltRefTest, MonotonicTimestamps) { EXPECT_GE(altref_count(), 1); } -VP8_INSTANTIATE_TEST_CASE(AltRefTest, - ::testing::Range(kLookAheadMin, kLookAheadMax)); +VP8_INSTANTIATE_TEST_SUITE(AltRefTest, + ::testing::Range(kLookAheadMin, kLookAheadMax)); #endif // CONFIG_VP8_ENCODER @@ -142,11 +142,11 @@ TEST_P(AltRefForcedKeyTestLarge, ForcedFrameIsKey) { } } -VP8_INSTANTIATE_TEST_CASE(AltRefForcedKeyTestLarge, - ::testing::Values(::libvpx_test::kOnePassGood), - ::testing::Range(0, 9)); +VP8_INSTANTIATE_TEST_SUITE(AltRefForcedKeyTestLarge, + ::testing::Values(::libvpx_test::kOnePassGood), + ::testing::Range(0, 9)); -VP9_INSTANTIATE_TEST_CASE(AltRefForcedKeyTestLarge, - ::testing::Values(::libvpx_test::kOnePassGood), - ::testing::Range(0, 9)); +VP9_INSTANTIATE_TEST_SUITE(AltRefForcedKeyTestLarge, + ::testing::Values(::libvpx_test::kOnePassGood), + ::testing::Range(0, 9)); } // namespace diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc index 3c4053be7f..2cbc991d0c 100644 --- a/test/aq_segment_test.cc +++ b/test/aq_segment_test.cc @@ -102,8 +102,8 @@ TEST_P(AqSegmentTest, TestNoMisMatchAQ3) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP9_INSTANTIATE_TEST_CASE(AqSegmentTest, - ::testing::Values(::libvpx_test::kRealTime, - ::libvpx_test::kOnePassGood), - ::testing::Range(3, 9)); +VP9_INSTANTIATE_TEST_SUITE(AqSegmentTest, + ::testing::Values(::libvpx_test::kRealTime, + ::libvpx_test::kOnePassGood), + ::testing::Range(3, 9)); } // namespace diff --git a/test/avg_test.cc b/test/avg_test.cc index 72e16f6578..601fb19012 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -557,38 +557,38 @@ TEST_P(BlockErrorTestFP, DISABLED_Speed) { using std::make_tuple; -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, AverageTest, ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c), make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c))); #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, AverageTestHBD, ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_c), make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_c))); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, AverageTestHBD, ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_sse2), make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2))); #endif // HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(C, SatdHighbdTest, - ::testing::Values(make_tuple(16, &vpx_satd_c), - make_tuple(64, &vpx_satd_c), - make_tuple(256, &vpx_satd_c), - make_tuple(1024, &vpx_satd_c))); +INSTANTIATE_TEST_SUITE_P(C, SatdHighbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_c), + make_tuple(64, &vpx_satd_c), + make_tuple(256, &vpx_satd_c), + make_tuple(1024, &vpx_satd_c))); #endif // CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P(C, SatdLowbdTest, - ::testing::Values(make_tuple(16, &vpx_satd_c), - make_tuple(64, &vpx_satd_c), - make_tuple(256, &vpx_satd_c), - make_tuple(1024, &vpx_satd_c))); +INSTANTIATE_TEST_SUITE_P(C, SatdLowbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_c), + make_tuple(64, &vpx_satd_c), + make_tuple(256, &vpx_satd_c), + make_tuple(1024, &vpx_satd_c))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, BlockErrorTestFP, ::testing::Values(make_tuple(16, &vp9_block_error_fp_c), make_tuple(64, &vp9_block_error_fp_c), @@ -596,7 +596,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(1024, &vp9_block_error_fp_c))); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, AverageTest, ::testing::Values(make_tuple(16, 16, 0, 8, &vpx_avg_8x8_sse2), make_tuple(16, 16, 5, 8, &vpx_avg_8x8_sse2), @@ -605,27 +605,27 @@ INSTANTIATE_TEST_CASE_P( make_tuple(16, 16, 5, 4, &vpx_avg_4x4_sse2), make_tuple(32, 32, 15, 4, &vpx_avg_4x4_sse2))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, IntProRowTest, ::testing::Values(make_tuple(16, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c), make_tuple(32, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c), make_tuple(64, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, IntProColTest, ::testing::Values(make_tuple(16, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c), make_tuple(32, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c), make_tuple(64, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c))); -INSTANTIATE_TEST_CASE_P(SSE2, SatdLowbdTest, - ::testing::Values(make_tuple(16, &vpx_satd_sse2), - make_tuple(64, &vpx_satd_sse2), - make_tuple(256, &vpx_satd_sse2), - make_tuple(1024, &vpx_satd_sse2))); +INSTANTIATE_TEST_SUITE_P(SSE2, SatdLowbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_sse2), + make_tuple(64, &vpx_satd_sse2), + make_tuple(256, &vpx_satd_sse2), + make_tuple(1024, &vpx_satd_sse2))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, BlockErrorTestFP, ::testing::Values(make_tuple(16, &vp9_block_error_fp_sse2), make_tuple(64, &vp9_block_error_fp_sse2), @@ -634,14 +634,14 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_AVX2 -INSTANTIATE_TEST_CASE_P(AVX2, SatdLowbdTest, - ::testing::Values(make_tuple(16, &vpx_satd_avx2), - make_tuple(64, &vpx_satd_avx2), - make_tuple(256, &vpx_satd_avx2), - make_tuple(1024, &vpx_satd_avx2))); +INSTANTIATE_TEST_SUITE_P(AVX2, SatdLowbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_avx2), + make_tuple(64, &vpx_satd_avx2), + make_tuple(256, &vpx_satd_avx2), + make_tuple(1024, &vpx_satd_avx2))); #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, SatdHighbdTest, ::testing::Values(make_tuple(16, &vpx_highbd_satd_avx2), make_tuple(64, &vpx_highbd_satd_avx2), @@ -649,7 +649,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(1024, &vpx_highbd_satd_avx2))); #endif // CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, BlockErrorTestFP, ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2), make_tuple(64, &vp9_block_error_fp_avx2), @@ -658,7 +658,7 @@ INSTANTIATE_TEST_CASE_P( #endif #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, AverageTest, ::testing::Values(make_tuple(16, 16, 0, 8, &vpx_avg_8x8_neon), make_tuple(16, 16, 5, 8, &vpx_avg_8x8_neon), @@ -667,30 +667,30 @@ INSTANTIATE_TEST_CASE_P( make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon), make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, IntProRowTest, ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c), make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c), make_tuple(64, &vpx_int_pro_row_neon, &vpx_int_pro_row_c))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, IntProColTest, ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c), make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c), make_tuple(64, &vpx_int_pro_col_neon, &vpx_int_pro_col_c))); -INSTANTIATE_TEST_CASE_P(NEON, SatdLowbdTest, - ::testing::Values(make_tuple(16, &vpx_satd_neon), - make_tuple(64, &vpx_satd_neon), - make_tuple(256, &vpx_satd_neon), - make_tuple(1024, &vpx_satd_neon))); +INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_neon), + make_tuple(64, &vpx_satd_neon), + make_tuple(256, &vpx_satd_neon), + make_tuple(1024, &vpx_satd_neon))); // TODO(jianj): Remove the highbitdepth flag once the SIMD functions are // in place. #if !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, BlockErrorTestFP, ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon), make_tuple(64, &vp9_block_error_fp_neon), @@ -700,7 +700,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_NEON #if HAVE_MSA -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, AverageTest, ::testing::Values(make_tuple(16, 16, 0, 8, &vpx_avg_8x8_msa), make_tuple(16, 16, 5, 8, &vpx_avg_8x8_msa), @@ -709,14 +709,14 @@ INSTANTIATE_TEST_CASE_P( make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa), make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, IntProRowTest, ::testing::Values(make_tuple(16, &vpx_int_pro_row_msa, &vpx_int_pro_row_c), make_tuple(32, &vpx_int_pro_row_msa, &vpx_int_pro_row_c), make_tuple(64, &vpx_int_pro_row_msa, &vpx_int_pro_row_c))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, IntProColTest, ::testing::Values(make_tuple(16, &vpx_int_pro_col_msa, &vpx_int_pro_col_c), make_tuple(32, &vpx_int_pro_col_msa, &vpx_int_pro_col_c), @@ -726,11 +726,11 @@ INSTANTIATE_TEST_CASE_P( // TODO(jingning): Remove the highbitdepth flag once the SIMD functions are // in place. #if !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P(MSA, SatdLowbdTest, - ::testing::Values(make_tuple(16, &vpx_satd_msa), - make_tuple(64, &vpx_satd_msa), - make_tuple(256, &vpx_satd_msa), - make_tuple(1024, &vpx_satd_msa))); +INSTANTIATE_TEST_SUITE_P(MSA, SatdLowbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_msa), + make_tuple(64, &vpx_satd_msa), + make_tuple(256, &vpx_satd_msa), + make_tuple(1024, &vpx_satd_msa))); #endif // !CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_MSA diff --git a/test/blockiness_test.cc b/test/blockiness_test.cc index ced6e66c62..75aa2938e2 100644 --- a/test/blockiness_test.cc +++ b/test/blockiness_test.cc @@ -215,7 +215,8 @@ using std::make_tuple; const BlockinessParam c_vp9_tests[] = { make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238) }; -INSTANTIATE_TEST_CASE_P(C, BlockinessVP9Test, ::testing::ValuesIn(c_vp9_tests)); +INSTANTIATE_TEST_SUITE_P(C, BlockinessVP9Test, + ::testing::ValuesIn(c_vp9_tests)); #endif } // namespace diff --git a/test/borders_test.cc b/test/borders_test.cc index b91a15b800..3c1f69a923 100644 --- a/test/borders_test.cc +++ b/test/borders_test.cc @@ -79,6 +79,6 @@ TEST_P(BordersTest, TestLowBitrate) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP9_INSTANTIATE_TEST_CASE(BordersTest, - ::testing::Values(::libvpx_test::kTwoPassGood)); +VP9_INSTANTIATE_TEST_SUITE(BordersTest, + ::testing::Values(::libvpx_test::kTwoPassGood)); } // namespace diff --git a/test/byte_alignment_test.cc b/test/byte_alignment_test.cc index 0ef6c4c519..9feb7bdce7 100644 --- a/test/byte_alignment_test.cc +++ b/test/byte_alignment_test.cc @@ -176,8 +176,8 @@ TEST_P(ByteAlignmentTest, TestAlignment) { } } -INSTANTIATE_TEST_CASE_P(Alignments, ByteAlignmentTest, - ::testing::ValuesIn(kBaTestParams)); +INSTANTIATE_TEST_SUITE_P(Alignments, ByteAlignmentTest, + ::testing::ValuesIn(kBaTestParams)); #endif // CONFIG_WEBM_IO diff --git a/test/codec_factory.h b/test/codec_factory.h index 17c9512ca8..77ce49de9f 100644 --- a/test/codec_factory.h +++ b/test/codec_factory.h @@ -157,15 +157,15 @@ class VP8CodecFactory : public CodecFactory { const libvpx_test::VP8CodecFactory kVP8; -#define VP8_INSTANTIATE_TEST_CASE(test, ...) \ - INSTANTIATE_TEST_CASE_P( \ +#define VP8_INSTANTIATE_TEST_SUITE(test, ...) \ + INSTANTIATE_TEST_SUITE_P( \ VP8, test, \ ::testing::Combine( \ ::testing::Values(static_cast( \ &libvpx_test::kVP8)), \ __VA_ARGS__)) #else -#define VP8_INSTANTIATE_TEST_CASE(test, ...) +#define VP8_INSTANTIATE_TEST_SUITE(test, ...) #endif // CONFIG_VP8 /* @@ -253,15 +253,15 @@ class VP9CodecFactory : public CodecFactory { const libvpx_test::VP9CodecFactory kVP9; -#define VP9_INSTANTIATE_TEST_CASE(test, ...) \ - INSTANTIATE_TEST_CASE_P( \ +#define VP9_INSTANTIATE_TEST_SUITE(test, ...) \ + INSTANTIATE_TEST_SUITE_P( \ VP9, test, \ ::testing::Combine( \ ::testing::Values(static_cast( \ &libvpx_test::kVP9)), \ __VA_ARGS__)) #else -#define VP9_INSTANTIATE_TEST_CASE(test, ...) +#define VP9_INSTANTIATE_TEST_SUITE(test, ...) #endif // CONFIG_VP9 } // namespace libvpx_test diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc index 56e701e09c..b33bc3fba9 100644 --- a/test/comp_avg_pred_test.cc +++ b/test/comp_avg_pred_test.cc @@ -166,21 +166,21 @@ TEST_P(AvgPredTest, DISABLED_Speed) { } } -INSTANTIATE_TEST_CASE_P(C, AvgPredTest, - ::testing::Values(&vpx_comp_avg_pred_c)); +INSTANTIATE_TEST_SUITE_P(C, AvgPredTest, + ::testing::Values(&vpx_comp_avg_pred_c)); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, AvgPredTest, - ::testing::Values(&vpx_comp_avg_pred_sse2)); +INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTest, + ::testing::Values(&vpx_comp_avg_pred_sse2)); #endif // HAVE_SSE2 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, AvgPredTest, - ::testing::Values(&vpx_comp_avg_pred_neon)); +INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTest, + ::testing::Values(&vpx_comp_avg_pred_neon)); #endif // HAVE_NEON #if HAVE_VSX -INSTANTIATE_TEST_CASE_P(VSX, AvgPredTest, - ::testing::Values(&vpx_comp_avg_pred_vsx)); +INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTest, + ::testing::Values(&vpx_comp_avg_pred_vsx)); #endif // HAVE_VSX } // namespace diff --git a/test/config_test.cc b/test/config_test.cc index b2f8ea5ed3..8f4c60e113 100644 --- a/test/config_test.cc +++ b/test/config_test.cc @@ -58,5 +58,5 @@ TEST_P(ConfigTest, LagIsDisabled) { EXPECT_EQ(frame_count_in_, frame_count_out_); } -VP8_INSTANTIATE_TEST_CASE(ConfigTest, ONE_PASS_TEST_MODES); +VP8_INSTANTIATE_TEST_SUITE(ConfigTest, ONE_PASS_TEST_MODES); } // namespace diff --git a/test/consistency_test.cc b/test/consistency_test.cc index 875b06f4aa..69ebaf70c4 100644 --- a/test/consistency_test.cc +++ b/test/consistency_test.cc @@ -208,8 +208,8 @@ using std::make_tuple; const ConsistencyParam c_vp9_tests[] = { make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238) }; -INSTANTIATE_TEST_CASE_P(C, ConsistencyVP9Test, - ::testing::ValuesIn(c_vp9_tests)); +INSTANTIATE_TEST_SUITE_P(C, ConsistencyVP9Test, + ::testing::ValuesIn(c_vp9_tests)); #endif } // namespace diff --git a/test/convolve_test.cc b/test/convolve_test.cc index d8d053df5f..7330e97db0 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -1276,7 +1276,8 @@ const ConvolveFunctions convolve8_c( vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c) }; #endif -INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::ValuesIn(kArrayConvolve_c)); +INSTANTIATE_TEST_SUITE_P(C, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_c)); #if HAVE_SSE2 && VPX_ARCH_X86_64 #if CONFIG_VP9_HIGHBITDEPTH @@ -1317,8 +1318,8 @@ const ConvolveFunctions convolve8_sse2( const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) }; #endif // CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve_sse2)); +INSTANTIATE_TEST_SUITE_P(SSE2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_sse2)); #endif #if HAVE_SSSE3 @@ -1330,8 +1331,8 @@ const ConvolveFunctions convolve8_ssse3( vpx_scaled_avg_vert_c, vpx_scaled_2d_ssse3, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) }; -INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve8_ssse3)); +INSTANTIATE_TEST_SUITE_P(SSSE3, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_ssse3)); #endif #if HAVE_AVX2 @@ -1362,8 +1363,8 @@ const ConvolveFunctions convolve12_avx2( const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2), ALL_SIZES(convolve10_avx2), ALL_SIZES(convolve12_avx2) }; -INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve8_avx2)); +INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_avx2)); #else // !CONFIG_VP9_HIGHBITDEPTH const ConvolveFunctions convolve8_avx2( vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2, @@ -1372,8 +1373,8 @@ const ConvolveFunctions convolve8_avx2( vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) }; -INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve8_avx2)); +INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_avx2)); #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_AVX2 @@ -1416,8 +1417,8 @@ const ConvolveFunctions convolve8_neon( const ConvolveParam kArrayConvolve_neon[] = { ALL_SIZES(convolve8_neon) }; #endif // CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve_neon)); +INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_neon)); #endif // HAVE_NEON #if HAVE_DSPR2 @@ -1429,8 +1430,8 @@ const ConvolveFunctions convolve8_dspr2( vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve8_dspr2[] = { ALL_SIZES(convolve8_dspr2) }; -INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve8_dspr2)); +INSTANTIATE_TEST_SUITE_P(DSPR2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_dspr2)); #endif // HAVE_DSPR2 #if HAVE_MSA @@ -1442,8 +1443,8 @@ const ConvolveFunctions convolve8_msa( vpx_scaled_avg_vert_c, vpx_scaled_2d_msa, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES(convolve8_msa) }; -INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve8_msa)); +INSTANTIATE_TEST_SUITE_P(MSA, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_msa)); #endif // HAVE_MSA #if HAVE_VSX @@ -1454,8 +1455,8 @@ const ConvolveFunctions convolve8_vsx( vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve_vsx[] = { ALL_SIZES(convolve8_vsx) }; -INSTANTIATE_TEST_CASE_P(VSX, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve_vsx)); +INSTANTIATE_TEST_SUITE_P(VSX, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_vsx)); #endif // HAVE_VSX #if HAVE_MMI @@ -1466,7 +1467,7 @@ const ConvolveFunctions convolve8_mmi( vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve_mmi[] = { ALL_SIZES(convolve8_mmi) }; -INSTANTIATE_TEST_CASE_P(MMI, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve_mmi)); +INSTANTIATE_TEST_SUITE_P(MMI, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_mmi)); #endif // HAVE_MMI } // namespace diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc index 2fb5c10eae..a7623f09ac 100644 --- a/test/cpu_speed_test.cc +++ b/test/cpu_speed_test.cc @@ -148,9 +148,9 @@ TEST_P(CpuSpeedTest, TestLowBitrate) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP9_INSTANTIATE_TEST_CASE(CpuSpeedTest, - ::testing::Values(::libvpx_test::kTwoPassGood, - ::libvpx_test::kOnePassGood, - ::libvpx_test::kRealTime), - ::testing::Range(0, 10)); +VP9_INSTANTIATE_TEST_SUITE(CpuSpeedTest, + ::testing::Values(::libvpx_test::kTwoPassGood, + ::libvpx_test::kOnePassGood, + ::libvpx_test::kRealTime), + ::testing::Range(0, 10)); } // namespace diff --git a/test/cq_test.cc b/test/cq_test.cc index 474b9d0fa2..3126f3b4e2 100644 --- a/test/cq_test.cc +++ b/test/cq_test.cc @@ -126,6 +126,6 @@ TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) { EXPECT_GE(cq_psnr_lin, vbr_psnr_lin); } -VP8_INSTANTIATE_TEST_CASE(CQTest, ::testing::Range(kCQLevelMin, kCQLevelMax, - kCQLevelStep)); +VP8_INSTANTIATE_TEST_SUITE(CQTest, ::testing::Range(kCQLevelMin, kCQLevelMax, + kCQLevelStep)); } // namespace diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index 9ccf2b84f1..d586db7208 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -747,21 +747,21 @@ TEST_P(InvTrans16x16DCT, CompareReference) { using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, Trans16x16DCT, ::testing::Values( make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_10, 0, VPX_BITS_10), make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_12, 0, VPX_BITS_12), make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 0, VPX_BITS_8))); #else -INSTANTIATE_TEST_CASE_P(C, Trans16x16DCT, - ::testing::Values(make_tuple(&vpx_fdct16x16_c, - &vpx_idct16x16_256_add_c, - 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P(C, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_c, + &vpx_idct16x16_256_add_c, + 0, VPX_BITS_8))); #endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, Trans16x16HT, ::testing::Values( make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 0, VPX_BITS_10), @@ -777,7 +777,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, Trans16x16HT, ::testing::Values( make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8), @@ -787,18 +787,18 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, Trans16x16DCT, ::testing::Values(make_tuple(&vpx_fdct16x16_neon, &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8))); #endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Trans16x16DCT, ::testing::Values(make_tuple(&vpx_fdct16x16_sse2, &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Trans16x16HT, ::testing::Values(make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0, VPX_BITS_8), @@ -811,7 +811,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Trans16x16DCT, ::testing::Values( make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_10, 0, VPX_BITS_10), @@ -822,7 +822,7 @@ INSTANTIATE_TEST_CASE_P( VPX_BITS_12), make_tuple(&vpx_fdct16x16_sse2, &vpx_idct16x16_256_add_c, 0, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Trans16x16HT, ::testing::Values( make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8), @@ -832,7 +832,7 @@ INSTANTIATE_TEST_CASE_P( VPX_BITS_8))); // Optimizations take effect at a threshold of 3155, so we use a value close to // that to test both branches. -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, InvTrans16x16DCT, ::testing::Values(make_tuple(&idct16x16_10_add_10_c, &idct16x16_10_add_10_sse2, 3167, VPX_BITS_10), @@ -845,11 +845,11 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(MSA, Trans16x16DCT, - ::testing::Values(make_tuple(&vpx_fdct16x16_msa, - &vpx_idct16x16_256_add_msa, - 0, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( + MSA, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_msa, &vpx_idct16x16_256_add_msa, + 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P( MSA, Trans16x16HT, ::testing::Values( make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 0, VPX_BITS_8), @@ -860,9 +860,9 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(VSX, Trans16x16DCT, - ::testing::Values(make_tuple(&vpx_fdct16x16_c, - &vpx_idct16x16_256_add_vsx, - 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P( + VSX, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_vsx, + 0, VPX_BITS_8))); #endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index 94d6b37fa9..8398e17e81 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -317,7 +317,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) { using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, Trans32x32Test, ::testing::Values( make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_10, 0, VPX_BITS_10), @@ -328,7 +328,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, Trans32x32Test, ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8), @@ -337,7 +337,7 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, Trans32x32Test, ::testing::Values(make_tuple(&vpx_fdct32x32_neon, &vpx_idct32x32_1024_add_neon, 0, VPX_BITS_8), @@ -346,7 +346,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Trans32x32Test, ::testing::Values(make_tuple(&vpx_fdct32x32_sse2, &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8), @@ -355,7 +355,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Trans32x32Test, ::testing::Values( make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_10, 0, VPX_BITS_10), @@ -371,7 +371,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, Trans32x32Test, ::testing::Values(make_tuple(&vpx_fdct32x32_avx2, &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8), @@ -380,7 +380,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, Trans32x32Test, ::testing::Values(make_tuple(&vpx_fdct32x32_msa, &vpx_idct32x32_1024_add_msa, 0, VPX_BITS_8), @@ -389,7 +389,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VSX, Trans32x32Test, ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_vsx, 0, VPX_BITS_8), diff --git a/test/dct_partial_test.cc b/test/dct_partial_test.cc index c889e92d70..2fd176f25a 100644 --- a/test/dct_partial_test.cc +++ b/test/dct_partial_test.cc @@ -111,7 +111,7 @@ class PartialFdctTest : public ::testing::TestWithParam { TEST_P(PartialFdctTest, PartialFdctTest) { RunTest(); } #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, PartialFdctTest, ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_12), make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_10), @@ -124,7 +124,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8), make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, PartialFdctTest, ::testing::Values(make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8), make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8), @@ -133,7 +133,7 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, PartialFdctTest, ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, 32, VPX_BITS_8), make_tuple(&vpx_fdct16x16_1_sse2, 16, VPX_BITS_8), @@ -143,7 +143,7 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_NEON #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, PartialFdctTest, ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8), make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8), @@ -152,7 +152,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8), make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, PartialFdctTest, ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8), make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8), @@ -163,11 +163,11 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_MSA #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P(MSA, PartialFdctTest, - ::testing::Values(make_tuple(&vpx_fdct8x8_1_msa, 8, - VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P(MSA, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct8x8_1_msa, 8, + VPX_BITS_8))); #else // !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, PartialFdctTest, ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa, 32, VPX_BITS_8), make_tuple(&vpx_fdct16x16_1_msa, 16, VPX_BITS_8), diff --git a/test/dct_test.cc b/test/dct_test.cc index ed12f77569..a1b766e103 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -473,7 +473,7 @@ static const FuncInfo dct_c_func_info[] = { 1 } }; -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, TransDCT, ::testing::Combine( ::testing::Range(0, static_cast(sizeof(dct_c_func_info) / @@ -505,7 +505,7 @@ static const FuncInfo dct_sse2_func_info[] = { &idct_wrapper, 32, 1 } }; -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, TransDCT, ::testing::Combine( ::testing::Range(0, static_cast(sizeof(dct_sse2_func_info) / @@ -521,9 +521,9 @@ static const FuncInfo dct_ssse3_func_info = { }; // TODO(johannkoenig): high bit depth fdct8x8. -INSTANTIATE_TEST_CASE_P(SSSE3, TransDCT, - ::testing::Values(make_tuple(0, &dct_ssse3_func_info, 0, - VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P(SSSE3, TransDCT, + ::testing::Values(make_tuple(0, &dct_ssse3_func_info, + 0, VPX_BITS_8))); #endif // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH @@ -533,9 +533,9 @@ static const FuncInfo dct_avx2_func_info = { }; // TODO(johannkoenig): high bit depth fdct32x32. -INSTANTIATE_TEST_CASE_P(AVX2, TransDCT, - ::testing::Values(make_tuple(0, &dct_avx2_func_info, 0, - VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P(AVX2, TransDCT, + ::testing::Values(make_tuple(0, &dct_avx2_func_info, 0, + VPX_BITS_8))); #endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON @@ -550,7 +550,7 @@ static const FuncInfo dct_neon_func_info[4] = { &idct_wrapper, 32, 1 } }; -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, TransDCT, ::testing::Combine(::testing::Range(0, 4), ::testing::Values(dct_neon_func_info), @@ -569,11 +569,11 @@ static const FuncInfo dct_msa_func_info[4] = { 32, 1 } }; -INSTANTIATE_TEST_CASE_P(MSA, TransDCT, - ::testing::Combine(::testing::Range(0, 4), - ::testing::Values(dct_msa_func_info), - ::testing::Values(0), - ::testing::Values(VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P( + MSA, TransDCT, + ::testing::Combine(::testing::Range(0, 4), + ::testing::Values(dct_msa_func_info), + ::testing::Values(0), ::testing::Values(VPX_BITS_8))); #endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH @@ -581,9 +581,9 @@ static const FuncInfo dct_vsx_func_info = { &fdct_wrapper, &idct_wrapper, 4, 1 }; -INSTANTIATE_TEST_CASE_P(VSX, TransDCT, - ::testing::Values(make_tuple(0, &dct_vsx_func_info, 0, - VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P(VSX, TransDCT, + ::testing::Values(make_tuple(0, &dct_vsx_func_info, 0, + VPX_BITS_8))); #endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && #endif // !CONFIG_EMULATE_HARDWARE @@ -619,7 +619,7 @@ static const FuncInfo ht_c_func_info[] = { { &vp9_fht16x16_c, &iht_wrapper, 16, 1 } }; -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, TransHT, ::testing::Combine( ::testing::Range(0, static_cast(sizeof(ht_c_func_info) / @@ -645,7 +645,7 @@ static const FuncInfo ht_neon_func_info[] = { { &vp9_fht16x16_c, &iht_wrapper, 16, 1 } }; -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, TransHT, ::testing::Combine( ::testing::Range(0, static_cast(sizeof(ht_neon_func_info) / @@ -662,11 +662,11 @@ static const FuncInfo ht_sse2_func_info[3] = { { &vp9_fht16x16_sse2, &iht_wrapper, 16, 1 } }; -INSTANTIATE_TEST_CASE_P(SSE2, TransHT, - ::testing::Combine(::testing::Range(0, 3), - ::testing::Values(ht_sse2_func_info), - ::testing::Range(0, 4), - ::testing::Values(VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P( + SSE2, TransHT, + ::testing::Combine(::testing::Range(0, 3), + ::testing::Values(ht_sse2_func_info), + ::testing::Range(0, 4), ::testing::Values(VPX_BITS_8))); #endif // HAVE_SSE2 #if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH @@ -679,7 +679,7 @@ static const FuncInfo ht_sse4_1_func_info[3] = { &highbd_iht_wrapper, 16, 2 } }; -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE4_1, TransHT, ::testing::Combine(::testing::Range(0, 3), ::testing::Values(ht_sse4_1_func_info), @@ -695,11 +695,11 @@ static const FuncInfo ht_vsx_func_info[3] = { { &vp9_fht16x16_c, &iht_wrapper, 16, 1 } }; -INSTANTIATE_TEST_CASE_P(VSX, TransHT, - ::testing::Combine(::testing::Range(0, 3), - ::testing::Values(ht_vsx_func_info), - ::testing::Range(0, 4), - ::testing::Values(VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P(VSX, TransHT, + ::testing::Combine(::testing::Range(0, 3), + ::testing::Values(ht_vsx_func_info), + ::testing::Range(0, 4), + ::testing::Values(VPX_BITS_8))); #endif // HAVE_VSX #endif // !CONFIG_EMULATE_HARDWARE @@ -726,7 +726,7 @@ static const FuncInfo wht_c_func_info[] = { { &fdct_wrapper, &idct_wrapper, 4, 1 } }; -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, TransWHT, ::testing::Combine( ::testing::Range(0, static_cast(sizeof(wht_c_func_info) / @@ -739,9 +739,9 @@ static const FuncInfo wht_sse2_func_info = { &fdct_wrapper, &idct_wrapper, 4, 1 }; -INSTANTIATE_TEST_CASE_P(SSE2, TransWHT, - ::testing::Values(make_tuple(0, &wht_sse2_func_info, 0, - VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P(SSE2, TransWHT, + ::testing::Values(make_tuple(0, &wht_sse2_func_info, 0, + VPX_BITS_8))); #endif // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE #if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH @@ -749,8 +749,8 @@ static const FuncInfo wht_vsx_func_info = { &fdct_wrapper, &idct_wrapper, 4, 1 }; -INSTANTIATE_TEST_CASE_P(VSX, TransWHT, - ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0, - VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P(VSX, TransWHT, + ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0, + VPX_BITS_8))); #endif // HAVE_VSX && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/test/decode_corrupted.cc b/test/decode_corrupted.cc index b1495ce89f..31e1da69cc 100644 --- a/test/decode_corrupted.cc +++ b/test/decode_corrupted.cc @@ -87,14 +87,14 @@ TEST_P(DecodeCorruptedFrameTest, DecodeCorruptedFrame) { } #if CONFIG_VP9 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VP9, DecodeCorruptedFrameTest, ::testing::Values( static_cast(&libvpx_test::kVP9))); #endif // CONFIG_VP9 #if CONFIG_VP8 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VP8, DecodeCorruptedFrameTest, ::testing::Values( static_cast(&libvpx_test::kVP8))); diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc index aecdd3e999..34d53862a3 100644 --- a/test/decode_perf_test.cc +++ b/test/decode_perf_test.cc @@ -107,8 +107,8 @@ TEST_P(DecodePerfTest, PerfTest) { printf("}\n"); } -INSTANTIATE_TEST_CASE_P(VP9, DecodePerfTest, - ::testing::ValuesIn(kVP9DecodePerfVectors)); +INSTANTIATE_TEST_SUITE_P(VP9, DecodePerfTest, + ::testing::ValuesIn(kVP9DecodePerfVectors)); class VP9NewEncodeDecodePerfTest : public ::libvpx_test::EncoderTest, @@ -258,6 +258,6 @@ TEST_P(VP9NewEncodeDecodePerfTest, PerfTest) { printf("}\n"); } -VP9_INSTANTIATE_TEST_CASE(VP9NewEncodeDecodePerfTest, - ::testing::Values(::libvpx_test::kTwoPassGood)); +VP9_INSTANTIATE_TEST_SUITE(VP9NewEncodeDecodePerfTest, + ::testing::Values(::libvpx_test::kTwoPassGood)); } // namespace diff --git a/test/decode_svc_test.cc b/test/decode_svc_test.cc index c6f0873f89..be0f32545e 100644 --- a/test/decode_svc_test.cc +++ b/test/decode_svc_test.cc @@ -118,7 +118,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer10) { ASSERT_EQ(total_frames_, kNumFrames); } -VP9_INSTANTIATE_TEST_CASE( +VP9_INSTANTIATE_TEST_SUITE( DecodeSvcTest, ::testing::ValuesIn(libvpx_test::kVP9TestVectorsSvc, libvpx_test::kVP9TestVectorsSvc + libvpx_test::kNumVP9TestVectorsSvc)); diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc index 142d9e2da8..142a55952b 100644 --- a/test/encode_perf_test.cc +++ b/test/encode_perf_test.cc @@ -183,6 +183,6 @@ TEST_P(VP9EncodePerfTest, PerfTest) { } } -VP9_INSTANTIATE_TEST_CASE(VP9EncodePerfTest, - ::testing::Values(::libvpx_test::kRealTime)); +VP9_INSTANTIATE_TEST_SUITE(VP9EncodePerfTest, + ::testing::Values(::libvpx_test::kRealTime)); } // namespace diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc index 030b67c572..45a327ec2f 100644 --- a/test/error_resilience_test.cc +++ b/test/error_resilience_test.cc @@ -573,10 +573,10 @@ TEST_P(ErrorResilienceTestLargeCodecControls, CodecControl3TemporalLayers) { } } -VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES, - ::testing::Values(true)); -VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLargeCodecControls, - ONE_PASS_TEST_MODES); -VP9_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES, - ::testing::Values(true)); +VP8_INSTANTIATE_TEST_SUITE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES, + ::testing::Values(true)); +VP8_INSTANTIATE_TEST_SUITE(ErrorResilienceTestLargeCodecControls, + ONE_PASS_TEST_MODES); +VP9_INSTANTIATE_TEST_SUITE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES, + ::testing::Values(true)); } // namespace diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc index 438eeb3ecd..25dd30011b 100644 --- a/test/external_frame_buffer_test.cc +++ b/test/external_frame_buffer_test.cc @@ -509,7 +509,7 @@ TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) { } #endif // CONFIG_WEBM_IO -VP9_INSTANTIATE_TEST_CASE( +VP9_INSTANTIATE_TEST_SUITE( ExternalFrameBufferMD5Test, ::testing::ValuesIn(libvpx_test::kVP9TestVectors, libvpx_test::kVP9TestVectors + diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 1d4f31871f..fd9fdff611 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -632,21 +632,21 @@ TEST_P(InvTrans8x8DCT, CompareReference) { using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, FwdTrans8x8DCT, ::testing::Values( make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 0, VPX_BITS_8), make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10), make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12))); #else -INSTANTIATE_TEST_CASE_P(C, FwdTrans8x8DCT, - ::testing::Values(make_tuple(&vpx_fdct8x8_c, - &vpx_idct8x8_64_add_c, 0, - VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P(C, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_c, + &vpx_idct8x8_64_add_c, 0, + VPX_BITS_8))); #endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, FwdTrans8x8HT, ::testing::Values( make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8), @@ -662,7 +662,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8), make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, FwdTrans8x8HT, ::testing::Values( make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8), @@ -672,13 +672,13 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT, - ::testing::Values(make_tuple(&vpx_fdct8x8_neon, - &vpx_idct8x8_64_add_neon, - 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P(NEON, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_neon, + &vpx_idct8x8_64_add_neon, + 0, VPX_BITS_8))); #if !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, FwdTrans8x8HT, ::testing::Values( make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0, VPX_BITS_8), @@ -689,11 +689,11 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(SSE2, FwdTrans8x8DCT, - ::testing::Values(make_tuple(&vpx_fdct8x8_sse2, - &vpx_idct8x8_64_add_sse2, - 0, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P(SSE2, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_sse2, + &vpx_idct8x8_64_add_sse2, + 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P( SSE2, FwdTrans8x8HT, ::testing::Values( make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8), @@ -703,7 +703,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, FwdTrans8x8DCT, ::testing::Values(make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_c, 0, VPX_BITS_8), @@ -716,7 +716,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_64_add_12_sse2, 12, VPX_BITS_12))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, FwdTrans8x8HT, ::testing::Values( make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8), @@ -726,7 +726,7 @@ INSTANTIATE_TEST_CASE_P( // Optimizations take effect at a threshold of 6201, so we use a value close to // that to test both branches. -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, InvTrans8x8DCT, ::testing::Values( make_tuple(&idct8x8_12_add_10_c, &idct8x8_12_add_10_sse2, 6225, @@ -739,18 +739,18 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_SSSE3 && VPX_ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \ !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT, - ::testing::Values(make_tuple(&vpx_fdct8x8_ssse3, - &vpx_idct8x8_64_add_sse2, - 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P(SSSE3, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_ssse3, + &vpx_idct8x8_64_add_sse2, + 0, VPX_BITS_8))); #endif #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(MSA, FwdTrans8x8DCT, - ::testing::Values(make_tuple(&vpx_fdct8x8_msa, - &vpx_idct8x8_64_add_msa, 0, - VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P(MSA, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_msa, + &vpx_idct8x8_64_add_msa, + 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P( MSA, FwdTrans8x8HT, ::testing::Values( make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 0, VPX_BITS_8), @@ -760,9 +760,9 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(VSX, FwdTrans8x8DCT, - ::testing::Values(make_tuple(&vpx_fdct8x8_c, - &vpx_idct8x8_64_add_vsx, 0, - VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P(VSX, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_c, + &vpx_idct8x8_64_add_vsx, + 0, VPX_BITS_8))); #endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index 6b7aae3d50..dab945a561 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -233,14 +233,14 @@ TEST_P(HadamardLowbdTest, DISABLED_Speed) { SpeedTest(10000000); } -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, HadamardLowbdTest, ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_c, 8), HadamardFuncWithSize(&vpx_hadamard_16x16_c, 16), HadamardFuncWithSize(&vpx_hadamard_32x32_c, 32))); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, HadamardLowbdTest, ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_sse2, 8), HadamardFuncWithSize(&vpx_hadamard_16x16_sse2, 16), @@ -248,20 +248,20 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_AVX2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, HadamardLowbdTest, ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_16x16_avx2, 16), HadamardFuncWithSize(&vpx_hadamard_32x32_avx2, 32))); #endif // HAVE_AVX2 #if HAVE_SSSE3 && VPX_ARCH_X86_64 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3, HadamardLowbdTest, ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_ssse3, 8))); #endif // HAVE_SSSE3 && VPX_ARCH_X86_64 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, HadamardLowbdTest, ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_neon, 8), HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16))); @@ -271,7 +271,7 @@ INSTANTIATE_TEST_CASE_P( // in place and turn on the unit test. #if !CONFIG_VP9_HIGHBITDEPTH #if HAVE_MSA -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, HadamardLowbdTest, ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_msa, 8), HadamardFuncWithSize(&vpx_hadamard_16x16_msa, 16))); @@ -279,7 +279,7 @@ INSTANTIATE_TEST_CASE_P( #endif // !CONFIG_VP9_HIGHBITDEPTH #if HAVE_VSX -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VSX, HadamardLowbdTest, ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_vsx, 8), HadamardFuncWithSize(&vpx_hadamard_16x16_vsx, 16))); @@ -301,14 +301,14 @@ TEST_P(HadamardHighbdTest, DISABLED_Speed) { SpeedTest(10000000); } -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, HadamardHighbdTest, ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_c, 8), HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_c, 16), HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_c, 32))); #if HAVE_AVX2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, HadamardHighbdTest, ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_avx2, 8), HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_avx2, 16), diff --git a/test/idct_test.cc b/test/idct_test.cc index 3564c0bd5d..50d5ff433d 100644 --- a/test/idct_test.cc +++ b/test/idct_test.cc @@ -155,25 +155,26 @@ TEST_P(IDCTTest, TestWithData) { ASSERT_TRUE(output->CheckPadding()); } -INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c)); +INSTANTIATE_TEST_SUITE_P(C, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_c)); #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, IDCTTest, - ::testing::Values(vp8_short_idct4x4llm_neon)); +INSTANTIATE_TEST_SUITE_P(NEON, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_neon)); #endif // HAVE_NEON #if HAVE_MMX -INSTANTIATE_TEST_CASE_P(MMX, IDCTTest, - ::testing::Values(vp8_short_idct4x4llm_mmx)); +INSTANTIATE_TEST_SUITE_P(MMX, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_mmx)); #endif // HAVE_MMX #if HAVE_MSA -INSTANTIATE_TEST_CASE_P(MSA, IDCTTest, - ::testing::Values(vp8_short_idct4x4llm_msa)); +INSTANTIATE_TEST_SUITE_P(MSA, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_msa)); #endif // HAVE_MSA #if HAVE_MMI -INSTANTIATE_TEST_CASE_P(MMI, IDCTTest, - ::testing::Values(vp8_short_idct4x4llm_mmi)); +INSTANTIATE_TEST_SUITE_P(MMI, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_mmi)); #endif // HAVE_MMI } // namespace diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc index 8b1bc56c9b..a5054a54d6 100644 --- a/test/invalid_file_test.cc +++ b/test/invalid_file_test.cc @@ -129,8 +129,8 @@ const DecodeParam kVP8InvalidFileTests[] = { { 1, "invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf" }, }; -VP8_INSTANTIATE_TEST_CASE(InvalidFileTest, - ::testing::ValuesIn(kVP8InvalidFileTests)); +VP8_INSTANTIATE_TEST_SUITE(InvalidFileTest, + ::testing::ValuesIn(kVP8InvalidFileTests)); #endif // CONFIG_VP8_DECODER #if CONFIG_VP9_DECODER @@ -163,8 +163,8 @@ const DecodeParam kVP9InvalidFileTests[] = { { 1, "invalid-crbug-667044.webm" }, }; -VP9_INSTANTIATE_TEST_CASE(InvalidFileTest, - ::testing::ValuesIn(kVP9InvalidFileTests)); +VP9_INSTANTIATE_TEST_SUITE(InvalidFileTest, + ::testing::ValuesIn(kVP9InvalidFileTests)); #endif // CONFIG_VP9_DECODER // This class will include test vectors that are expected to fail @@ -184,8 +184,8 @@ const DecodeParam kVP8InvalidPeekTests[] = { { 1, "invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf" }, }; -VP8_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest, - ::testing::ValuesIn(kVP8InvalidPeekTests)); +VP8_INSTANTIATE_TEST_SUITE(InvalidFileInvalidPeekTest, + ::testing::ValuesIn(kVP8InvalidPeekTests)); #endif // CONFIG_VP8_DECODER #if CONFIG_VP9_DECODER @@ -193,8 +193,9 @@ const DecodeParam kVP9InvalidFileInvalidPeekTests[] = { { 1, "invalid-vp90-01-v3.webm" }, }; -VP9_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest, - ::testing::ValuesIn(kVP9InvalidFileInvalidPeekTests)); +VP9_INSTANTIATE_TEST_SUITE( + InvalidFileInvalidPeekTest, + ::testing::ValuesIn(kVP9InvalidFileInvalidPeekTests)); const DecodeParam kMultiThreadedVP9InvalidFileTests[] = { { 4, "invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm" }, @@ -210,7 +211,7 @@ const DecodeParam kMultiThreadedVP9InvalidFileTests[] = { { 4, "invalid-crbug-1562.ivf" }, }; -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VP9MultiThreaded, InvalidFileTest, ::testing::Combine( ::testing::Values( diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc index 582d448168..a13dec9ce2 100644 --- a/test/keyframe_test.cc +++ b/test/keyframe_test.cc @@ -145,5 +145,5 @@ TEST_P(KeyframeTest, TestAutoKeyframe) { } } -VP8_INSTANTIATE_TEST_CASE(KeyframeTest, ALL_TEST_MODES); +VP8_INSTANTIATE_TEST_SUITE(KeyframeTest, ALL_TEST_MODES); } // namespace diff --git a/test/level_test.cc b/test/level_test.cc index 26935a81b0..038d75f44f 100644 --- a/test/level_test.cc +++ b/test/level_test.cc @@ -140,8 +140,8 @@ TEST_P(LevelTest, TestTargetLevelApi) { EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc)); } -VP9_INSTANTIATE_TEST_CASE(LevelTest, - ::testing::Values(::libvpx_test::kTwoPassGood, - ::libvpx_test::kOnePassGood), - ::testing::Range(0, 9)); +VP9_INSTANTIATE_TEST_SUITE(LevelTest, + ::testing::Values(::libvpx_test::kTwoPassGood, + ::libvpx_test::kOnePassGood), + ::testing::Range(0, 9)); } // namespace diff --git a/test/lpf_test.cc b/test/lpf_test.cc index 9db1181c6c..25bba74b0f 100644 --- a/test/lpf_test.cc +++ b/test/lpf_test.cc @@ -407,7 +407,7 @@ using std::make_tuple; #if HAVE_SSE2 #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Loop8Test6Param, ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_sse2, &vpx_highbd_lpf_horizontal_4_c, 8), @@ -458,7 +458,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2, &vpx_highbd_lpf_vertical_16_dual_c, 12))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Loop8Test6Param, ::testing::Values( make_tuple(&vpx_lpf_horizontal_4_sse2, &vpx_lpf_horizontal_4_c, 8), @@ -475,7 +475,7 @@ INSTANTIATE_TEST_CASE_P( #endif #if HAVE_AVX2 && (!CONFIG_VP9_HIGHBITDEPTH) -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, Loop8Test6Param, ::testing::Values(make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8), @@ -485,7 +485,7 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_SSE2 #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Loop8Test9Param, ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2, &vpx_highbd_lpf_horizontal_4_dual_c, 8), @@ -512,7 +512,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2, &vpx_highbd_lpf_vertical_8_dual_c, 12))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Loop8Test9Param, ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_sse2, &vpx_lpf_horizontal_4_dual_c, 8), @@ -527,7 +527,7 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_NEON #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, Loop8Test6Param, ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_neon, &vpx_highbd_lpf_horizontal_4_c, 8), @@ -577,7 +577,7 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_lpf_vertical_16_dual_c, 10), make_tuple(&vpx_highbd_lpf_vertical_16_dual_neon, &vpx_highbd_lpf_vertical_16_dual_c, 12))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, Loop8Test9Param, ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon, &vpx_highbd_lpf_horizontal_4_dual_c, 8), @@ -604,7 +604,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_highbd_lpf_vertical_8_dual_neon, &vpx_highbd_lpf_vertical_8_dual_c, 12))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, Loop8Test6Param, ::testing::Values( make_tuple(&vpx_lpf_horizontal_16_neon, &vpx_lpf_horizontal_16_c, 8), @@ -617,7 +617,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_lpf_vertical_8_neon, &vpx_lpf_vertical_8_c, 8), make_tuple(&vpx_lpf_horizontal_4_neon, &vpx_lpf_horizontal_4_c, 8), make_tuple(&vpx_lpf_vertical_4_neon, &vpx_lpf_vertical_4_c, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, Loop8Test9Param, ::testing::Values(make_tuple(&vpx_lpf_horizontal_8_dual_neon, &vpx_lpf_horizontal_8_dual_c, 8), @@ -631,7 +631,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_NEON #if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( DSPR2, Loop8Test6Param, ::testing::Values( make_tuple(&vpx_lpf_horizontal_4_dspr2, &vpx_lpf_horizontal_4_c, 8), @@ -645,7 +645,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_lpf_vertical_16_dual_dspr2, &vpx_lpf_vertical_16_dual_c, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( DSPR2, Loop8Test9Param, ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_dspr2, &vpx_lpf_horizontal_4_dual_c, 8), @@ -658,7 +658,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH #if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH) -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, Loop8Test6Param, ::testing::Values( make_tuple(&vpx_lpf_horizontal_4_msa, &vpx_lpf_horizontal_4_c, 8), @@ -670,7 +670,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8), make_tuple(&vpx_lpf_vertical_16_msa, &vpx_lpf_vertical_16_c, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, Loop8Test9Param, ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_msa, &vpx_lpf_horizontal_4_dual_c, 8), diff --git a/test/minmax_test.cc b/test/minmax_test.cc index 9c119116a8..12327bc188 100644 --- a/test/minmax_test.cc +++ b/test/minmax_test.cc @@ -115,21 +115,21 @@ TEST_P(MinMaxTest, CompareReferenceAndVaryStride) { } } -INSTANTIATE_TEST_CASE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c)); +INSTANTIATE_TEST_SUITE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c)); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, MinMaxTest, - ::testing::Values(&vpx_minmax_8x8_sse2)); +INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest, + ::testing::Values(&vpx_minmax_8x8_sse2)); #endif #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, MinMaxTest, - ::testing::Values(&vpx_minmax_8x8_neon)); +INSTANTIATE_TEST_SUITE_P(NEON, MinMaxTest, + ::testing::Values(&vpx_minmax_8x8_neon)); #endif #if HAVE_MSA -INSTANTIATE_TEST_CASE_P(MSA, MinMaxTest, - ::testing::Values(&vpx_minmax_8x8_msa)); +INSTANTIATE_TEST_SUITE_P(MSA, MinMaxTest, + ::testing::Values(&vpx_minmax_8x8_msa)); #endif } // namespace diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index e66a695eb0..61e7c1098b 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -474,8 +474,8 @@ const PartialInvTxfmParam c_partial_idct_tests[] = { &wrapper, TX_4X4, 1, 8, 1) }; -INSTANTIATE_TEST_CASE_P(C, PartialIDctTest, - ::testing::ValuesIn(c_partial_idct_tests)); +INSTANTIATE_TEST_SUITE_P(C, PartialIDctTest, + ::testing::ValuesIn(c_partial_idct_tests)); #if !CONFIG_EMULATE_HARDWARE @@ -625,8 +625,8 @@ const PartialInvTxfmParam neon_partial_idct_tests[] = { &wrapper, TX_4X4, 1, 8, 1) }; -INSTANTIATE_TEST_CASE_P(NEON, PartialIDctTest, - ::testing::ValuesIn(neon_partial_idct_tests)); +INSTANTIATE_TEST_SUITE_P(NEON, PartialIDctTest, + ::testing::ValuesIn(neon_partial_idct_tests)); #endif // HAVE_NEON #if HAVE_SSE2 @@ -776,8 +776,8 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = { &wrapper, TX_4X4, 1, 8, 1) }; -INSTANTIATE_TEST_CASE_P(SSE2, PartialIDctTest, - ::testing::ValuesIn(sse2_partial_idct_tests)); +INSTANTIATE_TEST_SUITE_P(SSE2, PartialIDctTest, + ::testing::ValuesIn(sse2_partial_idct_tests)); #endif // HAVE_SSE2 @@ -791,8 +791,8 @@ const PartialInvTxfmParam ssse3_partial_idct_tests[] = { &wrapper, TX_8X8, 12, 8, 1) }; -INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest, - ::testing::ValuesIn(ssse3_partial_idct_tests)); +INSTANTIATE_TEST_SUITE_P(SSSE3, PartialIDctTest, + ::testing::ValuesIn(ssse3_partial_idct_tests)); #endif // HAVE_SSSE3 #if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH @@ -889,8 +889,8 @@ const PartialInvTxfmParam sse4_1_partial_idct_tests[] = { &highbd_wrapper, TX_4X4, 16, 12, 2) }; -INSTANTIATE_TEST_CASE_P(SSE4_1, PartialIDctTest, - ::testing::ValuesIn(sse4_1_partial_idct_tests)); +INSTANTIATE_TEST_SUITE_P(SSE4_1, PartialIDctTest, + ::testing::ValuesIn(sse4_1_partial_idct_tests)); #endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH #if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH @@ -919,8 +919,8 @@ const PartialInvTxfmParam dspr2_partial_idct_tests[] = { &wrapper, TX_4X4, 1, 8, 1) }; -INSTANTIATE_TEST_CASE_P(DSPR2, PartialIDctTest, - ::testing::ValuesIn(dspr2_partial_idct_tests)); +INSTANTIATE_TEST_SUITE_P(DSPR2, PartialIDctTest, + ::testing::ValuesIn(dspr2_partial_idct_tests)); #endif // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH @@ -950,8 +950,8 @@ const PartialInvTxfmParam msa_partial_idct_tests[] = { &wrapper, TX_4X4, 1, 8, 1) }; -INSTANTIATE_TEST_CASE_P(MSA, PartialIDctTest, - ::testing::ValuesIn(msa_partial_idct_tests)); +INSTANTIATE_TEST_SUITE_P(MSA, PartialIDctTest, + ::testing::ValuesIn(msa_partial_idct_tests)); #endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH #endif // !CONFIG_EMULATE_HARDWARE diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc index 1ed261bf9b..952a954295 100644 --- a/test/pp_filter_test.cc +++ b/test/pp_filter_test.cc @@ -511,62 +511,62 @@ TEST_P(VpxMbPostProcDownTest, DISABLED_Speed) { PrintMedian("16x16"); } -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_c)); -INSTANTIATE_TEST_CASE_P(C, VpxMbPostProcAcrossIpTest, - ::testing::Values(vpx_mbpost_proc_across_ip_c)); +INSTANTIATE_TEST_SUITE_P(C, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_c)); -INSTANTIATE_TEST_CASE_P(C, VpxMbPostProcDownTest, - ::testing::Values(vpx_mbpost_proc_down_c)); +INSTANTIATE_TEST_SUITE_P(C, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_c)); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2)); -INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcAcrossIpTest, - ::testing::Values(vpx_mbpost_proc_across_ip_sse2)); +INSTANTIATE_TEST_SUITE_P(SSE2, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_sse2)); -INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcDownTest, - ::testing::Values(vpx_mbpost_proc_down_sse2)); +INSTANTIATE_TEST_SUITE_P(SSE2, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_sse2)); #endif // HAVE_SSE2 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_neon)); -INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcAcrossIpTest, - ::testing::Values(vpx_mbpost_proc_across_ip_neon)); +INSTANTIATE_TEST_SUITE_P(NEON, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_neon)); -INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcDownTest, - ::testing::Values(vpx_mbpost_proc_down_neon)); +INSTANTIATE_TEST_SUITE_P(NEON, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_neon)); #endif // HAVE_NEON #if HAVE_MSA -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_msa)); -INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcAcrossIpTest, - ::testing::Values(vpx_mbpost_proc_across_ip_msa)); +INSTANTIATE_TEST_SUITE_P(MSA, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_msa)); -INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcDownTest, - ::testing::Values(vpx_mbpost_proc_down_msa)); +INSTANTIATE_TEST_SUITE_P(MSA, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_msa)); #endif // HAVE_MSA #if HAVE_VSX -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VSX, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_vsx)); -INSTANTIATE_TEST_CASE_P(VSX, VpxMbPostProcAcrossIpTest, - ::testing::Values(vpx_mbpost_proc_across_ip_vsx)); +INSTANTIATE_TEST_SUITE_P(VSX, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_vsx)); -INSTANTIATE_TEST_CASE_P(VSX, VpxMbPostProcDownTest, - ::testing::Values(vpx_mbpost_proc_down_vsx)); +INSTANTIATE_TEST_SUITE_P(VSX, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_vsx)); #endif // HAVE_VSX } // namespace diff --git a/test/predict_test.cc b/test/predict_test.cc index d40d9c755e..d884932bae 100644 --- a/test/predict_test.cc +++ b/test/predict_test.cc @@ -298,14 +298,14 @@ TEST_P(SixtapPredictTest, TestWithPresetData) { CompareBuffers(kExpectedDst, kExpectedDstStride, dst_, dst_stride_)); } -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, SixtapPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_c), make_tuple(8, 8, &vp8_sixtap_predict8x8_c), make_tuple(8, 4, &vp8_sixtap_predict8x4_c), make_tuple(4, 4, &vp8_sixtap_predict4x4_c))); #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, SixtapPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon), make_tuple(8, 8, &vp8_sixtap_predict8x8_neon), @@ -313,19 +313,19 @@ INSTANTIATE_TEST_CASE_P( make_tuple(4, 4, &vp8_sixtap_predict4x4_neon))); #endif #if HAVE_MMX -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MMX, SixtapPredictTest, ::testing::Values(make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx))); #endif #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, SixtapPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_sse2), make_tuple(8, 8, &vp8_sixtap_predict8x8_sse2), make_tuple(8, 4, &vp8_sixtap_predict8x4_sse2))); #endif #if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3, SixtapPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_ssse3), make_tuple(8, 8, &vp8_sixtap_predict8x8_ssse3), @@ -333,7 +333,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(4, 4, &vp8_sixtap_predict4x4_ssse3))); #endif #if HAVE_MSA -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, SixtapPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_msa), make_tuple(8, 8, &vp8_sixtap_predict8x8_msa), @@ -342,7 +342,7 @@ INSTANTIATE_TEST_CASE_P( #endif #if HAVE_MMI -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MMI, SixtapPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmi), make_tuple(8, 8, &vp8_sixtap_predict8x8_mmi), @@ -367,14 +367,14 @@ TEST_P(BilinearPredictTest, DISABLED_Speed) { PrintMedian(title); } -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, BilinearPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_c), make_tuple(8, 8, &vp8_bilinear_predict8x8_c), make_tuple(8, 4, &vp8_bilinear_predict8x4_c), make_tuple(4, 4, &vp8_bilinear_predict4x4_c))); #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, BilinearPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_neon), make_tuple(8, 8, &vp8_bilinear_predict8x8_neon), @@ -382,7 +382,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(4, 4, &vp8_bilinear_predict4x4_neon))); #endif #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, BilinearPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_sse2), make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2), @@ -390,13 +390,13 @@ INSTANTIATE_TEST_CASE_P( make_tuple(4, 4, &vp8_bilinear_predict4x4_sse2))); #endif #if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3, BilinearPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_ssse3), make_tuple(8, 8, &vp8_bilinear_predict8x8_ssse3))); #endif #if HAVE_MSA -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, BilinearPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_msa), make_tuple(8, 8, &vp8_bilinear_predict8x8_msa), diff --git a/test/quantize_test.cc b/test/quantize_test.cc index a7497742ce..96d5241712 100644 --- a/test/quantize_test.cc +++ b/test/quantize_test.cc @@ -181,7 +181,7 @@ TEST_P(QuantizeTest, DISABLED_Speed) { } #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, QuantizeTest, ::testing::Values( make_tuple(&vp8_fast_quantize_b_sse2, &vp8_fast_quantize_b_c), @@ -189,26 +189,27 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P(SSSE3, QuantizeTest, - ::testing::Values(make_tuple(&vp8_fast_quantize_b_ssse3, - &vp8_fast_quantize_b_c))); +INSTANTIATE_TEST_SUITE_P( + SSSE3, QuantizeTest, + ::testing::Values(make_tuple(&vp8_fast_quantize_b_ssse3, + &vp8_fast_quantize_b_c))); #endif // HAVE_SSSE3 #if HAVE_SSE4_1 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE4_1, QuantizeTest, ::testing::Values(make_tuple(&vp8_regular_quantize_b_sse4_1, &vp8_regular_quantize_b_c))); #endif // HAVE_SSE4_1 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, QuantizeTest, - ::testing::Values(make_tuple(&vp8_fast_quantize_b_neon, - &vp8_fast_quantize_b_c))); +INSTANTIATE_TEST_SUITE_P(NEON, QuantizeTest, + ::testing::Values(make_tuple(&vp8_fast_quantize_b_neon, + &vp8_fast_quantize_b_c))); #endif // HAVE_NEON #if HAVE_MSA -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, QuantizeTest, ::testing::Values( make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c), @@ -216,7 +217,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_MSA #if HAVE_MMI -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MMI, QuantizeTest, ::testing::Values( make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c), diff --git a/test/realtime_test.cc b/test/realtime_test.cc index 6ee2121eae..25a933bcc5 100644 --- a/test/realtime_test.cc +++ b/test/realtime_test.cc @@ -76,9 +76,9 @@ TEST_P(RealtimeTest, DISABLED_IntegerOverflow) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP8_INSTANTIATE_TEST_CASE(RealtimeTest, - ::testing::Values(::libvpx_test::kRealTime)); -VP9_INSTANTIATE_TEST_CASE(RealtimeTest, - ::testing::Values(::libvpx_test::kRealTime)); +VP8_INSTANTIATE_TEST_SUITE(RealtimeTest, + ::testing::Values(::libvpx_test::kRealTime)); +VP9_INSTANTIATE_TEST_SUITE(RealtimeTest, + ::testing::Values(::libvpx_test::kRealTime)); } // namespace diff --git a/test/resize_test.cc b/test/resize_test.cc index d996cbe1be..b2eadb1e8e 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -793,14 +793,14 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP8_INSTANTIATE_TEST_CASE(ResizeTest, ONE_PASS_TEST_MODES); -VP9_INSTANTIATE_TEST_CASE(ResizeTest, - ::testing::Values(::libvpx_test::kRealTime)); -VP9_INSTANTIATE_TEST_CASE(ResizeInternalTest, - ::testing::Values(::libvpx_test::kOnePassBest)); -VP9_INSTANTIATE_TEST_CASE(ResizeRealtimeTest, - ::testing::Values(::libvpx_test::kRealTime), - ::testing::Range(5, 9)); -VP9_INSTANTIATE_TEST_CASE(ResizeCspTest, - ::testing::Values(::libvpx_test::kRealTime)); +VP8_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES); +VP9_INSTANTIATE_TEST_SUITE(ResizeTest, + ::testing::Values(::libvpx_test::kRealTime)); +VP9_INSTANTIATE_TEST_SUITE(ResizeInternalTest, + ::testing::Values(::libvpx_test::kOnePassBest)); +VP9_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Range(5, 9)); +VP9_INSTANTIATE_TEST_SUITE(ResizeCspTest, + ::testing::Values(::libvpx_test::kRealTime)); } // namespace diff --git a/test/sad_test.cc b/test/sad_test.cc index e39775cc46..3211c1b83b 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -628,7 +628,7 @@ const SadMxNParam c_tests[] = { SadMxNParam(4, 4, &vpx_highbd_sad4x4_c, 12), #endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests)); +INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests)); const SadMxNAvgParam avg_c_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_c), @@ -686,7 +686,7 @@ const SadMxNAvgParam avg_c_tests[] = { SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_c, 12), #endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests)); +INSTANTIATE_TEST_SUITE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests)); const SadMxNx4Param x4d_c_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_c), @@ -744,7 +744,7 @@ const SadMxNx4Param x4d_c_tests[] = { SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_c, 12), #endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); +INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); // TODO(angiebird): implement the marked-down sad functions const SadMxNx8Param x8_c_tests[] = { @@ -762,7 +762,7 @@ const SadMxNx8Param x8_c_tests[] = { // SadMxNx8Param(4, 8, &vpx_sad4x8x8_c), SadMxNx8Param(4, 4, &vpx_sad4x4x8_c), }; -INSTANTIATE_TEST_CASE_P(C, SADx8Test, ::testing::ValuesIn(x8_c_tests)); +INSTANTIATE_TEST_SUITE_P(C, SADx8Test, ::testing::ValuesIn(x8_c_tests)); //------------------------------------------------------------------------------ // ARM functions @@ -780,7 +780,7 @@ const SadMxNParam neon_tests[] = { SadMxNParam(4, 8, &vpx_sad4x8_neon), SadMxNParam(4, 4, &vpx_sad4x4_neon), }; -INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests)); +INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests)); const SadMxNAvgParam avg_neon_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon), @@ -797,7 +797,7 @@ const SadMxNAvgParam avg_neon_tests[] = { SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon), SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon), }; -INSTANTIATE_TEST_CASE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests)); +INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests)); const SadMxNx4Param x4d_neon_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon), @@ -814,7 +814,7 @@ const SadMxNx4Param x4d_neon_tests[] = { SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon), SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon), }; -INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests)); +INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests)); #endif // HAVE_NEON //------------------------------------------------------------------------------ @@ -870,7 +870,7 @@ const SadMxNParam sse2_tests[] = { SadMxNParam(8, 4, &vpx_highbd_sad8x4_sse2, 12), #endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests)); +INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests)); const SadMxNAvgParam avg_sse2_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_sse2), @@ -922,7 +922,7 @@ const SadMxNAvgParam avg_sse2_tests[] = { SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_sse2, 12), #endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests)); +INSTANTIATE_TEST_SUITE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests)); const SadMxNx4Param x4d_sse2_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_sse2), @@ -980,7 +980,7 @@ const SadMxNx4Param x4d_sse2_tests[] = { SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_sse2, 12), #endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests)); +INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests)); #endif // HAVE_SSE2 #if HAVE_SSE3 @@ -999,8 +999,8 @@ const SadMxNx8Param x8_sse4_1_tests[] = { SadMxNx8Param(8, 8, &vpx_sad8x8x8_sse4_1), SadMxNx8Param(4, 4, &vpx_sad4x4x8_sse4_1), }; -INSTANTIATE_TEST_CASE_P(SSE4_1, SADx8Test, - ::testing::ValuesIn(x8_sse4_1_tests)); +INSTANTIATE_TEST_SUITE_P(SSE4_1, SADx8Test, + ::testing::ValuesIn(x8_sse4_1_tests)); #endif // HAVE_SSE4_1 #if HAVE_AVX2 @@ -1011,7 +1011,7 @@ const SadMxNParam avx2_tests[] = { SadMxNParam(32, 32, &vpx_sad32x32_avx2), SadMxNParam(32, 16, &vpx_sad32x16_avx2), }; -INSTANTIATE_TEST_CASE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests)); +INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests)); const SadMxNAvgParam avg_avx2_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_avx2), @@ -1020,27 +1020,27 @@ const SadMxNAvgParam avg_avx2_tests[] = { SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_avx2), SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_avx2), }; -INSTANTIATE_TEST_CASE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests)); +INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests)); const SadMxNx4Param x4d_avx2_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2), SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2), }; -INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); +INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); const SadMxNx8Param x8_avx2_tests[] = { // SadMxNx8Param(64, 64, &vpx_sad64x64x8_c), SadMxNx8Param(32, 32, &vpx_sad32x32x8_avx2), }; -INSTANTIATE_TEST_CASE_P(AVX2, SADx8Test, ::testing::ValuesIn(x8_avx2_tests)); +INSTANTIATE_TEST_SUITE_P(AVX2, SADx8Test, ::testing::ValuesIn(x8_avx2_tests)); #endif // HAVE_AVX2 #if HAVE_AVX512 const SadMxNx4Param x4d_avx512_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx512), }; -INSTANTIATE_TEST_CASE_P(AVX512, SADx4Test, - ::testing::ValuesIn(x4d_avx512_tests)); +INSTANTIATE_TEST_SUITE_P(AVX512, SADx4Test, + ::testing::ValuesIn(x4d_avx512_tests)); #endif // HAVE_AVX512 //------------------------------------------------------------------------------ @@ -1061,7 +1061,7 @@ const SadMxNParam msa_tests[] = { SadMxNParam(4, 8, &vpx_sad4x8_msa), SadMxNParam(4, 4, &vpx_sad4x4_msa), }; -INSTANTIATE_TEST_CASE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests)); +INSTANTIATE_TEST_SUITE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests)); const SadMxNAvgParam avg_msa_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_msa), @@ -1078,7 +1078,7 @@ const SadMxNAvgParam avg_msa_tests[] = { SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_msa), SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_msa), }; -INSTANTIATE_TEST_CASE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests)); +INSTANTIATE_TEST_SUITE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests)); const SadMxNx4Param x4d_msa_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_msa), @@ -1095,7 +1095,7 @@ const SadMxNx4Param x4d_msa_tests[] = { SadMxNx4Param(4, 8, &vpx_sad4x8x4d_msa), SadMxNx4Param(4, 4, &vpx_sad4x4x4d_msa), }; -INSTANTIATE_TEST_CASE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests)); +INSTANTIATE_TEST_SUITE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests)); #endif // HAVE_MSA //------------------------------------------------------------------------------ @@ -1114,7 +1114,7 @@ const SadMxNParam vsx_tests[] = { SadMxNParam(8, 8, &vpx_sad8x8_vsx), SadMxNParam(8, 4, &vpx_sad8x4_vsx), }; -INSTANTIATE_TEST_CASE_P(VSX, SADTest, ::testing::ValuesIn(vsx_tests)); +INSTANTIATE_TEST_SUITE_P(VSX, SADTest, ::testing::ValuesIn(vsx_tests)); const SadMxNAvgParam avg_vsx_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_vsx), @@ -1126,7 +1126,7 @@ const SadMxNAvgParam avg_vsx_tests[] = { SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_vsx), SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_vsx), }; -INSTANTIATE_TEST_CASE_P(VSX, SADavgTest, ::testing::ValuesIn(avg_vsx_tests)); +INSTANTIATE_TEST_SUITE_P(VSX, SADavgTest, ::testing::ValuesIn(avg_vsx_tests)); const SadMxNx4Param x4d_vsx_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_vsx), @@ -1138,7 +1138,7 @@ const SadMxNx4Param x4d_vsx_tests[] = { SadMxNx4Param(16, 16, &vpx_sad16x16x4d_vsx), SadMxNx4Param(16, 8, &vpx_sad16x8x4d_vsx), }; -INSTANTIATE_TEST_CASE_P(VSX, SADx4Test, ::testing::ValuesIn(x4d_vsx_tests)); +INSTANTIATE_TEST_SUITE_P(VSX, SADx4Test, ::testing::ValuesIn(x4d_vsx_tests)); #endif // HAVE_VSX //------------------------------------------------------------------------------ @@ -1159,7 +1159,7 @@ const SadMxNParam mmi_tests[] = { SadMxNParam(4, 8, &vpx_sad4x8_mmi), SadMxNParam(4, 4, &vpx_sad4x4_mmi), }; -INSTANTIATE_TEST_CASE_P(MMI, SADTest, ::testing::ValuesIn(mmi_tests)); +INSTANTIATE_TEST_SUITE_P(MMI, SADTest, ::testing::ValuesIn(mmi_tests)); const SadMxNAvgParam avg_mmi_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_mmi), @@ -1176,7 +1176,7 @@ const SadMxNAvgParam avg_mmi_tests[] = { SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_mmi), SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_mmi), }; -INSTANTIATE_TEST_CASE_P(MMI, SADavgTest, ::testing::ValuesIn(avg_mmi_tests)); +INSTANTIATE_TEST_SUITE_P(MMI, SADavgTest, ::testing::ValuesIn(avg_mmi_tests)); const SadMxNx4Param x4d_mmi_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_mmi), @@ -1193,6 +1193,6 @@ const SadMxNx4Param x4d_mmi_tests[] = { SadMxNx4Param(4, 8, &vpx_sad4x8x4d_mmi), SadMxNx4Param(4, 4, &vpx_sad4x4x4d_mmi), }; -INSTANTIATE_TEST_CASE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests)); +INSTANTIATE_TEST_SUITE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests)); #endif // HAVE_MMI } // namespace diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc index d2c70f4d4b..a47f5b5b1b 100644 --- a/test/sum_squares_test.cc +++ b/test/sum_squares_test.cc @@ -106,21 +106,21 @@ TEST_P(SumSquaresTest, ExtremeValues) { using std::make_tuple; #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, SumSquaresTest, ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, &vpx_sum_squares_2d_i16_neon))); #endif // HAVE_NEON #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, SumSquaresTest, ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, &vpx_sum_squares_2d_i16_sse2))); #endif // HAVE_SSE2 #if HAVE_MSA -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, SumSquaresTest, ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, &vpx_sum_squares_2d_i16_msa))); diff --git a/test/superframe_test.cc b/test/superframe_test.cc index 8c8d1ae290..7b2ce29eb8 100644 --- a/test/superframe_test.cc +++ b/test/superframe_test.cc @@ -95,7 +95,7 @@ TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) { EXPECT_EQ(sf_count_, 1); } -VP9_INSTANTIATE_TEST_CASE( +VP9_INSTANTIATE_TEST_SUITE( SuperframeTest, ::testing::Combine(::testing::Values(::libvpx_test::kTwoPassGood), ::testing::Values(0))); diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc index 1920debfc8..0a7d0032c1 100644 --- a/test/svc_datarate_test.cc +++ b/test/svc_datarate_test.cc @@ -1529,29 +1529,29 @@ TEST_P(DatarateOnePassCbrSvcPostencodeDrop, OnePassCbrSvc2QL1TLScreen) { #endif } -VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcSingleBR, - ::testing::Range(5, 10)); +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcSingleBR, + ::testing::Range(5, 10)); -VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcPostencodeDrop, - ::testing::Range(5, 6)); +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcPostencodeDrop, + ::testing::Range(5, 6)); -VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcInterLayerPredSingleBR, - ::testing::Range(5, 10), ::testing::Range(0, 3)); +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcInterLayerPredSingleBR, + ::testing::Range(5, 10), ::testing::Range(0, 3)); -VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcMultiBR, ::testing::Range(5, 10), - ::testing::Range(0, 3)); +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcMultiBR, + ::testing::Range(5, 10), ::testing::Range(0, 3)); -VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcFrameDropMultiBR, - ::testing::Range(5, 10), ::testing::Range(0, 2), - ::testing::Range(0, 3)); +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcFrameDropMultiBR, + ::testing::Range(5, 10), ::testing::Range(0, 2), + ::testing::Range(0, 3)); #if CONFIG_VP9_TEMPORAL_DENOISING -VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcDenoiser, - ::testing::Range(5, 10), ::testing::Range(1, 3), - ::testing::Range(0, 3), ::testing::Range(0, 4)); +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcDenoiser, + ::testing::Range(5, 10), ::testing::Range(1, 3), + ::testing::Range(0, 3), ::testing::Range(0, 4)); #endif -VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcSmallKF, ::testing::Range(5, 10), - ::testing::Range(32, 36)); +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcSmallKF, + ::testing::Range(5, 10), ::testing::Range(32, 36)); } // namespace } // namespace svc_test diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc index 82259ac30c..edd4c887fc 100644 --- a/test/svc_end_to_end_test.cc +++ b/test/svc_end_to_end_test.cc @@ -470,9 +470,9 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc1SL3TLSyncFrameIntraOnlyQVGA) { #endif } -VP9_INSTANTIATE_TEST_CASE(SyncFrameOnePassCbrSvc, ::testing::Range(0, 3)); +VP9_INSTANTIATE_TEST_SUITE(SyncFrameOnePassCbrSvc, ::testing::Range(0, 3)); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VP9, ScalePartitionOnePassCbrSvc, ::testing::Values( static_cast(&libvpx_test::kVP9))); diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc index 5a9737122f..6adcb221ad 100644 --- a/test/test_vector_test.cc +++ b/test/test_vector_test.cc @@ -153,7 +153,7 @@ TEST_P(TestVectorTest, MD5Match) { } #if CONFIG_VP8_DECODER -VP8_INSTANTIATE_TEST_CASE( +VP8_INSTANTIATE_TEST_SUITE( TestVectorTest, ::testing::Combine( ::testing::Values(1), // Single thread. @@ -163,7 +163,7 @@ VP8_INSTANTIATE_TEST_CASE( libvpx_test::kNumVP8TestVectors))); // Test VP8 decode in with different numbers of threads. -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VP8MultiThreaded, TestVectorTest, ::testing::Combine( ::testing::Values( @@ -178,7 +178,7 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP8_DECODER #if CONFIG_VP9_DECODER -VP9_INSTANTIATE_TEST_CASE( +VP9_INSTANTIATE_TEST_SUITE( TestVectorTest, ::testing::Combine( ::testing::Values(1), // Single thread. @@ -187,7 +187,7 @@ VP9_INSTANTIATE_TEST_CASE( libvpx_test::kVP9TestVectors + libvpx_test::kNumVP9TestVectors))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VP9MultiThreaded, TestVectorTest, ::testing::Combine( ::testing::Values( diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc index 1d1020a9d3..d92c13f88e 100644 --- a/test/tile_independence_test.cc +++ b/test/tile_independence_test.cc @@ -100,5 +100,5 @@ TEST_P(TileIndependenceTest, MD5Match) { ASSERT_STREQ(md5_fw_str, md5_inv_str); } -VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1)); +VP9_INSTANTIATE_TEST_SUITE(TileIndependenceTest, ::testing::Range(0, 2, 1)); } // namespace diff --git a/test/timestamp_test.cc b/test/timestamp_test.cc index 41c1928875..645a9f2ff8 100644 --- a/test/timestamp_test.cc +++ b/test/timestamp_test.cc @@ -94,8 +94,8 @@ TEST_P(TimestampTest, TestVpxRollover) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP8_INSTANTIATE_TEST_CASE(TimestampTest, - ::testing::Values(::libvpx_test::kTwoPassGood)); -VP9_INSTANTIATE_TEST_CASE(TimestampTest, - ::testing::Values(::libvpx_test::kTwoPassGood)); +VP8_INSTANTIATE_TEST_SUITE(TimestampTest, + ::testing::Values(::libvpx_test::kTwoPassGood)); +VP9_INSTANTIATE_TEST_SUITE(TimestampTest, + ::testing::Values(::libvpx_test::kTwoPassGood)); } // namespace diff --git a/test/variance_test.cc b/test/variance_test.cc index e9fa03c680..a6db9cff42 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -738,23 +738,23 @@ TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); } TEST_P(VpxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(VpxSubpelAvgVarianceTest, Ref) { RefTest(); } -INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest, - ::testing::Values(vpx_get_mb_ss_c)); +INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_c)); typedef TestParams SseParams; -INSTANTIATE_TEST_CASE_P(C, VpxSseTest, - ::testing::Values(SseParams(2, 2, - &vpx_get4x4sse_cs_c))); +INSTANTIATE_TEST_SUITE_P(C, VpxSseTest, + ::testing::Values(SseParams(2, 2, + &vpx_get4x4sse_cs_c))); typedef TestParams MseParams; -INSTANTIATE_TEST_CASE_P(C, VpxMseTest, - ::testing::Values(MseParams(4, 4, &vpx_mse16x16_c), - MseParams(4, 3, &vpx_mse16x8_c), - MseParams(3, 4, &vpx_mse8x16_c), - MseParams(3, 3, &vpx_mse8x8_c))); +INSTANTIATE_TEST_SUITE_P(C, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_c), + MseParams(4, 3, &vpx_mse16x8_c), + MseParams(3, 4, &vpx_mse8x16_c), + MseParams(3, 3, &vpx_mse8x8_c))); typedef TestParams VarianceParams; -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_c), VarianceParams(6, 5, &vpx_variance64x32_c), @@ -771,7 +771,7 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(2, 2, &vpx_variance4x4_c))); typedef TestParams SubpelVarianceParams; -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, VpxSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_c, 0), @@ -789,7 +789,7 @@ INSTANTIATE_TEST_CASE_P( SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0))); typedef TestParams SubpelAvgVarianceParams; -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, VpxSubpelAvgVarianceTest, ::testing::Values( SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0), @@ -825,7 +825,7 @@ TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } /* TODO(debargha): This test does not support the highbd version -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, VpxHBDMseTest, ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c), MseParams(4, 4, &vpx_highbd_12_mse16x8_c), @@ -841,7 +841,7 @@ INSTANTIATE_TEST_CASE_P( MseParams(4, 4, &vpx_highbd_8_mse8x8_c))); */ -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, VpxHBDVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_highbd_12_variance64x64_c, 12), VarianceParams(6, 5, &vpx_highbd_12_variance64x32_c, 12), @@ -883,7 +883,7 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(2, 3, &vpx_highbd_8_variance4x8_c, 8), VarianceParams(2, 2, &vpx_highbd_8_variance4x4_c, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, VpxHBDSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8), @@ -941,7 +941,7 @@ INSTANTIATE_TEST_CASE_P( SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, VpxHBDSubpelAvgVarianceTest, ::testing::Values( SubpelAvgVarianceParams(6, 6, @@ -1044,16 +1044,16 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest, - ::testing::Values(vpx_get_mb_ss_sse2)); +INSTANTIATE_TEST_SUITE_P(SSE2, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_sse2)); -INSTANTIATE_TEST_CASE_P(SSE2, VpxMseTest, - ::testing::Values(MseParams(4, 4, &vpx_mse16x16_sse2), - MseParams(4, 3, &vpx_mse16x8_sse2), - MseParams(3, 4, &vpx_mse8x16_sse2), - MseParams(3, 3, &vpx_mse8x8_sse2))); +INSTANTIATE_TEST_SUITE_P(SSE2, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_sse2), + MseParams(4, 3, &vpx_mse16x8_sse2), + MseParams(3, 4, &vpx_mse8x16_sse2), + MseParams(3, 3, &vpx_mse8x8_sse2))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_sse2), VarianceParams(6, 5, &vpx_variance64x32_sse2), @@ -1069,7 +1069,7 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(2, 3, &vpx_variance4x8_sse2), VarianceParams(2, 2, &vpx_variance4x4_sse2))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VpxSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0), @@ -1086,7 +1086,7 @@ INSTANTIATE_TEST_CASE_P( SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0), SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VpxSubpelAvgVarianceTest, ::testing::Values( SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0), @@ -1105,7 +1105,7 @@ INSTANTIATE_TEST_CASE_P( #if CONFIG_VP9_HIGHBITDEPTH /* TODO(debargha): This test does not support the highbd version -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VpxHBDMseTest, ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2), MseParams(4, 3, &vpx_highbd_12_mse16x8_sse2), @@ -1121,7 +1121,7 @@ INSTANTIATE_TEST_CASE_P( MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2))); */ -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VpxHBDVarianceTest, ::testing::Values( VarianceParams(6, 6, &vpx_highbd_12_variance64x64_sse2, 12), @@ -1155,7 +1155,7 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(3, 4, &vpx_highbd_8_variance8x16_sse2, 8), VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sse2, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VpxHBDSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, @@ -1224,7 +1224,7 @@ INSTANTIATE_TEST_CASE_P( SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VpxHBDSubpelAvgVarianceTest, ::testing::Values( SubpelAvgVarianceParams(6, 6, @@ -1330,7 +1330,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3, VpxSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0), @@ -1347,7 +1347,7 @@ INSTANTIATE_TEST_CASE_P( SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0), SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3, VpxSubpelAvgVarianceTest, ::testing::Values( SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, @@ -1374,11 +1374,11 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSSE3 #if HAVE_AVX2 -INSTANTIATE_TEST_CASE_P(AVX2, VpxMseTest, - ::testing::Values(MseParams(4, 4, &vpx_mse16x16_avx2), - MseParams(4, 3, &vpx_mse16x8_avx2))); +INSTANTIATE_TEST_SUITE_P(AVX2, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_avx2), + MseParams(4, 3, &vpx_mse16x8_avx2))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_avx2), VarianceParams(6, 5, &vpx_variance64x32_avx2), @@ -1389,13 +1389,13 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(4, 4, &vpx_variance16x16_avx2), VarianceParams(4, 3, &vpx_variance16x8_avx2))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, VpxSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0), SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, VpxSubpelAvgVarianceTest, ::testing::Values( SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0), @@ -1404,14 +1404,15 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_AVX2 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, VpxSseTest, - ::testing::Values(SseParams(2, 2, - &vpx_get4x4sse_cs_neon))); +INSTANTIATE_TEST_SUITE_P(NEON, VpxSseTest, + ::testing::Values(SseParams(2, 2, + &vpx_get4x4sse_cs_neon))); -INSTANTIATE_TEST_CASE_P(NEON, VpxMseTest, - ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon))); +INSTANTIATE_TEST_SUITE_P(NEON, VpxMseTest, + ::testing::Values(MseParams(4, 4, + &vpx_mse16x16_neon))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_neon), VarianceParams(6, 5, &vpx_variance64x32_neon), @@ -1427,7 +1428,7 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(2, 3, &vpx_variance4x8_neon), VarianceParams(2, 2, &vpx_variance4x4_neon))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, VpxSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_neon, 0), @@ -1444,7 +1445,7 @@ INSTANTIATE_TEST_CASE_P( SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_neon, 0), SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_neon, 0))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, VpxSubpelAvgVarianceTest, ::testing::Values( SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_neon, 0), @@ -1463,20 +1464,20 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_NEON #if HAVE_MSA -INSTANTIATE_TEST_CASE_P(MSA, SumOfSquaresTest, - ::testing::Values(vpx_get_mb_ss_msa)); +INSTANTIATE_TEST_SUITE_P(MSA, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_msa)); -INSTANTIATE_TEST_CASE_P(MSA, VpxSseTest, - ::testing::Values(SseParams(2, 2, - &vpx_get4x4sse_cs_msa))); +INSTANTIATE_TEST_SUITE_P(MSA, VpxSseTest, + ::testing::Values(SseParams(2, 2, + &vpx_get4x4sse_cs_msa))); -INSTANTIATE_TEST_CASE_P(MSA, VpxMseTest, - ::testing::Values(MseParams(4, 4, &vpx_mse16x16_msa), - MseParams(4, 3, &vpx_mse16x8_msa), - MseParams(3, 4, &vpx_mse8x16_msa), - MseParams(3, 3, &vpx_mse8x8_msa))); +INSTANTIATE_TEST_SUITE_P(MSA, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_msa), + MseParams(4, 3, &vpx_mse16x8_msa), + MseParams(3, 4, &vpx_mse8x16_msa), + MseParams(3, 3, &vpx_mse8x8_msa))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_msa), VarianceParams(6, 5, &vpx_variance64x32_msa), @@ -1492,7 +1493,7 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(2, 3, &vpx_variance4x8_msa), VarianceParams(2, 2, &vpx_variance4x4_msa))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, VpxSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0), @@ -1509,7 +1510,7 @@ INSTANTIATE_TEST_CASE_P( SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_msa, 0), SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_msa, 0))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, VpxSubpelAvgVarianceTest, ::testing::Values( SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0), @@ -1528,19 +1529,19 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_MSA #if HAVE_VSX -INSTANTIATE_TEST_CASE_P(VSX, SumOfSquaresTest, - ::testing::Values(vpx_get_mb_ss_vsx)); - -INSTANTIATE_TEST_CASE_P(VSX, VpxSseTest, - ::testing::Values(SseParams(2, 2, - &vpx_get4x4sse_cs_vsx))); -INSTANTIATE_TEST_CASE_P(VSX, VpxMseTest, - ::testing::Values(MseParams(4, 4, &vpx_mse16x16_vsx), - MseParams(4, 3, &vpx_mse16x8_vsx), - MseParams(3, 4, &vpx_mse8x16_vsx), - MseParams(3, 3, &vpx_mse8x8_vsx))); - -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P(VSX, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_vsx)); + +INSTANTIATE_TEST_SUITE_P(VSX, VpxSseTest, + ::testing::Values(SseParams(2, 2, + &vpx_get4x4sse_cs_vsx))); +INSTANTIATE_TEST_SUITE_P(VSX, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_vsx), + MseParams(4, 3, &vpx_mse16x8_vsx), + MseParams(3, 4, &vpx_mse8x16_vsx), + MseParams(3, 3, &vpx_mse8x8_vsx))); + +INSTANTIATE_TEST_SUITE_P( VSX, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_vsx), VarianceParams(6, 5, &vpx_variance64x32_vsx), @@ -1558,13 +1559,13 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_VSX #if HAVE_MMI -INSTANTIATE_TEST_CASE_P(MMI, VpxMseTest, - ::testing::Values(MseParams(4, 4, &vpx_mse16x16_mmi), - MseParams(4, 3, &vpx_mse16x8_mmi), - MseParams(3, 4, &vpx_mse8x16_mmi), - MseParams(3, 3, &vpx_mse8x8_mmi))); +INSTANTIATE_TEST_SUITE_P(MMI, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_mmi), + MseParams(4, 3, &vpx_mse16x8_mmi), + MseParams(3, 4, &vpx_mse8x16_mmi), + MseParams(3, 3, &vpx_mse8x8_mmi))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MMI, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_mmi), VarianceParams(6, 5, &vpx_variance64x32_mmi), @@ -1580,7 +1581,7 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(2, 3, &vpx_variance4x8_mmi), VarianceParams(2, 2, &vpx_variance4x4_mmi))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MMI, VpxSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_mmi, 0), @@ -1597,7 +1598,7 @@ INSTANTIATE_TEST_CASE_P( SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_mmi, 0), SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_mmi, 0))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MMI, VpxSubpelAvgVarianceTest, ::testing::Values( SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_mmi, 0), diff --git a/test/vp8_datarate_test.cc b/test/vp8_datarate_test.cc index 2ae99ee49b..dcd68a2d4c 100644 --- a/test/vp8_datarate_test.cc +++ b/test/vp8_datarate_test.cc @@ -430,9 +430,9 @@ TEST_P(DatarateTestRealTime, NV12) { << " The datarate for the file missed the target!"; } -VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES, - ::testing::Values(0)); -VP8_INSTANTIATE_TEST_CASE(DatarateTestRealTime, - ::testing::Values(::libvpx_test::kRealTime), - ::testing::Values(-6, -12)); +VP8_INSTANTIATE_TEST_SUITE(DatarateTestLarge, ALL_TEST_MODES, + ::testing::Values(0)); +VP8_INSTANTIATE_TEST_SUITE(DatarateTestRealTime, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Values(-6, -12)); } // namespace diff --git a/test/vp8_denoiser_sse2_test.cc b/test/vp8_denoiser_sse2_test.cc index 2cbcf04149..0197f143f3 100644 --- a/test/vp8_denoiser_sse2_test.cc +++ b/test/vp8_denoiser_sse2_test.cc @@ -110,5 +110,5 @@ TEST_P(VP8DenoiserTest, BitexactCheck) { } // Test for all block size. -INSTANTIATE_TEST_CASE_P(SSE2, VP8DenoiserTest, ::testing::Values(0, 1)); +INSTANTIATE_TEST_SUITE_P(SSE2, VP8DenoiserTest, ::testing::Values(0, 1)); } // namespace diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc index b7697d8596..d5ac253003 100644 --- a/test/vp8_fdct4x4_test.cc +++ b/test/vp8_fdct4x4_test.cc @@ -183,24 +183,24 @@ TEST_P(FdctTest, RoundTripErrorCheck) { << "Error: FDCT/IDCT has average roundtrip error > 1 per block"; }; -INSTANTIATE_TEST_CASE_P(C, FdctTest, ::testing::Values(vp8_short_fdct4x4_c)); +INSTANTIATE_TEST_SUITE_P(C, FdctTest, ::testing::Values(vp8_short_fdct4x4_c)); #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, FdctTest, - ::testing::Values(vp8_short_fdct4x4_neon)); +INSTANTIATE_TEST_SUITE_P(NEON, FdctTest, + ::testing::Values(vp8_short_fdct4x4_neon)); #endif // HAVE_NEON #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, FdctTest, - ::testing::Values(vp8_short_fdct4x4_sse2)); +INSTANTIATE_TEST_SUITE_P(SSE2, FdctTest, + ::testing::Values(vp8_short_fdct4x4_sse2)); #endif // HAVE_SSE2 #if HAVE_MSA -INSTANTIATE_TEST_CASE_P(MSA, FdctTest, - ::testing::Values(vp8_short_fdct4x4_msa)); +INSTANTIATE_TEST_SUITE_P(MSA, FdctTest, + ::testing::Values(vp8_short_fdct4x4_msa)); #endif // HAVE_MSA #if HAVE_MMI -INSTANTIATE_TEST_CASE_P(MMI, FdctTest, - ::testing::Values(vp8_short_fdct4x4_mmi)); +INSTANTIATE_TEST_SUITE_P(MMI, FdctTest, + ::testing::Values(vp8_short_fdct4x4_mmi)); #endif // HAVE_MMI } // namespace diff --git a/test/vp9_arf_freq_test.cc b/test/vp9_arf_freq_test.cc index 9a3455b4aa..c7e6f1af02 100644 --- a/test/vp9_arf_freq_test.cc +++ b/test/vp9_arf_freq_test.cc @@ -213,7 +213,7 @@ TEST_P(ArfFreqTest, MinArfFreqTest) { } } -VP9_INSTANTIATE_TEST_CASE(ArfFreqTest, ::testing::ValuesIn(kTestVectors), - ::testing::ValuesIn(kEncodeVectors), - ::testing::ValuesIn(kMinArfVectors)); +VP9_INSTANTIATE_TEST_SUITE(ArfFreqTest, ::testing::ValuesIn(kTestVectors), + ::testing::ValuesIn(kEncodeVectors), + ::testing::ValuesIn(kMinArfVectors)); } // namespace diff --git a/test/vp9_block_error_test.cc b/test/vp9_block_error_test.cc index 71a0686d7a..dc1ff49c68 100644 --- a/test/vp9_block_error_test.cc +++ b/test/vp9_block_error_test.cc @@ -185,12 +185,12 @@ const BlockErrorParam sse2_block_error_tests[] = { &BlockError8BitWrapper, VPX_BITS_8) }; -INSTANTIATE_TEST_CASE_P(SSE2, BlockErrorTest, - ::testing::ValuesIn(sse2_block_error_tests)); +INSTANTIATE_TEST_SUITE_P(SSE2, BlockErrorTest, + ::testing::ValuesIn(sse2_block_error_tests)); #endif // HAVE_SSE2 #if HAVE_AVX2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, BlockErrorTest, ::testing::Values(make_tuple(&BlockError8BitWrapper, &BlockError8BitWrapper, diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc index f9f2aaaffa..7e4d08f111 100644 --- a/test/vp9_datarate_test.cc +++ b/test/vp9_datarate_test.cc @@ -935,23 +935,23 @@ TEST_P(DatarateTestVP9RealTimeDenoiser, DenoiserOffOn) { } #endif // CONFIG_VP9_TEMPORAL_DENOISING -VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTimeMultiBR, - ::testing::Range(5, 10), ::testing::Range(0, 4)); +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeMultiBR, + ::testing::Range(5, 10), ::testing::Range(0, 4)); -VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeVBR, ::testing::Range(5, 9), - ::testing::Range(0, 2)); +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9LargeVBR, ::testing::Range(5, 9), + ::testing::Range(0, 2)); -VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTime, ::testing::Range(5, 10)); +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTime, ::testing::Range(5, 10)); -VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTimeDeltaQUV, - ::testing::Range(5, 10), - ::testing::Values(-5, -10, -15)); +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeDeltaQUV, + ::testing::Range(5, 10), + ::testing::Values(-5, -10, -15)); -VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9PostEncodeDrop, - ::testing::Range(5, 6)); +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9PostEncodeDrop, + ::testing::Range(5, 6)); #if CONFIG_VP9_TEMPORAL_DENOISING -VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTimeDenoiser, - ::testing::Range(5, 10)); +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeDenoiser, + ::testing::Range(5, 10)); #endif } // namespace diff --git a/test/vp9_denoiser_test.cc b/test/vp9_denoiser_test.cc index 47fa587fca..3d76edfaa2 100644 --- a/test/vp9_denoiser_test.cc +++ b/test/vp9_denoiser_test.cc @@ -104,7 +104,7 @@ using std::make_tuple; // Test for all block size. #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VP9DenoiserTest, ::testing::Values(make_tuple(&vp9_denoiser_filter_sse2, BLOCK_8X8), make_tuple(&vp9_denoiser_filter_sse2, BLOCK_8X16), @@ -119,7 +119,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, VP9DenoiserTest, ::testing::Values(make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X8), make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X16), diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc index fade08bbd4..5286d7a85f 100644 --- a/test/vp9_encoder_parms_get_to_decoder.cc +++ b/test/vp9_encoder_parms_get_to_decoder.cc @@ -147,7 +147,7 @@ TEST_P(VpxEncoderParmsGetToDecoder, BitstreamParms) { ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); } -VP9_INSTANTIATE_TEST_CASE(VpxEncoderParmsGetToDecoder, - ::testing::ValuesIn(kVP9EncodeParameterSet), - ::testing::ValuesIn(kVP9EncodePerfTestVectors)); +VP9_INSTANTIATE_TEST_SUITE(VpxEncoderParmsGetToDecoder, + ::testing::ValuesIn(kVP9EncodeParameterSet), + ::testing::ValuesIn(kVP9EncodePerfTestVectors)); } // namespace diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc index 0165aa26e2..315ce219ed 100644 --- a/test/vp9_end_to_end_test.cc +++ b/test/vp9_end_to_end_test.cc @@ -334,21 +334,21 @@ TEST_P(EndToEndTestLoopFilterThreading, TileCountChange) { } #endif // CONFIG_VP9_DECODER -VP9_INSTANTIATE_TEST_CASE(EndToEndTestLarge, - ::testing::ValuesIn(kEncodingModeVectors), - ::testing::ValuesIn(kTestVectors), - ::testing::ValuesIn(kCpuUsedVectors)); +VP9_INSTANTIATE_TEST_SUITE(EndToEndTestLarge, + ::testing::ValuesIn(kEncodingModeVectors), + ::testing::ValuesIn(kTestVectors), + ::testing::ValuesIn(kCpuUsedVectors)); -VP9_INSTANTIATE_TEST_CASE(EndToEndNV12, - ::testing::Values(::libvpx_test::kRealTime), - ::testing::ValuesIn(kTestVectorsNv12), - ::testing::ValuesIn({ 6, 7, 8 })); +VP9_INSTANTIATE_TEST_SUITE(EndToEndNV12, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::ValuesIn(kTestVectorsNv12), + ::testing::ValuesIn({ 6, 7, 8 })); -VP9_INSTANTIATE_TEST_CASE(EndToEndTestAdaptiveRDThresh, - ::testing::Values(5, 6, 7), ::testing::Values(8, 9)); +VP9_INSTANTIATE_TEST_SUITE(EndToEndTestAdaptiveRDThresh, + ::testing::Values(5, 6, 7), ::testing::Values(8, 9)); #if CONFIG_VP9_DECODER -VP9_INSTANTIATE_TEST_CASE(EndToEndTestLoopFilterThreading, ::testing::Bool(), - ::testing::Range(2, 6)); +VP9_INSTANTIATE_TEST_SUITE(EndToEndTestLoopFilterThreading, ::testing::Bool(), + ::testing::Range(2, 6)); #endif // CONFIG_VP9_DECODER } // namespace diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc index 6de76e9e55..1041dd78c0 100644 --- a/test/vp9_ethread_test.cc +++ b/test/vp9_ethread_test.cc @@ -390,7 +390,7 @@ TEST_P(VPxEncoderThreadTest, EncoderResultTest) { EXPECT_NEAR(single_thr_psnr, multi_thr_psnr, 0.2); } -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VP9, VPxFirstPassEncoderThreadTest, ::testing::Combine( ::testing::Values( @@ -401,7 +401,7 @@ INSTANTIATE_TEST_CASE_P( // Split this into two instantiations so that we can distinguish // between very slow runs ( ie cpu_speed 0 ) vs ones that can be // run nightly by adding Large to the title. -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VP9, VPxEncoderThreadTest, ::testing::Combine( ::testing::Values( @@ -413,7 +413,7 @@ INSTANTIATE_TEST_CASE_P( ::testing::Range(0, 3), // tile_columns ::testing::Range(2, 5))); // threads -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VP9Large, VPxEncoderThreadTest, ::testing::Combine( ::testing::Values( diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 58091f875b..29f689f07d 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -132,12 +132,12 @@ TEST_P(VP9IntraPredTest, IntraPredTests) { // Instantiate a token test to avoid -Wuninitialized warnings when none of the // other tests are enabled. -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, VP9IntraPredTest, ::testing::Values(IntraPredParam(&vpx_d45_predictor_4x4_c, &vpx_d45_predictor_4x4_c, 4, 8))); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VP9IntraPredTest, ::testing::Values( IntraPredParam(&vpx_d45_predictor_4x4_sse2, &vpx_d45_predictor_4x4_c, 4, @@ -201,7 +201,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3, VP9IntraPredTest, ::testing::Values(IntraPredParam(&vpx_d45_predictor_16x16_ssse3, &vpx_d45_predictor_16x16_c, 16, 8), @@ -232,7 +232,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSSE3 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, VP9IntraPredTest, ::testing::Values( IntraPredParam(&vpx_d45_predictor_4x4_neon, &vpx_d45_predictor_4x4_c, 4, @@ -306,7 +306,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_NEON #if HAVE_DSPR2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( DSPR2, VP9IntraPredTest, ::testing::Values(IntraPredParam(&vpx_dc_predictor_4x4_dspr2, &vpx_dc_predictor_4x4_c, 4, 8), @@ -327,7 +327,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_DSPR2 #if HAVE_MSA -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, VP9IntraPredTest, ::testing::Values( IntraPredParam(&vpx_dc_128_predictor_4x4_msa, @@ -401,7 +401,7 @@ INSTANTIATE_TEST_CASE_P( #endif #if HAVE_VSX -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VSX, VP9IntraPredTest, ::testing::Values(IntraPredParam(&vpx_d45_predictor_16x16_vsx, &vpx_d45_predictor_16x16_c, 16, 8), @@ -477,7 +477,7 @@ TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) { } #if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3_TO_C_8, VP9HighbdIntraPredTest, ::testing::Values( HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, @@ -519,7 +519,7 @@ INSTANTIATE_TEST_CASE_P( HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, &vpx_highbd_d207_predictor_32x32_c, 32, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3_TO_C_10, VP9HighbdIntraPredTest, ::testing::Values( HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, @@ -561,7 +561,7 @@ INSTANTIATE_TEST_CASE_P( HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, &vpx_highbd_d207_predictor_32x32_c, 32, 10))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3_TO_C_12, VP9HighbdIntraPredTest, ::testing::Values( HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, @@ -605,7 +605,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSSE3 #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2_TO_C_8, VP9HighbdIntraPredTest, ::testing::Values( HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, @@ -675,7 +675,7 @@ INSTANTIATE_TEST_CASE_P( HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2, &vpx_highbd_v_predictor_32x32_c, 32, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2_TO_C_10, VP9HighbdIntraPredTest, ::testing::Values( HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, @@ -745,7 +745,7 @@ INSTANTIATE_TEST_CASE_P( HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2, &vpx_highbd_v_predictor_32x32_c, 32, 10))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2_TO_C_12, VP9HighbdIntraPredTest, ::testing::Values( HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, @@ -817,7 +817,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON_TO_C_8, VP9HighbdIntraPredTest, ::testing::Values( HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon, @@ -893,7 +893,7 @@ INSTANTIATE_TEST_CASE_P( HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon, &vpx_highbd_v_predictor_32x32_c, 32, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON_TO_C_10, VP9HighbdIntraPredTest, ::testing::Values( HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon, @@ -969,7 +969,7 @@ INSTANTIATE_TEST_CASE_P( HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon, &vpx_highbd_v_predictor_32x32_c, 32, 10))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON_TO_C_12, VP9HighbdIntraPredTest, ::testing::Values( HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon, diff --git a/test/vp9_lossless_test.cc b/test/vp9_lossless_test.cc index 5cf0a41da4..931ac30a36 100644 --- a/test/vp9_lossless_test.cc +++ b/test/vp9_lossless_test.cc @@ -118,8 +118,8 @@ TEST_P(LosslessTest, TestLossLessEncodingCtrl) { EXPECT_GE(psnr_lossless, kMaxPsnr); } -VP9_INSTANTIATE_TEST_CASE(LosslessTest, - ::testing::Values(::libvpx_test::kRealTime, - ::libvpx_test::kOnePassGood, - ::libvpx_test::kTwoPassGood)); +VP9_INSTANTIATE_TEST_SUITE(LosslessTest, + ::testing::Values(::libvpx_test::kRealTime, + ::libvpx_test::kOnePassGood, + ::libvpx_test::kTwoPassGood)); } // namespace diff --git a/test/vp9_motion_vector_test.cc b/test/vp9_motion_vector_test.cc index b556a1c378..68eecb8232 100644 --- a/test/vp9_motion_vector_test.cc +++ b/test/vp9_motion_vector_test.cc @@ -92,8 +92,8 @@ TEST_P(MotionVectorTestLarge, OverallTest) { ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); } -VP9_INSTANTIATE_TEST_CASE(MotionVectorTestLarge, - ::testing::ValuesIn(kEncodingModeVectors), - ::testing::ValuesIn(kCpuUsedVectors), - ::testing::ValuesIn(kMVTestModes)); +VP9_INSTANTIATE_TEST_SUITE(MotionVectorTestLarge, + ::testing::ValuesIn(kEncodingModeVectors), + ::testing::ValuesIn(kCpuUsedVectors), + ::testing::ValuesIn(kMVTestModes)); } // namespace diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 69c2c5a0b9..083a8844bf 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -475,7 +475,7 @@ using std::make_tuple; #if HAVE_SSE2 #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, ::testing::Values( make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, @@ -494,7 +494,7 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, false), @@ -506,7 +506,7 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_SSSE3 #if VPX_ARCH_X86_64 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_8, 16, false), @@ -520,7 +520,7 @@ INSTANTIATE_TEST_CASE_P( &QuantFPWrapper, VPX_BITS_8, 32, true))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_8, 16, false), @@ -532,17 +532,17 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSSE3 #if HAVE_AVX -INSTANTIATE_TEST_CASE_P(AVX, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_avx, - &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx, - &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false))); +INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_avx, + &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_avx, + &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false))); #endif // HAVE_AVX #if VPX_ARCH_X86_64 && HAVE_AVX2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, ::testing::Values(make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, @@ -550,7 +550,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_AVX2 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, false), @@ -566,7 +566,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_NEON #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VSX, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_vsx, &vpx_quantize_b_c, VPX_BITS_8, 16, false), @@ -582,7 +582,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH // Only useful to compare "Speed" test results. -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( DISABLED_C, VP9QuantizeTest, ::testing::Values( make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false), diff --git a/test/vp9_scale_test.cc b/test/vp9_scale_test.cc index f3e7f0a0e2..2d1203fb89 100644 --- a/test/vp9_scale_test.cc +++ b/test/vp9_scale_test.cc @@ -199,17 +199,17 @@ TEST_P(ScaleTest, DISABLED_Speed) { } } -INSTANTIATE_TEST_CASE_P(C, ScaleTest, - ::testing::Values(vp9_scale_and_extend_frame_c)); +INSTANTIATE_TEST_SUITE_P(C, ScaleTest, + ::testing::Values(vp9_scale_and_extend_frame_c)); #if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P(SSSE3, ScaleTest, - ::testing::Values(vp9_scale_and_extend_frame_ssse3)); +INSTANTIATE_TEST_SUITE_P(SSSE3, ScaleTest, + ::testing::Values(vp9_scale_and_extend_frame_ssse3)); #endif // HAVE_SSSE3 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, ScaleTest, - ::testing::Values(vp9_scale_and_extend_frame_neon)); +INSTANTIATE_TEST_SUITE_P(NEON, ScaleTest, + ::testing::Values(vp9_scale_and_extend_frame_neon)); #endif // HAVE_NEON } // namespace libvpx_test diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc index 67e8de6c74..ef8cc207d6 100644 --- a/test/vp9_subtract_test.cc +++ b/test/vp9_subtract_test.cc @@ -126,30 +126,30 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) { } } -INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest, - ::testing::Values(vpx_subtract_block_c)); +INSTANTIATE_TEST_SUITE_P(C, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_c)); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest, - ::testing::Values(vpx_subtract_block_sse2)); +INSTANTIATE_TEST_SUITE_P(SSE2, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_sse2)); #endif #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest, - ::testing::Values(vpx_subtract_block_neon)); +INSTANTIATE_TEST_SUITE_P(NEON, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_neon)); #endif #if HAVE_MSA -INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest, - ::testing::Values(vpx_subtract_block_msa)); +INSTANTIATE_TEST_SUITE_P(MSA, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_msa)); #endif #if HAVE_MMI -INSTANTIATE_TEST_CASE_P(MMI, VP9SubtractBlockTest, - ::testing::Values(vpx_subtract_block_mmi)); +INSTANTIATE_TEST_SUITE_P(MMI, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_mmi)); #endif #if HAVE_VSX -INSTANTIATE_TEST_CASE_P(VSX, VP9SubtractBlockTest, - ::testing::Values(vpx_subtract_block_vsx)); +INSTANTIATE_TEST_SUITE_P(VSX, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_vsx)); #endif } // namespace vp9 diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc index 31b6fe57b4..67f2bec800 100644 --- a/test/vp9_thread_test.cc +++ b/test/vp9_thread_test.cc @@ -316,6 +316,6 @@ TEST(VP9DecodeMultiThreadedTest, NonFrameParallel) { } #endif // CONFIG_WEBM_IO -INSTANTIATE_TEST_CASE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool()); +INSTANTIATE_TEST_SUITE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool()); } // namespace diff --git a/test/vpx_scale_test.cc b/test/vpx_scale_test.cc index 057a2a295f..7eea437fc8 100644 --- a/test/vpx_scale_test.cc +++ b/test/vpx_scale_test.cc @@ -62,8 +62,8 @@ class ExtendBorderTest TEST_P(ExtendBorderTest, ExtendBorder) { ASSERT_NO_FATAL_FAILURE(RunTest()); } -INSTANTIATE_TEST_CASE_P(C, ExtendBorderTest, - ::testing::Values(vp8_yv12_extend_frame_borders_c)); +INSTANTIATE_TEST_SUITE_P(C, ExtendBorderTest, + ::testing::Values(vp8_yv12_extend_frame_borders_c)); class CopyFrameTest : public VpxScaleBase, public ::testing::TestWithParam { @@ -94,8 +94,8 @@ class CopyFrameTest : public VpxScaleBase, TEST_P(CopyFrameTest, CopyFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); } -INSTANTIATE_TEST_CASE_P(C, CopyFrameTest, - ::testing::Values(vp8_yv12_copy_frame_c)); +INSTANTIATE_TEST_SUITE_P(C, CopyFrameTest, + ::testing::Values(vp8_yv12_copy_frame_c)); } // namespace } // namespace libvpx_test diff --git a/test/y4m_test.cc b/test/y4m_test.cc index 76d033d52a..e7b86ac500 100644 --- a/test/y4m_test.cc +++ b/test/y4m_test.cc @@ -133,8 +133,8 @@ TEST_P(Y4mVideoSourceTest, SourceTest) { Md5Check(t.md5raw); } -INSTANTIATE_TEST_CASE_P(C, Y4mVideoSourceTest, - ::testing::ValuesIn(kY4mTestVectors)); +INSTANTIATE_TEST_SUITE_P(C, Y4mVideoSourceTest, + ::testing::ValuesIn(kY4mTestVectors)); class Y4mVideoWriteTest : public Y4mVideoSourceTest { protected: @@ -186,6 +186,6 @@ TEST_P(Y4mVideoWriteTest, WriteTest) { Md5Check(t.md5raw); } -INSTANTIATE_TEST_CASE_P(C, Y4mVideoWriteTest, - ::testing::ValuesIn(kY4mTestVectors)); +INSTANTIATE_TEST_SUITE_P(C, Y4mVideoWriteTest, + ::testing::ValuesIn(kY4mTestVectors)); } // namespace diff --git a/test/yuv_temporal_filter_test.cc b/test/yuv_temporal_filter_test.cc index 8f3c58b038..cfdc88d896 100644 --- a/test/yuv_temporal_filter_test.cc +++ b/test/yuv_temporal_filter_test.cc @@ -677,7 +677,7 @@ TEST_P(YUVTemporalFilterTest, DISABLED_Speed) { WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10); WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, YUVTemporalFilterTest, ::testing::Values( TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_10, 10), @@ -686,7 +686,7 @@ INSTANTIATE_TEST_CASE_P( WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 10); WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 12); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE4_1, YUVTemporalFilterTest, ::testing::Values( TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_10, @@ -695,14 +695,14 @@ INSTANTIATE_TEST_CASE_P( 12))); #endif // HAVE_SSE4_1 #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, YUVTemporalFilterTest, ::testing::Values(TemporalFilterWithBd(&vp9_apply_temporal_filter_c, 8))); #if HAVE_SSE4_1 -INSTANTIATE_TEST_CASE_P(SSE4_1, YUVTemporalFilterTest, - ::testing::Values(TemporalFilterWithBd( - &vp9_apply_temporal_filter_sse4_1, 8))); +INSTANTIATE_TEST_SUITE_P(SSE4_1, YUVTemporalFilterTest, + ::testing::Values(TemporalFilterWithBd( + &vp9_apply_temporal_filter_sse4_1, 8))); #endif // HAVE_SSE4_1 #endif // CONFIG_VP9_HIGHBITDEPTH } // namespace diff --git a/third_party/googletest/README.libvpx b/third_party/googletest/README.libvpx index 49005ddac9..111390a12e 100644 --- a/third_party/googletest/README.libvpx +++ b/third_party/googletest/README.libvpx @@ -1,5 +1,5 @@ URL: https://github.com/google/googletest.git -Version: release-1.8.1 +Version: release-1.10.0 License: BSD License File: LICENSE @@ -13,16 +13,10 @@ generation. Local Modifications: - Remove everything but: - googletest-release-1.8.1/googletest/ + googletest-release-1.10.0/googletest/ CHANGES CONTRIBUTORS include LICENSE README.md src - -- Make WithParamInterface::GetParam static in order to avoid - initialization issues - https://github.com/google/googletest/pull/1830 -- Use wcslen() instead of std::wcslen() - https://github.com/google/googletest/pull/1899 diff --git a/third_party/googletest/src/README.md b/third_party/googletest/src/README.md index e30fe80471..766ddc1e07 100644 --- a/third_party/googletest/src/README.md +++ b/third_party/googletest/src/README.md @@ -6,48 +6,7 @@ To build Google Test and your tests that use it, you need to tell your build system where to find its headers and source files. The exact way to do it depends on which build system you use, and is usually straightforward. -#### Build - -Suppose you put Google Test in directory `${GTEST_DIR}`. To build it, create a -library build target (or a project as called by Visual Studio and Xcode) to -compile - - ${GTEST_DIR}/src/gtest-all.cc - -with `${GTEST_DIR}/include` in the system header search path and `${GTEST_DIR}` -in the normal header search path. Assuming a Linux-like system and gcc, -something like the following will do: - - g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \ - -pthread -c ${GTEST_DIR}/src/gtest-all.cc - ar -rv libgtest.a gtest-all.o - -(We need `-pthread` as Google Test uses threads.) - -Next, you should compile your test source file with `${GTEST_DIR}/include` in -the system header search path, and link it with gtest and any other necessary -libraries: - - g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \ - -o your_test - -As an example, the make/ directory contains a Makefile that you can use to build -Google Test on systems where GNU make is available (e.g. Linux, Mac OS X, and -Cygwin). It doesn't try to build Google Test's own tests. Instead, it just -builds the Google Test library and a sample test. You can use it as a starting -point for your own build script. - -If the default settings are correct for your environment, the following commands -should succeed: - - cd ${GTEST_DIR}/make - make - ./sample1_unittest - -If you see errors, try to tweak the contents of `make/Makefile` to make them go -away. There are instructions in `make/Makefile` on how to do it. - -### Using CMake +### Build with CMake Google Test comes with a CMake build script ( [CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt)) @@ -115,60 +74,64 @@ pulled into the main build with `add_subdirectory()`. For example: New file `CMakeLists.txt.in`: - cmake_minimum_required(VERSION 2.8.2) +```cmake +cmake_minimum_required(VERSION 2.8.2) - project(googletest-download NONE) +project(googletest-download NONE) - include(ExternalProject) - ExternalProject_Add(googletest - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG master - SOURCE_DIR "${CMAKE_BINARY_DIR}/googletest-src" - BINARY_DIR "${CMAKE_BINARY_DIR}/googletest-build" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" - ) +include(ExternalProject) +ExternalProject_Add(googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG master + SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src" + BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) +``` Existing build's `CMakeLists.txt`: - # Download and unpack googletest at configure time - configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt) - execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download ) - if(result) - message(FATAL_ERROR "CMake step for googletest failed: ${result}") - endif() - execute_process(COMMAND ${CMAKE_COMMAND} --build . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download ) - if(result) - message(FATAL_ERROR "Build step for googletest failed: ${result}") - endif() - - # Prevent overriding the parent project's compiler/linker - # settings on Windows - set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) - - # Add googletest directly to our build. This defines - # the gtest and gtest_main targets. - add_subdirectory(${CMAKE_BINARY_DIR}/googletest-src - ${CMAKE_BINARY_DIR}/googletest-build - EXCLUDE_FROM_ALL) - - # The gtest/gtest_main targets carry header search path - # dependencies automatically when using CMake 2.8.11 or - # later. Otherwise we have to add them here ourselves. - if (CMAKE_VERSION VERSION_LESS 2.8.11) - include_directories("${gtest_SOURCE_DIR}/include") - endif() - - # Now simply link against gtest or gtest_main as needed. Eg - add_executable(example example.cpp) - target_link_libraries(example gtest_main) - add_test(NAME example_test COMMAND example) +```cmake +# Download and unpack googletest at configure time +configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt) +execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) +if(result) + message(FATAL_ERROR "CMake step for googletest failed: ${result}") +endif() +execute_process(COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) +if(result) + message(FATAL_ERROR "Build step for googletest failed: ${result}") +endif() + +# Prevent overriding the parent project's compiler/linker +# settings on Windows +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + +# Add googletest directly to our build. This defines +# the gtest and gtest_main targets. +add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src + ${CMAKE_CURRENT_BINARY_DIR}/googletest-build + EXCLUDE_FROM_ALL) + +# The gtest/gtest_main targets carry header search path +# dependencies automatically when using CMake 2.8.11 or +# later. Otherwise we have to add them here ourselves. +if (CMAKE_VERSION VERSION_LESS 2.8.11) + include_directories("${gtest_SOURCE_DIR}/include") +endif() + +# Now simply link against gtest or gtest_main as needed. Eg +add_executable(example example.cpp) +target_link_libraries(example gtest_main) +add_test(NAME example_test COMMAND example) +``` Note that this approach requires CMake 2.8.2 or later due to its use of the `ExternalProject_Add()` command. The above technique is discussed in more detail @@ -188,47 +151,14 @@ Google Test already has a CMake option for this: `gtest_force_shared_crt` Enabling this option will make gtest link the runtimes dynamically too, and match the project in which it is included. -### Legacy Build Scripts - -Before settling on CMake, we have been providing hand-maintained build -projects/scripts for Visual Studio, Xcode, and Autotools. While we continue to -provide them for convenience, they are not actively maintained any more. We -highly recommend that you follow the instructions in the above sections to -integrate Google Test with your existing build system. - -If you still need to use the legacy build scripts, here's how: - -The msvc\ folder contains two solutions with Visual C++ projects. Open the -`gtest.sln` or `gtest-md.sln` file using Visual Studio, and you are ready to -build Google Test the same way you build any Visual Studio project. Files that -have names ending with -md use DLL versions of Microsoft runtime libraries (the -/MD or the /MDd compiler option). Files without that suffix use static versions -of the runtime libraries (the /MT or the /MTd option). Please note that one must -use the same option to compile both gtest and the test code. If you use Visual -Studio 2005 or above, we recommend the -md version as /MD is the default for new -projects in these versions of Visual Studio. +#### C++ Standard Version -On Mac OS X, open the `gtest.xcodeproj` in the `xcode/` folder using Xcode. -Build the "gtest" target. The universal binary framework will end up in your -selected build directory (selected in the Xcode "Preferences..." -> "Building" -pane and defaults to xcode/build). Alternatively, at the command line, enter: - - xcodebuild - -This will build the "Release" configuration of gtest.framework in your default -build location. See the "xcodebuild" man page for more information about -building different configurations and building in different locations. - -If you wish to use the Google Test Xcode project with Xcode 4.x and above, you -need to either: - -* update the SDK configuration options in xcode/Config/General.xconfig. - Comment options `SDKROOT`, `MACOS_DEPLOYMENT_TARGET`, and `GCC_VERSION`. If - you choose this route you lose the ability to target earlier versions of - MacOS X. -* Install an SDK for an earlier version. This doesn't appear to be supported - by Apple, but has been reported to work - (http://stackoverflow.com/questions/5378518). +An environment that supports C++11 is required in order to successfully build +Google Test. One way to ensure this is to specify the standard in the top-level +project, for example by using the `set(CMAKE_CXX_STANDARD 11)` command. If this +is not feasible, for example in a C project using Google Test for validation, +then it can be specified by adding it to the options for cmake via the +`DCMAKE_CXX_FLAGS` option. ### Tweaking Google Test @@ -239,41 +169,14 @@ command line. Generally, these macros are named like `GTEST_XYZ` and you define them to either 1 or 0 to enable or disable a certain feature. We list the most frequently used macros below. For a complete list, see file -[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/include/gtest/internal/gtest-port.h). - -### Choosing a TR1 Tuple Library - -Some Google Test features require the C++ Technical Report 1 (TR1) tuple -library, which is not yet available with all compilers. The good news is that -Google Test implements a subset of TR1 tuple that's enough for its own need, and -will automatically use this when the compiler doesn't provide TR1 tuple. - -Usually you don't need to care about which tuple library Google Test uses. -However, if your project already uses TR1 tuple, you need to tell Google Test to -use the same TR1 tuple library the rest of your project uses, or the two tuple -implementations will clash. To do that, add - - -DGTEST_USE_OWN_TR1_TUPLE=0 - -to the compiler flags while compiling Google Test and your tests. If you want to -force Google Test to use its own tuple library, just add - - -DGTEST_USE_OWN_TR1_TUPLE=1 - -to the compiler flags instead. - -If you don't want Google Test to use tuple at all, add - - -DGTEST_HAS_TR1_TUPLE=0 - -and all features using tuple will be disabled. +[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/googletest/include/gtest/internal/gtest-port.h). ### Multi-threaded Tests Google Test is thread-safe where the pthread library is available. After -`#include "gtest/gtest.h"`, you can check the `GTEST_IS_THREADSAFE` macro to see -whether this is the case (yes if the macro is `#defined` to 1, no if it's -undefined.). +`#include "gtest/gtest.h"`, you can check the +`GTEST_IS_THREADSAFE` macro to see whether this is the case (yes if the macro is +`#defined` to 1, no if it's undefined.). If Google Test doesn't correctly detect whether pthread is available in your environment, you can force it with diff --git a/third_party/googletest/src/include/gtest/gtest-death-test.h b/third_party/googletest/src/include/gtest/gtest-death-test.h index 20c54d8695..dc878ffbb3 100644 --- a/third_party/googletest/src/include/gtest/gtest-death-test.h +++ b/third_party/googletest/src/include/gtest/gtest-death-test.h @@ -161,7 +161,6 @@ GTEST_API_ bool InDeathTestChild(); // is rarely a problem as people usually don't put the test binary // directory in PATH. // -// FIXME: make thread-safe death tests search the PATH. // Asserts that a given statement causes the program to exit, with an // integer exit status that satisfies predicate, and emitting error output @@ -170,7 +169,7 @@ GTEST_API_ bool InDeathTestChild(); GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_) // Like ASSERT_EXIT, but continues on to successive tests in the -// test case, if any: +// test suite, if any: # define EXPECT_EXIT(statement, predicate, regex) \ GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_) @@ -181,7 +180,7 @@ GTEST_API_ bool InDeathTestChild(); ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) // Like ASSERT_DEATH, but continues on to successive tests in the -// test case, if any: +// test suite, if any: # define EXPECT_DEATH(statement, regex) \ EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) @@ -228,7 +227,7 @@ class GTEST_API_ KilledBySignal { // return 12; // } // -// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) { +// TEST(TestSuite, TestDieOr12WorksInDgbAndOpt) { // int sideeffect = 0; // // Only asserts in dbg. // EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death"); @@ -277,20 +276,20 @@ class GTEST_API_ KilledBySignal { // This macro is used for implementing macros such as // EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where // death tests are not supported. Those macros must compile on such systems -// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on -// systems that support death tests. This allows one to write such a macro -// on a system that does not support death tests and be sure that it will -// compile on a death-test supporting system. It is exposed publicly so that -// systems that have death-tests with stricter requirements than -// GTEST_HAS_DEATH_TEST can write their own equivalent of -// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED. +// if and only if EXPECT_DEATH and ASSERT_DEATH compile with the same parameters +// on systems that support death tests. This allows one to write such a macro on +// a system that does not support death tests and be sure that it will compile +// on a death-test supporting system. It is exposed publicly so that systems +// that have death-tests with stricter requirements than GTEST_HAS_DEATH_TEST +// can write their own equivalent of EXPECT_DEATH_IF_SUPPORTED and +// ASSERT_DEATH_IF_SUPPORTED. // // Parameters: // statement - A statement that a macro such as EXPECT_DEATH would test // for program termination. This macro has to make sure this // statement is compiled but not executed, to ensure that // EXPECT_DEATH_IF_SUPPORTED compiles with a certain -// parameter iff EXPECT_DEATH compiles with it. +// parameter if and only if EXPECT_DEATH compiles with it. // regex - A regex that a macro such as EXPECT_DEATH would use to test // the output of statement. This parameter has to be // compiled but not evaluated by this macro, to ensure that diff --git a/third_party/googletest/src/include/gtest/gtest-matchers.h b/third_party/googletest/src/include/gtest/gtest-matchers.h new file mode 100644 index 0000000000..9de6c2e10a --- /dev/null +++ b/third_party/googletest/src/include/gtest/gtest-matchers.h @@ -0,0 +1,750 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file implements just enough of the matcher interface to allow +// EXPECT_DEATH and friends to accept a matcher argument. + +// IWYU pragma: private, include "testing/base/public/gunit.h" +// IWYU pragma: friend third_party/googletest/googlemock/.* +// IWYU pragma: friend third_party/googletest/googletest/.* + +#ifndef GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ +#define GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ + +#include +#include +#include +#include + +#include "gtest/gtest-printers.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" + +// MSVC warning C5046 is new as of VS2017 version 15.8. +#if defined(_MSC_VER) && _MSC_VER >= 1915 +#define GTEST_MAYBE_5046_ 5046 +#else +#define GTEST_MAYBE_5046_ +#endif + +GTEST_DISABLE_MSC_WARNINGS_PUSH_( + 4251 GTEST_MAYBE_5046_ /* class A needs to have dll-interface to be used by + clients of class B */ + /* Symbol involving type with internal linkage not defined */) + +namespace testing { + +// To implement a matcher Foo for type T, define: +// 1. a class FooMatcherImpl that implements the +// MatcherInterface interface, and +// 2. a factory function that creates a Matcher object from a +// FooMatcherImpl*. +// +// The two-level delegation design makes it possible to allow a user +// to write "v" instead of "Eq(v)" where a Matcher is expected, which +// is impossible if we pass matchers by pointers. It also eases +// ownership management as Matcher objects can now be copied like +// plain values. + +// MatchResultListener is an abstract class. Its << operator can be +// used by a matcher to explain why a value matches or doesn't match. +// +class MatchResultListener { + public: + // Creates a listener object with the given underlying ostream. The + // listener does not own the ostream, and does not dereference it + // in the constructor or destructor. + explicit MatchResultListener(::std::ostream* os) : stream_(os) {} + virtual ~MatchResultListener() = 0; // Makes this class abstract. + + // Streams x to the underlying ostream; does nothing if the ostream + // is NULL. + template + MatchResultListener& operator<<(const T& x) { + if (stream_ != nullptr) *stream_ << x; + return *this; + } + + // Returns the underlying ostream. + ::std::ostream* stream() { return stream_; } + + // Returns true if and only if the listener is interested in an explanation + // of the match result. A matcher's MatchAndExplain() method can use + // this information to avoid generating the explanation when no one + // intends to hear it. + bool IsInterested() const { return stream_ != nullptr; } + + private: + ::std::ostream* const stream_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(MatchResultListener); +}; + +inline MatchResultListener::~MatchResultListener() { +} + +// An instance of a subclass of this knows how to describe itself as a +// matcher. +class MatcherDescriberInterface { + public: + virtual ~MatcherDescriberInterface() {} + + // Describes this matcher to an ostream. The function should print + // a verb phrase that describes the property a value matching this + // matcher should have. The subject of the verb phrase is the value + // being matched. For example, the DescribeTo() method of the Gt(7) + // matcher prints "is greater than 7". + virtual void DescribeTo(::std::ostream* os) const = 0; + + // Describes the negation of this matcher to an ostream. For + // example, if the description of this matcher is "is greater than + // 7", the negated description could be "is not greater than 7". + // You are not required to override this when implementing + // MatcherInterface, but it is highly advised so that your matcher + // can produce good error messages. + virtual void DescribeNegationTo(::std::ostream* os) const { + *os << "not ("; + DescribeTo(os); + *os << ")"; + } +}; + +// The implementation of a matcher. +template +class MatcherInterface : public MatcherDescriberInterface { + public: + // Returns true if and only if the matcher matches x; also explains the + // match result to 'listener' if necessary (see the next paragraph), in + // the form of a non-restrictive relative clause ("which ...", + // "whose ...", etc) that describes x. For example, the + // MatchAndExplain() method of the Pointee(...) matcher should + // generate an explanation like "which points to ...". + // + // Implementations of MatchAndExplain() should add an explanation of + // the match result *if and only if* they can provide additional + // information that's not already present (or not obvious) in the + // print-out of x and the matcher's description. Whether the match + // succeeds is not a factor in deciding whether an explanation is + // needed, as sometimes the caller needs to print a failure message + // when the match succeeds (e.g. when the matcher is used inside + // Not()). + // + // For example, a "has at least 10 elements" matcher should explain + // what the actual element count is, regardless of the match result, + // as it is useful information to the reader; on the other hand, an + // "is empty" matcher probably only needs to explain what the actual + // size is when the match fails, as it's redundant to say that the + // size is 0 when the value is already known to be empty. + // + // You should override this method when defining a new matcher. + // + // It's the responsibility of the caller (Google Test) to guarantee + // that 'listener' is not NULL. This helps to simplify a matcher's + // implementation when it doesn't care about the performance, as it + // can talk to 'listener' without checking its validity first. + // However, in order to implement dummy listeners efficiently, + // listener->stream() may be NULL. + virtual bool MatchAndExplain(T x, MatchResultListener* listener) const = 0; + + // Inherits these methods from MatcherDescriberInterface: + // virtual void DescribeTo(::std::ostream* os) const = 0; + // virtual void DescribeNegationTo(::std::ostream* os) const; +}; + +namespace internal { + +// Converts a MatcherInterface to a MatcherInterface. +template +class MatcherInterfaceAdapter : public MatcherInterface { + public: + explicit MatcherInterfaceAdapter(const MatcherInterface* impl) + : impl_(impl) {} + ~MatcherInterfaceAdapter() override { delete impl_; } + + void DescribeTo(::std::ostream* os) const override { impl_->DescribeTo(os); } + + void DescribeNegationTo(::std::ostream* os) const override { + impl_->DescribeNegationTo(os); + } + + bool MatchAndExplain(const T& x, + MatchResultListener* listener) const override { + return impl_->MatchAndExplain(x, listener); + } + + private: + const MatcherInterface* const impl_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(MatcherInterfaceAdapter); +}; + +struct AnyEq { + template + bool operator()(const A& a, const B& b) const { return a == b; } +}; +struct AnyNe { + template + bool operator()(const A& a, const B& b) const { return a != b; } +}; +struct AnyLt { + template + bool operator()(const A& a, const B& b) const { return a < b; } +}; +struct AnyGt { + template + bool operator()(const A& a, const B& b) const { return a > b; } +}; +struct AnyLe { + template + bool operator()(const A& a, const B& b) const { return a <= b; } +}; +struct AnyGe { + template + bool operator()(const A& a, const B& b) const { return a >= b; } +}; + +// A match result listener that ignores the explanation. +class DummyMatchResultListener : public MatchResultListener { + public: + DummyMatchResultListener() : MatchResultListener(nullptr) {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(DummyMatchResultListener); +}; + +// A match result listener that forwards the explanation to a given +// ostream. The difference between this and MatchResultListener is +// that the former is concrete. +class StreamMatchResultListener : public MatchResultListener { + public: + explicit StreamMatchResultListener(::std::ostream* os) + : MatchResultListener(os) {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener); +}; + +// An internal class for implementing Matcher, which will derive +// from it. We put functionalities common to all Matcher +// specializations here to avoid code duplication. +template +class MatcherBase { + public: + // Returns true if and only if the matcher matches x; also explains the + // match result to 'listener'. + bool MatchAndExplain(const T& x, MatchResultListener* listener) const { + return impl_->MatchAndExplain(x, listener); + } + + // Returns true if and only if this matcher matches x. + bool Matches(const T& x) const { + DummyMatchResultListener dummy; + return MatchAndExplain(x, &dummy); + } + + // Describes this matcher to an ostream. + void DescribeTo(::std::ostream* os) const { impl_->DescribeTo(os); } + + // Describes the negation of this matcher to an ostream. + void DescribeNegationTo(::std::ostream* os) const { + impl_->DescribeNegationTo(os); + } + + // Explains why x matches, or doesn't match, the matcher. + void ExplainMatchResultTo(const T& x, ::std::ostream* os) const { + StreamMatchResultListener listener(os); + MatchAndExplain(x, &listener); + } + + // Returns the describer for this matcher object; retains ownership + // of the describer, which is only guaranteed to be alive when + // this matcher object is alive. + const MatcherDescriberInterface* GetDescriber() const { + return impl_.get(); + } + + protected: + MatcherBase() {} + + // Constructs a matcher from its implementation. + explicit MatcherBase(const MatcherInterface* impl) : impl_(impl) {} + + template + explicit MatcherBase( + const MatcherInterface* impl, + typename std::enable_if::value>::type* = + nullptr) + : impl_(new internal::MatcherInterfaceAdapter(impl)) {} + + MatcherBase(const MatcherBase&) = default; + MatcherBase& operator=(const MatcherBase&) = default; + MatcherBase(MatcherBase&&) = default; + MatcherBase& operator=(MatcherBase&&) = default; + + virtual ~MatcherBase() {} + + private: + std::shared_ptr> impl_; +}; + +} // namespace internal + +// A Matcher is a copyable and IMMUTABLE (except by assignment) +// object that can check whether a value of type T matches. The +// implementation of Matcher is just a std::shared_ptr to const +// MatcherInterface. Don't inherit from Matcher! +template +class Matcher : public internal::MatcherBase { + public: + // Constructs a null matcher. Needed for storing Matcher objects in STL + // containers. A default-constructed matcher is not yet initialized. You + // cannot use it until a valid value has been assigned to it. + explicit Matcher() {} // NOLINT + + // Constructs a matcher from its implementation. + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + + template + explicit Matcher( + const MatcherInterface* impl, + typename std::enable_if::value>::type* = + nullptr) + : internal::MatcherBase(impl) {} + + // Implicit constructor here allows people to write + // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes + Matcher(T value); // NOLINT +}; + +// The following two specializations allow the user to write str +// instead of Eq(str) and "foo" instead of Eq("foo") when a std::string +// matcher is expected. +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a std::string object. + Matcher(const std::string& s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char* s); // NOLINT +}; + +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a string object. + Matcher(const std::string& s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char* s); // NOLINT +}; + +#if GTEST_HAS_ABSL +// The following two specializations allow the user to write str +// instead of Eq(str) and "foo" instead of Eq("foo") when a absl::string_view +// matcher is expected. +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a std::string object. + Matcher(const std::string& s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char* s); // NOLINT + + // Allows the user to pass absl::string_views directly. + Matcher(absl::string_view s); // NOLINT +}; + +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a std::string object. + Matcher(const std::string& s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char* s); // NOLINT + + // Allows the user to pass absl::string_views directly. + Matcher(absl::string_view s); // NOLINT +}; +#endif // GTEST_HAS_ABSL + +// Prints a matcher in a human-readable format. +template +std::ostream& operator<<(std::ostream& os, const Matcher& matcher) { + matcher.DescribeTo(&os); + return os; +} + +// The PolymorphicMatcher class template makes it easy to implement a +// polymorphic matcher (i.e. a matcher that can match values of more +// than one type, e.g. Eq(n) and NotNull()). +// +// To define a polymorphic matcher, a user should provide an Impl +// class that has a DescribeTo() method and a DescribeNegationTo() +// method, and define a member function (or member function template) +// +// bool MatchAndExplain(const Value& value, +// MatchResultListener* listener) const; +// +// See the definition of NotNull() for a complete example. +template +class PolymorphicMatcher { + public: + explicit PolymorphicMatcher(const Impl& an_impl) : impl_(an_impl) {} + + // Returns a mutable reference to the underlying matcher + // implementation object. + Impl& mutable_impl() { return impl_; } + + // Returns an immutable reference to the underlying matcher + // implementation object. + const Impl& impl() const { return impl_; } + + template + operator Matcher() const { + return Matcher(new MonomorphicImpl(impl_)); + } + + private: + template + class MonomorphicImpl : public MatcherInterface { + public: + explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {} + + virtual void DescribeTo(::std::ostream* os) const { impl_.DescribeTo(os); } + + virtual void DescribeNegationTo(::std::ostream* os) const { + impl_.DescribeNegationTo(os); + } + + virtual bool MatchAndExplain(T x, MatchResultListener* listener) const { + return impl_.MatchAndExplain(x, listener); + } + + private: + const Impl impl_; + }; + + Impl impl_; +}; + +// Creates a matcher from its implementation. +// DEPRECATED: Especially in the generic code, prefer: +// Matcher(new MyMatcherImpl(...)); +// +// MakeMatcher may create a Matcher that accepts its argument by value, which +// leads to unnecessary copies & lack of support for non-copyable types. +template +inline Matcher MakeMatcher(const MatcherInterface* impl) { + return Matcher(impl); +} + +// Creates a polymorphic matcher from its implementation. This is +// easier to use than the PolymorphicMatcher constructor as it +// doesn't require you to explicitly write the template argument, e.g. +// +// MakePolymorphicMatcher(foo); +// vs +// PolymorphicMatcher(foo); +template +inline PolymorphicMatcher MakePolymorphicMatcher(const Impl& impl) { + return PolymorphicMatcher(impl); +} + +namespace internal { +// Implements a matcher that compares a given value with a +// pre-supplied value using one of the ==, <=, <, etc, operators. The +// two values being compared don't have to have the same type. +// +// The matcher defined here is polymorphic (for example, Eq(5) can be +// used to match an int, a short, a double, etc). Therefore we use +// a template type conversion operator in the implementation. +// +// The following template definition assumes that the Rhs parameter is +// a "bare" type (i.e. neither 'const T' nor 'T&'). +template +class ComparisonBase { + public: + explicit ComparisonBase(const Rhs& rhs) : rhs_(rhs) {} + template + operator Matcher() const { + return Matcher(new Impl(rhs_)); + } + + private: + template + static const T& Unwrap(const T& v) { return v; } + template + static const T& Unwrap(std::reference_wrapper v) { return v; } + + template + class Impl : public MatcherInterface { + public: + explicit Impl(const Rhs& rhs) : rhs_(rhs) {} + bool MatchAndExplain(Lhs lhs, + MatchResultListener* /* listener */) const override { + return Op()(lhs, Unwrap(rhs_)); + } + void DescribeTo(::std::ostream* os) const override { + *os << D::Desc() << " "; + UniversalPrint(Unwrap(rhs_), os); + } + void DescribeNegationTo(::std::ostream* os) const override { + *os << D::NegatedDesc() << " "; + UniversalPrint(Unwrap(rhs_), os); + } + + private: + Rhs rhs_; + }; + Rhs rhs_; +}; + +template +class EqMatcher : public ComparisonBase, Rhs, AnyEq> { + public: + explicit EqMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyEq>(rhs) { } + static const char* Desc() { return "is equal to"; } + static const char* NegatedDesc() { return "isn't equal to"; } +}; +template +class NeMatcher : public ComparisonBase, Rhs, AnyNe> { + public: + explicit NeMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyNe>(rhs) { } + static const char* Desc() { return "isn't equal to"; } + static const char* NegatedDesc() { return "is equal to"; } +}; +template +class LtMatcher : public ComparisonBase, Rhs, AnyLt> { + public: + explicit LtMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyLt>(rhs) { } + static const char* Desc() { return "is <"; } + static const char* NegatedDesc() { return "isn't <"; } +}; +template +class GtMatcher : public ComparisonBase, Rhs, AnyGt> { + public: + explicit GtMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyGt>(rhs) { } + static const char* Desc() { return "is >"; } + static const char* NegatedDesc() { return "isn't >"; } +}; +template +class LeMatcher : public ComparisonBase, Rhs, AnyLe> { + public: + explicit LeMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyLe>(rhs) { } + static const char* Desc() { return "is <="; } + static const char* NegatedDesc() { return "isn't <="; } +}; +template +class GeMatcher : public ComparisonBase, Rhs, AnyGe> { + public: + explicit GeMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyGe>(rhs) { } + static const char* Desc() { return "is >="; } + static const char* NegatedDesc() { return "isn't >="; } +}; + +// Implements polymorphic matchers MatchesRegex(regex) and +// ContainsRegex(regex), which can be used as a Matcher as long as +// T can be converted to a string. +class MatchesRegexMatcher { + public: + MatchesRegexMatcher(const RE* regex, bool full_match) + : regex_(regex), full_match_(full_match) {} + +#if GTEST_HAS_ABSL + bool MatchAndExplain(const absl::string_view& s, + MatchResultListener* listener) const { + return MatchAndExplain(std::string(s), listener); + } +#endif // GTEST_HAS_ABSL + + // Accepts pointer types, particularly: + // const char* + // char* + // const wchar_t* + // wchar_t* + template + bool MatchAndExplain(CharType* s, MatchResultListener* listener) const { + return s != nullptr && MatchAndExplain(std::string(s), listener); + } + + // Matches anything that can convert to std::string. + // + // This is a template, not just a plain function with const std::string&, + // because absl::string_view has some interfering non-explicit constructors. + template + bool MatchAndExplain(const MatcheeStringType& s, + MatchResultListener* /* listener */) const { + const std::string& s2(s); + return full_match_ ? RE::FullMatch(s2, *regex_) + : RE::PartialMatch(s2, *regex_); + } + + void DescribeTo(::std::ostream* os) const { + *os << (full_match_ ? "matches" : "contains") << " regular expression "; + UniversalPrinter::Print(regex_->pattern(), os); + } + + void DescribeNegationTo(::std::ostream* os) const { + *os << "doesn't " << (full_match_ ? "match" : "contain") + << " regular expression "; + UniversalPrinter::Print(regex_->pattern(), os); + } + + private: + const std::shared_ptr regex_; + const bool full_match_; +}; +} // namespace internal + +// Matches a string that fully matches regular expression 'regex'. +// The matcher takes ownership of 'regex'. +inline PolymorphicMatcher MatchesRegex( + const internal::RE* regex) { + return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true)); +} +inline PolymorphicMatcher MatchesRegex( + const std::string& regex) { + return MatchesRegex(new internal::RE(regex)); +} + +// Matches a string that contains regular expression 'regex'. +// The matcher takes ownership of 'regex'. +inline PolymorphicMatcher ContainsRegex( + const internal::RE* regex) { + return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false)); +} +inline PolymorphicMatcher ContainsRegex( + const std::string& regex) { + return ContainsRegex(new internal::RE(regex)); +} + +// Creates a polymorphic matcher that matches anything equal to x. +// Note: if the parameter of Eq() were declared as const T&, Eq("foo") +// wouldn't compile. +template +inline internal::EqMatcher Eq(T x) { return internal::EqMatcher(x); } + +// Constructs a Matcher from a 'value' of type T. The constructed +// matcher matches any value that's equal to 'value'. +template +Matcher::Matcher(T value) { *this = Eq(value); } + +// Creates a monomorphic matcher that matches anything with type Lhs +// and equal to rhs. A user may need to use this instead of Eq(...) +// in order to resolve an overloading ambiguity. +// +// TypedEq(x) is just a convenient short-hand for Matcher(Eq(x)) +// or Matcher(x), but more readable than the latter. +// +// We could define similar monomorphic matchers for other comparison +// operations (e.g. TypedLt, TypedGe, and etc), but decided not to do +// it yet as those are used much less than Eq() in practice. A user +// can always write Matcher(Lt(5)) to be explicit about the type, +// for example. +template +inline Matcher TypedEq(const Rhs& rhs) { return Eq(rhs); } + +// Creates a polymorphic matcher that matches anything >= x. +template +inline internal::GeMatcher Ge(Rhs x) { + return internal::GeMatcher(x); +} + +// Creates a polymorphic matcher that matches anything > x. +template +inline internal::GtMatcher Gt(Rhs x) { + return internal::GtMatcher(x); +} + +// Creates a polymorphic matcher that matches anything <= x. +template +inline internal::LeMatcher Le(Rhs x) { + return internal::LeMatcher(x); +} + +// Creates a polymorphic matcher that matches anything < x. +template +inline internal::LtMatcher Lt(Rhs x) { + return internal::LtMatcher(x); +} + +// Creates a polymorphic matcher that matches anything != x. +template +inline internal::NeMatcher Ne(Rhs x) { + return internal::NeMatcher(x); +} +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 5046 + +#endif // GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ diff --git a/third_party/googletest/src/include/gtest/gtest-message.h b/third_party/googletest/src/include/gtest/gtest-message.h index 5ca041614c..4a80e11e6b 100644 --- a/third_party/googletest/src/include/gtest/gtest-message.h +++ b/third_party/googletest/src/include/gtest/gtest-message.h @@ -48,6 +48,7 @@ #define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ #include +#include #include "gtest/internal/gtest-port.h" @@ -106,14 +107,6 @@ class GTEST_API_ Message { *ss_ << str; } -#if GTEST_OS_SYMBIAN - // Streams a value (either a pointer or not) to this object. - template - inline Message& operator <<(const T& value) { - StreamHelper(typename internal::is_pointer::type(), value); - return *this; - } -#else // Streams a non-pointer value to this object. template inline Message& operator <<(const T& val) { @@ -151,14 +144,13 @@ class GTEST_API_ Message { // as "(null)". template inline Message& operator <<(T* const& pointer) { // NOLINT - if (pointer == NULL) { + if (pointer == nullptr) { *ss_ << "(null)"; } else { *ss_ << pointer; } return *this; } -#endif // GTEST_OS_SYMBIAN // Since the basic IO manipulators are overloaded for both narrow // and wide streams, we have to provide this specialized definition @@ -187,12 +179,6 @@ class GTEST_API_ Message { Message& operator <<(const ::std::wstring& wstr); #endif // GTEST_HAS_STD_WSTRING -#if GTEST_HAS_GLOBAL_WSTRING - // Converts the given wide string to a narrow string using the UTF-8 - // encoding, and streams the result to this Message object. - Message& operator <<(const ::wstring& wstr); -#endif // GTEST_HAS_GLOBAL_WSTRING - // Gets the text streamed to this object so far as an std::string. // Each '\0' character in the buffer is replaced with "\\0". // @@ -200,31 +186,8 @@ class GTEST_API_ Message { std::string GetString() const; private: -#if GTEST_OS_SYMBIAN - // These are needed as the Nokia Symbian Compiler cannot decide between - // const T& and const T* in a function template. The Nokia compiler _can_ - // decide between class template specializations for T and T*, so a - // tr1::type_traits-like is_pointer works, and we can overload on that. - template - inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) { - if (pointer == NULL) { - *ss_ << "(null)"; - } else { - *ss_ << pointer; - } - } - template - inline void StreamHelper(internal::false_type /*is_pointer*/, - const T& value) { - // See the comments in Message& operator <<(const T&) above for why - // we need this using statement. - using ::operator <<; - *ss_ << value; - } -#endif // GTEST_OS_SYMBIAN - // We'll hold the text streamed to this object here. - const internal::scoped_ptr< ::std::stringstream> ss_; + const std::unique_ptr< ::std::stringstream> ss_; // We declare (but don't implement) this to prevent the compiler // from implementing the assignment operator. diff --git a/third_party/googletest/src/include/gtest/gtest-param-test.h b/third_party/googletest/src/include/gtest/gtest-param-test.h index 3e95e4390e..c2e6eae3d8 100644 --- a/third_party/googletest/src/include/gtest/gtest-param-test.h +++ b/third_party/googletest/src/include/gtest/gtest-param-test.h @@ -1,7 +1,3 @@ -// This file was GENERATED by command: -// pump.py gtest-param-test.h.pump -// DO NOT EDIT BY HAND!!! - // Copyright 2008, Google Inc. // All rights reserved. // @@ -75,7 +71,7 @@ TEST_P(FooTest, HasBlahBlah) { ... } -// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test +// Finally, you can use INSTANTIATE_TEST_SUITE_P to instantiate the test // case with any set of parameters you want. Google Test defines a number // of functions for generating test parameters. They return what we call // (surprise!) parameter generators. Here is a summary of them, which @@ -96,17 +92,17 @@ TEST_P(FooTest, HasBlahBlah) { // For more details, see comments at the definitions of these functions below // in this file. // -// The following statement will instantiate tests from the FooTest test case +// The following statement will instantiate tests from the FooTest test suite // each with parameter values "meeny", "miny", and "moe". -INSTANTIATE_TEST_CASE_P(InstantiationName, - FooTest, - Values("meeny", "miny", "moe")); +INSTANTIATE_TEST_SUITE_P(InstantiationName, + FooTest, + Values("meeny", "miny", "moe")); // To distinguish different instances of the pattern, (yes, you -// can instantiate it more then once) the first argument to the -// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the -// actual test case name. Remember to pick unique prefixes for different +// can instantiate it more than once) the first argument to the +// INSTANTIATE_TEST_SUITE_P macro is a prefix that will be added to the +// actual test suite name. Remember to pick unique prefixes for different // instantiations. The tests from the instantiation above will have // these names: // @@ -123,7 +119,7 @@ INSTANTIATE_TEST_CASE_P(InstantiationName, // with parameter values "cat" and "dog": const char* pets[] = {"cat", "dog"}; -INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets)); +INSTANTIATE_TEST_SUITE_P(AnotherInstantiationName, FooTest, ValuesIn(pets)); // The tests from the instantiation above will have these names: // @@ -132,9 +128,9 @@ INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets)); // * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat" // * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog" // -// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests -// in the given test case, whether their definitions come before or -// AFTER the INSTANTIATE_TEST_CASE_P statement. +// Please note that INSTANTIATE_TEST_SUITE_P will instantiate all tests +// in the given test suite, whether their definitions come before or +// AFTER the INSTANTIATE_TEST_SUITE_P statement. // // Please also note that generator expressions (including parameters to the // generators) are evaluated in InitGoogleTest(), after main() has started. @@ -178,26 +174,23 @@ TEST_P(DerivedTest, DoesBlah) { #endif // 0 -#include "gtest/internal/gtest-port.h" - -#if !GTEST_OS_SYMBIAN -# include -#endif +#include +#include #include "gtest/internal/gtest-internal.h" #include "gtest/internal/gtest-param-util.h" -#include "gtest/internal/gtest-param-util-generated.h" +#include "gtest/internal/gtest-port.h" namespace testing { // Functions producing parameter generators. // // Google Test uses these generators to produce parameters for value- -// parameterized tests. When a parameterized test case is instantiated +// parameterized tests. When a parameterized test suite is instantiated // with a particular generator, Google Test creates and runs tests // for each element in the sequence produced by the generator. // -// In the following sample, tests from test case FooTest are instantiated +// In the following sample, tests from test suite FooTest are instantiated // each three times with parameter values 3, 5, and 8: // // class FooTest : public TestWithParam { ... }; @@ -206,7 +199,7 @@ namespace testing { // } // TEST_P(FooTest, TestThat) { // } -// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8)); +// INSTANTIATE_TEST_SUITE_P(TestSequence, FooTest, Values(3, 5, 8)); // // Range() returns generators providing sequences of values in a range. @@ -263,13 +256,13 @@ internal::ParamGenerator Range(T start, T end) { // // Examples: // -// This instantiates tests from test case StringTest +// This instantiates tests from test suite StringTest // each with C-string values of "foo", "bar", and "baz": // // const char* strings[] = {"foo", "bar", "baz"}; -// INSTANTIATE_TEST_CASE_P(StringSequence, StringTest, ValuesIn(strings)); +// INSTANTIATE_TEST_SUITE_P(StringSequence, StringTest, ValuesIn(strings)); // -// This instantiates tests from test case StlStringTest +// This instantiates tests from test suite StlStringTest // each with STL strings with values "a" and "b": // // ::std::vector< ::std::string> GetParameterStrings() { @@ -279,9 +272,9 @@ internal::ParamGenerator Range(T start, T end) { // return v; // } // -// INSTANTIATE_TEST_CASE_P(CharSequence, -// StlStringTest, -// ValuesIn(GetParameterStrings())); +// INSTANTIATE_TEST_SUITE_P(CharSequence, +// StlStringTest, +// ValuesIn(GetParameterStrings())); // // // This will also instantiate tests from CharTest @@ -294,16 +287,15 @@ internal::ParamGenerator Range(T start, T end) { // return list; // } // ::std::list l = GetParameterChars(); -// INSTANTIATE_TEST_CASE_P(CharSequence2, -// CharTest, -// ValuesIn(l.begin(), l.end())); +// INSTANTIATE_TEST_SUITE_P(CharSequence2, +// CharTest, +// ValuesIn(l.begin(), l.end())); // template internal::ParamGenerator< - typename ::testing::internal::IteratorTraits::value_type> + typename std::iterator_traits::value_type> ValuesIn(ForwardIterator begin, ForwardIterator end) { - typedef typename ::testing::internal::IteratorTraits - ::value_type ParamType; + typedef typename std::iterator_traits::value_type ParamType; return internal::ParamGenerator( new internal::ValuesInIteratorRangeGenerator(begin, end)); } @@ -326,869 +318,22 @@ internal::ParamGenerator ValuesIn( // Values(T v1, T v2, ..., T vN) // - returns a generator producing sequences with elements v1, v2, ..., vN. // -// For example, this instantiates tests from test case BarTest each +// For example, this instantiates tests from test suite BarTest each // with values "one", "two", and "three": // -// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three")); +// INSTANTIATE_TEST_SUITE_P(NumSequence, +// BarTest, +// Values("one", "two", "three")); // -// This instantiates tests from test case BazTest each with values 1, 2, 3.5. +// This instantiates tests from test suite BazTest each with values 1, 2, 3.5. // The exact type of values will depend on the type of parameter in BazTest. // -// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5)); +// INSTANTIATE_TEST_SUITE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5)); // -// Currently, Values() supports from 1 to 50 parameters. // -template -internal::ValueArray1 Values(T1 v1) { - return internal::ValueArray1(v1); -} - -template -internal::ValueArray2 Values(T1 v1, T2 v2) { - return internal::ValueArray2(v1, v2); -} - -template -internal::ValueArray3 Values(T1 v1, T2 v2, T3 v3) { - return internal::ValueArray3(v1, v2, v3); -} - -template -internal::ValueArray4 Values(T1 v1, T2 v2, T3 v3, T4 v4) { - return internal::ValueArray4(v1, v2, v3, v4); -} - -template -internal::ValueArray5 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5) { - return internal::ValueArray5(v1, v2, v3, v4, v5); -} - -template -internal::ValueArray6 Values(T1 v1, T2 v2, T3 v3, - T4 v4, T5 v5, T6 v6) { - return internal::ValueArray6(v1, v2, v3, v4, v5, v6); -} - -template -internal::ValueArray7 Values(T1 v1, T2 v2, T3 v3, - T4 v4, T5 v5, T6 v6, T7 v7) { - return internal::ValueArray7(v1, v2, v3, v4, v5, - v6, v7); -} - -template -internal::ValueArray8 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) { - return internal::ValueArray8(v1, v2, v3, v4, - v5, v6, v7, v8); -} - -template -internal::ValueArray9 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) { - return internal::ValueArray9(v1, v2, v3, - v4, v5, v6, v7, v8, v9); -} - -template -internal::ValueArray10 Values(T1 v1, - T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) { - return internal::ValueArray10(v1, - v2, v3, v4, v5, v6, v7, v8, v9, v10); -} - -template -internal::ValueArray11 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11) { - return internal::ValueArray11(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11); -} - -template -internal::ValueArray12 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12) { - return internal::ValueArray12(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12); -} - -template -internal::ValueArray13 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13) { - return internal::ValueArray13(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13); -} - -template -internal::ValueArray14 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) { - return internal::ValueArray14(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, - v14); -} - -template -internal::ValueArray15 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, - T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) { - return internal::ValueArray15(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, - v13, v14, v15); -} - -template -internal::ValueArray16 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16) { - return internal::ValueArray16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, - v12, v13, v14, v15, v16); -} - -template -internal::ValueArray17 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17) { - return internal::ValueArray17(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, - v11, v12, v13, v14, v15, v16, v17); -} - -template -internal::ValueArray18 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, - T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18) { - return internal::ValueArray18(v1, v2, v3, v4, v5, v6, v7, v8, v9, - v10, v11, v12, v13, v14, v15, v16, v17, v18); -} - -template -internal::ValueArray19 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, - T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, - T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) { - return internal::ValueArray19(v1, v2, v3, v4, v5, v6, v7, v8, - v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19); -} - -template -internal::ValueArray20 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, - T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) { - return internal::ValueArray20(v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20); -} - -template -internal::ValueArray21 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, - T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) { - return internal::ValueArray21(v1, v2, v3, v4, v5, v6, - v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21); -} - -template -internal::ValueArray22 Values(T1 v1, T2 v2, T3 v3, - T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22) { - return internal::ValueArray22(v1, v2, v3, v4, - v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, - v20, v21, v22); -} - -template -internal::ValueArray23 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22, T23 v23) { - return internal::ValueArray23(v1, v2, v3, - v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, - v20, v21, v22, v23); -} - -template -internal::ValueArray24 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22, T23 v23, T24 v24) { - return internal::ValueArray24(v1, v2, - v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, - v19, v20, v21, v22, v23, v24); -} - -template -internal::ValueArray25 Values(T1 v1, - T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, - T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, - T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) { - return internal::ValueArray25(v1, - v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, - v18, v19, v20, v21, v22, v23, v24, v25); -} - -template -internal::ValueArray26 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26) { - return internal::ValueArray26(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, - v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26); -} - -template -internal::ValueArray27 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27) { - return internal::ValueArray27(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, - v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27); -} - -template -internal::ValueArray28 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28) { - return internal::ValueArray28(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, - v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, - v28); -} - -template -internal::ValueArray29 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29) { - return internal::ValueArray29(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, - v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, - v27, v28, v29); -} - -template -internal::ValueArray30 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, - T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, - T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, - T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) { - return internal::ValueArray30(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, - v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, - v26, v27, v28, v29, v30); -} - -template -internal::ValueArray31 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) { - return internal::ValueArray31(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, - v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, - v25, v26, v27, v28, v29, v30, v31); -} - -template -internal::ValueArray32 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, - T32 v32) { - return internal::ValueArray32(v1, v2, v3, v4, v5, v6, v7, v8, v9, - v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, - v24, v25, v26, v27, v28, v29, v30, v31, v32); -} - -template -internal::ValueArray33 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, - T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, - T32 v32, T33 v33) { - return internal::ValueArray33(v1, v2, v3, v4, v5, v6, v7, v8, - v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, - v24, v25, v26, v27, v28, v29, v30, v31, v32, v33); -} - -template -internal::ValueArray34 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, - T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, - T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, - T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, - T31 v31, T32 v32, T33 v33, T34 v34) { - return internal::ValueArray34(v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, - v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34); -} - -template -internal::ValueArray35 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, - T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, - T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, - T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) { - return internal::ValueArray35(v1, v2, v3, v4, v5, v6, - v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, - v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35); -} - -template -internal::ValueArray36 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, - T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, - T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, - T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) { - return internal::ValueArray36(v1, v2, v3, v4, - v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, - v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, - v34, v35, v36); -} - -template -internal::ValueArray37 Values(T1 v1, T2 v2, T3 v3, - T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, - T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, - T37 v37) { - return internal::ValueArray37(v1, v2, v3, - v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, - v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, - v34, v35, v36, v37); -} - -template -internal::ValueArray38 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, - T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, - T37 v37, T38 v38) { - return internal::ValueArray38(v1, v2, - v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, - v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, - v33, v34, v35, v36, v37, v38); -} - -template -internal::ValueArray39 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, - T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, - T37 v37, T38 v38, T39 v39) { - return internal::ValueArray39(v1, - v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, - v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, - v32, v33, v34, v35, v36, v37, v38, v39); -} - -template -internal::ValueArray40 Values(T1 v1, - T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, - T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, - T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, - T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, - T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) { - return internal::ValueArray40(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, - v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, - v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40); -} - -template -internal::ValueArray41 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) { - return internal::ValueArray41(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, - v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, - v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41); -} - -template -internal::ValueArray42 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42) { - return internal::ValueArray42(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, - v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, - v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, - v42); -} - -template -internal::ValueArray43 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43) { - return internal::ValueArray43(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, - v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, - v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, - v41, v42, v43); -} - -template -internal::ValueArray44 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44) { - return internal::ValueArray44(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, - v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, - v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, - v40, v41, v42, v43, v44); -} - -template -internal::ValueArray45 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, - T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, - T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, - T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, - T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, - T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) { - return internal::ValueArray45(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, - v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, - v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, - v39, v40, v41, v42, v43, v44, v45); -} - -template -internal::ValueArray46 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, - T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, - T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) { - return internal::ValueArray46(v1, v2, v3, v4, v5, v6, v7, v8, v9, - v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, - v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, - v38, v39, v40, v41, v42, v43, v44, v45, v46); -} - -template -internal::ValueArray47 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, - T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, - T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) { - return internal::ValueArray47(v1, v2, v3, v4, v5, v6, v7, v8, - v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, - v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, - v38, v39, v40, v41, v42, v43, v44, v45, v46, v47); -} - -template -internal::ValueArray48 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, - T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, - T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, - T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, - T48 v48) { - return internal::ValueArray48(v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, - v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, - v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48); -} - -template -internal::ValueArray49 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, - T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, - T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, - T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, - T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, - T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, - T47 v47, T48 v48, T49 v49) { - return internal::ValueArray49(v1, v2, v3, v4, v5, v6, - v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, - v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, - v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49); -} - -template -internal::ValueArray50 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, - T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, - T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, - T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, - T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, - T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) { - return internal::ValueArray50(v1, v2, v3, v4, - v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, - v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, - v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, - v48, v49, v50); +template +internal::ValueArray Values(T... v) { + return internal::ValueArray(std::move(v)...); } // Bool() allows generating tests with parameters in a set of (false, true). @@ -1201,7 +346,7 @@ internal::ValueArray50 { @@ -1209,13 +354,12 @@ internal::ValueArray50 Bool() { return Values(false, true); } -# if GTEST_HAS_COMBINE // Combine() allows the user to combine two or more sequences to produce // values of a Cartesian product of those sequences' elements. // @@ -1224,184 +368,83 @@ inline internal::ParamGenerator Bool() { // - returns a generator producing sequences with elements coming from // the Cartesian product of elements from the sequences generated by // gen1, gen2, ..., genN. The sequence elements will have a type of -// tuple where T1, T2, ..., TN are the types +// std::tuple where T1, T2, ..., TN are the types // of elements from sequences produces by gen1, gen2, ..., genN. // -// Combine can have up to 10 arguments. This number is currently limited -// by the maximum number of elements in the tuple implementation used by Google -// Test. +// Combine can have up to 10 arguments. // // Example: // -// This will instantiate tests in test case AnimalTest each one with +// This will instantiate tests in test suite AnimalTest each one with // the parameter values tuple("cat", BLACK), tuple("cat", WHITE), // tuple("dog", BLACK), and tuple("dog", WHITE): // // enum Color { BLACK, GRAY, WHITE }; // class AnimalTest -// : public testing::TestWithParam > {...}; +// : public testing::TestWithParam > {...}; // // TEST_P(AnimalTest, AnimalLooksNice) {...} // -// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest, -// Combine(Values("cat", "dog"), -// Values(BLACK, WHITE))); +// INSTANTIATE_TEST_SUITE_P(AnimalVariations, AnimalTest, +// Combine(Values("cat", "dog"), +// Values(BLACK, WHITE))); // // This will instantiate tests in FlagDependentTest with all variations of two // Boolean flags: // // class FlagDependentTest -// : public testing::TestWithParam > { +// : public testing::TestWithParam > { // virtual void SetUp() { // // Assigns external_flag_1 and external_flag_2 values from the tuple. -// tie(external_flag_1, external_flag_2) = GetParam(); +// std::tie(external_flag_1, external_flag_2) = GetParam(); // } // }; // // TEST_P(FlagDependentTest, TestFeature1) { // // Test your code using external_flag_1 and external_flag_2 here. // } -// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest, -// Combine(Bool(), Bool())); -// -template -internal::CartesianProductHolder2 Combine( - const Generator1& g1, const Generator2& g2) { - return internal::CartesianProductHolder2( - g1, g2); -} - -template -internal::CartesianProductHolder3 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3) { - return internal::CartesianProductHolder3( - g1, g2, g3); -} - -template -internal::CartesianProductHolder4 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4) { - return internal::CartesianProductHolder4( - g1, g2, g3, g4); -} - -template -internal::CartesianProductHolder5 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5) { - return internal::CartesianProductHolder5( - g1, g2, g3, g4, g5); -} - -template -internal::CartesianProductHolder6 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5, const Generator6& g6) { - return internal::CartesianProductHolder6( - g1, g2, g3, g4, g5, g6); -} - -template -internal::CartesianProductHolder7 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5, const Generator6& g6, - const Generator7& g7) { - return internal::CartesianProductHolder7( - g1, g2, g3, g4, g5, g6, g7); -} - -template -internal::CartesianProductHolder8 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5, const Generator6& g6, - const Generator7& g7, const Generator8& g8) { - return internal::CartesianProductHolder8( - g1, g2, g3, g4, g5, g6, g7, g8); -} - -template -internal::CartesianProductHolder9 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5, const Generator6& g6, - const Generator7& g7, const Generator8& g8, const Generator9& g9) { - return internal::CartesianProductHolder9( - g1, g2, g3, g4, g5, g6, g7, g8, g9); -} - -template -internal::CartesianProductHolder10 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5, const Generator6& g6, - const Generator7& g7, const Generator8& g8, const Generator9& g9, - const Generator10& g10) { - return internal::CartesianProductHolder10( - g1, g2, g3, g4, g5, g6, g7, g8, g9, g10); -} -# endif // GTEST_HAS_COMBINE - -# define TEST_P(test_case_name, test_name) \ - class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ - : public test_case_name { \ - public: \ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \ - virtual void TestBody(); \ - private: \ - static int AddToRegistry() { \ - ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ - GetTestCasePatternHolder(\ - #test_case_name, \ - ::testing::internal::CodeLocation(\ - __FILE__, __LINE__))->AddTestPattern(\ - GTEST_STRINGIFY_(test_case_name), \ - GTEST_STRINGIFY_(test_name), \ - new ::testing::internal::TestMetaFactory< \ - GTEST_TEST_CLASS_NAME_(\ - test_case_name, test_name)>()); \ - return 0; \ - } \ - static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ - GTEST_DISALLOW_COPY_AND_ASSIGN_(\ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \ - }; \ - int GTEST_TEST_CLASS_NAME_(test_case_name, \ - test_name)::gtest_registering_dummy_ = \ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \ - void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() - -// The optional last argument to INSTANTIATE_TEST_CASE_P allows the user -// to specify a function or functor that generates custom test name suffixes -// based on the test parameters. The function should accept one argument of -// type testing::TestParamInfo, and return std::string. +// INSTANTIATE_TEST_SUITE_P(TwoBoolSequence, FlagDependentTest, +// Combine(Bool(), Bool())); +// +template +internal::CartesianProductHolder Combine(const Generator&... g) { + return internal::CartesianProductHolder(g...); +} + +#define TEST_P(test_suite_name, test_name) \ + class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + : public test_suite_name { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {} \ + virtual void TestBody(); \ + \ + private: \ + static int AddToRegistry() { \ + ::testing::UnitTest::GetInstance() \ + ->parameterized_test_registry() \ + .GetTestSuitePatternHolder( \ + #test_suite_name, \ + ::testing::internal::CodeLocation(__FILE__, __LINE__)) \ + ->AddTestPattern( \ + GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name), \ + new ::testing::internal::TestMetaFactory()); \ + return 0; \ + } \ + static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)); \ + }; \ + int GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)::gtest_registering_dummy_ = \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::AddToRegistry(); \ + void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody() + +// The last argument to INSTANTIATE_TEST_SUITE_P allows the user to specify +// generator and an optional function or functor that generates custom test name +// suffixes based on the test parameters. Such a function or functor should +// accept one argument of type testing::TestParamInfo, and +// return std::string. // // testing::PrintToStringParamName is a builtin test suffix generator that // returns the value of testing::PrintToString(GetParam()). @@ -1410,24 +453,50 @@ internal::CartesianProductHolder10 \ - gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \ - static ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \ - const ::testing::TestParamInfo& info) { \ - return ::testing::internal::GetParamNameGen \ - (__VA_ARGS__)(info); \ - } \ - static int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ - GetTestCasePatternHolder(\ - #test_case_name, \ - ::testing::internal::CodeLocation(\ - __FILE__, __LINE__))->AddTestCaseInstantiation(\ - #prefix, \ - >est_##prefix##test_case_name##_EvalGenerator_, \ - >est_##prefix##test_case_name##_EvalGenerateName_, \ - __FILE__, __LINE__) +#define GTEST_EXPAND_(arg) arg +#define GTEST_GET_FIRST_(first, ...) first +#define GTEST_GET_SECOND_(first, second, ...) second + +#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...) \ + static ::testing::internal::ParamGenerator \ + gtest_##prefix##test_suite_name##_EvalGenerator_() { \ + return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_)); \ + } \ + static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_( \ + const ::testing::TestParamInfo& info) { \ + if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_( \ + __VA_ARGS__, \ + ::testing::internal::DefaultParamName, \ + DUMMY_PARAM_))); \ + auto t = std::make_tuple(__VA_ARGS__); \ + static_assert(std::tuple_size::value <= 2, \ + "Too Many Args!"); \ + } \ + return ((GTEST_EXPAND_(GTEST_GET_SECOND_( \ + __VA_ARGS__, \ + ::testing::internal::DefaultParamName, \ + DUMMY_PARAM_))))(info); \ + } \ + static int gtest_##prefix##test_suite_name##_dummy_ \ + GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::UnitTest::GetInstance() \ + ->parameterized_test_registry() \ + .GetTestSuitePatternHolder( \ + #test_suite_name, \ + ::testing::internal::CodeLocation(__FILE__, __LINE__)) \ + ->AddTestSuiteInstantiation( \ + #prefix, >est_##prefix##test_suite_name##_EvalGenerator_, \ + >est_##prefix##test_suite_name##_EvalGenerateName_, \ + __FILE__, __LINE__) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define INSTANTIATE_TEST_CASE_P \ + static_assert(::testing::internal::InstantiateTestCase_P_IsDeprecated(), \ + ""); \ + INSTANTIATE_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ } // namespace testing diff --git a/third_party/googletest/src/include/gtest/gtest-param-test.h.pump b/third_party/googletest/src/include/gtest/gtest-param-test.h.pump deleted file mode 100644 index 274f2b3b56..0000000000 --- a/third_party/googletest/src/include/gtest/gtest-param-test.h.pump +++ /dev/null @@ -1,500 +0,0 @@ -$$ -*- mode: c++; -*- -$var n = 50 $$ Maximum length of Values arguments we want to support. -$var maxtuple = 10 $$ Maximum number of Combine arguments we want to support. -// Copyright 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Macros and functions for implementing parameterized tests -// in Google C++ Testing and Mocking Framework (Google Test) -// -// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! -// -// GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ -#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ - - -// Value-parameterized tests allow you to test your code with different -// parameters without writing multiple copies of the same test. -// -// Here is how you use value-parameterized tests: - -#if 0 - -// To write value-parameterized tests, first you should define a fixture -// class. It is usually derived from testing::TestWithParam (see below for -// another inheritance scheme that's sometimes useful in more complicated -// class hierarchies), where the type of your parameter values. -// TestWithParam is itself derived from testing::Test. T can be any -// copyable type. If it's a raw pointer, you are responsible for managing the -// lifespan of the pointed values. - -class FooTest : public ::testing::TestWithParam { - // You can implement all the usual class fixture members here. -}; - -// Then, use the TEST_P macro to define as many parameterized tests -// for this fixture as you want. The _P suffix is for "parameterized" -// or "pattern", whichever you prefer to think. - -TEST_P(FooTest, DoesBlah) { - // Inside a test, access the test parameter with the GetParam() method - // of the TestWithParam class: - EXPECT_TRUE(foo.Blah(GetParam())); - ... -} - -TEST_P(FooTest, HasBlahBlah) { - ... -} - -// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test -// case with any set of parameters you want. Google Test defines a number -// of functions for generating test parameters. They return what we call -// (surprise!) parameter generators. Here is a summary of them, which -// are all in the testing namespace: -// -// -// Range(begin, end [, step]) - Yields values {begin, begin+step, -// begin+step+step, ...}. The values do not -// include end. step defaults to 1. -// Values(v1, v2, ..., vN) - Yields values {v1, v2, ..., vN}. -// ValuesIn(container) - Yields values from a C-style array, an STL -// ValuesIn(begin,end) container, or an iterator range [begin, end). -// Bool() - Yields sequence {false, true}. -// Combine(g1, g2, ..., gN) - Yields all combinations (the Cartesian product -// for the math savvy) of the values generated -// by the N generators. -// -// For more details, see comments at the definitions of these functions below -// in this file. -// -// The following statement will instantiate tests from the FooTest test case -// each with parameter values "meeny", "miny", and "moe". - -INSTANTIATE_TEST_CASE_P(InstantiationName, - FooTest, - Values("meeny", "miny", "moe")); - -// To distinguish different instances of the pattern, (yes, you -// can instantiate it more then once) the first argument to the -// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the -// actual test case name. Remember to pick unique prefixes for different -// instantiations. The tests from the instantiation above will have -// these names: -// -// * InstantiationName/FooTest.DoesBlah/0 for "meeny" -// * InstantiationName/FooTest.DoesBlah/1 for "miny" -// * InstantiationName/FooTest.DoesBlah/2 for "moe" -// * InstantiationName/FooTest.HasBlahBlah/0 for "meeny" -// * InstantiationName/FooTest.HasBlahBlah/1 for "miny" -// * InstantiationName/FooTest.HasBlahBlah/2 for "moe" -// -// You can use these names in --gtest_filter. -// -// This statement will instantiate all tests from FooTest again, each -// with parameter values "cat" and "dog": - -const char* pets[] = {"cat", "dog"}; -INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets)); - -// The tests from the instantiation above will have these names: -// -// * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat" -// * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog" -// * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat" -// * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog" -// -// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests -// in the given test case, whether their definitions come before or -// AFTER the INSTANTIATE_TEST_CASE_P statement. -// -// Please also note that generator expressions (including parameters to the -// generators) are evaluated in InitGoogleTest(), after main() has started. -// This allows the user on one hand, to adjust generator parameters in order -// to dynamically determine a set of tests to run and on the other hand, -// give the user a chance to inspect the generated tests with Google Test -// reflection API before RUN_ALL_TESTS() is executed. -// -// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc -// for more examples. -// -// In the future, we plan to publish the API for defining new parameter -// generators. But for now this interface remains part of the internal -// implementation and is subject to change. -// -// -// A parameterized test fixture must be derived from testing::Test and from -// testing::WithParamInterface, where T is the type of the parameter -// values. Inheriting from TestWithParam satisfies that requirement because -// TestWithParam inherits from both Test and WithParamInterface. In more -// complicated hierarchies, however, it is occasionally useful to inherit -// separately from Test and WithParamInterface. For example: - -class BaseTest : public ::testing::Test { - // You can inherit all the usual members for a non-parameterized test - // fixture here. -}; - -class DerivedTest : public BaseTest, public ::testing::WithParamInterface { - // The usual test fixture members go here too. -}; - -TEST_F(BaseTest, HasFoo) { - // This is an ordinary non-parameterized test. -} - -TEST_P(DerivedTest, DoesBlah) { - // GetParam works just the same here as if you inherit from TestWithParam. - EXPECT_TRUE(foo.Blah(GetParam())); -} - -#endif // 0 - -#include "gtest/internal/gtest-port.h" - -#if !GTEST_OS_SYMBIAN -# include -#endif - -#include "gtest/internal/gtest-internal.h" -#include "gtest/internal/gtest-param-util.h" -#include "gtest/internal/gtest-param-util-generated.h" - -namespace testing { - -// Functions producing parameter generators. -// -// Google Test uses these generators to produce parameters for value- -// parameterized tests. When a parameterized test case is instantiated -// with a particular generator, Google Test creates and runs tests -// for each element in the sequence produced by the generator. -// -// In the following sample, tests from test case FooTest are instantiated -// each three times with parameter values 3, 5, and 8: -// -// class FooTest : public TestWithParam { ... }; -// -// TEST_P(FooTest, TestThis) { -// } -// TEST_P(FooTest, TestThat) { -// } -// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8)); -// - -// Range() returns generators providing sequences of values in a range. -// -// Synopsis: -// Range(start, end) -// - returns a generator producing a sequence of values {start, start+1, -// start+2, ..., }. -// Range(start, end, step) -// - returns a generator producing a sequence of values {start, start+step, -// start+step+step, ..., }. -// Notes: -// * The generated sequences never include end. For example, Range(1, 5) -// returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2) -// returns a generator producing {1, 3, 5, 7}. -// * start and end must have the same type. That type may be any integral or -// floating-point type or a user defined type satisfying these conditions: -// * It must be assignable (have operator=() defined). -// * It must have operator+() (operator+(int-compatible type) for -// two-operand version). -// * It must have operator<() defined. -// Elements in the resulting sequences will also have that type. -// * Condition start < end must be satisfied in order for resulting sequences -// to contain any elements. -// -template -internal::ParamGenerator Range(T start, T end, IncrementT step) { - return internal::ParamGenerator( - new internal::RangeGenerator(start, end, step)); -} - -template -internal::ParamGenerator Range(T start, T end) { - return Range(start, end, 1); -} - -// ValuesIn() function allows generation of tests with parameters coming from -// a container. -// -// Synopsis: -// ValuesIn(const T (&array)[N]) -// - returns a generator producing sequences with elements from -// a C-style array. -// ValuesIn(const Container& container) -// - returns a generator producing sequences with elements from -// an STL-style container. -// ValuesIn(Iterator begin, Iterator end) -// - returns a generator producing sequences with elements from -// a range [begin, end) defined by a pair of STL-style iterators. These -// iterators can also be plain C pointers. -// -// Please note that ValuesIn copies the values from the containers -// passed in and keeps them to generate tests in RUN_ALL_TESTS(). -// -// Examples: -// -// This instantiates tests from test case StringTest -// each with C-string values of "foo", "bar", and "baz": -// -// const char* strings[] = {"foo", "bar", "baz"}; -// INSTANTIATE_TEST_CASE_P(StringSequence, StringTest, ValuesIn(strings)); -// -// This instantiates tests from test case StlStringTest -// each with STL strings with values "a" and "b": -// -// ::std::vector< ::std::string> GetParameterStrings() { -// ::std::vector< ::std::string> v; -// v.push_back("a"); -// v.push_back("b"); -// return v; -// } -// -// INSTANTIATE_TEST_CASE_P(CharSequence, -// StlStringTest, -// ValuesIn(GetParameterStrings())); -// -// -// This will also instantiate tests from CharTest -// each with parameter values 'a' and 'b': -// -// ::std::list GetParameterChars() { -// ::std::list list; -// list.push_back('a'); -// list.push_back('b'); -// return list; -// } -// ::std::list l = GetParameterChars(); -// INSTANTIATE_TEST_CASE_P(CharSequence2, -// CharTest, -// ValuesIn(l.begin(), l.end())); -// -template -internal::ParamGenerator< - typename ::testing::internal::IteratorTraits::value_type> -ValuesIn(ForwardIterator begin, ForwardIterator end) { - typedef typename ::testing::internal::IteratorTraits - ::value_type ParamType; - return internal::ParamGenerator( - new internal::ValuesInIteratorRangeGenerator(begin, end)); -} - -template -internal::ParamGenerator ValuesIn(const T (&array)[N]) { - return ValuesIn(array, array + N); -} - -template -internal::ParamGenerator ValuesIn( - const Container& container) { - return ValuesIn(container.begin(), container.end()); -} - -// Values() allows generating tests from explicitly specified list of -// parameters. -// -// Synopsis: -// Values(T v1, T v2, ..., T vN) -// - returns a generator producing sequences with elements v1, v2, ..., vN. -// -// For example, this instantiates tests from test case BarTest each -// with values "one", "two", and "three": -// -// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three")); -// -// This instantiates tests from test case BazTest each with values 1, 2, 3.5. -// The exact type of values will depend on the type of parameter in BazTest. -// -// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5)); -// -// Currently, Values() supports from 1 to $n parameters. -// -$range i 1..n -$for i [[ -$range j 1..i - -template <$for j, [[typename T$j]]> -internal::ValueArray$i<$for j, [[T$j]]> Values($for j, [[T$j v$j]]) { - return internal::ValueArray$i<$for j, [[T$j]]>($for j, [[v$j]]); -} - -]] - -// Bool() allows generating tests with parameters in a set of (false, true). -// -// Synopsis: -// Bool() -// - returns a generator producing sequences with elements {false, true}. -// -// It is useful when testing code that depends on Boolean flags. Combinations -// of multiple flags can be tested when several Bool()'s are combined using -// Combine() function. -// -// In the following example all tests in the test case FlagDependentTest -// will be instantiated twice with parameters false and true. -// -// class FlagDependentTest : public testing::TestWithParam { -// virtual void SetUp() { -// external_flag = GetParam(); -// } -// } -// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool()); -// -inline internal::ParamGenerator Bool() { - return Values(false, true); -} - -# if GTEST_HAS_COMBINE -// Combine() allows the user to combine two or more sequences to produce -// values of a Cartesian product of those sequences' elements. -// -// Synopsis: -// Combine(gen1, gen2, ..., genN) -// - returns a generator producing sequences with elements coming from -// the Cartesian product of elements from the sequences generated by -// gen1, gen2, ..., genN. The sequence elements will have a type of -// tuple where T1, T2, ..., TN are the types -// of elements from sequences produces by gen1, gen2, ..., genN. -// -// Combine can have up to $maxtuple arguments. This number is currently limited -// by the maximum number of elements in the tuple implementation used by Google -// Test. -// -// Example: -// -// This will instantiate tests in test case AnimalTest each one with -// the parameter values tuple("cat", BLACK), tuple("cat", WHITE), -// tuple("dog", BLACK), and tuple("dog", WHITE): -// -// enum Color { BLACK, GRAY, WHITE }; -// class AnimalTest -// : public testing::TestWithParam > {...}; -// -// TEST_P(AnimalTest, AnimalLooksNice) {...} -// -// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest, -// Combine(Values("cat", "dog"), -// Values(BLACK, WHITE))); -// -// This will instantiate tests in FlagDependentTest with all variations of two -// Boolean flags: -// -// class FlagDependentTest -// : public testing::TestWithParam > { -// virtual void SetUp() { -// // Assigns external_flag_1 and external_flag_2 values from the tuple. -// tie(external_flag_1, external_flag_2) = GetParam(); -// } -// }; -// -// TEST_P(FlagDependentTest, TestFeature1) { -// // Test your code using external_flag_1 and external_flag_2 here. -// } -// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest, -// Combine(Bool(), Bool())); -// -$range i 2..maxtuple -$for i [[ -$range j 1..i - -template <$for j, [[typename Generator$j]]> -internal::CartesianProductHolder$i<$for j, [[Generator$j]]> Combine( - $for j, [[const Generator$j& g$j]]) { - return internal::CartesianProductHolder$i<$for j, [[Generator$j]]>( - $for j, [[g$j]]); -} - -]] -# endif // GTEST_HAS_COMBINE - -# define TEST_P(test_case_name, test_name) \ - class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ - : public test_case_name { \ - public: \ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \ - virtual void TestBody(); \ - private: \ - static int AddToRegistry() { \ - ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ - GetTestCasePatternHolder(\ - #test_case_name, \ - ::testing::internal::CodeLocation(\ - __FILE__, __LINE__))->AddTestPattern(\ - GTEST_STRINGIFY_(test_case_name), \ - GTEST_STRINGIFY_(test_name), \ - new ::testing::internal::TestMetaFactory< \ - GTEST_TEST_CLASS_NAME_(\ - test_case_name, test_name)>()); \ - return 0; \ - } \ - static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ - GTEST_DISALLOW_COPY_AND_ASSIGN_(\ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \ - }; \ - int GTEST_TEST_CLASS_NAME_(test_case_name, \ - test_name)::gtest_registering_dummy_ = \ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \ - void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() - -// The optional last argument to INSTANTIATE_TEST_CASE_P allows the user -// to specify a function or functor that generates custom test name suffixes -// based on the test parameters. The function should accept one argument of -// type testing::TestParamInfo, and return std::string. -// -// testing::PrintToStringParamName is a builtin test suffix generator that -// returns the value of testing::PrintToString(GetParam()). -// -// Note: test names must be non-empty, unique, and may only contain ASCII -// alphanumeric characters or underscore. Because PrintToString adds quotes -// to std::string and C strings, it won't work for these types. - -# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \ - static ::testing::internal::ParamGenerator \ - gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \ - static ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \ - const ::testing::TestParamInfo& info) { \ - return ::testing::internal::GetParamNameGen \ - (__VA_ARGS__)(info); \ - } \ - static int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ - GetTestCasePatternHolder(\ - #test_case_name, \ - ::testing::internal::CodeLocation(\ - __FILE__, __LINE__))->AddTestCaseInstantiation(\ - #prefix, \ - >est_##prefix##test_case_name##_EvalGenerator_, \ - >est_##prefix##test_case_name##_EvalGenerateName_, \ - __FILE__, __LINE__) - -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ diff --git a/third_party/googletest/src/include/gtest/gtest-printers.h b/third_party/googletest/src/include/gtest/gtest-printers.h index 51865f84e6..56a05450ef 100644 --- a/third_party/googletest/src/include/gtest/gtest-printers.h +++ b/third_party/googletest/src/include/gtest/gtest-printers.h @@ -100,17 +100,16 @@ #ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ #define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ +#include #include // NOLINT #include #include +#include +#include #include #include -#include "gtest/internal/gtest-port.h" #include "gtest/internal/gtest-internal.h" - -#if GTEST_HAS_STD_TUPLE_ -# include -#endif +#include "gtest/internal/gtest-port.h" #if GTEST_HAS_ABSL #include "absl/strings/string_view.h" @@ -152,9 +151,10 @@ class TypeWithoutFormatter { public: // This default version is called when kTypeKind is kOtherType. static void PrintValue(const T& value, ::std::ostream* os) { - PrintBytesInObjectTo(static_cast( - reinterpret_cast(&value)), - sizeof(value), os); + PrintBytesInObjectTo( + static_cast( + reinterpret_cast(std::addressof(value))), + sizeof(value), os); } }; @@ -233,12 +233,12 @@ ::std::basic_ostream& operator<<( ::std::basic_ostream& os, const T& x) { TypeWithoutFormatter::value ? kProtobuf - : internal::ImplicitlyConvertible< + : std::is_convertible< const T&, internal::BiggestInt>::value ? kConvertibleToInteger : #if GTEST_HAS_ABSL - internal::ImplicitlyConvertible< + std::is_convertible< const T&, absl::string_view>::value ? kConvertibleToStringView : @@ -358,16 +358,6 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t); GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string); GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string); -#if GTEST_HAS_GLOBAL_STRING -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string); -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string); -#endif - -#if GTEST_HAS_GLOBAL_WSTRING -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring); -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring); -#endif - #if GTEST_HAS_STD_WSTRING GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring); GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring); @@ -448,7 +438,7 @@ void DefaultPrintTo(WrapPrinterType /* dummy */, template void DefaultPrintTo(WrapPrinterType /* dummy */, T* p, ::std::ostream* os) { - if (p == NULL) { + if (p == nullptr) { *os << "NULL"; } else { // T is not a function type. We just call << to print p, @@ -460,7 +450,7 @@ void DefaultPrintTo(WrapPrinterType /* dummy */, template void DefaultPrintTo(WrapPrinterType /* dummy */, T* p, ::std::ostream* os) { - if (p == NULL) { + if (p == nullptr) { *os << "NULL"; } else { // T is a function type, so '*os << p' doesn't do what we want @@ -515,13 +505,9 @@ void PrintTo(const T& value, ::std::ostream* os) { (sizeof(IsContainerTest(0)) == sizeof(IsContainer)) && !IsRecursiveContainer::value ? kPrintContainer - : !is_pointer::value + : !std::is_pointer::value ? kPrintOther -#if GTEST_LANG_CXX11 : std::is_function::type>::value -#else - : !internal::ImplicitlyConvertible::value -#endif ? kPrintFunctionPointer : kPrintPointer > (), value, os); @@ -603,27 +589,13 @@ void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) { } } -// Overloads for ::string and ::std::string. -#if GTEST_HAS_GLOBAL_STRING -GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os); -inline void PrintTo(const ::string& s, ::std::ostream* os) { - PrintStringTo(s, os); -} -#endif // GTEST_HAS_GLOBAL_STRING - +// Overloads for ::std::string. GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os); inline void PrintTo(const ::std::string& s, ::std::ostream* os) { PrintStringTo(s, os); } -// Overloads for ::wstring and ::std::wstring. -#if GTEST_HAS_GLOBAL_WSTRING -GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os); -inline void PrintTo(const ::wstring& s, ::std::ostream* os) { - PrintWideStringTo(s, os); -} -#endif // GTEST_HAS_GLOBAL_WSTRING - +// Overloads for ::std::wstring. #if GTEST_HAS_STD_WSTRING GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os); inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) { @@ -638,99 +610,38 @@ inline void PrintTo(absl::string_view sp, ::std::ostream* os) { } #endif // GTEST_HAS_ABSL -#if GTEST_LANG_CXX11 inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; } -#endif // GTEST_LANG_CXX11 -#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_ -// Helper function for printing a tuple. T must be instantiated with -// a tuple type. template -void PrintTupleTo(const T& t, ::std::ostream* os); -#endif // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_ - -#if GTEST_HAS_TR1_TUPLE -// Overload for ::std::tr1::tuple. Needed for printing function arguments, -// which are packed as tuples. - -// Overloaded PrintTo() for tuples of various arities. We support -// tuples of up-to 10 fields. The following implementation works -// regardless of whether tr1::tuple is implemented using the -// non-standard variadic template feature or not. - -inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); +void PrintTo(std::reference_wrapper ref, ::std::ostream* os) { + UniversalPrinter::Print(ref.get(), os); } -template -void PrintTo(const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo( - const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); +// Helper function for printing a tuple. T must be instantiated with +// a tuple type. +template +void PrintTupleTo(const T&, std::integral_constant, + ::std::ostream*) {} + +template +void PrintTupleTo(const T& t, std::integral_constant, + ::std::ostream* os) { + PrintTupleTo(t, std::integral_constant(), os); + GTEST_INTENTIONAL_CONST_COND_PUSH_() + if (I > 1) { + GTEST_INTENTIONAL_CONST_COND_POP_() + *os << ", "; + } + UniversalPrinter::type>::Print( + std::get(t), os); } -#endif // GTEST_HAS_TR1_TUPLE -#if GTEST_HAS_STD_TUPLE_ template void PrintTo(const ::std::tuple& t, ::std::ostream* os) { - PrintTupleTo(t, os); + *os << "("; + PrintTupleTo(t, std::integral_constant(), os); + *os << ")"; } -#endif // GTEST_HAS_STD_TUPLE_ // Overload for std::pair. template @@ -826,7 +737,6 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) { // If the array has more than kThreshold elements, we'll have to // omit some details by printing only the first and the last // kChunkSize elements. - // FIXME: let the user control the threshold using a flag. if (len <= kThreshold) { PrintRawArrayTo(begin, len, os); } else { @@ -905,7 +815,7 @@ template <> class UniversalTersePrinter { public: static void Print(const char* str, ::std::ostream* os) { - if (str == NULL) { + if (str == nullptr) { *os << "NULL"; } else { UniversalPrint(std::string(str), os); @@ -925,7 +835,7 @@ template <> class UniversalTersePrinter { public: static void Print(const wchar_t* str, ::std::ostream* os) { - if (str == NULL) { + if (str == nullptr) { *os << "NULL"; } else { UniversalPrint(::std::wstring(str), os); @@ -961,109 +871,20 @@ void UniversalPrint(const T& value, ::std::ostream* os) { typedef ::std::vector< ::std::string> Strings; -// TuplePolicy must provide: -// - tuple_size -// size of tuple TupleT. -// - get(const TupleT& t) -// static function extracting element I of tuple TupleT. -// - tuple_element::type -// type of element I of tuple TupleT. -template -struct TuplePolicy; - -#if GTEST_HAS_TR1_TUPLE -template -struct TuplePolicy { - typedef TupleT Tuple; - static const size_t tuple_size = ::std::tr1::tuple_size::value; - - template - struct tuple_element : ::std::tr1::tuple_element(I), Tuple> { - }; - - template - static typename AddReference(I), Tuple>::type>::type - get(const Tuple& tuple) { - return ::std::tr1::get(tuple); - } -}; -template -const size_t TuplePolicy::tuple_size; -#endif // GTEST_HAS_TR1_TUPLE - -#if GTEST_HAS_STD_TUPLE_ -template -struct TuplePolicy< ::std::tuple > { - typedef ::std::tuple Tuple; - static const size_t tuple_size = ::std::tuple_size::value; - - template - struct tuple_element : ::std::tuple_element {}; - - template - static const typename ::std::tuple_element::type& get( - const Tuple& tuple) { - return ::std::get(tuple); - } -}; -template -const size_t TuplePolicy< ::std::tuple >::tuple_size; -#endif // GTEST_HAS_STD_TUPLE_ - -#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_ -// This helper template allows PrintTo() for tuples and -// UniversalTersePrintTupleFieldsToStrings() to be defined by -// induction on the number of tuple fields. The idea is that -// TuplePrefixPrinter::PrintPrefixTo(t, os) prints the first N -// fields in tuple t, and can be defined in terms of -// TuplePrefixPrinter. -// -// The inductive case. -template -struct TuplePrefixPrinter { - // Prints the first N fields of a tuple. - template - static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) { - TuplePrefixPrinter::PrintPrefixTo(t, os); - GTEST_INTENTIONAL_CONST_COND_PUSH_() - if (N > 1) { - GTEST_INTENTIONAL_CONST_COND_POP_() - *os << ", "; - } - UniversalPrinter< - typename TuplePolicy::template tuple_element::type> - ::Print(TuplePolicy::template get(t), os); - } - // Tersely prints the first N fields of a tuple to a string vector, // one element for each field. - template - static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) { - TuplePrefixPrinter::TersePrintPrefixToStrings(t, strings); - ::std::stringstream ss; - UniversalTersePrint(TuplePolicy::template get(t), &ss); - strings->push_back(ss.str()); - } -}; - -// Base case. -template <> -struct TuplePrefixPrinter<0> { - template - static void PrintPrefixTo(const Tuple&, ::std::ostream*) {} - - template - static void TersePrintPrefixToStrings(const Tuple&, Strings*) {} -}; - -// Helper function for printing a tuple. -// Tuple must be either std::tr1::tuple or std::tuple type. template -void PrintTupleTo(const Tuple& t, ::std::ostream* os) { - *os << "("; - TuplePrefixPrinter::tuple_size>::PrintPrefixTo(t, os); - *os << ")"; +void TersePrintPrefixToStrings(const Tuple&, std::integral_constant, + Strings*) {} +template +void TersePrintPrefixToStrings(const Tuple& t, + std::integral_constant, + Strings* strings) { + TersePrintPrefixToStrings(t, std::integral_constant(), + strings); + ::std::stringstream ss; + UniversalTersePrint(std::get(t), &ss); + strings->push_back(ss.str()); } // Prints the fields of a tuple tersely to a string vector, one @@ -1072,11 +893,11 @@ void PrintTupleTo(const Tuple& t, ::std::ostream* os) { template Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) { Strings result; - TuplePrefixPrinter::tuple_size>:: - TersePrintPrefixToStrings(value, &result); + TersePrintPrefixToStrings( + value, std::integral_constant::value>(), + &result); return result; } -#endif // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_ } // namespace internal diff --git a/third_party/googletest/src/include/gtest/gtest-spi.h b/third_party/googletest/src/include/gtest/gtest-spi.h index 1e8983938e..aa38870e8e 100644 --- a/third_party/googletest/src/include/gtest/gtest-spi.h +++ b/third_party/googletest/src/include/gtest/gtest-spi.h @@ -72,14 +72,15 @@ class GTEST_API_ ScopedFakeTestPartResultReporter TestPartResultArray* result); // The d'tor restores the previous test part result reporter. - virtual ~ScopedFakeTestPartResultReporter(); + ~ScopedFakeTestPartResultReporter() override; // Appends the TestPartResult object to the TestPartResultArray // received in the constructor. // // This method is from the TestPartResultReporterInterface // interface. - virtual void ReportTestPartResult(const TestPartResult& result); + void ReportTestPartResult(const TestPartResult& result) override; + private: void Init(); diff --git a/third_party/googletest/src/include/gtest/gtest-test-part.h b/third_party/googletest/src/include/gtest/gtest-test-part.h index 1c7b89e087..05a7985358 100644 --- a/third_party/googletest/src/include/gtest/gtest-test-part.h +++ b/third_party/googletest/src/include/gtest/gtest-test-part.h @@ -53,22 +53,20 @@ class GTEST_API_ TestPartResult { enum Type { kSuccess, // Succeeded. kNonFatalFailure, // Failed but the test can continue. - kFatalFailure // Failed and the test should be terminated. + kFatalFailure, // Failed and the test should be terminated. + kSkip // Skipped. }; // C'tor. TestPartResult does NOT have a default constructor. // Always use this constructor (with parameters) to create a // TestPartResult object. - TestPartResult(Type a_type, - const char* a_file_name, - int a_line_number, + TestPartResult(Type a_type, const char* a_file_name, int a_line_number, const char* a_message) : type_(a_type), - file_name_(a_file_name == NULL ? "" : a_file_name), + file_name_(a_file_name == nullptr ? "" : a_file_name), line_number_(a_line_number), summary_(ExtractSummary(a_message)), - message_(a_message) { - } + message_(a_message) {} // Gets the outcome of the test part. Type type() const { return type_; } @@ -76,7 +74,7 @@ class GTEST_API_ TestPartResult { // Gets the name of the source file where the test part took place, or // NULL if it's unknown. const char* file_name() const { - return file_name_.empty() ? NULL : file_name_.c_str(); + return file_name_.empty() ? nullptr : file_name_.c_str(); } // Gets the line in the source file where the test part took place, @@ -89,18 +87,21 @@ class GTEST_API_ TestPartResult { // Gets the message associated with the test part. const char* message() const { return message_.c_str(); } - // Returns true iff the test part passed. - bool passed() const { return type_ == kSuccess; } + // Returns true if and only if the test part was skipped. + bool skipped() const { return type_ == kSkip; } - // Returns true iff the test part failed. - bool failed() const { return type_ != kSuccess; } + // Returns true if and only if the test part passed. + bool passed() const { return type_ == kSuccess; } - // Returns true iff the test part non-fatally failed. + // Returns true if and only if the test part non-fatally failed. bool nonfatally_failed() const { return type_ == kNonFatalFailure; } - // Returns true iff the test part fatally failed. + // Returns true if and only if the test part fatally failed. bool fatally_failed() const { return type_ == kFatalFailure; } + // Returns true if and only if the test part failed. + bool failed() const { return fatally_failed() || nonfatally_failed(); } + private: Type type_; @@ -164,8 +165,8 @@ class GTEST_API_ HasNewFatalFailureHelper : public TestPartResultReporterInterface { public: HasNewFatalFailureHelper(); - virtual ~HasNewFatalFailureHelper(); - virtual void ReportTestPartResult(const TestPartResult& result); + ~HasNewFatalFailureHelper() override; + void ReportTestPartResult(const TestPartResult& result) override; bool has_new_fatal_failure() const { return has_new_fatal_failure_; } private: bool has_new_fatal_failure_; diff --git a/third_party/googletest/src/include/gtest/gtest-typed-test.h b/third_party/googletest/src/include/gtest/gtest-typed-test.h index 74bce46bdc..095ce05802 100644 --- a/third_party/googletest/src/include/gtest/gtest-typed-test.h +++ b/third_party/googletest/src/include/gtest/gtest-typed-test.h @@ -52,22 +52,22 @@ class FooTest : public testing::Test { T value_; }; -// Next, associate a list of types with the test case, which will be +// Next, associate a list of types with the test suite, which will be // repeated for each type in the list. The typedef is necessary for // the macro to parse correctly. typedef testing::Types MyTypes; -TYPED_TEST_CASE(FooTest, MyTypes); +TYPED_TEST_SUITE(FooTest, MyTypes); // If the type list contains only one type, you can write that type // directly without Types<...>: -// TYPED_TEST_CASE(FooTest, int); +// TYPED_TEST_SUITE(FooTest, int); // Then, use TYPED_TEST() instead of TEST_F() to define as many typed -// tests for this test case as you want. +// tests for this test suite as you want. TYPED_TEST(FooTest, DoesBlah) { - // Inside a test, refer to TypeParam to get the type parameter. - // Since we are inside a derived class template, C++ requires use to - // visit the members of FooTest via 'this'. + // Inside a test, refer to the special name TypeParam to get the type + // parameter. Since we are inside a derived class template, C++ requires + // us to visit the members of FooTest via 'this'. TypeParam n = this->value_; // To visit static members of the fixture, add the TestFixture:: @@ -83,7 +83,7 @@ TYPED_TEST(FooTest, DoesBlah) { TYPED_TEST(FooTest, HasPropertyA) { ... } -// TYPED_TEST_CASE takes an optional third argument which allows to specify a +// TYPED_TEST_SUITE takes an optional third argument which allows to specify a // class that generates custom test name suffixes based on the type. This should // be a class which has a static template function GetName(int index) returning // a string for each type. The provided integer index equals the index of the @@ -99,7 +99,7 @@ TYPED_TEST(FooTest, HasPropertyA) { ... } // if (std::is_same()) return "unsignedInt"; // } // }; -// TYPED_TEST_CASE(FooTest, MyTypes, MyTypeNames); +// TYPED_TEST_SUITE(FooTest, MyTypes, MyTypeNames); #endif // 0 @@ -126,13 +126,13 @@ class FooTest : public testing::Test { ... }; -// Next, declare that you will define a type-parameterized test case +// Next, declare that you will define a type-parameterized test suite // (the _P suffix is for "parameterized" or "pattern", whichever you // prefer): -TYPED_TEST_CASE_P(FooTest); +TYPED_TEST_SUITE_P(FooTest); // Then, use TYPED_TEST_P() to define as many type-parameterized tests -// for this type-parameterized test case as you want. +// for this type-parameterized test suite as you want. TYPED_TEST_P(FooTest, DoesBlah) { // Inside a test, refer to TypeParam to get the type parameter. TypeParam n = 0; @@ -143,10 +143,10 @@ TYPED_TEST_P(FooTest, HasPropertyA) { ... } // Now the tricky part: you need to register all test patterns before // you can instantiate them. The first argument of the macro is the -// test case name; the rest are the names of the tests in this test +// test suite name; the rest are the names of the tests in this test // case. -REGISTER_TYPED_TEST_CASE_P(FooTest, - DoesBlah, HasPropertyA); +REGISTER_TYPED_TEST_SUITE_P(FooTest, + DoesBlah, HasPropertyA); // Finally, you are free to instantiate the pattern with the types you // want. If you put the above code in a header file, you can #include @@ -154,19 +154,19 @@ REGISTER_TYPED_TEST_CASE_P(FooTest, // // To distinguish different instances of the pattern, the first // argument to the INSTANTIATE_* macro is a prefix that will be added -// to the actual test case name. Remember to pick unique prefixes for +// to the actual test suite name. Remember to pick unique prefixes for // different instances. typedef testing::Types MyTypes; -INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes); +INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); // If the type list contains only one type, you can write that type // directly without Types<...>: -// INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int); +// INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, int); // -// Similar to the optional argument of TYPED_TEST_CASE above, -// INSTANTIATE_TEST_CASE_P takes an optional fourth argument which allows to +// Similar to the optional argument of TYPED_TEST_SUITE above, +// INSTANTIATE_TEST_SUITE_P takes an optional fourth argument which allows to // generate custom names. -// INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes, MyTypeNames); +// INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes, MyTypeNames); #endif // 0 @@ -180,21 +180,18 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes); // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. // // Expands to the name of the typedef for the type parameters of the -// given test case. -# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_ +// given test suite. +#define GTEST_TYPE_PARAMS_(TestSuiteName) gtest_type_params_##TestSuiteName##_ // Expands to the name of the typedef for the NameGenerator, responsible for // creating the suffixes of the name. -#define GTEST_NAME_GENERATOR_(TestCaseName) \ - gtest_type_params_##TestCaseName##_NameGenerator - -// The 'Types' template argument below must have spaces around it -// since some compilers may choke on '>>' when passing a template -// instance (e.g. Types) -# define TYPED_TEST_CASE(CaseName, Types, ...) \ - typedef ::testing::internal::TypeList< Types >::type GTEST_TYPE_PARAMS_( \ - CaseName); \ - typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \ +#define GTEST_NAME_GENERATOR_(TestSuiteName) \ + gtest_type_params_##TestSuiteName##_NameGenerator + +#define TYPED_TEST_SUITE(CaseName, Types, ...) \ + typedef ::testing::internal::TypeList::type GTEST_TYPE_PARAMS_( \ + CaseName); \ + typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \ GTEST_NAME_GENERATOR_(CaseName) # define TYPED_TEST(CaseName, TestName) \ @@ -224,6 +221,13 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes); void GTEST_TEST_CLASS_NAME_(CaseName, \ TestName)::TestBody() +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define TYPED_TEST_CASE \ + static_assert(::testing::internal::TypedTestCaseIsDeprecated(), ""); \ + TYPED_TEST_SUITE +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + #endif // GTEST_HAS_TYPED_TEST // Implements type-parameterized tests. @@ -233,73 +237,93 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes); // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. // // Expands to the namespace name that the type-parameterized tests for -// the given type-parameterized test case are defined in. The exact +// the given type-parameterized test suite are defined in. The exact // name of the namespace is subject to change without notice. -# define GTEST_CASE_NAMESPACE_(TestCaseName) \ - gtest_case_##TestCaseName##_ +#define GTEST_SUITE_NAMESPACE_(TestSuiteName) gtest_suite_##TestSuiteName##_ // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. // // Expands to the name of the variable used to remember the names of -// the defined tests in the given test case. -# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \ - gtest_typed_test_case_p_state_##TestCaseName##_ +// the defined tests in the given test suite. +#define GTEST_TYPED_TEST_SUITE_P_STATE_(TestSuiteName) \ + gtest_typed_test_suite_p_state_##TestSuiteName##_ // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY. // // Expands to the name of the variable used to remember the names of -// the registered tests in the given test case. -# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \ - gtest_registered_test_names_##TestCaseName##_ +// the registered tests in the given test suite. +#define GTEST_REGISTERED_TEST_NAMES_(TestSuiteName) \ + gtest_registered_test_names_##TestSuiteName##_ // The variables defined in the type-parameterized test macros are // static as typically these macros are used in a .h file that can be // #included in multiple translation units linked together. -# define TYPED_TEST_CASE_P(CaseName) \ - static ::testing::internal::TypedTestCasePState \ - GTEST_TYPED_TEST_CASE_P_STATE_(CaseName) - -# define TYPED_TEST_P(CaseName, TestName) \ - namespace GTEST_CASE_NAMESPACE_(CaseName) { \ - template \ - class TestName : public CaseName { \ - private: \ - typedef CaseName TestFixture; \ - typedef gtest_TypeParam_ TypeParam; \ - virtual void TestBody(); \ - }; \ - static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \ - GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\ - __FILE__, __LINE__, #CaseName, #TestName); \ - } \ - template \ - void GTEST_CASE_NAMESPACE_(CaseName)::TestName::TestBody() - -# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \ - namespace GTEST_CASE_NAMESPACE_(CaseName) { \ - typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \ - } \ - static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) \ - GTEST_ATTRIBUTE_UNUSED_ = \ - GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames( \ - __FILE__, __LINE__, #__VA_ARGS__) - -// The 'Types' template argument below must have spaces around it -// since some compilers may choke on '>>' when passing a template -// instance (e.g. Types) -# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types, ...) \ - static bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::internal::TypeParameterizedTestCase< \ - CaseName, GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \ - ::testing::internal::TypeList< Types >::type>:: \ - Register(#Prefix, \ - ::testing::internal::CodeLocation(__FILE__, __LINE__), \ - >EST_TYPED_TEST_CASE_P_STATE_(CaseName), #CaseName, \ - GTEST_REGISTERED_TEST_NAMES_(CaseName), \ - ::testing::internal::GenerateNames< \ - ::testing::internal::NameGeneratorSelector< \ - __VA_ARGS__>::type, \ - ::testing::internal::TypeList< Types >::type>()) +#define TYPED_TEST_SUITE_P(SuiteName) \ + static ::testing::internal::TypedTestSuitePState \ + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define TYPED_TEST_CASE_P \ + static_assert(::testing::internal::TypedTestCase_P_IsDeprecated(), ""); \ + TYPED_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +#define TYPED_TEST_P(SuiteName, TestName) \ + namespace GTEST_SUITE_NAMESPACE_(SuiteName) { \ + template \ + class TestName : public SuiteName { \ + private: \ + typedef SuiteName TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + virtual void TestBody(); \ + }; \ + static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName( \ + __FILE__, __LINE__, #SuiteName, #TestName); \ + } \ + template \ + void GTEST_SUITE_NAMESPACE_( \ + SuiteName)::TestName::TestBody() + +#define REGISTER_TYPED_TEST_SUITE_P(SuiteName, ...) \ + namespace GTEST_SUITE_NAMESPACE_(SuiteName) { \ + typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \ + } \ + static const char* const GTEST_REGISTERED_TEST_NAMES_( \ + SuiteName) GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).VerifyRegisteredTestNames( \ + __FILE__, __LINE__, #__VA_ARGS__) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define REGISTER_TYPED_TEST_CASE_P \ + static_assert(::testing::internal::RegisterTypedTestCase_P_IsDeprecated(), \ + ""); \ + REGISTER_TYPED_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...) \ + static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTestSuite< \ + SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_, \ + ::testing::internal::TypeList::type>:: \ + Register(#Prefix, \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), \ + >EST_TYPED_TEST_SUITE_P_STATE_(SuiteName), #SuiteName, \ + GTEST_REGISTERED_TEST_NAMES_(SuiteName), \ + ::testing::internal::GenerateNames< \ + ::testing::internal::NameGeneratorSelector< \ + __VA_ARGS__>::type, \ + ::testing::internal::TypeList::type>()) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define INSTANTIATE_TYPED_TEST_CASE_P \ + static_assert( \ + ::testing::internal::InstantiateTypedTestCase_P_IsDeprecated(), ""); \ + INSTANTIATE_TYPED_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ #endif // GTEST_HAS_TYPED_TEST_P diff --git a/third_party/googletest/src/include/gtest/gtest.h b/third_party/googletest/src/include/gtest/gtest.h index 3b4bb1ee90..dbe5b1c2c3 100644 --- a/third_party/googletest/src/include/gtest/gtest.h +++ b/third_party/googletest/src/include/gtest/gtest.h @@ -52,13 +52,17 @@ #ifndef GTEST_INCLUDE_GTEST_GTEST_H_ #define GTEST_INCLUDE_GTEST_GTEST_H_ +#include #include +#include #include +#include #include #include "gtest/internal/gtest-internal.h" #include "gtest/internal/gtest-string.h" #include "gtest/gtest-death-test.h" +#include "gtest/gtest-matchers.h" #include "gtest/gtest-message.h" #include "gtest/gtest-param-test.h" #include "gtest/gtest-printers.h" @@ -69,21 +73,6 @@ GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ /* class A needs to have dll-interface to be used by clients of class B */) -// Depending on the platform, different string classes are available. -// On Linux, in addition to ::std::string, Google also makes use of -// class ::string, which has the same interface as ::std::string, but -// has a different implementation. -// -// You can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that -// ::string is available AND is a distinct type to ::std::string, or -// define it to 0 to indicate otherwise. -// -// If ::std::string and ::string are the same class on your platform -// due to aliasing, you should define GTEST_HAS_GLOBAL_STRING to 0. -// -// If you do not define GTEST_HAS_GLOBAL_STRING, it is defined -// heuristically. - namespace testing { // Silence C4100 (unreferenced formal parameter) and 4805 @@ -195,7 +184,12 @@ void ReportFailureInUnknownLocation(TestPartResult::Type result_type, // If we don't forward declare them the compiler might confuse the classes // in friendship clauses with same named classes on the scope. class Test; -class TestCase; +class TestSuite; + +// Old API is still available but deprecated +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +using TestCase = TestSuite; +#endif class TestInfo; class UnitTest; @@ -298,9 +292,10 @@ class GTEST_API_ AssertionResult { template explicit AssertionResult( const T& success, - typename internal::EnableIf< - !internal::ImplicitlyConvertible::value>::type* - /*enabler*/ = NULL) + typename std::enable_if< + !std::is_convertible::value>::type* + /*enabler*/ + = nullptr) : success_(success) {} #if defined(_MSC_VER) && _MSC_VER < 1910 @@ -313,7 +308,7 @@ class GTEST_API_ AssertionResult { return *this; } - // Returns true iff the assertion succeeded. + // Returns true if and only if the assertion succeeded. operator bool() const { return success_; } // NOLINT // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. @@ -324,9 +319,8 @@ class GTEST_API_ AssertionResult { // assertion's expectation). When nothing has been streamed into the // object, returns an empty string. const char* message() const { - return message_.get() != NULL ? message_->c_str() : ""; + return message_.get() != nullptr ? message_->c_str() : ""; } - // FIXME: Remove this after making sure no clients use it. // Deprecated; please use message() instead. const char* failure_message() const { return message(); } @@ -347,8 +341,7 @@ class GTEST_API_ AssertionResult { private: // Appends the contents of message to message_. void AppendMessage(const Message& a_message) { - if (message_.get() == NULL) - message_.reset(new ::std::string); + if (message_.get() == nullptr) message_.reset(new ::std::string); message_->append(a_message.GetString().c_str()); } @@ -361,7 +354,7 @@ class GTEST_API_ AssertionResult { // construct is not satisfied with the predicate's outcome. // Referenced via a pointer to avoid taking too much stack frame space // with test assertions. - internal::scoped_ptr< ::std::string> message_; + std::unique_ptr< ::std::string> message_; }; // Makes a successful assertion result. @@ -385,8 +378,8 @@ namespace testing { // The abstract class that all tests inherit from. // -// In Google Test, a unit test program contains one or many TestCases, and -// each TestCase contains one or many Tests. +// In Google Test, a unit test program contains one or many TestSuites, and +// each TestSuite contains one or many Tests. // // When you define a test using the TEST macro, you don't need to // explicitly derive from Test - the TEST macro automatically does @@ -410,49 +403,57 @@ class GTEST_API_ Test { public: friend class TestInfo; - // Defines types for pointers to functions that set up and tear down - // a test case. - typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc; - typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc; - // The d'tor is virtual as we intend to inherit from Test. virtual ~Test(); // Sets up the stuff shared by all tests in this test case. // - // Google Test will call Foo::SetUpTestCase() before running the first + // Google Test will call Foo::SetUpTestSuite() before running the first // test in test case Foo. Hence a sub-class can define its own - // SetUpTestCase() method to shadow the one defined in the super + // SetUpTestSuite() method to shadow the one defined in the super // class. - static void SetUpTestCase() {} + // Failures that happen during SetUpTestSuite are logged but otherwise + // ignored. + static void SetUpTestSuite() {} - // Tears down the stuff shared by all tests in this test case. + // Tears down the stuff shared by all tests in this test suite. // - // Google Test will call Foo::TearDownTestCase() after running the last + // Google Test will call Foo::TearDownTestSuite() after running the last // test in test case Foo. Hence a sub-class can define its own - // TearDownTestCase() method to shadow the one defined in the super + // TearDownTestSuite() method to shadow the one defined in the super // class. + // Failures that happen during TearDownTestSuite are logged but otherwise + // ignored. + static void TearDownTestSuite() {} + + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ static void TearDownTestCase() {} + static void SetUpTestCase() {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ - // Returns true iff the current test has a fatal failure. + // Returns true if and only if the current test has a fatal failure. static bool HasFatalFailure(); - // Returns true iff the current test has a non-fatal failure. + // Returns true if and only if the current test has a non-fatal failure. static bool HasNonfatalFailure(); - // Returns true iff the current test has a (either fatal or + // Returns true if and only if the current test was skipped. + static bool IsSkipped(); + + // Returns true if and only if the current test has a (either fatal or // non-fatal) failure. static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); } - // Logs a property for the current test, test case, or for the entire + // Logs a property for the current test, test suite, or for the entire // invocation of the test program when used outside of the context of a - // test case. Only the last value for a given key is remembered. These + // test suite. Only the last value for a given key is remembered. These // are public static so they can be called from utility functions that are // not members of the test fixture. Calls to RecordProperty made during // lifespan of the test (from the moment its constructor starts to the // moment its destructor finishes) will be output in XML as attributes of // the element. Properties recorded from fixture's - // SetUpTestCase or TearDownTestCase are logged as attributes of the + // SetUpTestSuite or TearDownTestSuite are logged as attributes of the // corresponding element. Calls to RecordProperty made in the // global context (before or after invocation of RUN_ALL_TESTS and from // SetUp/TearDown method of Environment objects registered with Google @@ -471,8 +472,8 @@ class GTEST_API_ Test { virtual void TearDown(); private: - // Returns true iff the current test has the same fixture class as - // the first test in the current test case. + // Returns true if and only if the current test has the same fixture class + // as the first test in the current test suite. static bool HasSameFixtureClass(); // Runs the test after the test fixture has been set up. @@ -490,7 +491,7 @@ class GTEST_API_ Test { // internal method to avoid clashing with names used in user TESTs. void DeleteSelf_() { delete this; } - const internal::scoped_ptr< GTEST_FLAG_SAVER_ > gtest_flag_saver_; + const std::unique_ptr gtest_flag_saver_; // Often a user misspells SetUp() as Setup() and spends a long time // wondering why it is never called by Google Test. The declaration of @@ -509,7 +510,7 @@ class GTEST_API_ Test { // If you see an error about overriding the following function or // about it being private, you have mis-spelled SetUp() as Setup(). struct Setup_should_be_spelled_SetUp {}; - virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; } + virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; } // We disallow copying Tests. GTEST_DISALLOW_COPY_AND_ASSIGN_(Test); @@ -573,21 +574,28 @@ class GTEST_API_ TestResult { // Returns the number of the test properties. int test_property_count() const; - // Returns true iff the test passed (i.e. no test part failed). - bool Passed() const { return !Failed(); } + // Returns true if and only if the test passed (i.e. no test part failed). + bool Passed() const { return !Skipped() && !Failed(); } + + // Returns true if and only if the test was skipped. + bool Skipped() const; - // Returns true iff the test failed. + // Returns true if and only if the test failed. bool Failed() const; - // Returns true iff the test fatally failed. + // Returns true if and only if the test fatally failed. bool HasFatalFailure() const; - // Returns true iff the test has a non-fatal failure. + // Returns true if and only if the test has a non-fatal failure. bool HasNonfatalFailure() const; // Returns the elapsed time, in milliseconds. TimeInMillis elapsed_time() const { return elapsed_time_; } + // Gets the time of the test case start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const { return start_timestamp_; } + // Returns the i-th test part result among all the results. i can range from 0 // to total_part_count() - 1. If i is not in that range, aborts the program. const TestPartResult& GetTestPartResult(int i) const; @@ -599,7 +607,7 @@ class GTEST_API_ TestResult { private: friend class TestInfo; - friend class TestCase; + friend class TestSuite; friend class UnitTest; friend class internal::DefaultGlobalTestPartResultReporter; friend class internal::ExecDeathTest; @@ -618,6 +626,9 @@ class GTEST_API_ TestResult { return test_properties_; } + // Sets the start time. + void set_start_timestamp(TimeInMillis start) { start_timestamp_ = start; } + // Sets the elapsed time. void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; } @@ -631,7 +642,7 @@ class GTEST_API_ TestResult { const TestProperty& test_property); // Adds a failure if the key is a reserved attribute of Google Test - // testcase tags. Returns true if the property is valid. + // testsuite tags. Returns true if the property is valid. // FIXME: Validate attribute names are legal and human readable. static bool ValidateTestProperty(const std::string& xml_element, const TestProperty& test_property); @@ -661,6 +672,8 @@ class GTEST_API_ TestResult { std::vector test_properties_; // Running count of death tests. int death_test_count_; + // The start time, in milliseconds since UNIX Epoch. + TimeInMillis start_timestamp_; // The elapsed time, in milliseconds. TimeInMillis elapsed_time_; @@ -670,7 +683,7 @@ class GTEST_API_ TestResult { // A TestInfo object stores the following information about a test: // -// Test case name +// Test suite name // Test name // Whether the test should be run // A function pointer that creates the test object when invoked @@ -685,8 +698,13 @@ class GTEST_API_ TestInfo { // don't inherit from TestInfo. ~TestInfo(); - // Returns the test case name. - const char* test_case_name() const { return test_case_name_.c_str(); } + // Returns the test suite name. + const char* test_suite_name() const { return test_suite_name_.c_str(); } + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + const char* test_case_name() const { return test_suite_name(); } +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ // Returns the test name. const char* name() const { return name_.c_str(); } @@ -694,17 +712,15 @@ class GTEST_API_ TestInfo { // Returns the name of the parameter type, or NULL if this is not a typed // or a type-parameterized test. const char* type_param() const { - if (type_param_.get() != NULL) - return type_param_->c_str(); - return NULL; + if (type_param_.get() != nullptr) return type_param_->c_str(); + return nullptr; } // Returns the text representation of the value parameter, or NULL if this // is not a value-parameterized test. const char* value_param() const { - if (value_param_.get() != NULL) - return value_param_->c_str(); - return NULL; + if (value_param_.get() != nullptr) return value_param_->c_str(); + return nullptr; } // Returns the file name where this test is defined. @@ -721,7 +737,7 @@ class GTEST_API_ TestInfo { // been specified) and its full name matches the user-specified filter. // // Google Test allows the user to filter the tests by their full names. - // The full name of a test Bar in test case Foo is defined as + // The full name of a test Bar in test suite Foo is defined as // "Foo.Bar". Only the tests that match the filter will run. // // A filter is a colon-separated list of glob (not regex) patterns, @@ -734,7 +750,7 @@ class GTEST_API_ TestInfo { // contains the character 'A' or starts with "Foo.". bool should_run() const { return should_run_; } - // Returns true iff this test will appear in the XML report. + // Returns true if and only if this test will appear in the XML report. bool is_reportable() const { // The XML report includes tests matching the filter, excluding those // run in other shards. @@ -749,24 +765,19 @@ class GTEST_API_ TestInfo { friend class internal::DefaultDeathTestFactory; #endif // GTEST_HAS_DEATH_TEST friend class Test; - friend class TestCase; + friend class TestSuite; friend class internal::UnitTestImpl; friend class internal::StreamingListenerTest; friend TestInfo* internal::MakeAndRegisterTestInfo( - const char* test_case_name, - const char* name, - const char* type_param, - const char* value_param, - internal::CodeLocation code_location, - internal::TypeId fixture_class_id, - Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc, + const char* test_suite_name, const char* name, const char* type_param, + const char* value_param, internal::CodeLocation code_location, + internal::TypeId fixture_class_id, internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc, internal::TestFactoryBase* factory); // Constructs a TestInfo object. The newly constructed instance assumes // ownership of the factory object. - TestInfo(const std::string& test_case_name, - const std::string& name, + TestInfo(const std::string& test_suite_name, const std::string& name, const char* a_type_param, // NULL if not a type-parameterized test const char* a_value_param, // NULL if not a value-parameterized test internal::CodeLocation a_code_location, @@ -788,21 +799,21 @@ class GTEST_API_ TestInfo { } // These fields are immutable properties of the test. - const std::string test_case_name_; // Test case name + const std::string test_suite_name_; // test suite name const std::string name_; // Test name // Name of the parameter type, or NULL if this is not a typed or a // type-parameterized test. - const internal::scoped_ptr type_param_; + const std::unique_ptr type_param_; // Text representation of the value parameter, or NULL if this is not a // value-parameterized test. - const internal::scoped_ptr value_param_; + const std::unique_ptr value_param_; internal::CodeLocation location_; - const internal::TypeId fixture_class_id_; // ID of the test fixture class - bool should_run_; // True iff this test should run - bool is_disabled_; // True iff this test is disabled - bool matches_filter_; // True if this test matches the - // user-specified filter. - bool is_in_another_shard_; // Will be run in another shard. + const internal::TypeId fixture_class_id_; // ID of the test fixture class + bool should_run_; // True if and only if this test should run + bool is_disabled_; // True if and only if this test is disabled + bool matches_filter_; // True if this test matches the + // user-specified filter. + bool is_in_another_shard_; // Will be run in another shard. internal::TestFactoryBase* const factory_; // The factory that creates // the test object @@ -813,90 +824,96 @@ class GTEST_API_ TestInfo { GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo); }; -// A test case, which consists of a vector of TestInfos. +// A test suite, which consists of a vector of TestInfos. // -// TestCase is not copyable. -class GTEST_API_ TestCase { +// TestSuite is not copyable. +class GTEST_API_ TestSuite { public: - // Creates a TestCase with the given name. + // Creates a TestSuite with the given name. // - // TestCase does NOT have a default constructor. Always use this - // constructor to create a TestCase object. + // TestSuite does NOT have a default constructor. Always use this + // constructor to create a TestSuite object. // // Arguments: // - // name: name of the test case + // name: name of the test suite // a_type_param: the name of the test's type parameter, or NULL if // this is not a type-parameterized test. - // set_up_tc: pointer to the function that sets up the test case - // tear_down_tc: pointer to the function that tears down the test case - TestCase(const char* name, const char* a_type_param, - Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc); + // set_up_tc: pointer to the function that sets up the test suite + // tear_down_tc: pointer to the function that tears down the test suite + TestSuite(const char* name, const char* a_type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc); - // Destructor of TestCase. - virtual ~TestCase(); + // Destructor of TestSuite. + virtual ~TestSuite(); - // Gets the name of the TestCase. + // Gets the name of the TestSuite. const char* name() const { return name_.c_str(); } // Returns the name of the parameter type, or NULL if this is not a - // type-parameterized test case. + // type-parameterized test suite. const char* type_param() const { - if (type_param_.get() != NULL) - return type_param_->c_str(); - return NULL; + if (type_param_.get() != nullptr) return type_param_->c_str(); + return nullptr; } - // Returns true if any test in this test case should run. + // Returns true if any test in this test suite should run. bool should_run() const { return should_run_; } - // Gets the number of successful tests in this test case. + // Gets the number of successful tests in this test suite. int successful_test_count() const; - // Gets the number of failed tests in this test case. + // Gets the number of skipped tests in this test suite. + int skipped_test_count() const; + + // Gets the number of failed tests in this test suite. int failed_test_count() const; // Gets the number of disabled tests that will be reported in the XML report. int reportable_disabled_test_count() const; - // Gets the number of disabled tests in this test case. + // Gets the number of disabled tests in this test suite. int disabled_test_count() const; // Gets the number of tests to be printed in the XML report. int reportable_test_count() const; - // Get the number of tests in this test case that should run. + // Get the number of tests in this test suite that should run. int test_to_run_count() const; - // Gets the number of all tests in this test case. + // Gets the number of all tests in this test suite. int total_test_count() const; - // Returns true iff the test case passed. + // Returns true if and only if the test suite passed. bool Passed() const { return !Failed(); } - // Returns true iff the test case failed. + // Returns true if and only if the test suite failed. bool Failed() const { return failed_test_count() > 0; } // Returns the elapsed time, in milliseconds. TimeInMillis elapsed_time() const { return elapsed_time_; } + // Gets the time of the test suite start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const { return start_timestamp_; } + // Returns the i-th test among all the tests. i can range from 0 to // total_test_count() - 1. If i is not in that range, returns NULL. const TestInfo* GetTestInfo(int i) const; // Returns the TestResult that holds test properties recorded during - // execution of SetUpTestCase and TearDownTestCase. + // execution of SetUpTestSuite and TearDownTestSuite. const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; } private: friend class Test; friend class internal::UnitTestImpl; - // Gets the (mutable) vector of TestInfos in this TestCase. + // Gets the (mutable) vector of TestInfos in this TestSuite. std::vector& test_info_list() { return test_info_list_; } - // Gets the (immutable) vector of TestInfos in this TestCase. + // Gets the (immutable) vector of TestInfos in this TestSuite. const std::vector& test_info_list() const { return test_info_list_; } @@ -908,51 +925,64 @@ class GTEST_API_ TestCase { // Sets the should_run member. void set_should_run(bool should) { should_run_ = should; } - // Adds a TestInfo to this test case. Will delete the TestInfo upon - // destruction of the TestCase object. + // Adds a TestInfo to this test suite. Will delete the TestInfo upon + // destruction of the TestSuite object. void AddTestInfo(TestInfo * test_info); - // Clears the results of all tests in this test case. + // Clears the results of all tests in this test suite. void ClearResult(); - // Clears the results of all tests in the given test case. - static void ClearTestCaseResult(TestCase* test_case) { - test_case->ClearResult(); + // Clears the results of all tests in the given test suite. + static void ClearTestSuiteResult(TestSuite* test_suite) { + test_suite->ClearResult(); } - // Runs every test in this TestCase. + // Runs every test in this TestSuite. void Run(); - // Runs SetUpTestCase() for this TestCase. This wrapper is needed - // for catching exceptions thrown from SetUpTestCase(). - void RunSetUpTestCase() { (*set_up_tc_)(); } + // Runs SetUpTestSuite() for this TestSuite. This wrapper is needed + // for catching exceptions thrown from SetUpTestSuite(). + void RunSetUpTestSuite() { + if (set_up_tc_ != nullptr) { + (*set_up_tc_)(); + } + } - // Runs TearDownTestCase() for this TestCase. This wrapper is - // needed for catching exceptions thrown from TearDownTestCase(). - void RunTearDownTestCase() { (*tear_down_tc_)(); } + // Runs TearDownTestSuite() for this TestSuite. This wrapper is + // needed for catching exceptions thrown from TearDownTestSuite(). + void RunTearDownTestSuite() { + if (tear_down_tc_ != nullptr) { + (*tear_down_tc_)(); + } + } - // Returns true iff test passed. + // Returns true if and only if test passed. static bool TestPassed(const TestInfo* test_info) { return test_info->should_run() && test_info->result()->Passed(); } - // Returns true iff test failed. + // Returns true if and only if test skipped. + static bool TestSkipped(const TestInfo* test_info) { + return test_info->should_run() && test_info->result()->Skipped(); + } + + // Returns true if and only if test failed. static bool TestFailed(const TestInfo* test_info) { return test_info->should_run() && test_info->result()->Failed(); } - // Returns true iff the test is disabled and will be reported in the XML - // report. + // Returns true if and only if the test is disabled and will be reported in + // the XML report. static bool TestReportableDisabled(const TestInfo* test_info) { return test_info->is_reportable() && test_info->is_disabled_; } - // Returns true iff test is disabled. + // Returns true if and only if test is disabled. static bool TestDisabled(const TestInfo* test_info) { return test_info->is_disabled_; } - // Returns true iff this test will appear in the XML report. + // Returns true if and only if this test will appear in the XML report. static bool TestReportable(const TestInfo* test_info) { return test_info->is_reportable(); } @@ -962,17 +992,17 @@ class GTEST_API_ TestCase { return test_info->should_run(); } - // Shuffles the tests in this test case. + // Shuffles the tests in this test suite. void ShuffleTests(internal::Random* random); // Restores the test order to before the first shuffle. void UnshuffleTests(); - // Name of the test case. + // Name of the test suite. std::string name_; // Name of the parameter type, or NULL if this is not a typed or a // type-parameterized test. - const internal::scoped_ptr type_param_; + const std::unique_ptr type_param_; // The vector of TestInfos in their original order. It owns the // elements in the vector. std::vector test_info_list_; @@ -980,20 +1010,22 @@ class GTEST_API_ TestCase { // shuffling and restoring the test order. The i-th element in this // vector is the index of the i-th test in the shuffled test list. std::vector test_indices_; - // Pointer to the function that sets up the test case. - Test::SetUpTestCaseFunc set_up_tc_; - // Pointer to the function that tears down the test case. - Test::TearDownTestCaseFunc tear_down_tc_; - // True iff any test in this test case should run. + // Pointer to the function that sets up the test suite. + internal::SetUpTestSuiteFunc set_up_tc_; + // Pointer to the function that tears down the test suite. + internal::TearDownTestSuiteFunc tear_down_tc_; + // True if and only if any test in this test suite should run. bool should_run_; + // The start time, in milliseconds since UNIX Epoch. + TimeInMillis start_timestamp_; // Elapsed time, in milliseconds. TimeInMillis elapsed_time_; - // Holds test properties recorded during execution of SetUpTestCase and - // TearDownTestCase. + // Holds test properties recorded during execution of SetUpTestSuite and + // TearDownTestSuite. TestResult ad_hoc_test_result_; - // We disallow copying TestCases. - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase); + // We disallow copying TestSuites. + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestSuite); }; // An Environment object is capable of setting up and tearing down an @@ -1024,7 +1056,7 @@ class Environment { // If you see an error about overriding the following function or // about it being private, you have mis-spelled SetUp() as Setup(). struct Setup_should_be_spelled_SetUp {}; - virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; } + virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; } }; #if GTEST_HAS_EXCEPTIONS @@ -1060,8 +1092,13 @@ class TestEventListener { // Fired after environment set-up for each iteration of tests ends. virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0; - // Fired before the test case starts. - virtual void OnTestCaseStart(const TestCase& test_case) = 0; + // Fired before the test suite starts. + virtual void OnTestSuiteStart(const TestSuite& /*test_suite*/) {} + + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + virtual void OnTestCaseStart(const TestCase& /*test_case*/) {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ // Fired before the test starts. virtual void OnTestStart(const TestInfo& test_info) = 0; @@ -1074,8 +1111,13 @@ class TestEventListener { // Fired after the test ends. virtual void OnTestEnd(const TestInfo& test_info) = 0; - // Fired after the test case ends. - virtual void OnTestCaseEnd(const TestCase& test_case) = 0; + // Fired after the test suite ends. + virtual void OnTestSuiteEnd(const TestSuite& /*test_suite*/) {} + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ // Fired before environment tear-down for each iteration of tests starts. virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0; @@ -1098,21 +1140,30 @@ class TestEventListener { // above. class EmptyTestEventListener : public TestEventListener { public: - virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {} - virtual void OnTestIterationStart(const UnitTest& /*unit_test*/, - int /*iteration*/) {} - virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {} - virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {} - virtual void OnTestCaseStart(const TestCase& /*test_case*/) {} - virtual void OnTestStart(const TestInfo& /*test_info*/) {} - virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {} - virtual void OnTestEnd(const TestInfo& /*test_info*/) {} - virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {} - virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {} - virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {} - virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/, - int /*iteration*/) {} - virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {} + void OnTestProgramStart(const UnitTest& /*unit_test*/) override {} + void OnTestIterationStart(const UnitTest& /*unit_test*/, + int /*iteration*/) override {} + void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {} + void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {} + void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {} +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseStart(const TestCase& /*test_case*/) override {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + void OnTestStart(const TestInfo& /*test_info*/) override {} + void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {} + void OnTestEnd(const TestInfo& /*test_info*/) override {} + void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {} +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseEnd(const TestCase& /*test_case*/) override {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {} + void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {} + void OnTestIterationEnd(const UnitTest& /*unit_test*/, + int /*iteration*/) override {} + void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {} }; // TestEventListeners lets users add listeners to track events in Google Test. @@ -1152,7 +1203,7 @@ class GTEST_API_ TestEventListeners { } private: - friend class TestCase; + friend class TestSuite; friend class TestInfo; friend class internal::DefaultGlobalTestPartResultReporter; friend class internal::NoExecDeathTest; @@ -1193,7 +1244,7 @@ class GTEST_API_ TestEventListeners { GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners); }; -// A UnitTest consists of a vector of TestCases. +// A UnitTest consists of a vector of TestSuites. // // This is a singleton class. The only instance of UnitTest is // created when UnitTest::GetInstance() is first called. This @@ -1222,10 +1273,14 @@ class GTEST_API_ UnitTest { // was executed. The UnitTest object owns the string. const char* original_working_dir() const; - // Returns the TestCase object for the test that's currently running, + // Returns the TestSuite object for the test that's currently running, // or NULL if no test is running. - const TestCase* current_test_case() const - GTEST_LOCK_EXCLUDED_(mutex_); + const TestSuite* current_test_suite() const GTEST_LOCK_EXCLUDED_(mutex_); + +// Legacy API is still available but deprecated +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + const TestCase* current_test_case() const GTEST_LOCK_EXCLUDED_(mutex_); +#endif // Returns the TestInfo object for the test that's currently running, // or NULL if no test is running. @@ -1235,29 +1290,40 @@ class GTEST_API_ UnitTest { // Returns the random seed used at the start of the current test run. int random_seed() const; - // Returns the ParameterizedTestCaseRegistry object used to keep track of + // Returns the ParameterizedTestSuiteRegistry object used to keep track of // value-parameterized tests and instantiate and register them. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. - internal::ParameterizedTestCaseRegistry& parameterized_test_registry() + internal::ParameterizedTestSuiteRegistry& parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_); - // Gets the number of successful test cases. - int successful_test_case_count() const; + // Gets the number of successful test suites. + int successful_test_suite_count() const; - // Gets the number of failed test cases. - int failed_test_case_count() const; + // Gets the number of failed test suites. + int failed_test_suite_count() const; - // Gets the number of all test cases. - int total_test_case_count() const; + // Gets the number of all test suites. + int total_test_suite_count() const; - // Gets the number of all test cases that contain at least one test + // Gets the number of all test suites that contain at least one test // that should run. + int test_suite_to_run_count() const; + + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + int successful_test_case_count() const; + int failed_test_case_count() const; + int total_test_case_count() const; int test_case_to_run_count() const; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ // Gets the number of successful tests. int successful_test_count() const; + // Gets the number of skipped tests. + int skipped_test_count() const; + // Gets the number of failed tests. int failed_test_count() const; @@ -1283,19 +1349,25 @@ class GTEST_API_ UnitTest { // Gets the elapsed time, in milliseconds. TimeInMillis elapsed_time() const; - // Returns true iff the unit test passed (i.e. all test cases passed). + // Returns true if and only if the unit test passed (i.e. all test suites + // passed). bool Passed() const; - // Returns true iff the unit test failed (i.e. some test case failed - // or something outside of all tests failed). + // Returns true if and only if the unit test failed (i.e. some test suite + // failed or something outside of all tests failed). bool Failed() const; - // Gets the i-th test case among all the test cases. i can range from 0 to - // total_test_case_count() - 1. If i is not in that range, returns NULL. + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + const TestSuite* GetTestSuite(int i) const; + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ const TestCase* GetTestCase(int i) const; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ // Returns the TestResult containing information on test failures and - // properties logged outside of individual test cases. + // properties logged outside of individual test suites. const TestResult& ad_hoc_test_result() const; // Returns the list of event listeners that can be used to track events @@ -1326,15 +1398,15 @@ class GTEST_API_ UnitTest { GTEST_LOCK_EXCLUDED_(mutex_); // Adds a TestProperty to the current TestResult object when invoked from - // inside a test, to current TestCase's ad_hoc_test_result_ when invoked - // from SetUpTestCase or TearDownTestCase, or to the global property set + // inside a test, to current TestSuite's ad_hoc_test_result_ when invoked + // from SetUpTestSuite or TearDownTestSuite, or to the global property set // when invoked elsewhere. If the result already contains a property with // the same key, the value will be updated. void RecordProperty(const std::string& key, const std::string& value); - // Gets the i-th test case among all the test cases. i can range from 0 to - // total_test_case_count() - 1. If i is not in that range, returns NULL. - TestCase* GetMutableTestCase(int i); + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + TestSuite* GetMutableTestSuite(int i); // Accessors for the implementation object. internal::UnitTestImpl* impl() { return impl_; } @@ -1419,6 +1491,10 @@ GTEST_API_ void InitGoogleTest(int* argc, char** argv); // UNICODE mode. GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv); +// This overloaded version can be used on Arduino/embedded platforms where +// there is no argc/argv. +GTEST_API_ void InitGoogleTest(); + namespace internal { // Separate the error generating code from the code path to reduce the stack @@ -1435,6 +1511,13 @@ AssertionResult CmpHelperEQFailure(const char* lhs_expression, false); } +// This block of code defines operator==/!= +// to block lexical scope lookup. +// It prevents using invalid operator==/!= defined at namespace scope. +struct faketype {}; +inline bool operator==(faketype, faketype) { return true; } +inline bool operator!=(faketype, faketype) { return false; } + // The helper function for {ASSERT|EXPECT}_EQ. template AssertionResult CmpHelperEQ(const char* lhs_expression, @@ -1456,18 +1539,17 @@ GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression, BiggestInt lhs, BiggestInt rhs); -// The helper class for {ASSERT|EXPECT}_EQ. The template argument -// lhs_is_null_literal is true iff the first argument to ASSERT_EQ() -// is a null pointer literal. The following default implementation is -// for lhs_is_null_literal being false. -template class EqHelper { public: // This templatized version is for the general case. - template + template < + typename T1, typename T2, + // Disable this overload for cases where one argument is a pointer + // and the other is the null pointer constant. + typename std::enable_if::value || + !std::is_pointer::value>::type* = nullptr> static AssertionResult Compare(const char* lhs_expression, - const char* rhs_expression, - const T1& lhs, + const char* rhs_expression, const T1& lhs, const T2& rhs) { return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); } @@ -1484,49 +1566,15 @@ class EqHelper { BiggestInt rhs) { return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); } -}; -// This specialization is used when the first argument to ASSERT_EQ() -// is a null pointer literal, like NULL, false, or 0. -template <> -class EqHelper { - public: - // We define two overloaded versions of Compare(). The first - // version will be picked when the second argument to ASSERT_EQ() is - // NOT a pointer, e.g. ASSERT_EQ(0, AnIntFunction()) or - // EXPECT_EQ(false, a_bool). - template - static AssertionResult Compare( - const char* lhs_expression, - const char* rhs_expression, - const T1& lhs, - const T2& rhs, - // The following line prevents this overload from being considered if T2 - // is not a pointer type. We need this because ASSERT_EQ(NULL, my_ptr) - // expands to Compare("", "", NULL, my_ptr), which requires a conversion - // to match the Secret* in the other overload, which would otherwise make - // this template match better. - typename EnableIf::value>::type* = 0) { - return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); - } - - // This version will be picked when the second argument to ASSERT_EQ() is a - // pointer, e.g. ASSERT_EQ(NULL, a_pointer). template static AssertionResult Compare( - const char* lhs_expression, - const char* rhs_expression, - // We used to have a second template parameter instead of Secret*. That - // template parameter would deduce to 'long', making this a better match - // than the first overload even without the first overload's EnableIf. - // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to - // non-pointer argument" (even a deduced integral argument), so the old - // implementation caused warnings in user code. - Secret* /* lhs (NULL) */, - T* rhs) { + const char* lhs_expression, const char* rhs_expression, + // Handle cases where '0' is used as a null pointer literal. + std::nullptr_t /* lhs */, T* rhs) { // We already know that 'lhs' is a null pointer. - return CmpHelperEQ(lhs_expression, rhs_expression, - static_cast(NULL), rhs); + return CmpHelperEQ(lhs_expression, rhs_expression, static_cast(nullptr), + rhs); } }; @@ -1755,6 +1803,12 @@ class GTEST_API_ AssertHelper { GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper); }; +enum GTestColor { COLOR_DEFAULT, COLOR_RED, COLOR_GREEN, COLOR_YELLOW }; + +GTEST_API_ GTEST_ATTRIBUTE_PRINTF_(2, 3) void ColoredPrintf(GTestColor color, + const char* fmt, + ...); + } // namespace internal // The pure interface class that all value-parameterized tests inherit from. @@ -1774,13 +1828,13 @@ class GTEST_API_ AssertHelper { // FooTest() { // // Can use GetParam() here. // } -// virtual ~FooTest() { +// ~FooTest() override { // // Can use GetParam() here. // } -// virtual void SetUp() { +// void SetUp() override { // // Can use GetParam() here. // } -// virtual void TearDown { +// void TearDown override { // // Can use GetParam() here. // } // }; @@ -1789,7 +1843,7 @@ class GTEST_API_ AssertHelper { // Foo foo; // ASSERT_TRUE(foo.DoesBar(GetParam())); // } -// INSTANTIATE_TEST_CASE_P(OneToTenRange, FooTest, ::testing::Range(1, 10)); +// INSTANTIATE_TEST_SUITE_P(OneToTenRange, FooTest, ::testing::Range(1, 10)); template class WithParamInterface { @@ -1800,7 +1854,7 @@ class WithParamInterface { // The current parameter value. Is also available in the test fixture's // constructor. static const ParamType& GetParam() { - GTEST_CHECK_(parameter_ != NULL) + GTEST_CHECK_(parameter_ != nullptr) << "GetParam() can only be called inside a value-parameterized test " << "-- did you intend to write TEST_P instead of TEST_F?"; return *parameter_; @@ -1821,7 +1875,7 @@ class WithParamInterface { }; template -const T* WithParamInterface::parameter_ = NULL; +const T* WithParamInterface::parameter_ = nullptr; // Most value-parameterized classes can ignore the existence of // WithParamInterface, and can just inherit from ::testing::TestWithParam. @@ -1832,6 +1886,11 @@ class TestWithParam : public Test, public WithParamInterface { // Macros for indicating success/failure in test code. +// Skips test in runtime. +// Skipping test aborts current function. +// Skipped tests are neither successful nor failed. +#define GTEST_SKIP() GTEST_SKIP_("Skipped") + // ADD_FAILURE unconditionally adds a failure to the current test. // SUCCEED generates a success - it doesn't automatically make the // current test successful, as a test is only successful when it has @@ -1861,6 +1920,11 @@ class TestWithParam : public Test, public WithParamInterface { // Generates a fatal failure with a generic message. #define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed") +// Like GTEST_FAIL(), but at the given source file location. +#define GTEST_FAIL_AT(file, line) \ + GTEST_MESSAGE_AT_(file, line, "Failed", \ + ::testing::TestPartResult::kFatalFailure) + // Define this macro to 1 to omit the definition of FAIL(), which is a // generic name and clashes with some other libraries. #if !GTEST_DONT_DEFINE_FAIL @@ -1961,9 +2025,7 @@ class TestWithParam : public Test, public WithParamInterface { // ASSERT_GT(records.size(), 0) << "There is no record left."; #define EXPECT_EQ(val1, val2) \ - EXPECT_PRED_FORMAT2(::testing::internal:: \ - EqHelper::Compare, \ - val1, val2) + EXPECT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2) #define EXPECT_NE(val1, val2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2) #define EXPECT_LE(val1, val2) \ @@ -1976,9 +2038,7 @@ class TestWithParam : public Test, public WithParamInterface { EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2) #define GTEST_ASSERT_EQ(val1, val2) \ - ASSERT_PRED_FORMAT2(::testing::internal:: \ - EqHelper::Compare, \ - val1, val2) + ASSERT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2) #define GTEST_ASSERT_NE(val1, val2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2) #define GTEST_ASSERT_LE(val1, val2) \ @@ -2169,12 +2229,6 @@ class GTEST_API_ ScopedTrace { PushTrace(file, line, message ? message : "(null)"); } -#if GTEST_HAS_GLOBAL_STRING - ScopedTrace(const char* file, int line, const ::string& message) { - PushTrace(file, line, message); - } -#endif - ScopedTrace(const char* file, int line, const std::string& message) { PushTrace(file, line, message); } @@ -2212,10 +2266,9 @@ class GTEST_API_ ScopedTrace { ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\ __FILE__, __LINE__, (message)) - // Compile-time assertion for type equality. -// StaticAssertTypeEq() compiles iff type1 and type2 are -// the same type. The value it returns is not interesting. +// StaticAssertTypeEq() compiles if and only if type1 and type2 +// are the same type. The value it returns is not interesting. // // Instead of making StaticAssertTypeEq a class template, we make it a // function template that invokes a helper class template. This @@ -2244,18 +2297,19 @@ class GTEST_API_ ScopedTrace { // // to cause a compiler error. template -bool StaticAssertTypeEq() { - (void)internal::StaticAssertTypeEqHelper(); +constexpr bool StaticAssertTypeEq() noexcept { + static_assert(std::is_same::value, + "type1 and type2 are not the same type"); return true; } // Defines a test. // -// The first parameter is the name of the test case, and the second -// parameter is the name of the test within the test case. +// The first parameter is the name of the test suite, and the second +// parameter is the name of the test within the test suite. // -// The convention is to end the test case name with "Test". For -// example, a test case for the Foo class can be named FooTest. +// The convention is to end the test suite name with "Test". For +// example, a test suite for the Foo class can be named FooTest. // // Test code should appear between braces after an invocation of // this macro. Example: @@ -2274,28 +2328,28 @@ bool StaticAssertTypeEq() { // code. GetTestTypeId() is guaranteed to always return the same // value, as it always calls GetTypeId<>() from the Google Test // framework. -#define GTEST_TEST(test_case_name, test_name)\ - GTEST_TEST_(test_case_name, test_name, \ - ::testing::Test, ::testing::internal::GetTestTypeId()) +#define GTEST_TEST(test_suite_name, test_name) \ + GTEST_TEST_(test_suite_name, test_name, ::testing::Test, \ + ::testing::internal::GetTestTypeId()) // Define this macro to 1 to omit the definition of TEST(), which // is a generic name and clashes with some other libraries. #if !GTEST_DONT_DEFINE_TEST -# define TEST(test_case_name, test_name) GTEST_TEST(test_case_name, test_name) +#define TEST(test_suite_name, test_name) GTEST_TEST(test_suite_name, test_name) #endif // Defines a test that uses a test fixture. // // The first parameter is the name of the test fixture class, which -// also doubles as the test case name. The second parameter is the -// name of the test within the test case. +// also doubles as the test suite name. The second parameter is the +// name of the test within the test suite. // // A test fixture class must be declared earlier. The user should put // the test code between braces after using this macro. Example: // // class FooTest : public testing::Test { // protected: -// virtual void SetUp() { b_.AddElement(3); } +// void SetUp() override { b_.AddElement(3); } // // Foo a_; // Foo b_; @@ -2309,7 +2363,8 @@ bool StaticAssertTypeEq() { // EXPECT_EQ(a_.size(), 0); // EXPECT_EQ(b_.size(), 1); // } - +// +// GOOGLETEST_CM0011 DO NOT DELETE #define TEST_F(test_fixture, test_name)\ GTEST_TEST_(test_fixture, test_name, test_fixture, \ ::testing::internal::GetTypeId()) @@ -2322,6 +2377,86 @@ GTEST_API_ std::string TempDir(); # pragma warning(pop) #endif +// Dynamically registers a test with the framework. +// +// This is an advanced API only to be used when the `TEST` macros are +// insufficient. The macros should be preferred when possible, as they avoid +// most of the complexity of calling this function. +// +// The `factory` argument is a factory callable (move-constructible) object or +// function pointer that creates a new instance of the Test object. It +// handles ownership to the caller. The signature of the callable is +// `Fixture*()`, where `Fixture` is the test fixture class for the test. All +// tests registered with the same `test_suite_name` must return the same +// fixture type. This is checked at runtime. +// +// The framework will infer the fixture class from the factory and will call +// the `SetUpTestSuite` and `TearDownTestSuite` for it. +// +// Must be called before `RUN_ALL_TESTS()` is invoked, otherwise behavior is +// undefined. +// +// Use case example: +// +// class MyFixture : public ::testing::Test { +// public: +// // All of these optional, just like in regular macro usage. +// static void SetUpTestSuite() { ... } +// static void TearDownTestSuite() { ... } +// void SetUp() override { ... } +// void TearDown() override { ... } +// }; +// +// class MyTest : public MyFixture { +// public: +// explicit MyTest(int data) : data_(data) {} +// void TestBody() override { ... } +// +// private: +// int data_; +// }; +// +// void RegisterMyTests(const std::vector& values) { +// for (int v : values) { +// ::testing::RegisterTest( +// "MyFixture", ("Test" + std::to_string(v)).c_str(), nullptr, +// std::to_string(v).c_str(), +// __FILE__, __LINE__, +// // Important to use the fixture type as the return type here. +// [=]() -> MyFixture* { return new MyTest(v); }); +// } +// } +// ... +// int main(int argc, char** argv) { +// std::vector values_to_test = LoadValuesFromConfig(); +// RegisterMyTests(values_to_test); +// ... +// return RUN_ALL_TESTS(); +// } +// +template +TestInfo* RegisterTest(const char* test_suite_name, const char* test_name, + const char* type_param, const char* value_param, + const char* file, int line, Factory factory) { + using TestT = typename std::remove_pointer::type; + + class FactoryImpl : public internal::TestFactoryBase { + public: + explicit FactoryImpl(Factory f) : factory_(std::move(f)) {} + Test* CreateTest() override { return factory_(); } + + private: + Factory factory_; + }; + + return internal::MakeAndRegisterTestInfo( + test_suite_name, test_name, type_param, value_param, + internal::CodeLocation(file, line), internal::GetTypeId(), + internal::SuiteApiResolver::GetSetUpCaseOrSuite(file, line), + internal::SuiteApiResolver::GetTearDownCaseOrSuite(file, line), + new FactoryImpl{std::move(factory)}); +} + } // namespace testing // Use this function in main() to run all tests. It returns 0 if all diff --git a/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/third_party/googletest/src/include/gtest/gtest_pred_impl.h index 0c1105cb8e..d514255c73 100644 --- a/third_party/googletest/src/include/gtest/gtest_pred_impl.h +++ b/third_party/googletest/src/include/gtest/gtest_pred_impl.h @@ -27,11 +27,10 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// This file is AUTOMATICALLY GENERATED on 01/02/2018 by command +// This file is AUTOMATICALLY GENERATED on 01/02/2019 by command // 'gen_gtest_pred_impl.py 5'. DO NOT EDIT BY HAND! // // Implements a family of generic predicate assertion macros. - // GOOGLETEST_CM0001 DO NOT DELETE #ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ @@ -67,6 +66,8 @@ namespace testing { // We also define the EXPECT_* variations. // // For now we only support predicates whose arity is at most 5. +// Please email googletestframework@googlegroups.com if you need +// support for higher arities. // GTEST_ASSERT_ is the basic statement to which all of the assertions // in this file reduce. Don't use this in your code. @@ -89,9 +90,10 @@ AssertionResult AssertPred1Helper(const char* pred_text, const T1& v1) { if (pred(v1)) return AssertionSuccess(); - return AssertionFailure() << pred_text << "(" - << e1 << ") evaluates to false, where" - << "\n" << e1 << " evaluates to " << v1; + return AssertionFailure() + << pred_text << "(" << e1 << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1); } // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1. @@ -133,11 +135,12 @@ AssertionResult AssertPred2Helper(const char* pred_text, const T2& v2) { if (pred(v1, v2)) return AssertionSuccess(); - return AssertionFailure() << pred_text << "(" - << e1 << ", " - << e2 << ") evaluates to false, where" - << "\n" << e1 << " evaluates to " << v1 - << "\n" << e2 << " evaluates to " << v2; + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 + << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2); } // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2. @@ -184,13 +187,13 @@ AssertionResult AssertPred3Helper(const char* pred_text, const T3& v3) { if (pred(v1, v2, v3)) return AssertionSuccess(); - return AssertionFailure() << pred_text << "(" - << e1 << ", " - << e2 << ", " - << e3 << ") evaluates to false, where" - << "\n" << e1 << " evaluates to " << v1 - << "\n" << e2 << " evaluates to " << v2 - << "\n" << e3 << " evaluates to " << v3; + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 << ", " << e3 + << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n" + << e3 << " evaluates to " << ::testing::PrintToString(v3); } // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3. @@ -242,15 +245,14 @@ AssertionResult AssertPred4Helper(const char* pred_text, const T4& v4) { if (pred(v1, v2, v3, v4)) return AssertionSuccess(); - return AssertionFailure() << pred_text << "(" - << e1 << ", " - << e2 << ", " - << e3 << ", " - << e4 << ") evaluates to false, where" - << "\n" << e1 << " evaluates to " << v1 - << "\n" << e2 << " evaluates to " << v2 - << "\n" << e3 << " evaluates to " << v3 - << "\n" << e4 << " evaluates to " << v4; + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4 + << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n" + << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n" + << e4 << " evaluates to " << ::testing::PrintToString(v4); } // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4. @@ -307,17 +309,15 @@ AssertionResult AssertPred5Helper(const char* pred_text, const T5& v5) { if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess(); - return AssertionFailure() << pred_text << "(" - << e1 << ", " - << e2 << ", " - << e3 << ", " - << e4 << ", " - << e5 << ") evaluates to false, where" - << "\n" << e1 << " evaluates to " << v1 - << "\n" << e2 << " evaluates to " << v2 - << "\n" << e3 << " evaluates to " << v3 - << "\n" << e4 << " evaluates to " << v4 - << "\n" << e5 << " evaluates to " << v5; + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4 + << ", " << e5 << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n" + << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n" + << e4 << " evaluates to " << ::testing::PrintToString(v4) << "\n" + << e5 << " evaluates to " << ::testing::PrintToString(v5); } // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5. diff --git a/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h index 0a9b42c8a5..68bd353061 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h @@ -36,9 +36,11 @@ #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ +#include "gtest/gtest-matchers.h" #include "gtest/internal/gtest-internal.h" #include +#include namespace testing { namespace internal { @@ -78,7 +80,7 @@ class GTEST_API_ DeathTest { // argument is set. If the death test should be skipped, the pointer // is set to NULL; otherwise, it is set to the address of a new concrete // DeathTest object that controls the execution of the current test. - static bool Create(const char* statement, const RE* regex, + static bool Create(const char* statement, Matcher matcher, const char* file, int line, DeathTest** test); DeathTest(); virtual ~DeathTest() { } @@ -144,21 +146,44 @@ GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 class DeathTestFactory { public: virtual ~DeathTestFactory() { } - virtual bool Create(const char* statement, const RE* regex, - const char* file, int line, DeathTest** test) = 0; + virtual bool Create(const char* statement, + Matcher matcher, const char* file, + int line, DeathTest** test) = 0; }; // A concrete DeathTestFactory implementation for normal use. class DefaultDeathTestFactory : public DeathTestFactory { public: - virtual bool Create(const char* statement, const RE* regex, - const char* file, int line, DeathTest** test); + bool Create(const char* statement, Matcher matcher, + const char* file, int line, DeathTest** test) override; }; // Returns true if exit_status describes a process that was terminated // by a signal, or exited normally with a nonzero exit code. GTEST_API_ bool ExitedUnsuccessfully(int exit_status); +// A string passed to EXPECT_DEATH (etc.) is caught by one of these overloads +// and interpreted as a regex (rather than an Eq matcher) for legacy +// compatibility. +inline Matcher MakeDeathTestMatcher( + ::testing::internal::RE regex) { + return ContainsRegex(regex.pattern()); +} +inline Matcher MakeDeathTestMatcher(const char* regex) { + return ContainsRegex(regex); +} +inline Matcher MakeDeathTestMatcher( + const ::std::string& regex) { + return ContainsRegex(regex); +} + +// If a Matcher is passed to EXPECT_DEATH (etc.), it's +// used directly. +inline Matcher MakeDeathTestMatcher( + Matcher matcher) { + return matcher; +} + // Traps C++ exceptions escaping statement and reports them as test // failures. Note that trapping SEH exceptions is not implemented here. # if GTEST_HAS_EXCEPTIONS @@ -186,38 +211,38 @@ GTEST_API_ bool ExitedUnsuccessfully(int exit_status); // This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*, // ASSERT_EXIT*, and EXPECT_EXIT*. -# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - const ::testing::internal::RE& gtest_regex = (regex); \ - ::testing::internal::DeathTest* gtest_dt; \ - if (!::testing::internal::DeathTest::Create(#statement, >est_regex, \ - __FILE__, __LINE__, >est_dt)) { \ - goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ - } \ - if (gtest_dt != NULL) { \ - ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \ - gtest_dt_ptr(gtest_dt); \ - switch (gtest_dt->AssumeRole()) { \ - case ::testing::internal::DeathTest::OVERSEE_TEST: \ - if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \ - goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ - } \ - break; \ - case ::testing::internal::DeathTest::EXECUTE_TEST: { \ - ::testing::internal::DeathTest::ReturnSentinel \ - gtest_sentinel(gtest_dt); \ - GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \ - gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \ - break; \ - } \ - default: \ - break; \ - } \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \ - fail(::testing::internal::DeathTest::LastMessage()) +#define GTEST_DEATH_TEST_(statement, predicate, regex_or_matcher, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + ::testing::internal::DeathTest* gtest_dt; \ + if (!::testing::internal::DeathTest::Create( \ + #statement, \ + ::testing::internal::MakeDeathTestMatcher(regex_or_matcher), \ + __FILE__, __LINE__, >est_dt)) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + if (gtest_dt != nullptr) { \ + std::unique_ptr< ::testing::internal::DeathTest> gtest_dt_ptr(gtest_dt); \ + switch (gtest_dt->AssumeRole()) { \ + case ::testing::internal::DeathTest::OVERSEE_TEST: \ + if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + break; \ + case ::testing::internal::DeathTest::EXECUTE_TEST: { \ + ::testing::internal::DeathTest::ReturnSentinel gtest_sentinel( \ + gtest_dt); \ + GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \ + gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \ + break; \ + } \ + default: \ + break; \ + } \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__) \ + : fail(::testing::internal::DeathTest::LastMessage()) // The symbol "fail" here expands to something into which a message // can be streamed. @@ -226,14 +251,13 @@ GTEST_API_ bool ExitedUnsuccessfully(int exit_status); // must accept a streamed message even though the message is never printed. // The regex object is not evaluated, but it is used to prevent "unused" // warnings and to avoid an expression that doesn't compile in debug mode. -#define GTEST_EXECUTE_STATEMENT_(statement, regex) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } else if (!::testing::internal::AlwaysTrue()) { \ - const ::testing::internal::RE& gtest_regex = (regex); \ - static_cast(gtest_regex); \ - } else \ +#define GTEST_EXECUTE_STATEMENT_(statement, regex_or_matcher) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } else if (!::testing::internal::AlwaysTrue()) { \ + ::testing::internal::MakeDeathTestMatcher(regex_or_matcher); \ + } else \ ::testing::Message() // A class representing the parsed contents of the diff --git a/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/third_party/googletest/src/include/gtest/internal/gtest-filepath.h index ae38d95bf8..c11b101516 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-filepath.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-filepath.h @@ -110,7 +110,7 @@ class GTEST_API_ FilePath { const FilePath& base_name, const char* extension); - // Returns true iff the path is "". + // Returns true if and only if the path is "". bool IsEmpty() const { return pathname_.empty(); } // If input name has a trailing separator character, removes it and returns diff --git a/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/third_party/googletest/src/include/gtest/internal/gtest-internal.h index b762f61fc5..94c816a28b 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-internal.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-internal.h @@ -58,6 +58,7 @@ #include #include #include +#include #include #include "gtest/gtest-message.h" @@ -79,7 +80,6 @@ // Stringifies its argument. #define GTEST_STRINGIFY_(name) #name -class ProtocolMessage; namespace proto2 { class Message; } namespace testing { @@ -91,7 +91,7 @@ class Message; // Represents a failure message. class Test; // Represents a test. class TestInfo; // Information about a test. class TestPartResult; // Result of a test part. -class UnitTest; // A collection of test cases. +class UnitTest; // A collection of test suites. template ::std::string PrintToString(const T& value); @@ -106,34 +106,22 @@ class UnitTestImpl; // Opaque implementation of UnitTest // stack trace. GTEST_API_ extern const char kStackTraceMarker[]; -// Two overloaded helpers for checking at compile time whether an -// expression is a null pointer literal (i.e. NULL or any 0-valued -// compile-time integral constant). Their return values have -// different sizes, so we can use sizeof() to test which version is -// picked by the compiler. These helpers have no implementations, as -// we only need their signatures. -// -// Given IsNullLiteralHelper(x), the compiler will pick the first -// version if x can be implicitly converted to Secret*, and pick the -// second version otherwise. Since Secret is a secret and incomplete -// type, the only expression a user can write that has type Secret* is -// a null pointer literal. Therefore, we know that x is a null -// pointer literal if and only if the first version is picked by the -// compiler. -char IsNullLiteralHelper(Secret* p); -char (&IsNullLiteralHelper(...))[2]; // NOLINT - -// A compile-time bool constant that is true if and only if x is a -// null pointer literal (i.e. NULL or any 0-valued compile-time -// integral constant). -#ifdef GTEST_ELLIPSIS_NEEDS_POD_ -// We lose support for NULL detection where the compiler doesn't like -// passing non-POD classes through ellipsis (...). -# define GTEST_IS_NULL_LITERAL_(x) false -#else -# define GTEST_IS_NULL_LITERAL_(x) \ - (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1) -#endif // GTEST_ELLIPSIS_NEEDS_POD_ +// An IgnoredValue object can be implicitly constructed from ANY value. +class IgnoredValue { + struct Sink {}; + public: + // This constructor template allows any value to be implicitly + // converted to IgnoredValue. The object has no data member and + // doesn't try to remember anything about the argument. We + // deliberately omit the 'explicit' keyword in order to allow the + // conversion to be implicit. + // Disable the conversion if T already has a magical conversion operator. + // Otherwise we get ambiguity. + template ::value, + int>::type = 0> + IgnoredValue(const T& /* ignored */) {} // NOLINT(runtime/explicit) +}; // Appends the user-supplied message to the Google-Test-generated message. GTEST_API_ std::string AppendUserMessage( @@ -201,7 +189,7 @@ GTEST_API_ std::string DiffStrings(const std::string& left, // expected_value: "5" // actual_value: "6" // -// The ignoring_case parameter is true iff the assertion is a +// The ignoring_case parameter is true if and only if the assertion is a // *_STRCASEEQ*. When it's true, the string " (ignoring case)" will // be inserted into the message. GTEST_API_ AssertionResult EqFailure(const char* expected_expression, @@ -330,15 +318,15 @@ class FloatingPoint { // Returns the sign bit of this number. Bits sign_bit() const { return kSignBitMask & u_.bits_; } - // Returns true iff this is NAN (not a number). + // Returns true if and only if this is NAN (not a number). bool is_nan() const { // It's a NAN if the exponent bits are all ones and the fraction // bits are not entirely zeros. return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0); } - // Returns true iff this number is at most kMaxUlps ULP's away from - // rhs. In particular, this function: + // Returns true if and only if this number is at most kMaxUlps ULP's away + // from rhs. In particular, this function: // // - returns false if either number is (or both are) NAN. // - treats really large numbers as almost equal to infinity. @@ -409,7 +397,7 @@ typedef FloatingPoint Float; typedef FloatingPoint Double; // In order to catch the mistake of putting tests that use different -// test fixture classes in the same test case, we need to assign +// test fixture classes in the same test suite, we need to assign // unique IDs to fixture classes and compare them. The TypeId type is // used to hold such IDs. The user should treat TypeId as an opaque // type: the only operation allowed on TypeId values is to compare @@ -469,7 +457,7 @@ class TestFactoryBase { template class TestFactoryImpl : public TestFactoryBase { public: - virtual Test* CreateTest() { return new TestClass; } + Test* CreateTest() override { return new TestClass; } }; #if GTEST_OS_WINDOWS @@ -485,9 +473,9 @@ GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr, #endif // GTEST_OS_WINDOWS -// Types of SetUpTestCase() and TearDownTestCase() functions. -typedef void (*SetUpTestCaseFunc)(); -typedef void (*TearDownTestCaseFunc)(); +// Types of SetUpTestSuite() and TearDownTestSuite() functions. +using SetUpTestSuiteFunc = void (*)(); +using TearDownTestSuiteFunc = void (*)(); struct CodeLocation { CodeLocation(const std::string& a_file, int a_line) @@ -497,12 +485,64 @@ struct CodeLocation { int line; }; +// Helper to identify which setup function for TestCase / TestSuite to call. +// Only one function is allowed, either TestCase or TestSute but not both. + +// Utility functions to help SuiteApiResolver +using SetUpTearDownSuiteFuncType = void (*)(); + +inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull( + SetUpTearDownSuiteFuncType a, SetUpTearDownSuiteFuncType def) { + return a == def ? nullptr : a; +} + +template +// Note that SuiteApiResolver inherits from T because +// SetUpTestSuite()/TearDownTestSuite() could be protected. Ths way +// SuiteApiResolver can access them. +struct SuiteApiResolver : T { + // testing::Test is only forward declared at this point. So we make it a + // dependend class for the compiler to be OK with it. + using Test = + typename std::conditional::type; + + static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char* filename, + int line_num) { + SetUpTearDownSuiteFuncType test_case_fp = + GetNotDefaultOrNull(&T::SetUpTestCase, &Test::SetUpTestCase); + SetUpTearDownSuiteFuncType test_suite_fp = + GetNotDefaultOrNull(&T::SetUpTestSuite, &Test::SetUpTestSuite); + + GTEST_CHECK_(!test_case_fp || !test_suite_fp) + << "Test can not provide both SetUpTestSuite and SetUpTestCase, please " + "make sure there is only one present at " + << filename << ":" << line_num; + + return test_case_fp != nullptr ? test_case_fp : test_suite_fp; + } + + static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char* filename, + int line_num) { + SetUpTearDownSuiteFuncType test_case_fp = + GetNotDefaultOrNull(&T::TearDownTestCase, &Test::TearDownTestCase); + SetUpTearDownSuiteFuncType test_suite_fp = + GetNotDefaultOrNull(&T::TearDownTestSuite, &Test::TearDownTestSuite); + + GTEST_CHECK_(!test_case_fp || !test_suite_fp) + << "Test can not provide both TearDownTestSuite and TearDownTestCase," + " please make sure there is only one present at" + << filename << ":" << line_num; + + return test_case_fp != nullptr ? test_case_fp : test_suite_fp; + } +}; + // Creates a new TestInfo object and registers it with Google Test; // returns the created object. // // Arguments: // -// test_case_name: name of the test case +// test_suite_name: name of the test suite // name: name of the test // type_param the name of the test's type parameter, or NULL if // this is not a typed or a type-parameterized test. @@ -510,21 +550,16 @@ struct CodeLocation { // or NULL if this is not a type-parameterized test. // code_location: code location where the test is defined // fixture_class_id: ID of the test fixture class -// set_up_tc: pointer to the function that sets up the test case -// tear_down_tc: pointer to the function that tears down the test case +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite // factory: pointer to the factory that creates a test object. // The newly created TestInfo instance will assume // ownership of the factory object. GTEST_API_ TestInfo* MakeAndRegisterTestInfo( - const char* test_case_name, - const char* name, - const char* type_param, - const char* value_param, - CodeLocation code_location, - TypeId fixture_class_id, - SetUpTestCaseFunc set_up_tc, - TearDownTestCaseFunc tear_down_tc, - TestFactoryBase* factory); + const char* test_suite_name, const char* name, const char* type_param, + const char* value_param, CodeLocation code_location, + TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc, + TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory); // If *pstr starts with the given prefix, modifies *pstr to be right // past the prefix and returns true; otherwise leaves *pstr unchanged @@ -536,19 +571,20 @@ GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr); GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ /* class A needs to have dll-interface to be used by clients of class B */) -// State of the definition of a type-parameterized test case. -class GTEST_API_ TypedTestCasePState { +// State of the definition of a type-parameterized test suite. +class GTEST_API_ TypedTestSuitePState { public: - TypedTestCasePState() : registered_(false) {} + TypedTestSuitePState() : registered_(false) {} // Adds the given test name to defined_test_names_ and return true - // if the test case hasn't been registered; otherwise aborts the + // if the test suite hasn't been registered; otherwise aborts the // program. bool AddTestName(const char* file, int line, const char* case_name, const char* test_name) { if (registered_) { - fprintf(stderr, "%s Test %s must be defined before " - "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n", + fprintf(stderr, + "%s Test %s must be defined before " + "REGISTER_TYPED_TEST_SUITE_P(%s, ...).\n", FormatFileLocation(file, line).c_str(), test_name, case_name); fflush(stderr); posix::Abort(); @@ -581,14 +617,19 @@ class GTEST_API_ TypedTestCasePState { RegisteredTestsMap registered_tests_; }; +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +using TypedTestCasePState = TypedTestSuitePState; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 // Skips to the first non-space char after the first comma in 'str'; // returns NULL if no comma is found in 'str'. inline const char* SkipComma(const char* str) { const char* comma = strchr(str, ','); - if (comma == NULL) { - return NULL; + if (comma == nullptr) { + return nullptr; } while (IsSpace(*(++comma))) {} return comma; @@ -598,7 +639,7 @@ inline const char* SkipComma(const char* str) { // the entire string if it contains no comma. inline std::string GetPrefixUntilComma(const char* str) { const char* comma = strchr(str, ','); - return comma == NULL ? str : std::string(str, comma); + return comma == nullptr ? str : std::string(str, comma); } // Splits a given string on a given delimiter, populating a given @@ -648,7 +689,7 @@ template class TypeParameterizedTest { public: // 'index' is the index of the test in the type list 'Types' - // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase, + // specified in INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, TestSuite, // Types). Valid values for 'index' are [0, N - 1] where N is the // length of Types. static bool Register(const char* prefix, const CodeLocation& code_location, @@ -663,13 +704,17 @@ class TypeParameterizedTest { // list. MakeAndRegisterTestInfo( (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + - "/" + type_names[index]) + "/" + type_names[static_cast(index)]) .c_str(), StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(), GetTypeName().c_str(), - NULL, // No value parameter. - code_location, GetTypeId(), TestClass::SetUpTestCase, - TestClass::TearDownTestCase, new TestFactoryImpl); + nullptr, // No value parameter. + code_location, GetTypeId(), + SuiteApiResolver::GetSetUpCaseOrSuite( + code_location.file.c_str(), code_location.line), + SuiteApiResolver::GetTearDownCaseOrSuite( + code_location.file.c_str(), code_location.line), + new TestFactoryImpl); // Next, recurses (at compile time) with the tail of the type list. return TypeParameterizedTest { } }; -// TypeParameterizedTestCase::Register() +// TypeParameterizedTestSuite::Register() // registers *all combinations* of 'Tests' and 'Types' with Google // Test. The return value is insignificant - we just need to return // something such that we can call this function in a namespace scope. template -class TypeParameterizedTestCase { +class TypeParameterizedTestSuite { public: static bool Register(const char* prefix, CodeLocation code_location, - const TypedTestCasePState* state, const char* case_name, + const TypedTestSuitePState* state, const char* case_name, const char* test_names, const std::vector& type_names = GenerateNames()) { @@ -726,20 +771,20 @@ class TypeParameterizedTestCase { prefix, test_location, case_name, test_names, 0, type_names); // Next, recurses (at compile time) with the tail of the test list. - return TypeParameterizedTestCase::Register(prefix, code_location, - state, case_name, - SkipComma(test_names), - type_names); + return TypeParameterizedTestSuite::Register(prefix, code_location, + state, case_name, + SkipComma(test_names), + type_names); } }; // The base case for the compile time recursion. template -class TypeParameterizedTestCase { +class TypeParameterizedTestSuite { public: static bool Register(const char* /*prefix*/, const CodeLocation&, - const TypedTestCasePState* /*state*/, + const TypedTestSuitePState* /*state*/, const char* /*case_name*/, const char* /*test_names*/, const std::vector& = std::vector() /*type_names*/) { @@ -802,120 +847,16 @@ class GTEST_API_ Random { GTEST_DISALLOW_COPY_AND_ASSIGN_(Random); }; -// Defining a variable of type CompileAssertTypesEqual will cause a -// compiler error iff T1 and T2 are different types. -template -struct CompileAssertTypesEqual; - -template -struct CompileAssertTypesEqual { -}; - -// Removes the reference from a type if it is a reference type, -// otherwise leaves it unchanged. This is the same as -// tr1::remove_reference, which is not widely available yet. -template -struct RemoveReference { typedef T type; }; // NOLINT -template -struct RemoveReference { typedef T type; }; // NOLINT - -// A handy wrapper around RemoveReference that works when the argument -// T depends on template parameters. -#define GTEST_REMOVE_REFERENCE_(T) \ - typename ::testing::internal::RemoveReference::type - -// Removes const from a type if it is a const type, otherwise leaves -// it unchanged. This is the same as tr1::remove_const, which is not -// widely available yet. -template -struct RemoveConst { typedef T type; }; // NOLINT -template -struct RemoveConst { typedef T type; }; // NOLINT - -// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above -// definition to fail to remove the const in 'const int[3]' and 'const -// char[3][4]'. The following specialization works around the bug. -template -struct RemoveConst { - typedef typename RemoveConst::type type[N]; -}; - -#if defined(_MSC_VER) && _MSC_VER < 1400 -// This is the only specialization that allows VC++ 7.1 to remove const in -// 'const int[3] and 'const int[3][4]'. However, it causes trouble with GCC -// and thus needs to be conditionally compiled. -template -struct RemoveConst { - typedef typename RemoveConst::type type[N]; -}; -#endif - -// A handy wrapper around RemoveConst that works when the argument -// T depends on template parameters. -#define GTEST_REMOVE_CONST_(T) \ - typename ::testing::internal::RemoveConst::type - // Turns const U&, U&, const U, and U all into U. #define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \ - GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T)) - -// ImplicitlyConvertible::value is a compile-time bool -// constant that's true iff type From can be implicitly converted to -// type To. -template -class ImplicitlyConvertible { - private: - // We need the following helper functions only for their types. - // They have no implementations. - - // MakeFrom() is an expression whose type is From. We cannot simply - // use From(), as the type From may not have a public default - // constructor. - static typename AddReference::type MakeFrom(); - - // These two functions are overloaded. Given an expression - // Helper(x), the compiler will pick the first version if x can be - // implicitly converted to type To; otherwise it will pick the - // second version. - // - // The first version returns a value of size 1, and the second - // version returns a value of size 2. Therefore, by checking the - // size of Helper(x), which can be done at compile time, we can tell - // which version of Helper() is used, and hence whether x can be - // implicitly converted to type To. - static char Helper(To); - static char (&Helper(...))[2]; // NOLINT - - // We have to put the 'public' section after the 'private' section, - // or MSVC refuses to compile the code. - public: -#if defined(__BORLANDC__) - // C++Builder cannot use member overload resolution during template - // instantiation. The simplest workaround is to use its C++0x type traits - // functions (C++Builder 2009 and above only). - static const bool value = __is_convertible(From, To); -#else - // MSVC warns about implicitly converting from double to int for - // possible loss of data, so we need to temporarily disable the - // warning. - GTEST_DISABLE_MSC_WARNINGS_PUSH_(4244) - static const bool value = - sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1; - GTEST_DISABLE_MSC_WARNINGS_POP_() -#endif // __BORLANDC__ -}; -template -const bool ImplicitlyConvertible::value; + typename std::remove_const::type>::type // IsAProtocolMessage::value is a compile-time bool constant that's -// true iff T is type ProtocolMessage, proto2::Message, or a subclass -// of those. +// true if and only if T is type proto2::Message or a subclass of it. template struct IsAProtocolMessage : public bool_constant< - ImplicitlyConvertible::value || - ImplicitlyConvertible::value> { -}; + std::is_convertible::value> {}; // When the compiler sees expression IsContainerTest(0), if C is an // STL-style container class, the first overload of IsContainerTest @@ -942,7 +883,6 @@ struct IsAProtocolMessage // IsContainerTest(typename C::const_iterator*) and // IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++. typedef int IsContainer; -#if GTEST_LANG_CXX11 template ().begin()), class = decltype(::std::declval().end()), @@ -952,14 +892,6 @@ template -IsContainer IsContainerTest(int /* dummy */, - typename C::iterator* /* it */ = NULL, - typename C::const_iterator* /* const_it */ = NULL) { - return 0; -} -#endif // GTEST_LANG_CXX11 typedef char IsNotContainer; template @@ -980,47 +912,30 @@ struct IsHashTable { static char test(...); public: - static const bool value = sizeof(test(0, 0)) == sizeof(int); + static const bool value = sizeof(test(nullptr, nullptr)) == sizeof(int); }; template const bool IsHashTable::value; -template -struct VoidT { - typedef void value_type; -}; - -template -struct HasValueType : false_type {}; -template -struct HasValueType > : true_type { -}; - template (0)) == sizeof(IsContainer), - bool = HasValueType::value> + bool = sizeof(IsContainerTest(0)) == sizeof(IsContainer)> struct IsRecursiveContainerImpl; -template -struct IsRecursiveContainerImpl : public false_type {}; +template +struct IsRecursiveContainerImpl : public std::false_type {}; // Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to // obey the same inconsistencies as the IsContainerTest, namely check if // something is a container is relying on only const_iterator in C++11 and // is relying on both const_iterator and iterator otherwise template -struct IsRecursiveContainerImpl : public false_type {}; - -template -struct IsRecursiveContainerImpl { - #if GTEST_LANG_CXX11 - typedef typename IteratorTraits::value_type - value_type; -#else - typedef typename IteratorTraits::value_type value_type; -#endif - typedef is_same type; +struct IsRecursiveContainerImpl { + using value_type = decltype(*std::declval()); + using type = + std::is_same::type>::type, + C>; }; // IsRecursiveContainer is a unary compile-time predicate that @@ -1032,13 +947,6 @@ struct IsRecursiveContainerImpl { template struct IsRecursiveContainer : public IsRecursiveContainerImpl::type {}; -// EnableIf::type is void when 'Cond' is true, and -// undefined when 'Cond' is false. To use SFINAE to make a function -// overload only apply when a particular expression is true, add -// "typename EnableIf::type* = 0" as the last parameter. -template struct EnableIf; -template<> struct EnableIf { typedef void type; }; // NOLINT - // Utilities for native arrays. // ArrayEq() compares two k-dimensional native arrays using the @@ -1161,10 +1069,9 @@ class NativeArray { } private: - enum { - kCheckTypeIsNotConstOrAReference = StaticAssertTypeEqHelper< - Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value - }; + static_assert(!std::is_const::value, "Type must not be const"); + static_assert(!std::is_reference::value, + "Type must not be a reference"); // Initializes this object with a copy of the input. void InitCopy(const Element* array, size_t a_size) { @@ -1189,6 +1096,139 @@ class NativeArray { GTEST_DISALLOW_ASSIGN_(NativeArray); }; +// Backport of std::index_sequence. +template +struct IndexSequence { + using type = IndexSequence; +}; + +// Double the IndexSequence, and one if plus_one is true. +template +struct DoubleSequence; +template +struct DoubleSequence, sizeofT> { + using type = IndexSequence; +}; +template +struct DoubleSequence, sizeofT> { + using type = IndexSequence; +}; + +// Backport of std::make_index_sequence. +// It uses O(ln(N)) instantiation depth. +template +struct MakeIndexSequence + : DoubleSequence::type, + N / 2>::type {}; + +template <> +struct MakeIndexSequence<0> : IndexSequence<> {}; + +// FIXME: This implementation of ElemFromList is O(1) in instantiation depth, +// but it is O(N^2) in total instantiations. Not sure if this is the best +// tradeoff, as it will make it somewhat slow to compile. +template +struct ElemFromListImpl {}; + +template +struct ElemFromListImpl { + using type = T; +}; + +// Get the Nth element from T... +// It uses O(1) instantiation depth. +template +struct ElemFromList; + +template +struct ElemFromList, T...> + : ElemFromListImpl... {}; + +template +class FlatTuple; + +template +struct FlatTupleElemBase; + +template +struct FlatTupleElemBase, I> { + using value_type = + typename ElemFromList::type, + T...>::type; + FlatTupleElemBase() = default; + explicit FlatTupleElemBase(value_type t) : value(std::move(t)) {} + value_type value; +}; + +template +struct FlatTupleBase; + +template +struct FlatTupleBase, IndexSequence> + : FlatTupleElemBase, Idx>... { + using Indices = IndexSequence; + FlatTupleBase() = default; + explicit FlatTupleBase(T... t) + : FlatTupleElemBase, Idx>(std::move(t))... {} +}; + +// Analog to std::tuple but with different tradeoffs. +// This class minimizes the template instantiation depth, thus allowing more +// elements that std::tuple would. std::tuple has been seen to require an +// instantiation depth of more than 10x the number of elements in some +// implementations. +// FlatTuple and ElemFromList are not recursive and have a fixed depth +// regardless of T... +// MakeIndexSequence, on the other hand, it is recursive but with an +// instantiation depth of O(ln(N)). +template +class FlatTuple + : private FlatTupleBase, + typename MakeIndexSequence::type> { + using Indices = typename FlatTuple::FlatTupleBase::Indices; + + public: + FlatTuple() = default; + explicit FlatTuple(T... t) : FlatTuple::FlatTupleBase(std::move(t)...) {} + + template + const typename ElemFromList::type& Get() const { + return static_cast*>(this)->value; + } + + template + typename ElemFromList::type& Get() { + return static_cast*>(this)->value; + } +}; + +// Utility functions to be called with static_assert to induce deprecation +// warnings. +GTEST_INTERNAL_DEPRECATED( + "INSTANTIATE_TEST_CASE_P is deprecated, please use " + "INSTANTIATE_TEST_SUITE_P") +constexpr bool InstantiateTestCase_P_IsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "TYPED_TEST_CASE_P is deprecated, please use " + "TYPED_TEST_SUITE_P") +constexpr bool TypedTestCase_P_IsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "TYPED_TEST_CASE is deprecated, please use " + "TYPED_TEST_SUITE") +constexpr bool TypedTestCaseIsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "REGISTER_TYPED_TEST_CASE_P is deprecated, please use " + "REGISTER_TYPED_TEST_SUITE_P") +constexpr bool RegisterTypedTestCase_P_IsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "INSTANTIATE_TYPED_TEST_CASE_P is deprecated, please use " + "INSTANTIATE_TYPED_TEST_SUITE_P") +constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; } + } // namespace internal } // namespace testing @@ -1208,7 +1248,10 @@ class NativeArray { #define GTEST_SUCCESS_(message) \ GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess) -// Suppress MSVC warning 4702 (unreachable code) for the code following +#define GTEST_SKIP_(message) \ + return GTEST_MESSAGE_(message, ::testing::TestPartResult::kSkip) + +// Suppress MSVC warning 4072 (unreachable code) for the code following // statement if it returns or throws (or doesn't return or throw in some // situations). #define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \ @@ -1300,31 +1343,38 @@ class NativeArray { " Actual: it does.") // Expands to the name of the class that implements the given test. -#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ - test_case_name##_##test_name##_Test +#define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + test_suite_name##_##test_name##_Test // Helper macro for defining tests. -#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\ -class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\ - public:\ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\ - private:\ - virtual void TestBody();\ - static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\ - GTEST_DISALLOW_COPY_AND_ASSIGN_(\ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\ -};\ -\ -::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\ - ::test_info_ =\ - ::testing::internal::MakeAndRegisterTestInfo(\ - #test_case_name, #test_name, NULL, NULL, \ - ::testing::internal::CodeLocation(__FILE__, __LINE__), \ - (parent_id), \ - parent_class::SetUpTestCase, \ - parent_class::TearDownTestCase, \ - new ::testing::internal::TestFactoryImpl<\ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\ -void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() +#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id) \ + static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1, \ + "test_suite_name must not be empty"); \ + static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1, \ + "test_name must not be empty"); \ + class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + : public parent_class { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {} \ + \ + private: \ + virtual void TestBody(); \ + static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_; \ + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)); \ + }; \ + \ + ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)::test_info_ = \ + ::testing::internal::MakeAndRegisterTestInfo( \ + #test_suite_name, #test_name, nullptr, nullptr, \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \ + ::testing::internal::SuiteApiResolver< \ + parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__), \ + ::testing::internal::SuiteApiResolver< \ + parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__), \ + new ::testing::internal::TestFactoryImpl); \ + void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody() #endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h b/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h deleted file mode 100644 index 082b87289a..0000000000 --- a/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h +++ /dev/null @@ -1,243 +0,0 @@ -// Copyright 2003 Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// A "smart" pointer type with reference tracking. Every pointer to a -// particular object is kept on a circular linked list. When the last pointer -// to an object is destroyed or reassigned, the object is deleted. -// -// Used properly, this deletes the object when the last reference goes away. -// There are several caveats: -// - Like all reference counting schemes, cycles lead to leaks. -// - Each smart pointer is actually two pointers (8 bytes instead of 4). -// - Every time a pointer is assigned, the entire list of pointers to that -// object is traversed. This class is therefore NOT SUITABLE when there -// will often be more than two or three pointers to a particular object. -// - References are only tracked as long as linked_ptr<> objects are copied. -// If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS -// will happen (double deletion). -// -// A good use of this class is storing object references in STL containers. -// You can safely put linked_ptr<> in a vector<>. -// Other uses may not be as good. -// -// Note: If you use an incomplete type with linked_ptr<>, the class -// *containing* linked_ptr<> must have a constructor and destructor (even -// if they do nothing!). -// -// Bill Gibbons suggested we use something like this. -// -// Thread Safety: -// Unlike other linked_ptr implementations, in this implementation -// a linked_ptr object is thread-safe in the sense that: -// - it's safe to copy linked_ptr objects concurrently, -// - it's safe to copy *from* a linked_ptr and read its underlying -// raw pointer (e.g. via get()) concurrently, and -// - it's safe to write to two linked_ptrs that point to the same -// shared object concurrently. -// FIXME: rename this to safe_linked_ptr to avoid -// confusion with normal linked_ptr. - -// GOOGLETEST_CM0001 DO NOT DELETE - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ - -#include -#include - -#include "gtest/internal/gtest-port.h" - -namespace testing { -namespace internal { - -// Protects copying of all linked_ptr objects. -GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex); - -// This is used internally by all instances of linked_ptr<>. It needs to be -// a non-template class because different types of linked_ptr<> can refer to -// the same object (linked_ptr(obj) vs linked_ptr(obj)). -// So, it needs to be possible for different types of linked_ptr to participate -// in the same circular linked list, so we need a single class type here. -// -// DO NOT USE THIS CLASS DIRECTLY YOURSELF. Use linked_ptr. -class linked_ptr_internal { - public: - // Create a new circle that includes only this instance. - void join_new() { - next_ = this; - } - - // Many linked_ptr operations may change p.link_ for some linked_ptr - // variable p in the same circle as this object. Therefore we need - // to prevent two such operations from occurring concurrently. - // - // Note that different types of linked_ptr objects can coexist in a - // circle (e.g. linked_ptr, linked_ptr, and - // linked_ptr). Therefore we must use a single mutex to - // protect all linked_ptr objects. This can create serious - // contention in production code, but is acceptable in a testing - // framework. - - // Join an existing circle. - void join(linked_ptr_internal const* ptr) - GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) { - MutexLock lock(&g_linked_ptr_mutex); - - linked_ptr_internal const* p = ptr; - while (p->next_ != ptr) { - assert(p->next_ != this && - "Trying to join() a linked ring we are already in. " - "Is GMock thread safety enabled?"); - p = p->next_; - } - p->next_ = this; - next_ = ptr; - } - - // Leave whatever circle we're part of. Returns true if we were the - // last member of the circle. Once this is done, you can join() another. - bool depart() - GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) { - MutexLock lock(&g_linked_ptr_mutex); - - if (next_ == this) return true; - linked_ptr_internal const* p = next_; - while (p->next_ != this) { - assert(p->next_ != next_ && - "Trying to depart() a linked ring we are not in. " - "Is GMock thread safety enabled?"); - p = p->next_; - } - p->next_ = next_; - return false; - } - - private: - mutable linked_ptr_internal const* next_; -}; - -template -class linked_ptr { - public: - typedef T element_type; - - // Take over ownership of a raw pointer. This should happen as soon as - // possible after the object is created. - explicit linked_ptr(T* ptr = NULL) { capture(ptr); } - ~linked_ptr() { depart(); } - - // Copy an existing linked_ptr<>, adding ourselves to the list of references. - template linked_ptr(linked_ptr const& ptr) { copy(&ptr); } - linked_ptr(linked_ptr const& ptr) { // NOLINT - assert(&ptr != this); - copy(&ptr); - } - - // Assignment releases the old value and acquires the new. - template linked_ptr& operator=(linked_ptr const& ptr) { - depart(); - copy(&ptr); - return *this; - } - - linked_ptr& operator=(linked_ptr const& ptr) { - if (&ptr != this) { - depart(); - copy(&ptr); - } - return *this; - } - - // Smart pointer members. - void reset(T* ptr = NULL) { - depart(); - capture(ptr); - } - T* get() const { return value_; } - T* operator->() const { return value_; } - T& operator*() const { return *value_; } - - bool operator==(T* p) const { return value_ == p; } - bool operator!=(T* p) const { return value_ != p; } - template - bool operator==(linked_ptr const& ptr) const { - return value_ == ptr.get(); - } - template - bool operator!=(linked_ptr const& ptr) const { - return value_ != ptr.get(); - } - - private: - template - friend class linked_ptr; - - T* value_; - linked_ptr_internal link_; - - void depart() { - if (link_.depart()) delete value_; - } - - void capture(T* ptr) { - value_ = ptr; - link_.join_new(); - } - - template void copy(linked_ptr const* ptr) { - value_ = ptr->get(); - if (value_) - link_.join(&ptr->link_); - else - link_.join_new(); - } -}; - -template inline -bool operator==(T* ptr, const linked_ptr& x) { - return ptr == x.get(); -} - -template inline -bool operator!=(T* ptr, const linked_ptr& x) { - return ptr != x.get(); -} - -// A function to convert T* into linked_ptr -// Doing e.g. make_linked_ptr(new FooBarBaz(arg)) is a shorter notation -// for linked_ptr >(new FooBarBaz(arg)) -template -linked_ptr make_linked_ptr(T* ptr) { - return linked_ptr(ptr); -} - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h b/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h deleted file mode 100644 index 4fac8c0270..0000000000 --- a/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h +++ /dev/null @@ -1,5552 +0,0 @@ -// This file was GENERATED by command: -// pump.py gtest-param-util-generated.h.pump -// DO NOT EDIT BY HAND!!! - -// Copyright 2008 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -// Type and function utilities for implementing parameterized tests. -// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! -// -// Currently Google Test supports at most 50 arguments in Values, -// and at most 10 arguments in Combine. Please contact -// googletestframework@googlegroups.com if you need more. -// Please note that the number of arguments to Combine is limited -// by the maximum arity of the implementation of tuple which is -// currently set at 10. - -// GOOGLETEST_CM0001 DO NOT DELETE - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ - -#include "gtest/internal/gtest-param-util.h" -#include "gtest/internal/gtest-port.h" - -namespace testing { - -// Forward declarations of ValuesIn(), which is implemented in -// include/gtest/gtest-param-test.h. -template -internal::ParamGenerator< - typename ::testing::internal::IteratorTraits::value_type> -ValuesIn(ForwardIterator begin, ForwardIterator end); - -template -internal::ParamGenerator ValuesIn(const T (&array)[N]); - -template -internal::ParamGenerator ValuesIn( - const Container& container); - -namespace internal { - -// Used in the Values() function to provide polymorphic capabilities. -template -class ValueArray1 { - public: - explicit ValueArray1(T1 v1) : v1_(v1) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_)}; - return ValuesIn(array); - } - - ValueArray1(const ValueArray1& other) : v1_(other.v1_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray1& other); - - const T1 v1_; -}; - -template -class ValueArray2 { - public: - ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_)}; - return ValuesIn(array); - } - - ValueArray2(const ValueArray2& other) : v1_(other.v1_), v2_(other.v2_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray2& other); - - const T1 v1_; - const T2 v2_; -}; - -template -class ValueArray3 { - public: - ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_)}; - return ValuesIn(array); - } - - ValueArray3(const ValueArray3& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray3& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; -}; - -template -class ValueArray4 { - public: - ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_)}; - return ValuesIn(array); - } - - ValueArray4(const ValueArray4& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray4& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; -}; - -template -class ValueArray5 { - public: - ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_)}; - return ValuesIn(array); - } - - ValueArray5(const ValueArray5& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray5& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; -}; - -template -class ValueArray6 { - public: - ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_)}; - return ValuesIn(array); - } - - ValueArray6(const ValueArray6& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray6& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; -}; - -template -class ValueArray7 { - public: - ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_)}; - return ValuesIn(array); - } - - ValueArray7(const ValueArray7& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray7& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; -}; - -template -class ValueArray8 { - public: - ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_)}; - return ValuesIn(array); - } - - ValueArray8(const ValueArray8& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray8& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; -}; - -template -class ValueArray9 { - public: - ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, - T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_)}; - return ValuesIn(array); - } - - ValueArray9(const ValueArray9& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray9& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; -}; - -template -class ValueArray10 { - public: - ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_)}; - return ValuesIn(array); - } - - ValueArray10(const ValueArray10& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray10& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; -}; - -template -class ValueArray11 { - public: - ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), - v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_)}; - return ValuesIn(array); - } - - ValueArray11(const ValueArray11& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray11& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; -}; - -template -class ValueArray12 { - public: - ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), - v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_)}; - return ValuesIn(array); - } - - ValueArray12(const ValueArray12& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray12& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; -}; - -template -class ValueArray13 { - public: - ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), - v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), - v12_(v12), v13_(v13) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_)}; - return ValuesIn(array); - } - - ValueArray13(const ValueArray13& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray13& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; -}; - -template -class ValueArray14 { - public: - ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_)}; - return ValuesIn(array); - } - - ValueArray14(const ValueArray14& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray14& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; -}; - -template -class ValueArray15 { - public: - ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_)}; - return ValuesIn(array); - } - - ValueArray15(const ValueArray15& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray15& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; -}; - -template -class ValueArray16 { - public: - ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), - v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), - v16_(v16) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_)}; - return ValuesIn(array); - } - - ValueArray16(const ValueArray16& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray16& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; -}; - -template -class ValueArray17 { - public: - ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, - T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_)}; - return ValuesIn(array); - } - - ValueArray17(const ValueArray17& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray17& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; -}; - -template -class ValueArray18 { - public: - ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_)}; - return ValuesIn(array); - } - - ValueArray18(const ValueArray18& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray18& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; -}; - -template -class ValueArray19 { - public: - ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), - v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), - v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_)}; - return ValuesIn(array); - } - - ValueArray19(const ValueArray19& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray19& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; -}; - -template -class ValueArray20 { - public: - ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), - v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), - v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), - v19_(v19), v20_(v20) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_)}; - return ValuesIn(array); - } - - ValueArray20(const ValueArray20& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray20& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; -}; - -template -class ValueArray21 { - public: - ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), - v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), - v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), - v18_(v18), v19_(v19), v20_(v20), v21_(v21) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_)}; - return ValuesIn(array); - } - - ValueArray21(const ValueArray21& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray21& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; -}; - -template -class ValueArray22 { - public: - ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_)}; - return ValuesIn(array); - } - - ValueArray22(const ValueArray22& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray22& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; -}; - -template -class ValueArray23 { - public: - ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_)}; - return ValuesIn(array); - } - - ValueArray23(const ValueArray23& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray23& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; -}; - -template -class ValueArray24 { - public: - ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), - v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), - v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), - v22_(v22), v23_(v23), v24_(v24) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_)}; - return ValuesIn(array); - } - - ValueArray24(const ValueArray24& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray24& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; -}; - -template -class ValueArray25 { - public: - ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, - T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_)}; - return ValuesIn(array); - } - - ValueArray25(const ValueArray25& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray25& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; -}; - -template -class ValueArray26 { - public: - ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_)}; - return ValuesIn(array); - } - - ValueArray26(const ValueArray26& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray26& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; -}; - -template -class ValueArray27 { - public: - ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), - v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), - v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), - v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), - v26_(v26), v27_(v27) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_)}; - return ValuesIn(array); - } - - ValueArray27(const ValueArray27& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray27& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; -}; - -template -class ValueArray28 { - public: - ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), - v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), - v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), - v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), - v25_(v25), v26_(v26), v27_(v27), v28_(v28) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_)}; - return ValuesIn(array); - } - - ValueArray28(const ValueArray28& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray28& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; -}; - -template -class ValueArray29 { - public: - ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), - v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), - v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), - v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), - v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_)}; - return ValuesIn(array); - } - - ValueArray29(const ValueArray29& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray29& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; -}; - -template -class ValueArray30 { - public: - ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_)}; - return ValuesIn(array); - } - - ValueArray30(const ValueArray30& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray30& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; -}; - -template -class ValueArray31 { - public: - ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30), v31_(v31) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_)}; - return ValuesIn(array); - } - - ValueArray31(const ValueArray31& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray31& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; -}; - -template -class ValueArray32 { - public: - ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), - v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), - v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), - v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), - v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_)}; - return ValuesIn(array); - } - - ValueArray32(const ValueArray32& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray32& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; -}; - -template -class ValueArray33 { - public: - ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, - T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_)}; - return ValuesIn(array); - } - - ValueArray33(const ValueArray33& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray33& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; -}; - -template -class ValueArray34 { - public: - ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33), v34_(v34) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_)}; - return ValuesIn(array); - } - - ValueArray34(const ValueArray34& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray34& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; -}; - -template -class ValueArray35 { - public: - ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), - v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), - v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), - v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), - v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), - v32_(v32), v33_(v33), v34_(v34), v35_(v35) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_)}; - return ValuesIn(array); - } - - ValueArray35(const ValueArray35& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray35& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; -}; - -template -class ValueArray36 { - public: - ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), - v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), - v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), - v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), - v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), - v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_)}; - return ValuesIn(array); - } - - ValueArray36(const ValueArray36& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_), v36_(other.v36_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray36& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; -}; - -template -class ValueArray37 { - public: - ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), - v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), - v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), - v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), - v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), - v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), - v36_(v36), v37_(v37) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_)}; - return ValuesIn(array); - } - - ValueArray37(const ValueArray37& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_), v36_(other.v36_), v37_(other.v37_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray37& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; -}; - -template -class ValueArray38 { - public: - ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), - v35_(v35), v36_(v36), v37_(v37), v38_(v38) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_)}; - return ValuesIn(array); - } - - ValueArray38(const ValueArray38& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray38& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; -}; - -template -class ValueArray39 { - public: - ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), - v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_)}; - return ValuesIn(array); - } - - ValueArray39(const ValueArray39& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), - v39_(other.v39_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray39& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; -}; - -template -class ValueArray40 { - public: - ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), - v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), - v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), - v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), - v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), - v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), - v40_(v40) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_)}; - return ValuesIn(array); - } - - ValueArray40(const ValueArray40& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), - v39_(other.v39_), v40_(other.v40_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray40& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; -}; - -template -class ValueArray41 { - public: - ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, - T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), - v39_(v39), v40_(v40), v41_(v41) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_)}; - return ValuesIn(array); - } - - ValueArray41(const ValueArray41& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), - v39_(other.v39_), v40_(other.v40_), v41_(other.v41_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray41& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; -}; - -template -class ValueArray42 { - public: - ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), - v39_(v39), v40_(v40), v41_(v41), v42_(v42) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_)}; - return ValuesIn(array); - } - - ValueArray42(const ValueArray42& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), - v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray42& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; -}; - -template -class ValueArray43 { - public: - ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), - v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), - v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), - v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), - v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), - v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), - v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_)}; - return ValuesIn(array); - } - - ValueArray43(const ValueArray43& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), - v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), - v43_(other.v43_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray43& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; -}; - -template -class ValueArray44 { - public: - ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), - v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), - v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), - v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), - v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), - v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), - v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), - v43_(v43), v44_(v44) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_)}; - return ValuesIn(array); - } - - ValueArray44(const ValueArray44& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), - v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), - v43_(other.v43_), v44_(other.v44_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray44& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; -}; - -template -class ValueArray45 { - public: - ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), - v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), - v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), - v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), - v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), - v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), - v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), - v42_(v42), v43_(v43), v44_(v44), v45_(v45) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_)}; - return ValuesIn(array); - } - - ValueArray45(const ValueArray45& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), - v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), - v43_(other.v43_), v44_(other.v44_), v45_(other.v45_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray45& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; -}; - -template -class ValueArray46 { - public: - ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), - v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), - v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_), static_cast(v46_)}; - return ValuesIn(array); - } - - ValueArray46(const ValueArray46& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), - v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), - v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray46& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; - const T46 v46_; -}; - -template -class ValueArray47 { - public: - ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), - v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), - v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46), - v47_(v47) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_), static_cast(v46_), static_cast(v47_)}; - return ValuesIn(array); - } - - ValueArray47(const ValueArray47& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), - v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), - v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_), - v47_(other.v47_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray47& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; - const T46 v46_; - const T47 v47_; -}; - -template -class ValueArray48 { - public: - ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), - v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), - v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), - v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), - v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), - v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), - v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), - v46_(v46), v47_(v47), v48_(v48) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_), static_cast(v46_), static_cast(v47_), - static_cast(v48_)}; - return ValuesIn(array); - } - - ValueArray48(const ValueArray48& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), - v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), - v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_), - v47_(other.v47_), v48_(other.v48_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray48& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; - const T46 v46_; - const T47 v47_; - const T48 v48_; -}; - -template -class ValueArray49 { - public: - ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, - T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), - v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), - v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_), static_cast(v46_), static_cast(v47_), - static_cast(v48_), static_cast(v49_)}; - return ValuesIn(array); - } - - ValueArray49(const ValueArray49& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), - v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), - v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_), - v47_(other.v47_), v48_(other.v48_), v49_(other.v49_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray49& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; - const T46 v46_; - const T47 v47_; - const T48 v48_; - const T49 v49_; -}; - -template -class ValueArray50 { - public: - ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49, - T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), - v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), - v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_), static_cast(v46_), static_cast(v47_), - static_cast(v48_), static_cast(v49_), static_cast(v50_)}; - return ValuesIn(array); - } - - ValueArray50(const ValueArray50& other) : v1_(other.v1_), v2_(other.v2_), - v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), - v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), - v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), - v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), - v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), - v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), - v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), - v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), - v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), - v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), - v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_), - v47_(other.v47_), v48_(other.v48_), v49_(other.v49_), v50_(other.v50_) {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray50& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; - const T46 v46_; - const T47 v47_; - const T48 v48_; - const T49 v49_; - const T50 v50_; -}; - -# if GTEST_HAS_COMBINE -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Generates values from the Cartesian product of values produced -// by the argument generators. -// -template -class CartesianProductGenerator2 - : public ParamGeneratorInterface< ::testing::tuple > { - public: - typedef ::testing::tuple ParamType; - - CartesianProductGenerator2(const ParamGenerator& g1, - const ParamGenerator& g2) - : g1_(g1), g2_(g2) {} - virtual ~CartesianProductGenerator2() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current2_; - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return current_value_.get(); } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_.reset(new ParamType(*current1_, *current2_)); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - linked_ptr current_value_; - }; // class CartesianProductGenerator2::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator2& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; -}; // class CartesianProductGenerator2 - - -template -class CartesianProductGenerator3 - : public ParamGeneratorInterface< ::testing::tuple > { - public: - typedef ::testing::tuple ParamType; - - CartesianProductGenerator3(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3) - : g1_(g1), g2_(g2), g3_(g3) {} - virtual ~CartesianProductGenerator3() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current3_; - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return current_value_.get(); } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_.reset(new ParamType(*current1_, *current2_, *current3_)); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - linked_ptr current_value_; - }; // class CartesianProductGenerator3::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator3& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; -}; // class CartesianProductGenerator3 - - -template -class CartesianProductGenerator4 - : public ParamGeneratorInterface< ::testing::tuple > { - public: - typedef ::testing::tuple ParamType; - - CartesianProductGenerator4(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {} - virtual ~CartesianProductGenerator4() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current4_; - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return current_value_.get(); } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_.reset(new ParamType(*current1_, *current2_, *current3_, - *current4_)); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - linked_ptr current_value_; - }; // class CartesianProductGenerator4::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator4& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; -}; // class CartesianProductGenerator4 - - -template -class CartesianProductGenerator5 - : public ParamGeneratorInterface< ::testing::tuple > { - public: - typedef ::testing::tuple ParamType; - - CartesianProductGenerator5(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {} - virtual ~CartesianProductGenerator5() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current5_; - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return current_value_.get(); } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_.reset(new ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_)); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - linked_ptr current_value_; - }; // class CartesianProductGenerator5::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator5& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; -}; // class CartesianProductGenerator5 - - -template -class CartesianProductGenerator6 - : public ParamGeneratorInterface< ::testing::tuple > { - public: - typedef ::testing::tuple ParamType; - - CartesianProductGenerator6(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5, - const ParamGenerator& g6) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {} - virtual ~CartesianProductGenerator6() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5, - const ParamGenerator& g6, - const typename ParamGenerator::iterator& current6) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5), - begin6_(g6.begin()), end6_(g6.end()), current6_(current6) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current6_; - if (current6_ == end6_) { - current6_ = begin6_; - ++current5_; - } - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return current_value_.get(); } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_ && - current6_ == typed_other->current6_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_), - begin6_(other.begin6_), - end6_(other.end6_), - current6_(other.current6_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_.reset(new ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_)); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_ || - current6_ == end6_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - const typename ParamGenerator::iterator begin6_; - const typename ParamGenerator::iterator end6_; - typename ParamGenerator::iterator current6_; - linked_ptr current_value_; - }; // class CartesianProductGenerator6::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator6& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; - const ParamGenerator g6_; -}; // class CartesianProductGenerator6 - - -template -class CartesianProductGenerator7 - : public ParamGeneratorInterface< ::testing::tuple > { - public: - typedef ::testing::tuple ParamType; - - CartesianProductGenerator7(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5, - const ParamGenerator& g6, const ParamGenerator& g7) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {} - virtual ~CartesianProductGenerator7() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, - g7_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5, - const ParamGenerator& g6, - const typename ParamGenerator::iterator& current6, - const ParamGenerator& g7, - const typename ParamGenerator::iterator& current7) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5), - begin6_(g6.begin()), end6_(g6.end()), current6_(current6), - begin7_(g7.begin()), end7_(g7.end()), current7_(current7) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current7_; - if (current7_ == end7_) { - current7_ = begin7_; - ++current6_; - } - if (current6_ == end6_) { - current6_ = begin6_; - ++current5_; - } - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return current_value_.get(); } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_ && - current6_ == typed_other->current6_ && - current7_ == typed_other->current7_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_), - begin6_(other.begin6_), - end6_(other.end6_), - current6_(other.current6_), - begin7_(other.begin7_), - end7_(other.end7_), - current7_(other.current7_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_.reset(new ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_, *current7_)); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_ || - current6_ == end6_ || - current7_ == end7_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - const typename ParamGenerator::iterator begin6_; - const typename ParamGenerator::iterator end6_; - typename ParamGenerator::iterator current6_; - const typename ParamGenerator::iterator begin7_; - const typename ParamGenerator::iterator end7_; - typename ParamGenerator::iterator current7_; - linked_ptr current_value_; - }; // class CartesianProductGenerator7::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator7& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; - const ParamGenerator g6_; - const ParamGenerator g7_; -}; // class CartesianProductGenerator7 - - -template -class CartesianProductGenerator8 - : public ParamGeneratorInterface< ::testing::tuple > { - public: - typedef ::testing::tuple ParamType; - - CartesianProductGenerator8(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5, - const ParamGenerator& g6, const ParamGenerator& g7, - const ParamGenerator& g8) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), - g8_(g8) {} - virtual ~CartesianProductGenerator8() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, - g7_.begin(), g8_, g8_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, - g8_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5, - const ParamGenerator& g6, - const typename ParamGenerator::iterator& current6, - const ParamGenerator& g7, - const typename ParamGenerator::iterator& current7, - const ParamGenerator& g8, - const typename ParamGenerator::iterator& current8) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5), - begin6_(g6.begin()), end6_(g6.end()), current6_(current6), - begin7_(g7.begin()), end7_(g7.end()), current7_(current7), - begin8_(g8.begin()), end8_(g8.end()), current8_(current8) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current8_; - if (current8_ == end8_) { - current8_ = begin8_; - ++current7_; - } - if (current7_ == end7_) { - current7_ = begin7_; - ++current6_; - } - if (current6_ == end6_) { - current6_ = begin6_; - ++current5_; - } - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return current_value_.get(); } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_ && - current6_ == typed_other->current6_ && - current7_ == typed_other->current7_ && - current8_ == typed_other->current8_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_), - begin6_(other.begin6_), - end6_(other.end6_), - current6_(other.current6_), - begin7_(other.begin7_), - end7_(other.end7_), - current7_(other.current7_), - begin8_(other.begin8_), - end8_(other.end8_), - current8_(other.current8_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_.reset(new ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_, *current7_, *current8_)); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_ || - current6_ == end6_ || - current7_ == end7_ || - current8_ == end8_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - const typename ParamGenerator::iterator begin6_; - const typename ParamGenerator::iterator end6_; - typename ParamGenerator::iterator current6_; - const typename ParamGenerator::iterator begin7_; - const typename ParamGenerator::iterator end7_; - typename ParamGenerator::iterator current7_; - const typename ParamGenerator::iterator begin8_; - const typename ParamGenerator::iterator end8_; - typename ParamGenerator::iterator current8_; - linked_ptr current_value_; - }; // class CartesianProductGenerator8::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator8& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; - const ParamGenerator g6_; - const ParamGenerator g7_; - const ParamGenerator g8_; -}; // class CartesianProductGenerator8 - - -template -class CartesianProductGenerator9 - : public ParamGeneratorInterface< ::testing::tuple > { - public: - typedef ::testing::tuple ParamType; - - CartesianProductGenerator9(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5, - const ParamGenerator& g6, const ParamGenerator& g7, - const ParamGenerator& g8, const ParamGenerator& g9) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), - g9_(g9) {} - virtual ~CartesianProductGenerator9() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, - g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, - g8_.end(), g9_, g9_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5, - const ParamGenerator& g6, - const typename ParamGenerator::iterator& current6, - const ParamGenerator& g7, - const typename ParamGenerator::iterator& current7, - const ParamGenerator& g8, - const typename ParamGenerator::iterator& current8, - const ParamGenerator& g9, - const typename ParamGenerator::iterator& current9) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5), - begin6_(g6.begin()), end6_(g6.end()), current6_(current6), - begin7_(g7.begin()), end7_(g7.end()), current7_(current7), - begin8_(g8.begin()), end8_(g8.end()), current8_(current8), - begin9_(g9.begin()), end9_(g9.end()), current9_(current9) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current9_; - if (current9_ == end9_) { - current9_ = begin9_; - ++current8_; - } - if (current8_ == end8_) { - current8_ = begin8_; - ++current7_; - } - if (current7_ == end7_) { - current7_ = begin7_; - ++current6_; - } - if (current6_ == end6_) { - current6_ = begin6_; - ++current5_; - } - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return current_value_.get(); } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_ && - current6_ == typed_other->current6_ && - current7_ == typed_other->current7_ && - current8_ == typed_other->current8_ && - current9_ == typed_other->current9_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_), - begin6_(other.begin6_), - end6_(other.end6_), - current6_(other.current6_), - begin7_(other.begin7_), - end7_(other.end7_), - current7_(other.current7_), - begin8_(other.begin8_), - end8_(other.end8_), - current8_(other.current8_), - begin9_(other.begin9_), - end9_(other.end9_), - current9_(other.current9_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_.reset(new ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_, *current7_, *current8_, - *current9_)); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_ || - current6_ == end6_ || - current7_ == end7_ || - current8_ == end8_ || - current9_ == end9_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - const typename ParamGenerator::iterator begin6_; - const typename ParamGenerator::iterator end6_; - typename ParamGenerator::iterator current6_; - const typename ParamGenerator::iterator begin7_; - const typename ParamGenerator::iterator end7_; - typename ParamGenerator::iterator current7_; - const typename ParamGenerator::iterator begin8_; - const typename ParamGenerator::iterator end8_; - typename ParamGenerator::iterator current8_; - const typename ParamGenerator::iterator begin9_; - const typename ParamGenerator::iterator end9_; - typename ParamGenerator::iterator current9_; - linked_ptr current_value_; - }; // class CartesianProductGenerator9::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator9& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; - const ParamGenerator g6_; - const ParamGenerator g7_; - const ParamGenerator g8_; - const ParamGenerator g9_; -}; // class CartesianProductGenerator9 - - -template -class CartesianProductGenerator10 - : public ParamGeneratorInterface< ::testing::tuple > { - public: - typedef ::testing::tuple ParamType; - - CartesianProductGenerator10(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5, - const ParamGenerator& g6, const ParamGenerator& g7, - const ParamGenerator& g8, const ParamGenerator& g9, - const ParamGenerator& g10) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), - g9_(g9), g10_(g10) {} - virtual ~CartesianProductGenerator10() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, - g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, - g8_.end(), g9_, g9_.end(), g10_, g10_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5, - const ParamGenerator& g6, - const typename ParamGenerator::iterator& current6, - const ParamGenerator& g7, - const typename ParamGenerator::iterator& current7, - const ParamGenerator& g8, - const typename ParamGenerator::iterator& current8, - const ParamGenerator& g9, - const typename ParamGenerator::iterator& current9, - const ParamGenerator& g10, - const typename ParamGenerator::iterator& current10) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5), - begin6_(g6.begin()), end6_(g6.end()), current6_(current6), - begin7_(g7.begin()), end7_(g7.end()), current7_(current7), - begin8_(g8.begin()), end8_(g8.end()), current8_(current8), - begin9_(g9.begin()), end9_(g9.end()), current9_(current9), - begin10_(g10.begin()), end10_(g10.end()), current10_(current10) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current10_; - if (current10_ == end10_) { - current10_ = begin10_; - ++current9_; - } - if (current9_ == end9_) { - current9_ = begin9_; - ++current8_; - } - if (current8_ == end8_) { - current8_ = begin8_; - ++current7_; - } - if (current7_ == end7_) { - current7_ = begin7_; - ++current6_; - } - if (current6_ == end6_) { - current6_ = begin6_; - ++current5_; - } - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return current_value_.get(); } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_ && - current6_ == typed_other->current6_ && - current7_ == typed_other->current7_ && - current8_ == typed_other->current8_ && - current9_ == typed_other->current9_ && - current10_ == typed_other->current10_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_), - begin6_(other.begin6_), - end6_(other.end6_), - current6_(other.current6_), - begin7_(other.begin7_), - end7_(other.end7_), - current7_(other.current7_), - begin8_(other.begin8_), - end8_(other.end8_), - current8_(other.current8_), - begin9_(other.begin9_), - end9_(other.end9_), - current9_(other.current9_), - begin10_(other.begin10_), - end10_(other.end10_), - current10_(other.current10_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_.reset(new ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_, *current7_, *current8_, - *current9_, *current10_)); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_ || - current6_ == end6_ || - current7_ == end7_ || - current8_ == end8_ || - current9_ == end9_ || - current10_ == end10_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - const typename ParamGenerator::iterator begin6_; - const typename ParamGenerator::iterator end6_; - typename ParamGenerator::iterator current6_; - const typename ParamGenerator::iterator begin7_; - const typename ParamGenerator::iterator end7_; - typename ParamGenerator::iterator current7_; - const typename ParamGenerator::iterator begin8_; - const typename ParamGenerator::iterator end8_; - typename ParamGenerator::iterator current8_; - const typename ParamGenerator::iterator begin9_; - const typename ParamGenerator::iterator end9_; - typename ParamGenerator::iterator current9_; - const typename ParamGenerator::iterator begin10_; - const typename ParamGenerator::iterator end10_; - typename ParamGenerator::iterator current10_; - linked_ptr current_value_; - }; // class CartesianProductGenerator10::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator10& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; - const ParamGenerator g6_; - const ParamGenerator g7_; - const ParamGenerator g8_; - const ParamGenerator g9_; - const ParamGenerator g10_; -}; // class CartesianProductGenerator10 - - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Helper classes providing Combine() with polymorphic features. They allow -// casting CartesianProductGeneratorN to ParamGenerator if T is -// convertible to U. -// -template -class CartesianProductHolder2 { - public: -CartesianProductHolder2(const Generator1& g1, const Generator2& g2) - : g1_(g1), g2_(g2) {} - template - operator ParamGenerator< ::testing::tuple >() const { - return ParamGenerator< ::testing::tuple >( - new CartesianProductGenerator2( - static_cast >(g1_), - static_cast >(g2_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder2& other); - - const Generator1 g1_; - const Generator2 g2_; -}; // class CartesianProductHolder2 - -template -class CartesianProductHolder3 { - public: -CartesianProductHolder3(const Generator1& g1, const Generator2& g2, - const Generator3& g3) - : g1_(g1), g2_(g2), g3_(g3) {} - template - operator ParamGenerator< ::testing::tuple >() const { - return ParamGenerator< ::testing::tuple >( - new CartesianProductGenerator3( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder3& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; -}; // class CartesianProductHolder3 - -template -class CartesianProductHolder4 { - public: -CartesianProductHolder4(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {} - template - operator ParamGenerator< ::testing::tuple >() const { - return ParamGenerator< ::testing::tuple >( - new CartesianProductGenerator4( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder4& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; -}; // class CartesianProductHolder4 - -template -class CartesianProductHolder5 { - public: -CartesianProductHolder5(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {} - template - operator ParamGenerator< ::testing::tuple >() const { - return ParamGenerator< ::testing::tuple >( - new CartesianProductGenerator5( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder5& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; -}; // class CartesianProductHolder5 - -template -class CartesianProductHolder6 { - public: -CartesianProductHolder6(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5, - const Generator6& g6) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {} - template - operator ParamGenerator< ::testing::tuple >() const { - return ParamGenerator< ::testing::tuple >( - new CartesianProductGenerator6( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_), - static_cast >(g6_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder6& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; - const Generator6 g6_; -}; // class CartesianProductHolder6 - -template -class CartesianProductHolder7 { - public: -CartesianProductHolder7(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5, - const Generator6& g6, const Generator7& g7) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {} - template - operator ParamGenerator< ::testing::tuple >() const { - return ParamGenerator< ::testing::tuple >( - new CartesianProductGenerator7( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_), - static_cast >(g6_), - static_cast >(g7_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder7& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; - const Generator6 g6_; - const Generator7 g7_; -}; // class CartesianProductHolder7 - -template -class CartesianProductHolder8 { - public: -CartesianProductHolder8(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5, - const Generator6& g6, const Generator7& g7, const Generator8& g8) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), - g8_(g8) {} - template - operator ParamGenerator< ::testing::tuple >() const { - return ParamGenerator< ::testing::tuple >( - new CartesianProductGenerator8( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_), - static_cast >(g6_), - static_cast >(g7_), - static_cast >(g8_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder8& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; - const Generator6 g6_; - const Generator7 g7_; - const Generator8 g8_; -}; // class CartesianProductHolder8 - -template -class CartesianProductHolder9 { - public: -CartesianProductHolder9(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5, - const Generator6& g6, const Generator7& g7, const Generator8& g8, - const Generator9& g9) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), - g9_(g9) {} - template - operator ParamGenerator< ::testing::tuple >() const { - return ParamGenerator< ::testing::tuple >( - new CartesianProductGenerator9( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_), - static_cast >(g6_), - static_cast >(g7_), - static_cast >(g8_), - static_cast >(g9_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder9& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; - const Generator6 g6_; - const Generator7 g7_; - const Generator8 g8_; - const Generator9 g9_; -}; // class CartesianProductHolder9 - -template -class CartesianProductHolder10 { - public: -CartesianProductHolder10(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5, - const Generator6& g6, const Generator7& g7, const Generator8& g8, - const Generator9& g9, const Generator10& g10) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), - g9_(g9), g10_(g10) {} - template - operator ParamGenerator< ::testing::tuple >() const { - return ParamGenerator< ::testing::tuple >( - new CartesianProductGenerator10( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_), - static_cast >(g6_), - static_cast >(g7_), - static_cast >(g8_), - static_cast >(g9_), - static_cast >(g10_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder10& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; - const Generator6 g6_; - const Generator7 g7_; - const Generator8 g8_; - const Generator9 g9_; - const Generator10 g10_; -}; // class CartesianProductHolder10 - -# endif // GTEST_HAS_COMBINE - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump b/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump deleted file mode 100644 index 30dffe43c3..0000000000 --- a/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump +++ /dev/null @@ -1,282 +0,0 @@ -$$ -*- mode: c++; -*- -$var n = 50 $$ Maximum length of Values arguments we want to support. -$var maxtuple = 10 $$ Maximum number of Combine arguments we want to support. -// Copyright 2008 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -// Type and function utilities for implementing parameterized tests. -// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! -// -// Currently Google Test supports at most $n arguments in Values, -// and at most $maxtuple arguments in Combine. Please contact -// googletestframework@googlegroups.com if you need more. -// Please note that the number of arguments to Combine is limited -// by the maximum arity of the implementation of tuple which is -// currently set at $maxtuple. - -// GOOGLETEST_CM0001 DO NOT DELETE - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ - -#include "gtest/internal/gtest-param-util.h" -#include "gtest/internal/gtest-port.h" - -namespace testing { - -// Forward declarations of ValuesIn(), which is implemented in -// include/gtest/gtest-param-test.h. -template -internal::ParamGenerator< - typename ::testing::internal::IteratorTraits::value_type> -ValuesIn(ForwardIterator begin, ForwardIterator end); - -template -internal::ParamGenerator ValuesIn(const T (&array)[N]); - -template -internal::ParamGenerator ValuesIn( - const Container& container); - -namespace internal { - -// Used in the Values() function to provide polymorphic capabilities. -$range i 1..n -$for i [[ -$range j 1..i - -template <$for j, [[typename T$j]]> -class ValueArray$i { - public: - $if i==1 [[explicit ]]ValueArray$i($for j, [[T$j v$j]]) : $for j, [[v$(j)_(v$j)]] {} - - template - operator ParamGenerator() const { - const T array[] = {$for j, [[static_cast(v$(j)_)]]}; - return ValuesIn(array); - } - - ValueArray$i(const ValueArray$i& other) : $for j, [[v$(j)_(other.v$(j)_)]] {} - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray$i& other); - -$for j [[ - - const T$j v$(j)_; -]] - -}; - -]] - -# if GTEST_HAS_COMBINE -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Generates values from the Cartesian product of values produced -// by the argument generators. -// -$range i 2..maxtuple -$for i [[ -$range j 1..i -$range k 2..i - -template <$for j, [[typename T$j]]> -class CartesianProductGenerator$i - : public ParamGeneratorInterface< ::testing::tuple<$for j, [[T$j]]> > { - public: - typedef ::testing::tuple<$for j, [[T$j]]> ParamType; - - CartesianProductGenerator$i($for j, [[const ParamGenerator& g$j]]) - : $for j, [[g$(j)_(g$j)]] {} - virtual ~CartesianProductGenerator$i() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, $for j, [[g$(j)_, g$(j)_.begin()]]); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, $for j, [[g$(j)_, g$(j)_.end()]]); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, $for j, [[ - - const ParamGenerator& g$j, - const typename ParamGenerator::iterator& current$(j)]]) - : base_(base), -$for j, [[ - - begin$(j)_(g$j.begin()), end$(j)_(g$j.end()), current$(j)_(current$j) -]] { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current$(i)_; - -$for k [[ - if (current$(i+2-k)_ == end$(i+2-k)_) { - current$(i+2-k)_ = begin$(i+2-k)_; - ++current$(i+2-k-1)_; - } - -]] - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return current_value_.get(); } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ($for j && [[ - - current$(j)_ == typed_other->current$(j)_ -]]); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), $for j, [[ - - begin$(j)_(other.begin$(j)_), - end$(j)_(other.end$(j)_), - current$(j)_(other.current$(j)_) -]] { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_.reset(new ParamType($for j, [[*current$(j)_]])); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return -$for j || [[ - - current$(j)_ == end$(j)_ -]]; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. -$for j [[ - - const typename ParamGenerator::iterator begin$(j)_; - const typename ParamGenerator::iterator end$(j)_; - typename ParamGenerator::iterator current$(j)_; -]] - - linked_ptr current_value_; - }; // class CartesianProductGenerator$i::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator$i& other); - - -$for j [[ - const ParamGenerator g$(j)_; - -]] -}; // class CartesianProductGenerator$i - - -]] - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Helper classes providing Combine() with polymorphic features. They allow -// casting CartesianProductGeneratorN to ParamGenerator if T is -// convertible to U. -// -$range i 2..maxtuple -$for i [[ -$range j 1..i - -template <$for j, [[class Generator$j]]> -class CartesianProductHolder$i { - public: -CartesianProductHolder$i($for j, [[const Generator$j& g$j]]) - : $for j, [[g$(j)_(g$j)]] {} - template <$for j, [[typename T$j]]> - operator ParamGenerator< ::testing::tuple<$for j, [[T$j]]> >() const { - return ParamGenerator< ::testing::tuple<$for j, [[T$j]]> >( - new CartesianProductGenerator$i<$for j, [[T$j]]>( -$for j,[[ - - static_cast >(g$(j)_) -]])); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder$i& other); - - -$for j [[ - const Generator$j g$(j)_; - -]] -}; // class CartesianProductHolder$i - -]] - -# endif // GTEST_HAS_COMBINE - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h index d64f620c4c..97533993c0 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h @@ -37,18 +37,19 @@ #include +#include #include +#include #include +#include #include #include #include "gtest/internal/gtest-internal.h" -#include "gtest/internal/gtest-linked_ptr.h" #include "gtest/internal/gtest-port.h" #include "gtest/gtest-printers.h" namespace testing { - // Input to a parameterized test name generator, describing a test parameter. // Consists of the parameter value and the integer parameter index. template @@ -72,13 +73,14 @@ struct PrintToStringParamName { namespace internal { // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// +// Utility Functions + // Outputs a message explaining invalid registration of different -// fixture class for the same test case. This may happen when +// fixture class for the same test suite. This may happen when // TEST_P macro is used to define two tests with the same name // but in different namespaces. -GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name, - CodeLocation code_location); +GTEST_API_ void ReportInvalidTestSuiteType(const char* test_suite_name, + CodeLocation code_location); template class ParamGeneratorInterface; template class ParamGenerator; @@ -153,7 +155,7 @@ class ParamIterator { private: friend class ParamGenerator; explicit ParamIterator(ParamIteratorInterface* impl) : impl_(impl) {} - scoped_ptr > impl_; + std::unique_ptr > impl_; }; // ParamGeneratorInterface is the binary interface to access generators @@ -192,7 +194,7 @@ class ParamGenerator { iterator end() const { return iterator(impl_->End()); } private: - linked_ptr > impl_; + std::shared_ptr > impl_; }; // Generates values from a range of two comparable values. Can be used to @@ -205,12 +207,12 @@ class RangeGenerator : public ParamGeneratorInterface { RangeGenerator(T begin, T end, IncrementT step) : begin_(begin), end_(end), step_(step), end_index_(CalculateEndIndex(begin, end, step)) {} - virtual ~RangeGenerator() {} + ~RangeGenerator() override {} - virtual ParamIteratorInterface* Begin() const { + ParamIteratorInterface* Begin() const override { return new Iterator(this, begin_, 0, step_); } - virtual ParamIteratorInterface* End() const { + ParamIteratorInterface* End() const override { return new Iterator(this, end_, end_index_, step_); } @@ -220,20 +222,20 @@ class RangeGenerator : public ParamGeneratorInterface { Iterator(const ParamGeneratorInterface* base, T value, int index, IncrementT step) : base_(base), value_(value), index_(index), step_(step) {} - virtual ~Iterator() {} + ~Iterator() override {} - virtual const ParamGeneratorInterface* BaseGenerator() const { + const ParamGeneratorInterface* BaseGenerator() const override { return base_; } - virtual void Advance() { + void Advance() override { value_ = static_cast(value_ + step_); index_++; } - virtual ParamIteratorInterface* Clone() const { + ParamIteratorInterface* Clone() const override { return new Iterator(*this); } - virtual const T* Current() const { return &value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { + const T* Current() const override { return &value_; } + bool Equals(const ParamIteratorInterface& other) const override { // Having the same base generator guarantees that the other // iterator is of the same type and we can downcast. GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) @@ -290,12 +292,12 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface { template ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end) : container_(begin, end) {} - virtual ~ValuesInIteratorRangeGenerator() {} + ~ValuesInIteratorRangeGenerator() override {} - virtual ParamIteratorInterface* Begin() const { + ParamIteratorInterface* Begin() const override { return new Iterator(this, container_.begin()); } - virtual ParamIteratorInterface* End() const { + ParamIteratorInterface* End() const override { return new Iterator(this, container_.end()); } @@ -307,16 +309,16 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface { Iterator(const ParamGeneratorInterface* base, typename ContainerType::const_iterator iterator) : base_(base), iterator_(iterator) {} - virtual ~Iterator() {} + ~Iterator() override {} - virtual const ParamGeneratorInterface* BaseGenerator() const { + const ParamGeneratorInterface* BaseGenerator() const override { return base_; } - virtual void Advance() { + void Advance() override { ++iterator_; value_.reset(); } - virtual ParamIteratorInterface* Clone() const { + ParamIteratorInterface* Clone() const override { return new Iterator(*this); } // We need to use cached value referenced by iterator_ because *iterator_ @@ -326,12 +328,11 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface { // can advance iterator_ beyond the end of the range, and we cannot // detect that fact. The client code, on the other hand, is // responsible for not calling Current() on an out-of-range iterator. - virtual const T* Current() const { - if (value_.get() == NULL) - value_.reset(new T(*iterator_)); + const T* Current() const override { + if (value_.get() == nullptr) value_.reset(new T(*iterator_)); return value_.get(); } - virtual bool Equals(const ParamIteratorInterface& other) const { + bool Equals(const ParamIteratorInterface& other) const override { // Having the same base generator guarantees that the other // iterator is of the same type and we can downcast. GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) @@ -354,9 +355,9 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface { // A cached value of *iterator_. We keep it here to allow access by // pointer in the wrapping iterator's operator->(). // value_ needs to be mutable to be accessed in Current(). - // Use of scoped_ptr helps manage cached value's lifetime, + // Use of std::unique_ptr helps manage cached value's lifetime, // which is bound by the lifespan of the iterator itself. - mutable scoped_ptr value_; + mutable std::unique_ptr value_; }; // class ValuesInIteratorRangeGenerator::Iterator // No implementation - assignment is unsupported. @@ -376,25 +377,12 @@ std::string DefaultParamName(const TestParamInfo& info) { return name_stream.GetString(); } -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Parameterized test name overload helpers, which help the -// INSTANTIATE_TEST_CASE_P macro choose between the default parameterized -// test name generator and user param name generator. -template -ParamNameGenFunctor GetParamNameGen(ParamNameGenFunctor func) { - return func; -} - -template -struct ParamNameGenFunc { - typedef std::string Type(const TestParamInfo&); -}; - -template -typename ParamNameGenFunc::Type *GetParamNameGen() { - return DefaultParamName; +template +void TestNotEmpty() { + static_assert(sizeof(T) == 0, "Empty arguments are not allowed."); } +template +void TestNotEmpty(const T&) {} // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. // @@ -406,7 +394,7 @@ class ParameterizedTestFactory : public TestFactoryBase { typedef typename TestClass::ParamType ParamType; explicit ParameterizedTestFactory(ParamType parameter) : parameter_(parameter) {} - virtual Test* CreateTest() { + Test* CreateTest() override { TestClass::SetParam(¶meter_); return new TestClass(); } @@ -434,19 +422,19 @@ class TestMetaFactoryBase { // TestMetaFactory creates test factories for passing into // MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives // ownership of test factory pointer, same factory object cannot be passed -// into that method twice. But ParameterizedTestCaseInfo is going to call +// into that method twice. But ParameterizedTestSuiteInfo is going to call // it for each Test/Parameter value combination. Thus it needs meta factory // creator class. -template +template class TestMetaFactory - : public TestMetaFactoryBase { + : public TestMetaFactoryBase { public: - typedef typename TestCase::ParamType ParamType; + using ParamType = typename TestSuite::ParamType; TestMetaFactory() {} - virtual TestFactoryBase* CreateTestFactory(ParamType parameter) { - return new ParameterizedTestFactory(parameter); + TestFactoryBase* CreateTestFactory(ParamType parameter) override { + return new ParameterizedTestFactory(parameter); } private: @@ -455,93 +443,93 @@ class TestMetaFactory // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. // -// ParameterizedTestCaseInfoBase is a generic interface -// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase +// ParameterizedTestSuiteInfoBase is a generic interface +// to ParameterizedTestSuiteInfo classes. ParameterizedTestSuiteInfoBase // accumulates test information provided by TEST_P macro invocations -// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations +// and generators provided by INSTANTIATE_TEST_SUITE_P macro invocations // and uses that information to register all resulting test instances -// in RegisterTests method. The ParameterizeTestCaseRegistry class holds -// a collection of pointers to the ParameterizedTestCaseInfo objects +// in RegisterTests method. The ParameterizeTestSuiteRegistry class holds +// a collection of pointers to the ParameterizedTestSuiteInfo objects // and calls RegisterTests() on each of them when asked. -class ParameterizedTestCaseInfoBase { +class ParameterizedTestSuiteInfoBase { public: - virtual ~ParameterizedTestCaseInfoBase() {} + virtual ~ParameterizedTestSuiteInfoBase() {} - // Base part of test case name for display purposes. - virtual const std::string& GetTestCaseName() const = 0; + // Base part of test suite name for display purposes. + virtual const std::string& GetTestSuiteName() const = 0; // Test case id to verify identity. - virtual TypeId GetTestCaseTypeId() const = 0; + virtual TypeId GetTestSuiteTypeId() const = 0; // UnitTest class invokes this method to register tests in this - // test case right before running them in RUN_ALL_TESTS macro. - // This method should not be called more then once on any single - // instance of a ParameterizedTestCaseInfoBase derived class. + // test suite right before running them in RUN_ALL_TESTS macro. + // This method should not be called more than once on any single + // instance of a ParameterizedTestSuiteInfoBase derived class. virtual void RegisterTests() = 0; protected: - ParameterizedTestCaseInfoBase() {} + ParameterizedTestSuiteInfoBase() {} private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase); + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfoBase); }; // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. // -// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P -// macro invocations for a particular test case and generators -// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that -// test case. It registers tests with all values generated by all +// ParameterizedTestSuiteInfo accumulates tests obtained from TEST_P +// macro invocations for a particular test suite and generators +// obtained from INSTANTIATE_TEST_SUITE_P macro invocations for that +// test suite. It registers tests with all values generated by all // generators when asked. -template -class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { +template +class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { public: // ParamType and GeneratorCreationFunc are private types but are required // for declarations of public methods AddTestPattern() and - // AddTestCaseInstantiation(). - typedef typename TestCase::ParamType ParamType; + // AddTestSuiteInstantiation(). + using ParamType = typename TestSuite::ParamType; // A function that returns an instance of appropriate generator type. typedef ParamGenerator(GeneratorCreationFunc)(); - typedef typename ParamNameGenFunc::Type ParamNameGeneratorFunc; + using ParamNameGeneratorFunc = std::string(const TestParamInfo&); - explicit ParameterizedTestCaseInfo( - const char* name, CodeLocation code_location) - : test_case_name_(name), code_location_(code_location) {} + explicit ParameterizedTestSuiteInfo(const char* name, + CodeLocation code_location) + : test_suite_name_(name), code_location_(code_location) {} // Test case base name for display purposes. - virtual const std::string& GetTestCaseName() const { return test_case_name_; } + const std::string& GetTestSuiteName() const override { + return test_suite_name_; + } // Test case id to verify identity. - virtual TypeId GetTestCaseTypeId() const { return GetTypeId(); } + TypeId GetTestSuiteTypeId() const override { return GetTypeId(); } // TEST_P macro uses AddTestPattern() to record information // about a single test in a LocalTestInfo structure. - // test_case_name is the base name of the test case (without invocation + // test_suite_name is the base name of the test suite (without invocation // prefix). test_base_name is the name of an individual test without // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is - // test case base name and DoBar is test base name. - void AddTestPattern(const char* test_case_name, - const char* test_base_name, + // test suite base name and DoBar is test base name. + void AddTestPattern(const char* test_suite_name, const char* test_base_name, TestMetaFactoryBase* meta_factory) { - tests_.push_back(linked_ptr(new TestInfo(test_case_name, - test_base_name, - meta_factory))); + tests_.push_back(std::shared_ptr( + new TestInfo(test_suite_name, test_base_name, meta_factory))); } - // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information + // INSTANTIATE_TEST_SUITE_P macro uses AddGenerator() to record information // about a generator. - int AddTestCaseInstantiation(const std::string& instantiation_name, - GeneratorCreationFunc* func, - ParamNameGeneratorFunc* name_func, - const char* file, int line) { + int AddTestSuiteInstantiation(const std::string& instantiation_name, + GeneratorCreationFunc* func, + ParamNameGeneratorFunc* name_func, + const char* file, int line) { instantiations_.push_back( InstantiationInfo(instantiation_name, func, name_func, file, line)); return 0; // Return value used only to run this method in namespace scope. } - // UnitTest class invokes this method to register tests in this test case - // test cases right before running tests in RUN_ALL_TESTS macro. - // This method should not be called more then once on any single - // instance of a ParameterizedTestCaseInfoBase derived class. - // UnitTest has a guard to prevent from calling this method more then once. - virtual void RegisterTests() { + // UnitTest class invokes this method to register tests in this test suite + // test suites right before running tests in RUN_ALL_TESTS macro. + // This method should not be called more than once on any single + // instance of a ParameterizedTestSuiteInfoBase derived class. + // UnitTest has a guard to prevent from calling this method more than once. + void RegisterTests() override { for (typename TestInfoContainer::iterator test_it = tests_.begin(); test_it != tests_.end(); ++test_it) { - linked_ptr test_info = *test_it; + std::shared_ptr test_info = *test_it; for (typename InstantiationContainer::iterator gen_it = instantiations_.begin(); gen_it != instantiations_.end(); ++gen_it) { @@ -551,10 +539,10 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { const char* file = gen_it->file; int line = gen_it->line; - std::string test_case_name; + std::string test_suite_name; if ( !instantiation_name.empty() ) - test_case_name = instantiation_name + "/"; - test_case_name += test_info->test_case_base_name; + test_suite_name = instantiation_name + "/"; + test_suite_name += test_info->test_suite_base_name; size_t i = 0; std::set test_param_names; @@ -577,39 +565,39 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { test_param_names.insert(param_name); - test_name_stream << test_info->test_base_name << "/" << param_name; + if (!test_info->test_base_name.empty()) { + test_name_stream << test_info->test_base_name << "/"; + } + test_name_stream << param_name; MakeAndRegisterTestInfo( - test_case_name.c_str(), - test_name_stream.GetString().c_str(), - NULL, // No type parameter. - PrintToString(*param_it).c_str(), - code_location_, - GetTestCaseTypeId(), - TestCase::SetUpTestCase, - TestCase::TearDownTestCase, + test_suite_name.c_str(), test_name_stream.GetString().c_str(), + nullptr, // No type parameter. + PrintToString(*param_it).c_str(), code_location_, + GetTestSuiteTypeId(), + SuiteApiResolver::GetSetUpCaseOrSuite(file, line), + SuiteApiResolver::GetTearDownCaseOrSuite(file, line), test_info->test_meta_factory->CreateTestFactory(*param_it)); } // for param_it } // for gen_it } // for test_it - } // RegisterTests + } // RegisterTests private: // LocalTestInfo structure keeps information about a single test registered // with TEST_P macro. struct TestInfo { - TestInfo(const char* a_test_case_base_name, - const char* a_test_base_name, - TestMetaFactoryBase* a_test_meta_factory) : - test_case_base_name(a_test_case_base_name), - test_base_name(a_test_base_name), - test_meta_factory(a_test_meta_factory) {} - - const std::string test_case_base_name; + TestInfo(const char* a_test_suite_base_name, const char* a_test_base_name, + TestMetaFactoryBase* a_test_meta_factory) + : test_suite_base_name(a_test_suite_base_name), + test_base_name(a_test_base_name), + test_meta_factory(a_test_meta_factory) {} + + const std::string test_suite_base_name; const std::string test_base_name; - const scoped_ptr > test_meta_factory; + const std::unique_ptr > test_meta_factory; }; - typedef ::std::vector > TestInfoContainer; - // Records data received from INSTANTIATE_TEST_CASE_P macros: + using TestInfoContainer = ::std::vector >; + // Records data received from INSTANTIATE_TEST_SUITE_P macros: // struct InstantiationInfo { @@ -646,76 +634,247 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { return true; } - const std::string test_case_name_; + const std::string test_suite_name_; CodeLocation code_location_; TestInfoContainer tests_; InstantiationContainer instantiations_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo); -}; // class ParameterizedTestCaseInfo + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfo); +}; // class ParameterizedTestSuiteInfo + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +template +using ParameterizedTestCaseInfo = ParameterizedTestSuiteInfo; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. // -// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase -// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P -// macros use it to locate their corresponding ParameterizedTestCaseInfo -// descriptors. -class ParameterizedTestCaseRegistry { +// ParameterizedTestSuiteRegistry contains a map of +// ParameterizedTestSuiteInfoBase classes accessed by test suite names. TEST_P +// and INSTANTIATE_TEST_SUITE_P macros use it to locate their corresponding +// ParameterizedTestSuiteInfo descriptors. +class ParameterizedTestSuiteRegistry { public: - ParameterizedTestCaseRegistry() {} - ~ParameterizedTestCaseRegistry() { - for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); - it != test_case_infos_.end(); ++it) { - delete *it; + ParameterizedTestSuiteRegistry() {} + ~ParameterizedTestSuiteRegistry() { + for (auto& test_suite_info : test_suite_infos_) { + delete test_suite_info; } } // Looks up or creates and returns a structure containing information about - // tests and instantiations of a particular test case. - template - ParameterizedTestCaseInfo* GetTestCasePatternHolder( - const char* test_case_name, - CodeLocation code_location) { - ParameterizedTestCaseInfo* typed_test_info = NULL; - for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); - it != test_case_infos_.end(); ++it) { - if ((*it)->GetTestCaseName() == test_case_name) { - if ((*it)->GetTestCaseTypeId() != GetTypeId()) { + // tests and instantiations of a particular test suite. + template + ParameterizedTestSuiteInfo* GetTestSuitePatternHolder( + const char* test_suite_name, CodeLocation code_location) { + ParameterizedTestSuiteInfo* typed_test_info = nullptr; + for (auto& test_suite_info : test_suite_infos_) { + if (test_suite_info->GetTestSuiteName() == test_suite_name) { + if (test_suite_info->GetTestSuiteTypeId() != GetTypeId()) { // Complain about incorrect usage of Google Test facilities // and terminate the program since we cannot guaranty correct - // test case setup and tear-down in this case. - ReportInvalidTestCaseType(test_case_name, code_location); + // test suite setup and tear-down in this case. + ReportInvalidTestSuiteType(test_suite_name, code_location); posix::Abort(); } else { // At this point we are sure that the object we found is of the same // type we are looking for, so we downcast it to that type // without further checks. typed_test_info = CheckedDowncastToActualType< - ParameterizedTestCaseInfo >(*it); + ParameterizedTestSuiteInfo >(test_suite_info); } break; } } - if (typed_test_info == NULL) { - typed_test_info = new ParameterizedTestCaseInfo( - test_case_name, code_location); - test_case_infos_.push_back(typed_test_info); + if (typed_test_info == nullptr) { + typed_test_info = new ParameterizedTestSuiteInfo( + test_suite_name, code_location); + test_suite_infos_.push_back(typed_test_info); } return typed_test_info; } void RegisterTests() { - for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); - it != test_case_infos_.end(); ++it) { - (*it)->RegisterTests(); + for (auto& test_suite_info : test_suite_infos_) { + test_suite_info->RegisterTests(); } } +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + template + ParameterizedTestCaseInfo* GetTestCasePatternHolder( + const char* test_case_name, CodeLocation code_location) { + return GetTestSuitePatternHolder(test_case_name, code_location); + } + +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + private: + using TestSuiteInfoContainer = ::std::vector; + + TestSuiteInfoContainer test_suite_infos_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteRegistry); +}; + +} // namespace internal + +// Forward declarations of ValuesIn(), which is implemented in +// include/gtest/gtest-param-test.h. +template +internal::ParamGenerator ValuesIn( + const Container& container); + +namespace internal { +// Used in the Values() function to provide polymorphic capabilities. + +template +class ValueArray { + public: + ValueArray(Ts... v) : v_{std::move(v)...} {} + + template + operator ParamGenerator() const { // NOLINT + return ValuesIn(MakeVector(MakeIndexSequence())); + } private: - typedef ::std::vector TestCaseInfoContainer; + template + std::vector MakeVector(IndexSequence) const { + return std::vector{static_cast(v_.template Get())...}; + } - TestCaseInfoContainer test_case_infos_; + FlatTuple v_; +}; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry); +template +class CartesianProductGenerator + : public ParamGeneratorInterface<::std::tuple> { + public: + typedef ::std::tuple ParamType; + + CartesianProductGenerator(const std::tuple...>& g) + : generators_(g) {} + ~CartesianProductGenerator() override {} + + ParamIteratorInterface* Begin() const override { + return new Iterator(this, generators_, false); + } + ParamIteratorInterface* End() const override { + return new Iterator(this, generators_, true); + } + + private: + template + class IteratorImpl; + template + class IteratorImpl> + : public ParamIteratorInterface { + public: + IteratorImpl(const ParamGeneratorInterface* base, + const std::tuple...>& generators, bool is_end) + : base_(base), + begin_(std::get(generators).begin()...), + end_(std::get(generators).end()...), + current_(is_end ? end_ : begin_) { + ComputeCurrentValue(); + } + ~IteratorImpl() override {} + + const ParamGeneratorInterface* BaseGenerator() const override { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + void Advance() override { + assert(!AtEnd()); + // Advance the last iterator. + ++std::get(current_); + // if that reaches end, propagate that up. + AdvanceIfEnd(); + ComputeCurrentValue(); + } + ParamIteratorInterface* Clone() const override { + return new IteratorImpl(*this); + } + + const ParamType* Current() const override { return current_value_.get(); } + + bool Equals(const ParamIteratorInterface& other) const override { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const IteratorImpl* typed_other = + CheckedDowncastToActualType(&other); + + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + if (AtEnd() && typed_other->AtEnd()) return true; + + bool same = true; + bool dummy[] = { + (same = same && std::get(current_) == + std::get(typed_other->current_))...}; + (void)dummy; + return same; + } + + private: + template + void AdvanceIfEnd() { + if (std::get(current_) != std::get(end_)) return; + + bool last = ThisI == 0; + if (last) { + // We are done. Nothing else to propagate. + return; + } + + constexpr size_t NextI = ThisI - (ThisI != 0); + std::get(current_) = std::get(begin_); + ++std::get(current_); + AdvanceIfEnd(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = std::make_shared(*std::get(current_)...); + } + bool AtEnd() const { + bool at_end = false; + bool dummy[] = { + (at_end = at_end || std::get(current_) == std::get(end_))...}; + (void)dummy; + return at_end; + } + + const ParamGeneratorInterface* const base_; + std::tuple::iterator...> begin_; + std::tuple::iterator...> end_; + std::tuple::iterator...> current_; + std::shared_ptr current_value_; + }; + + using Iterator = IteratorImpl::type>; + + std::tuple...> generators_; +}; + +template +class CartesianProductHolder { + public: + CartesianProductHolder(const Gen&... g) : generators_(g...) {} + template + operator ParamGenerator<::std::tuple>() const { + return ParamGenerator<::std::tuple>( + new CartesianProductGenerator(generators_)); + } + + private: + std::tuple generators_; }; } // namespace internal diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h index f83700e06d..cece93dba1 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h @@ -38,14 +38,13 @@ // Determines the platform on which Google Test is compiled. #ifdef __CYGWIN__ # define GTEST_OS_CYGWIN 1 -#elif defined __SYMBIAN32__ -# define GTEST_OS_SYMBIAN 1 +# elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__) +# define GTEST_OS_WINDOWS_MINGW 1 +# define GTEST_OS_WINDOWS 1 #elif defined _WIN32 # define GTEST_OS_WINDOWS 1 # ifdef _WIN32_WCE # define GTEST_OS_WINDOWS_MOBILE 1 -# elif defined(__MINGW__) || defined(__MINGW32__) -# define GTEST_OS_WINDOWS_MINGW 1 # elif defined(WINAPI_FAMILY) # include # if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) @@ -65,15 +64,21 @@ # else # define GTEST_OS_WINDOWS_DESKTOP 1 # endif // _WIN32_WCE +#elif defined __OS2__ +# define GTEST_OS_OS2 1 #elif defined __APPLE__ # define GTEST_OS_MAC 1 # if TARGET_OS_IPHONE # define GTEST_OS_IOS 1 # endif +#elif defined __DragonFly__ +# define GTEST_OS_DRAGONFLY 1 #elif defined __FreeBSD__ # define GTEST_OS_FREEBSD 1 #elif defined __Fuchsia__ # define GTEST_OS_FUCHSIA 1 +#elif defined(__GLIBC__) && defined(__FreeBSD_kernel__) +# define GTEST_OS_GNU_KFREEBSD 1 #elif defined __linux__ # define GTEST_OS_LINUX 1 # if defined __ANDROID__ @@ -95,6 +100,8 @@ # define GTEST_OS_OPENBSD 1 #elif defined __QNX__ # define GTEST_OS_QNX 1 +#elif defined(__HAIKU__) +#define GTEST_OS_HAIKU 1 #endif // __CYGWIN__ #endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port.h b/third_party/googletest/src/include/gtest/internal/gtest-port.h index 786497d854..063fcb1083 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-port.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-port.h @@ -72,10 +72,6 @@ // is/isn't available. // GTEST_HAS_EXCEPTIONS - Define it to 1/0 to indicate that exceptions // are enabled. -// GTEST_HAS_GLOBAL_STRING - Define it to 1/0 to indicate that ::string -// is/isn't available -// GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::wstring -// is/isn't available // GTEST_HAS_POSIX_RE - Define it to 1/0 to indicate that POSIX regular // expressions are/aren't available. // GTEST_HAS_PTHREAD - Define it to 1/0 to indicate that @@ -85,8 +81,6 @@ // GTEST_HAS_STD_WSTRING - Define it to 1/0 to indicate that // std::wstring does/doesn't work (Google Test can // be used where std::wstring is unavailable). -// GTEST_HAS_TR1_TUPLE - Define it to 1/0 to indicate tr1::tuple -// is/isn't available. // GTEST_HAS_SEH - Define it to 1/0 to indicate whether the // compiler supports Microsoft's "Structured // Exception Handling". @@ -94,12 +88,6 @@ // - Define it to 1/0 to indicate whether the // platform supports I/O stream redirection using // dup() and dup2(). -// GTEST_USE_OWN_TR1_TUPLE - Define it to 1/0 to indicate whether Google -// Test's own tr1 tuple implementation should be -// used. Unused when the user sets -// GTEST_HAS_TR1_TUPLE to 0. -// GTEST_LANG_CXX11 - Define it to 1/0 to indicate that Google Test -// is building in C++11/C++98 mode. // GTEST_LINKED_AS_SHARED_LIBRARY // - Define to 1 when compiling tests that use // Google Test as a shared library (known as @@ -125,8 +113,11 @@ // // GTEST_OS_AIX - IBM AIX // GTEST_OS_CYGWIN - Cygwin +// GTEST_OS_DRAGONFLY - DragonFlyBSD // GTEST_OS_FREEBSD - FreeBSD // GTEST_OS_FUCHSIA - Fuchsia +// GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD +// GTEST_OS_HAIKU - Haiku // GTEST_OS_HPUX - HP-UX // GTEST_OS_LINUX - Linux // GTEST_OS_LINUX_ANDROID - Google Android @@ -135,9 +126,9 @@ // GTEST_OS_NACL - Google Native Client (NaCl) // GTEST_OS_NETBSD - NetBSD // GTEST_OS_OPENBSD - OpenBSD +// GTEST_OS_OS2 - OS/2 // GTEST_OS_QNX - QNX // GTEST_OS_SOLARIS - Sun Solaris -// GTEST_OS_SYMBIAN - Symbian // GTEST_OS_WINDOWS - Windows (Desktop, MinGW, or Mobile) // GTEST_OS_WINDOWS_DESKTOP - Windows Desktop // GTEST_OS_WINDOWS_MINGW - MinGW @@ -146,7 +137,7 @@ // GTEST_OS_WINDOWS_RT - Windows Store App/WinRT // GTEST_OS_ZOS - z/OS // -// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the +// Among the platforms, Cygwin, Linux, Mac OS X, and Windows have the // most stable support. Since core members of the Google Test project // don't have access to other platforms, support for them may be less // stable. If you notice any problems on your platform, please notify @@ -172,8 +163,6 @@ // EXPECT_DEATH(DoSomethingDeadly()); // #endif // -// GTEST_HAS_COMBINE - the Combine() function (for value-parameterized -// tests) // GTEST_HAS_DEATH_TEST - death tests // GTEST_HAS_TYPED_TEST - typed tests // GTEST_HAS_TYPED_TEST_P - type-parameterized tests @@ -184,7 +173,6 @@ // define themselves. // GTEST_USES_SIMPLE_RE - our own simple regex is used; // the above RE\b(s) are mutually exclusive. -// GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ(). // Misc public macros // ------------------ @@ -210,23 +198,10 @@ // GTEST_INTENTIONAL_CONST_COND_POP_ - finish code section where MSVC C4127 // is suppressed. // -// C++11 feature wrappers: -// -// testing::internal::forward - portability wrapper for std::forward. -// testing::internal::move - portability wrapper for std::move. -// // Synchronization: // Mutex, MutexLock, ThreadLocal, GetThreadCount() // - synchronization primitives. // -// Template meta programming: -// is_pointer - as in TR1; needed on Symbian and IBM XL C/C++ only. -// IteratorTraits - partial implementation of std::iterator_traits, which -// is not available in libCstd when compiled with Sun C++. -// -// Smart pointers: -// scoped_ptr - as in TR2. -// // Regular expressions: // RE - a simple regular expression class using the POSIX // Extended Regular Expression syntax on UNIX-like platforms @@ -262,12 +237,20 @@ // BoolFromGTestEnv() - parses a bool environment variable. // Int32FromGTestEnv() - parses an Int32 environment variable. // StringFromGTestEnv() - parses a string environment variable. +// +// Deprecation warnings: +// GTEST_INTERNAL_DEPRECATED(message) - attribute marking a function as +// deprecated; calling a marked function +// should generate a compiler warning #include // for isspace, etc #include // for ptrdiff_t -#include #include +#include #include +#include +#include + #ifndef _WIN32_WCE # include # include @@ -278,12 +261,11 @@ # include #endif -// Brings in the definition of HAS_GLOBAL_STRING. This must be done -// BEFORE we test HAS_GLOBAL_STRING. -#include // NOLINT #include // NOLINT -#include // NOLINT -#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include #include #include // NOLINT @@ -315,14 +297,14 @@ // GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385) // /* code that triggers warnings C4800 and C4385 */ // GTEST_DISABLE_MSC_WARNINGS_POP_() -#if _MSC_VER >= 1400 +#if defined(_MSC_VER) # define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \ __pragma(warning(push)) \ __pragma(warning(disable: warnings)) # define GTEST_DISABLE_MSC_WARNINGS_POP_() \ __pragma(warning(pop)) #else -// Older versions of MSVC don't have __pragma. +// Not all compilers are MSVC # define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) # define GTEST_DISABLE_MSC_WARNINGS_POP_() #endif @@ -343,80 +325,6 @@ GTEST_DISABLE_MSC_WARNINGS_POP_() #endif -#ifndef GTEST_LANG_CXX11 -// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when -// -std={c,gnu}++{0x,11} is passed. The C++11 standard specifies a -// value for __cplusplus, and recent versions of clang, gcc, and -// probably other compilers set that too in C++11 mode. -# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L || _MSC_VER >= 1900 -// Compiling in at least C++11 mode. -# define GTEST_LANG_CXX11 1 -# else -# define GTEST_LANG_CXX11 0 -# endif -#endif - -// Distinct from C++11 language support, some environments don't provide -// proper C++11 library support. Notably, it's possible to build in -// C++11 mode when targeting Mac OS X 10.6, which has an old libstdc++ -// with no C++11 support. -// -// libstdc++ has sufficient C++11 support as of GCC 4.6.0, __GLIBCXX__ -// 20110325, but maintenance releases in the 4.4 and 4.5 series followed -// this date, so check for those versions by their date stamps. -// https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html#abi.versioning -#if GTEST_LANG_CXX11 && \ - (!defined(__GLIBCXX__) || ( \ - __GLIBCXX__ >= 20110325ul && /* GCC >= 4.6.0 */ \ - /* Blacklist of patch releases of older branches: */ \ - __GLIBCXX__ != 20110416ul && /* GCC 4.4.6 */ \ - __GLIBCXX__ != 20120313ul && /* GCC 4.4.7 */ \ - __GLIBCXX__ != 20110428ul && /* GCC 4.5.3 */ \ - __GLIBCXX__ != 20120702ul)) /* GCC 4.5.4 */ -# define GTEST_STDLIB_CXX11 1 -#endif - -// Only use C++11 library features if the library provides them. -#if GTEST_STDLIB_CXX11 -# define GTEST_HAS_STD_BEGIN_AND_END_ 1 -# define GTEST_HAS_STD_FORWARD_LIST_ 1 -# if !defined(_MSC_VER) || (_MSC_FULL_VER >= 190023824) -// works only with VS2015U2 and better -# define GTEST_HAS_STD_FUNCTION_ 1 -# endif -# define GTEST_HAS_STD_INITIALIZER_LIST_ 1 -# define GTEST_HAS_STD_MOVE_ 1 -# define GTEST_HAS_STD_UNIQUE_PTR_ 1 -# define GTEST_HAS_STD_SHARED_PTR_ 1 -# define GTEST_HAS_UNORDERED_MAP_ 1 -# define GTEST_HAS_UNORDERED_SET_ 1 -#endif - -// C++11 specifies that provides std::tuple. -// Some platforms still might not have it, however. -#if GTEST_LANG_CXX11 -# define GTEST_HAS_STD_TUPLE_ 1 -# if defined(__clang__) -// Inspired by -// https://clang.llvm.org/docs/LanguageExtensions.html#include-file-checking-macros -# if defined(__has_include) && !__has_include() -# undef GTEST_HAS_STD_TUPLE_ -# endif -# elif defined(_MSC_VER) -// Inspired by boost/config/stdlib/dinkumware.hpp -# if defined(_CPPLIB_VER) && _CPPLIB_VER < 520 -# undef GTEST_HAS_STD_TUPLE_ -# endif -# elif defined(__GLIBCXX__) -// Inspired by boost/config/stdlib/libstdcpp3.hpp, -// http://gcc.gnu.org/gcc-4.2/changes.html and -// https://web.archive.org/web/20140227044429/gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x -# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2) -# undef GTEST_HAS_STD_TUPLE_ -# endif -# endif -#endif - // Brings in definitions for functions used in the testing::internal::posix // namespace (read, write, close, chdir, isatty, stat). We do not currently // use them on Windows Mobile. @@ -449,7 +357,8 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; # include // NOLINT #endif -// Defines this to true iff Google Test can use POSIX regular expressions. +// Defines this to true if and only if Google Test can use POSIX regular +// expressions. #ifndef GTEST_HAS_POSIX_RE # if GTEST_OS_LINUX_ANDROID // On Android, is only available starting with Gingerbread. @@ -490,7 +399,7 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // The user didn't tell us whether exceptions are enabled, so we need // to figure it out. # if defined(_MSC_VER) && defined(_CPPUNWIND) -// MSVC defines _CPPUNWIND to 1 iff exceptions are enabled. +// MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled. # define GTEST_HAS_EXCEPTIONS 1 # elif defined(__BORLANDC__) // C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS @@ -501,16 +410,17 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; # endif // _HAS_EXCEPTIONS # define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS # elif defined(__clang__) -// clang defines __EXCEPTIONS iff exceptions are enabled before clang 220714, -// but iff cleanups are enabled after that. In Obj-C++ files, there can be -// cleanups for ObjC exceptions which also need cleanups, even if C++ exceptions -// are disabled. clang has __has_feature(cxx_exceptions) which checks for C++ -// exceptions starting at clang r206352, but which checked for cleanups prior to -// that. To reliably check for C++ exception availability with clang, check for +// clang defines __EXCEPTIONS if and only if exceptions are enabled before clang +// 220714, but if and only if cleanups are enabled after that. In Obj-C++ files, +// there can be cleanups for ObjC exceptions which also need cleanups, even if +// C++ exceptions are disabled. clang has __has_feature(cxx_exceptions) which +// checks for C++ exceptions starting at clang r206352, but which checked for +// cleanups prior to that. To reliably check for C++ exception availability with +// clang, check for // __EXCEPTIONS && __has_feature(cxx_exceptions). # define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions)) # elif defined(__GNUC__) && __EXCEPTIONS -// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled. +// gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled. # define GTEST_HAS_EXCEPTIONS 1 # elif defined(__SUNPRO_CC) // Sun Pro CC supports exceptions. However, there is no compile-time way of @@ -518,7 +428,7 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // they are enabled unless the user tells us otherwise. # define GTEST_HAS_EXCEPTIONS 1 # elif defined(__IBMCPP__) && __EXCEPTIONS -// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled. +// xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled. # define GTEST_HAS_EXCEPTIONS 1 # elif defined(__HP_aCC) // Exception handling is in effect by default in HP aCC compiler. It has to @@ -540,31 +450,18 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; # error "::std::string isn't available." #endif // !defined(GTEST_HAS_STD_STRING) -#ifndef GTEST_HAS_GLOBAL_STRING -# define GTEST_HAS_GLOBAL_STRING 0 -#endif // GTEST_HAS_GLOBAL_STRING - #ifndef GTEST_HAS_STD_WSTRING // The user didn't tell us whether ::std::wstring is available, so we need // to figure it out. -// FIXME: uses autoconf to detect whether ::std::wstring -// is available. - // Cygwin 1.7 and below doesn't support ::std::wstring. // Solaris' libc++ doesn't support it either. Android has // no support for it at least as recent as Froyo (2.2). -# define GTEST_HAS_STD_WSTRING \ - (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS)) +#define GTEST_HAS_STD_WSTRING \ + (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ + GTEST_OS_HAIKU)) #endif // GTEST_HAS_STD_WSTRING -#ifndef GTEST_HAS_GLOBAL_WSTRING -// The user didn't tell us whether ::wstring is available, so we need -// to figure it out. -# define GTEST_HAS_GLOBAL_WSTRING \ - (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING) -#endif // GTEST_HAS_GLOBAL_WSTRING - // Determines whether RTTI is available. #ifndef GTEST_HAS_RTTI // The user didn't tell us whether RTTI is enabled, so we need to @@ -572,14 +469,15 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; # ifdef _MSC_VER -# ifdef _CPPRTTI // MSVC defines this macro iff RTTI is enabled. +#ifdef _CPPRTTI // MSVC defines this macro if and only if RTTI is enabled. # define GTEST_HAS_RTTI 1 # else # define GTEST_HAS_RTTI 0 # endif -// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled. -# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302) +// Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is +// enabled. +# elif defined(__GNUC__) # ifdef __GXX_RTTI // When building against STLport with the Android NDK and with @@ -635,9 +533,11 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // // To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0 // to your compiler flags. -#define GTEST_HAS_PTHREAD \ - (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX || \ - GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA) +#define GTEST_HAS_PTHREAD \ + (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX || \ + GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \ + GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD || \ + GTEST_OS_HAIKU) #endif // GTEST_HAS_PTHREAD #if GTEST_HAS_PTHREAD @@ -649,136 +549,6 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; # include // NOLINT #endif -// Determines if hash_map/hash_set are available. -// Only used for testing against those containers. -#if !defined(GTEST_HAS_HASH_MAP_) -# if defined(_MSC_VER) && (_MSC_VER < 1900) -# define GTEST_HAS_HASH_MAP_ 1 // Indicates that hash_map is available. -# define GTEST_HAS_HASH_SET_ 1 // Indicates that hash_set is available. -# endif // _MSC_VER -#endif // !defined(GTEST_HAS_HASH_MAP_) - -// Determines whether Google Test can use tr1/tuple. You can define -// this macro to 0 to prevent Google Test from using tuple (any -// feature depending on tuple with be disabled in this mode). -#ifndef GTEST_HAS_TR1_TUPLE -# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) -// STLport, provided with the Android NDK, has neither or . -# define GTEST_HAS_TR1_TUPLE 0 -# elif defined(_MSC_VER) && (_MSC_VER >= 1910) -// Prevent `warning C4996: 'std::tr1': warning STL4002: -// The non-Standard std::tr1 namespace and TR1-only machinery -// are deprecated and will be REMOVED.` -# define GTEST_HAS_TR1_TUPLE 0 -# elif GTEST_LANG_CXX11 && defined(_LIBCPP_VERSION) -// libc++ doesn't support TR1. -# define GTEST_HAS_TR1_TUPLE 0 -# else -// The user didn't tell us not to do it, so we assume it's OK. -# define GTEST_HAS_TR1_TUPLE 1 -# endif -#endif // GTEST_HAS_TR1_TUPLE - -// Determines whether Google Test's own tr1 tuple implementation -// should be used. -#ifndef GTEST_USE_OWN_TR1_TUPLE -// We use our own tuple implementation on Symbian. -# if GTEST_OS_SYMBIAN -# define GTEST_USE_OWN_TR1_TUPLE 1 -# else -// The user didn't tell us, so we need to figure it out. - -// We use our own TR1 tuple if we aren't sure the user has an -// implementation of it already. At this time, libstdc++ 4.0.0+ and -// MSVC 2010 are the only mainstream standard libraries that come -// with a TR1 tuple implementation. NVIDIA's CUDA NVCC compiler -// pretends to be GCC by defining __GNUC__ and friends, but cannot -// compile GCC's tuple implementation. MSVC 2008 (9.0) provides TR1 -// tuple in a 323 MB Feature Pack download, which we cannot assume the -// user has. QNX's QCC compiler is a modified GCC but it doesn't -// support TR1 tuple. libc++ only provides std::tuple, in C++11 mode, -// and it can be used with some compilers that define __GNUC__. -# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \ - && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) \ - || (_MSC_VER >= 1600 && _MSC_VER < 1900) -# define GTEST_ENV_HAS_TR1_TUPLE_ 1 -# endif - -// C++11 specifies that provides std::tuple. Use that if gtest is used -// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6 -// can build with clang but need to use gcc4.2's libstdc++). -# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325) -# define GTEST_ENV_HAS_STD_TUPLE_ 1 -# endif - -# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_ -# define GTEST_USE_OWN_TR1_TUPLE 0 -# else -# define GTEST_USE_OWN_TR1_TUPLE 1 -# endif -# endif // GTEST_OS_SYMBIAN -#endif // GTEST_USE_OWN_TR1_TUPLE - -// To avoid conditional compilation we make it gtest-port.h's responsibility -// to #include the header implementing tuple. -#if GTEST_HAS_STD_TUPLE_ -# include // IWYU pragma: export -# define GTEST_TUPLE_NAMESPACE_ ::std -#endif // GTEST_HAS_STD_TUPLE_ - -// We include tr1::tuple even if std::tuple is available to define printers for -// them. -#if GTEST_HAS_TR1_TUPLE -# ifndef GTEST_TUPLE_NAMESPACE_ -# define GTEST_TUPLE_NAMESPACE_ ::std::tr1 -# endif // GTEST_TUPLE_NAMESPACE_ - -# if GTEST_USE_OWN_TR1_TUPLE -# include "gtest/internal/gtest-tuple.h" // IWYU pragma: export // NOLINT -# elif GTEST_OS_SYMBIAN - -// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to -// use STLport's tuple implementation, which unfortunately doesn't -// work as the copy of STLport distributed with Symbian is incomplete. -// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to -// use its own tuple implementation. -# ifdef BOOST_HAS_TR1_TUPLE -# undef BOOST_HAS_TR1_TUPLE -# endif // BOOST_HAS_TR1_TUPLE - -// This prevents , which defines -// BOOST_HAS_TR1_TUPLE, from being #included by Boost's . -# define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED -# include // IWYU pragma: export // NOLINT - -# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000) -// GCC 4.0+ implements tr1/tuple in the header. This does -// not conform to the TR1 spec, which requires the header to be . - -# if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302 -// Until version 4.3.2, gcc has a bug that causes , -// which is #included by , to not compile when RTTI is -// disabled. _TR1_FUNCTIONAL is the header guard for -// . Hence the following #define is used to prevent -// from being included. -# define _TR1_FUNCTIONAL 1 -# include -# undef _TR1_FUNCTIONAL // Allows the user to #include - // if they choose to. -# else -# include // NOLINT -# endif // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302 - -// VS 2010 now has tr1 support. -# elif _MSC_VER >= 1600 -# include // IWYU pragma: export // NOLINT - -# else // GTEST_USE_OWN_TR1_TUPLE -# include // IWYU pragma: export // NOLINT -# endif // GTEST_USE_OWN_TR1_TUPLE - -#endif // GTEST_HAS_TR1_TUPLE - // Determines whether clone(2) is supported. // Usually it will only be available on Linux, excluding // Linux on the Itanium architecture. @@ -812,24 +582,21 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; #ifndef GTEST_HAS_STREAM_REDIRECTION // By default, we assume that stream redirection is supported on all // platforms except known mobile ones. -# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || \ - GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT +# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT # define GTEST_HAS_STREAM_REDIRECTION 0 # else # define GTEST_HAS_STREAM_REDIRECTION 1 -# endif // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN +# endif // !GTEST_OS_WINDOWS_MOBILE #endif // GTEST_HAS_STREAM_REDIRECTION // Determines whether to support death tests. -// Google Test does not support death tests for VC 7.1 and earlier as -// abort() in a VC 7.1 application compiled as GUI in debug config // pops up a dialog window that cannot be suppressed programmatically. -#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ - (GTEST_OS_MAC && !GTEST_OS_IOS) || \ - (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \ - GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \ - GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD || \ - GTEST_OS_NETBSD || GTEST_OS_FUCHSIA) +#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ + (GTEST_OS_MAC && !GTEST_OS_IOS) || \ + (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW || \ + GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \ + GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \ + GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU) # define GTEST_HAS_DEATH_TEST 1 #endif @@ -837,26 +604,19 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // Typed tests need and variadic macros, which GCC, VC++ 8.0, // Sun Pro CC, IBM Visual Age, and HP aCC support. -#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \ +#if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \ defined(__IBMCPP__) || defined(__HP_aCC) # define GTEST_HAS_TYPED_TEST 1 # define GTEST_HAS_TYPED_TEST_P 1 #endif -// Determines whether to support Combine(). This only makes sense when -// value-parameterized tests are enabled. The implementation doesn't -// work on Sun Studio since it doesn't understand templated conversion -// operators. -#if (GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_) && !defined(__SUNPRO_CC) -# define GTEST_HAS_COMBINE 1 -#endif - // Determines whether the system compiler uses UTF-16 for encoding wide strings. #define GTEST_WIDE_STRING_USES_UTF16_ \ - (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX) + (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_AIX || GTEST_OS_OS2) // Determines whether test results can be streamed to a socket. -#if GTEST_OS_LINUX +#if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \ + GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD # define GTEST_CAN_STREAM_RESULTS_ 1 #endif @@ -898,12 +658,6 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; # define GTEST_ATTRIBUTE_UNUSED_ #endif -#if GTEST_LANG_CXX11 -# define GTEST_CXX11_EQUALS_DELETE_ = delete -#else // GTEST_LANG_CXX11 -# define GTEST_CXX11_EQUALS_DELETE_ -#endif // GTEST_LANG_CXX11 - // Use this annotation before a function that takes a printf format string. #if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC) # if defined(__MINGW_PRINTF_FORMAT) @@ -925,12 +679,12 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // A macro to disallow operator= // This should be used in the private: declarations for a class. #define GTEST_DISALLOW_ASSIGN_(type) \ - void operator=(type const &) GTEST_CXX11_EQUALS_DELETE_ + void operator=(type const &) = delete // A macro to disallow copy constructor and operator= // This should be used in the private: declarations for a class. #define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \ - type(type const &) GTEST_CXX11_EQUALS_DELETE_; \ + type(type const &) = delete; \ GTEST_DISALLOW_ASSIGN_(type) // Tell the compiler to warn about unused return values for functions declared @@ -938,11 +692,11 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // following the argument list: // // Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_; -#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC) +#if defined(__GNUC__) && !defined(COMPILER_ICC) # define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result)) #else # define GTEST_MUST_USE_RESULT_ -#endif // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC +#endif // __GNUC__ && !COMPILER_ICC // MS C++ compiler emits warning when a conditional expression is compile time // constant. In some contexts this warning is false positive and needs to be @@ -971,13 +725,17 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; # define GTEST_HAS_SEH 0 # endif -#define GTEST_IS_THREADSAFE \ - (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ \ - || (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) \ - || GTEST_HAS_PTHREAD) - #endif // GTEST_HAS_SEH +#ifndef GTEST_IS_THREADSAFE + +#define GTEST_IS_THREADSAFE \ + (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ || \ + (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) || \ + GTEST_HAS_PTHREAD) + +#endif // GTEST_IS_THREADSAFE + // GTEST_API_ qualifies all symbols that must be exported. The definitions below // are guarded by #ifndef to give embedders a chance to define GTEST_API_ in // gtest/internal/custom/gtest-port.h @@ -1044,6 +802,18 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; # define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ #endif // __clang__ +// A function level attribute to disable HWAddressSanitizer instrumentation. +#if defined(__clang__) +# if __has_feature(hwaddress_sanitizer) +# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \ + __attribute__((no_sanitize("hwaddress"))) +# else +# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +# endif // __has_feature(hwaddress_sanitizer) +#else +# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +#endif // __clang__ + // A function level attribute to disable ThreadSanitizer instrumentation. #if defined(__clang__) # if __has_feature(thread_sanitizer) @@ -1060,16 +830,13 @@ namespace testing { class Message; -#if defined(GTEST_TUPLE_NAMESPACE_) -// Import tuple and friends into the ::testing namespace. -// It is part of our interface, having them in ::testing allows us to change -// their types as needed. -using GTEST_TUPLE_NAMESPACE_::get; -using GTEST_TUPLE_NAMESPACE_::make_tuple; -using GTEST_TUPLE_NAMESPACE_::tuple; -using GTEST_TUPLE_NAMESPACE_::tuple_size; -using GTEST_TUPLE_NAMESPACE_::tuple_element; -#endif // defined(GTEST_TUPLE_NAMESPACE_) +// Legacy imports for backwards compatibility. +// New code should use std:: names directly. +using std::get; +using std::make_tuple; +using std::tuple; +using std::tuple_element; +using std::tuple_size; namespace internal { @@ -1078,158 +845,24 @@ namespace internal { // Secret object, which is what we want. class Secret; -// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time -// expression is true. For example, you could use it to verify the -// size of a static array: +// The GTEST_COMPILE_ASSERT_ is a legacy macro used to verify that a compile +// time expression is true (in new code, use static_assert instead). For +// example, you could use it to verify the size of a static array: // // GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES, // names_incorrect_size); // -// or to make sure a struct is smaller than a certain size: -// -// GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large); -// -// The second argument to the macro is the name of the variable. If -// the expression is false, most compilers will issue a warning/error -// containing the name of the variable. - -#if GTEST_LANG_CXX11 -# define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg) -#else // !GTEST_LANG_CXX11 -template - struct CompileAssert { -}; - -# define GTEST_COMPILE_ASSERT_(expr, msg) \ - typedef ::testing::internal::CompileAssert<(static_cast(expr))> \ - msg[static_cast(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_ -#endif // !GTEST_LANG_CXX11 - -// Implementation details of GTEST_COMPILE_ASSERT_: -// -// (In C++11, we simply use static_assert instead of the following) -// -// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1 -// elements (and thus is invalid) when the expression is false. -// -// - The simpler definition -// -// #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1] -// -// does not work, as gcc supports variable-length arrays whose sizes -// are determined at run-time (this is gcc's extension and not part -// of the C++ standard). As a result, gcc fails to reject the -// following code with the simple definition: -// -// int foo; -// GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is -// // not a compile-time constant. -// -// - By using the type CompileAssert<(bool(expr))>, we ensures that -// expr is a compile-time constant. (Template arguments must be -// determined at compile-time.) -// -// - The outter parentheses in CompileAssert<(bool(expr))> are necessary -// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written -// -// CompileAssert -// -// instead, these compilers will refuse to compile -// -// GTEST_COMPILE_ASSERT_(5 > 0, some_message); -// -// (They seem to think the ">" in "5 > 0" marks the end of the -// template argument list.) -// -// - The array size is (bool(expr) ? 1 : -1), instead of simply -// -// ((expr) ? 1 : -1). -// -// This is to avoid running into a bug in MS VC 7.1, which -// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1. - -// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h. -// -// This template is declared, but intentionally undefined. -template -struct StaticAssertTypeEqHelper; - -template -struct StaticAssertTypeEqHelper { - enum { value = true }; -}; - -// Same as std::is_same<>. -template -struct IsSame { - enum { value = false }; -}; -template -struct IsSame { - enum { value = true }; -}; +// The second argument to the macro must be a valid C++ identifier. If the +// expression is false, compiler will issue an error containing this identifier. +#define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg) // Evaluates to the number of elements in 'array'. #define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0])) -#if GTEST_HAS_GLOBAL_STRING -typedef ::string string; -#else -typedef ::std::string string; -#endif // GTEST_HAS_GLOBAL_STRING - -#if GTEST_HAS_GLOBAL_WSTRING -typedef ::wstring wstring; -#elif GTEST_HAS_STD_WSTRING -typedef ::std::wstring wstring; -#endif // GTEST_HAS_GLOBAL_WSTRING - // A helper for suppressing warnings on constant condition. It just // returns 'condition'. GTEST_API_ bool IsTrue(bool condition); -// Defines scoped_ptr. - -// This implementation of scoped_ptr is PARTIAL - it only contains -// enough stuff to satisfy Google Test's need. -template -class scoped_ptr { - public: - typedef T element_type; - - explicit scoped_ptr(T* p = NULL) : ptr_(p) {} - ~scoped_ptr() { reset(); } - - T& operator*() const { return *ptr_; } - T* operator->() const { return ptr_; } - T* get() const { return ptr_; } - - T* release() { - T* const ptr = ptr_; - ptr_ = NULL; - return ptr; - } - - void reset(T* p = NULL) { - if (p != ptr_) { - if (IsTrue(sizeof(T) > 0)) { // Makes sure T is a complete type. - delete ptr_; - } - ptr_ = p; - } - } - - friend void swap(scoped_ptr& a, scoped_ptr& b) { - using std::swap; - swap(a.ptr_, b.ptr_); - } - - private: - T* ptr_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr); -}; - // Defines RE. #if GTEST_USES_PCRE @@ -1247,25 +880,16 @@ class GTEST_API_ RE { // Constructs an RE from a string. RE(const ::std::string& regex) { Init(regex.c_str()); } // NOLINT -# if GTEST_HAS_GLOBAL_STRING - - RE(const ::string& regex) { Init(regex.c_str()); } // NOLINT - -# endif // GTEST_HAS_GLOBAL_STRING - RE(const char* regex) { Init(regex); } // NOLINT ~RE(); // Returns the string representation of the regex. const char* pattern() const { return pattern_; } - // FullMatch(str, re) returns true iff regular expression re matches - // the entire str. - // PartialMatch(str, re) returns true iff regular expression re + // FullMatch(str, re) returns true if and only if regular expression re + // matches the entire str. + // PartialMatch(str, re) returns true if and only if regular expression re // matches a substring of str (including str itself). - // - // FIXME: make FullMatch() and PartialMatch() work - // when str contains NUL characters. static bool FullMatch(const ::std::string& str, const RE& re) { return FullMatch(str.c_str(), re); } @@ -1273,26 +897,11 @@ class GTEST_API_ RE { return PartialMatch(str.c_str(), re); } -# if GTEST_HAS_GLOBAL_STRING - - static bool FullMatch(const ::string& str, const RE& re) { - return FullMatch(str.c_str(), re); - } - static bool PartialMatch(const ::string& str, const RE& re) { - return PartialMatch(str.c_str(), re); - } - -# endif // GTEST_HAS_GLOBAL_STRING - static bool FullMatch(const char* str, const RE& re); static bool PartialMatch(const char* str, const RE& re); private: void Init(const char* regex); - - // We use a const char* instead of an std::string, as Google Test used to be - // used where std::string is not available. FIXME: change to - // std::string. const char* pattern_; bool is_valid_; @@ -1360,7 +969,7 @@ class GTEST_API_ GTestLog { __FILE__, __LINE__).GetStream() inline void LogToStderr() {} -inline void FlushInfoLog() { fflush(NULL); } +inline void FlushInfoLog() { fflush(nullptr); } #endif // !defined(GTEST_LOG_) @@ -1397,19 +1006,6 @@ inline void FlushInfoLog() { fflush(NULL); } GTEST_LOG_(FATAL) << #posix_call << "failed with error " \ << gtest_error -// Adds reference to a type if it is not a reference type, -// otherwise leaves it unchanged. This is the same as -// tr1::add_reference, which is not widely available yet. -template -struct AddReference { typedef T& type; }; // NOLINT -template -struct AddReference { typedef T& type; }; // NOLINT - -// A handy wrapper around AddReference that works when the argument T -// depends on template parameters. -#define GTEST_ADD_REFERENCE_(T) \ - typename ::testing::internal::AddReference::type - // Transforms "T" into "const T&" according to standard reference collapsing // rules (this is only needed as a backport for C++98 compilers that do not // support reference collapsing). Specifically, it transforms: @@ -1430,28 +1026,6 @@ struct ConstRef { typedef T& type; }; #define GTEST_REFERENCE_TO_CONST_(T) \ typename ::testing::internal::ConstRef::type -#if GTEST_HAS_STD_MOVE_ -using std::forward; -using std::move; - -template -struct RvalueRef { - typedef T&& type; -}; -#else // GTEST_HAS_STD_MOVE_ -template -const T& move(const T& t) { - return t; -} -template -GTEST_ADD_REFERENCE_(T) forward(GTEST_ADD_REFERENCE_(T) t) { return t; } - -template -struct RvalueRef { - typedef const T& type; -}; -#endif // GTEST_HAS_STD_MOVE_ - // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. // // Use ImplicitCast_ as a safe version of static_cast for upcasting in @@ -1505,13 +1079,13 @@ inline To DownCast_(From* f) { // so we only accept pointers GTEST_INTENTIONAL_CONST_COND_PUSH_() if (false) { GTEST_INTENTIONAL_CONST_COND_POP_() - const To to = NULL; - ::testing::internal::ImplicitCast_(to); + const To to = nullptr; + ::testing::internal::ImplicitCast_(to); } #if GTEST_HAS_RTTI // RTTI: debug mode only! - GTEST_CHECK_(f == NULL || dynamic_cast(f) != NULL); + GTEST_CHECK_(f == nullptr || dynamic_cast(f) != nullptr); #endif return static_cast(f); } @@ -1565,9 +1139,6 @@ std::vector GetInjectableArgvs(); // Deprecated: pass the args vector by value instead. void SetInjectableArgvs(const std::vector* new_argvs); void SetInjectableArgvs(const std::vector& new_argvs); -#if GTEST_HAS_GLOBAL_STRING -void SetInjectableArgvs(const std::vector< ::string>& new_argvs); -#endif // GTEST_HAS_GLOBAL_STRING void ClearInjectableArgvs(); #endif // GTEST_HAS_DEATH_TEST @@ -1583,7 +1154,7 @@ inline void SleepMilliseconds(int n) { 0, // 0 seconds. n * 1000L * 1000L, // And n ms. }; - nanosleep(&time, NULL); + nanosleep(&time, nullptr); } # endif // GTEST_HAS_PTHREAD @@ -1601,7 +1172,7 @@ inline void SleepMilliseconds(int n) { class Notification { public: Notification() : notified_(false) { - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL)); + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr)); } ~Notification() { pthread_mutex_destroy(&mutex_); @@ -1659,7 +1230,8 @@ class GTEST_API_ AutoHandle { void Reset(Handle handle); private: - // Returns true iff the handle is a valid handle object that can be closed. + // Returns true if and only if the handle is a valid handle object that can be + // closed. bool IsCloseable() const; Handle handle_; @@ -1710,7 +1282,7 @@ class ThreadWithParamBase { // pass into pthread_create(). extern "C" inline void* ThreadFuncWithCLinkage(void* thread) { static_cast(thread)->Run(); - return NULL; + return nullptr; } // Helper class for testing Google Test's multi-threading constructs. @@ -1739,20 +1311,19 @@ class ThreadWithParam : public ThreadWithParamBase { // The thread can be created only after all fields except thread_ // have been initialized. GTEST_CHECK_POSIX_SUCCESS_( - pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base)); + pthread_create(&thread_, nullptr, &ThreadFuncWithCLinkage, base)); } - ~ThreadWithParam() { Join(); } + ~ThreadWithParam() override { Join(); } void Join() { if (!finished_) { - GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0)); + GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, nullptr)); finished_ = true; } } - virtual void Run() { - if (thread_can_start_ != NULL) - thread_can_start_->WaitForNotification(); + void Run() override { + if (thread_can_start_ != nullptr) thread_can_start_->WaitForNotification(); func_(param_); } @@ -1762,7 +1333,8 @@ class ThreadWithParam : public ThreadWithParamBase { // When non-NULL, used to block execution until the controller thread // notifies. Notification* const thread_can_start_; - bool finished_; // true iff we know that the thread function has finished. + bool finished_; // true if and only if we know that the thread function has + // finished. pthread_t thread_; // The native thread object. GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam); @@ -2046,7 +1618,7 @@ class ThreadLocal : public ThreadLocalBase { GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory); }; - scoped_ptr default_factory_; + std::unique_ptr default_factory_; GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); }; @@ -2115,7 +1687,7 @@ class MutexBase { class Mutex : public MutexBase { public: Mutex() { - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL)); + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr)); has_owner_ = false; } ~Mutex() { @@ -2213,7 +1785,7 @@ class GTEST_API_ ThreadLocal { T* GetOrCreateValue() const { ThreadLocalValueHolderBase* const holder = static_cast(pthread_getspecific(key_)); - if (holder != NULL) { + if (holder != nullptr) { return CheckedDowncastToActualType(holder)->pointer(); } @@ -2257,7 +1829,7 @@ class GTEST_API_ ThreadLocal { // A key pthreads uses for looking up per-thread values. const pthread_key_t key_; - scoped_ptr default_factory_; + std::unique_ptr default_factory_; GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); }; @@ -2315,67 +1887,8 @@ class GTEST_API_ ThreadLocal { // we cannot detect it. GTEST_API_ size_t GetThreadCount(); -// Passing non-POD classes through ellipsis (...) crashes the ARM -// compiler and generates a warning in Sun Studio before 12u4. The Nokia Symbian -// and the IBM XL C/C++ compiler try to instantiate a copy constructor -// for objects passed through ellipsis (...), failing for uncopyable -// objects. We define this to ensure that only POD is passed through -// ellipsis on these systems. -#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || \ - (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x5130) -// We lose support for NULL detection where the compiler doesn't like -// passing non-POD classes through ellipsis (...). -# define GTEST_ELLIPSIS_NEEDS_POD_ 1 -#else -# define GTEST_CAN_COMPARE_NULL 1 -#endif - -// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between -// const T& and const T* in a function template. These compilers -// _can_ decide between class template specializations for T and T*, -// so a tr1::type_traits-like is_pointer works. -#if defined(__SYMBIAN32__) || defined(__IBMCPP__) -# define GTEST_NEEDS_IS_POINTER_ 1 -#endif - -template -struct bool_constant { - typedef bool_constant type; - static const bool value = bool_value; -}; -template const bool bool_constant::value; - -typedef bool_constant false_type; -typedef bool_constant true_type; - -template -struct is_same : public false_type {}; - -template -struct is_same : public true_type {}; - - -template -struct is_pointer : public false_type {}; - -template -struct is_pointer : public true_type {}; - -template -struct IteratorTraits { - typedef typename Iterator::value_type value_type; -}; - - -template -struct IteratorTraits { - typedef T value_type; -}; - -template -struct IteratorTraits { - typedef T value_type; -}; +template +using bool_constant = std::integral_constant; #if GTEST_OS_WINDOWS # define GTEST_PATH_SEP_ "\\" @@ -2535,12 +2048,12 @@ inline const char* GetEnv(const char* name) { #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT // We are on Windows CE, which has no environment variables. static_cast(name); // To prevent 'unused argument' warning. - return NULL; + return nullptr; #elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9) // Environment variables which we programmatically clear will be set to the // empty string rather than unset (NULL). Handle that case. const char* const env = getenv(name); - return (env != NULL && env[0] != '\0') ? env : NULL; + return (env != nullptr && env[0] != '\0') ? env : nullptr; #else return getenv(name); #endif @@ -2552,9 +2065,9 @@ GTEST_DISABLE_MSC_DEPRECATED_POP_() // Windows CE has no C library. The abort() function is used in // several places in Google Test. This implementation provides a reasonable // imitation of standard behaviour. -void Abort(); +[[noreturn]] void Abort(); #else -inline void Abort() { abort(); } +[[noreturn]] inline void Abort() { abort(); } #endif // GTEST_OS_WINDOWS_MOBILE } // namespace posix @@ -2564,13 +2077,12 @@ inline void Abort() { abort(); } // MSVC-based platforms. We map the GTEST_SNPRINTF_ macro to the appropriate // function in order to achieve that. We use macro definition here because // snprintf is a variadic function. -#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE +#if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE // MSVC 2005 and above support variadic macros. # define GTEST_SNPRINTF_(buffer, size, format, ...) \ _snprintf_s(buffer, size, size, format, __VA_ARGS__) #elif defined(_MSC_VER) -// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't -// complain about _snprintf. +// Windows CE does not define _snprintf_s # define GTEST_SNPRINTF_ _snprintf #else # define GTEST_SNPRINTF_ snprintf @@ -2684,9 +2196,6 @@ typedef TypeWithSize<8>::Int TimeInMillis; // Represents time in milliseconds. // Parses 'str' for a 32-bit signed integer. If successful, writes the result // to *value and returns true; otherwise leaves *value unchanged and returns // false. -// FIXME: Find a better way to refactor flag and environment parsing -// out of both gtest-port.cc and gtest.cc to avoid exporting this utility -// function. bool ParseInt32(const Message& src_text, const char* str, Int32* value); // Parses a bool/Int32/string from the environment variable @@ -2699,4 +2208,24 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val); } // namespace internal } // namespace testing +#if !defined(GTEST_INTERNAL_DEPRECATED) + +// Internal Macro to mark an API deprecated, for googletest usage only +// Usage: class GTEST_INTERNAL_DEPRECATED(message) MyClass or +// GTEST_INTERNAL_DEPRECATED(message) myFunction(); Every usage of +// a deprecated entity will trigger a warning when compiled with +// `-Wdeprecated-declarations` option (clang, gcc, any __GNUC__ compiler). +// For msvc /W3 option will need to be used +// Note that for 'other' compilers this macro evaluates to nothing to prevent +// compilations errors. +#if defined(_MSC_VER) +#define GTEST_INTERNAL_DEPRECATED(message) __declspec(deprecated(message)) +#elif defined(__GNUC__) +#define GTEST_INTERNAL_DEPRECATED(message) __attribute__((deprecated(message))) +#else +#define GTEST_INTERNAL_DEPRECATED(message) +#endif + +#endif // !defined(GTEST_INTERNAL_DEPRECATED) + #endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-string.h b/third_party/googletest/src/include/gtest/internal/gtest-string.h index 4c9b6262c3..82aaa63bf4 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-string.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-string.h @@ -94,7 +94,8 @@ class GTEST_API_ String { static const char* Utf16ToAnsi(LPCWSTR utf16_str); #endif - // Compares two C strings. Returns true iff they have the same content. + // Compares two C strings. Returns true if and only if they have the same + // content. // // Unlike strcmp(), this function can handle NULL argument(s). A // NULL C string is considered different to any non-NULL C string, @@ -107,16 +108,16 @@ class GTEST_API_ String { // returned. static std::string ShowWideCString(const wchar_t* wide_c_str); - // Compares two wide C strings. Returns true iff they have the same - // content. + // Compares two wide C strings. Returns true if and only if they have the + // same content. // // Unlike wcscmp(), this function can handle NULL argument(s). A // NULL C string is considered different to any non-NULL C string, // including the empty string. static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs); - // Compares two C strings, ignoring case. Returns true iff they - // have the same content. + // Compares two C strings, ignoring case. Returns true if and only if + // they have the same content. // // Unlike strcasecmp(), this function can handle NULL argument(s). // A NULL C string is considered different to any non-NULL C string, @@ -124,8 +125,8 @@ class GTEST_API_ String { static bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs); - // Compares two wide C strings, ignoring case. Returns true iff they - // have the same content. + // Compares two wide C strings, ignoring case. Returns true if and only if + // they have the same content. // // Unlike wcscasecmp(), this function can handle NULL argument(s). // A NULL C string is considered different to any non-NULL wide C string, @@ -139,8 +140,8 @@ class GTEST_API_ String { static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs, const wchar_t* rhs); - // Returns true iff the given string ends with the given suffix, ignoring - // case. Any string is considered to end with an empty suffix. + // Returns true if and only if the given string ends with the given suffix, + // ignoring case. Any string is considered to end with an empty suffix. static bool EndsWithCaseInsensitive( const std::string& str, const std::string& suffix); @@ -150,6 +151,9 @@ class GTEST_API_ String { // Formats an int value as "%X". static std::string FormatHexInt(int value); + // Formats an int value as "%X". + static std::string FormatHexUInt32(UInt32 value); + // Formats a byte as "%02X". static std::string FormatByte(unsigned char value); diff --git a/third_party/googletest/src/include/gtest/internal/gtest-tuple.h b/third_party/googletest/src/include/gtest/internal/gtest-tuple.h deleted file mode 100644 index 78a3a6a01f..0000000000 --- a/third_party/googletest/src/include/gtest/internal/gtest-tuple.h +++ /dev/null @@ -1,1021 +0,0 @@ -// This file was GENERATED by command: -// pump.py gtest-tuple.h.pump -// DO NOT EDIT BY HAND!!! - -// Copyright 2009 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -// Implements a subset of TR1 tuple needed by Google Test and Google Mock. - -// GOOGLETEST_CM0001 DO NOT DELETE - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ - -#include // For ::std::pair. - -// The compiler used in Symbian has a bug that prevents us from declaring the -// tuple template as a friend (it complains that tuple is redefined). This -// bypasses the bug by declaring the members that should otherwise be -// private as public. -// Sun Studio versions < 12 also have the above bug. -#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590) -# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public: -#else -# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \ - template friend class tuple; \ - private: -#endif - -// Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that conflict -// with our own definitions. Therefore using our own tuple does not work on -// those compilers. -#if defined(_MSC_VER) && _MSC_VER >= 1600 /* 1600 is Visual Studio 2010 */ -# error "gtest's tuple doesn't compile on Visual Studio 2010 or later. \ -GTEST_USE_OWN_TR1_TUPLE must be set to 0 on those compilers." -#endif - -// GTEST_n_TUPLE_(T) is the type of an n-tuple. -#define GTEST_0_TUPLE_(T) tuple<> -#define GTEST_1_TUPLE_(T) tuple -#define GTEST_2_TUPLE_(T) tuple -#define GTEST_3_TUPLE_(T) tuple -#define GTEST_4_TUPLE_(T) tuple -#define GTEST_5_TUPLE_(T) tuple -#define GTEST_6_TUPLE_(T) tuple -#define GTEST_7_TUPLE_(T) tuple -#define GTEST_8_TUPLE_(T) tuple -#define GTEST_9_TUPLE_(T) tuple -#define GTEST_10_TUPLE_(T) tuple - -// GTEST_n_TYPENAMES_(T) declares a list of n typenames. -#define GTEST_0_TYPENAMES_(T) -#define GTEST_1_TYPENAMES_(T) typename T##0 -#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1 -#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2 -#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3 -#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4 -#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4, typename T##5 -#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4, typename T##5, typename T##6 -#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4, typename T##5, typename T##6, typename T##7 -#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4, typename T##5, typename T##6, \ - typename T##7, typename T##8 -#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4, typename T##5, typename T##6, \ - typename T##7, typename T##8, typename T##9 - -// In theory, defining stuff in the ::std namespace is undefined -// behavior. We can do this as we are playing the role of a standard -// library vendor. -namespace std { -namespace tr1 { - -template -class tuple; - -// Anything in namespace gtest_internal is Google Test's INTERNAL -// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code. -namespace gtest_internal { - -// ByRef::type is T if T is a reference; otherwise it's const T&. -template -struct ByRef { typedef const T& type; }; // NOLINT -template -struct ByRef { typedef T& type; }; // NOLINT - -// A handy wrapper for ByRef. -#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef::type - -// AddRef::type is T if T is a reference; otherwise it's T&. This -// is the same as tr1::add_reference::type. -template -struct AddRef { typedef T& type; }; // NOLINT -template -struct AddRef { typedef T& type; }; // NOLINT - -// A handy wrapper for AddRef. -#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef::type - -// A helper for implementing get(). -template class Get; - -// A helper for implementing tuple_element. kIndexValid is true -// iff k < the number of fields in tuple type T. -template -struct TupleElement; - -template -struct TupleElement { - typedef T0 type; -}; - -template -struct TupleElement { - typedef T1 type; -}; - -template -struct TupleElement { - typedef T2 type; -}; - -template -struct TupleElement { - typedef T3 type; -}; - -template -struct TupleElement { - typedef T4 type; -}; - -template -struct TupleElement { - typedef T5 type; -}; - -template -struct TupleElement { - typedef T6 type; -}; - -template -struct TupleElement { - typedef T7 type; -}; - -template -struct TupleElement { - typedef T8 type; -}; - -template -struct TupleElement { - typedef T9 type; -}; - -} // namespace gtest_internal - -template <> -class tuple<> { - public: - tuple() {} - tuple(const tuple& /* t */) {} - tuple& operator=(const tuple& /* t */) { return *this; } -}; - -template -class GTEST_1_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {} - - tuple(const tuple& t) : f0_(t.f0_) {} - - template - tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_1_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) { - f0_ = t.f0_; - return *this; - } - - T0 f0_; -}; - -template -class GTEST_2_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0), - f1_(f1) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {} - - template - tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {} - template - tuple(const ::std::pair& p) : f0_(p.first), f1_(p.second) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_2_TUPLE_(U)& t) { - return CopyFrom(t); - } - template - tuple& operator=(const ::std::pair& p) { - f0_ = p.first; - f1_ = p.second; - return *this; - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - return *this; - } - - T0 f0_; - T1 f1_; -}; - -template -class GTEST_3_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {} - - template - tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_3_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; -}; - -template -class GTEST_4_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2), - f3_(f3) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {} - - template - tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_4_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; -}; - -template -class GTEST_5_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, - GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_) {} - - template - tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_5_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; -}; - -template -class GTEST_6_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, - GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), - f5_(f5) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_), f5_(t.f5_) {} - - template - tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_6_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - f5_ = t.f5_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - T5 f5_; -}; - -template -class GTEST_7_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, - GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2), - f3_(f3), f4_(f4), f5_(f5), f6_(f6) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {} - - template - tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_7_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - f5_ = t.f5_; - f6_ = t.f6_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - T5 f5_; - T6 f6_; -}; - -template -class GTEST_8_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, - GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, - GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), - f5_(f5), f6_(f6), f7_(f7) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {} - - template - tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_8_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - f5_ = t.f5_; - f6_ = t.f6_; - f7_ = t.f7_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - T5 f5_; - T6 f6_; - T7 f7_; -}; - -template -class GTEST_9_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, - GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7, - GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), - f5_(f5), f6_(f6), f7_(f7), f8_(f8) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {} - - template - tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_9_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - f5_ = t.f5_; - f6_ = t.f6_; - f7_ = t.f7_; - f8_ = t.f8_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - T5 f5_; - T6 f6_; - T7 f7_; - T8 f8_; -}; - -template -class tuple { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(), - f9_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, - GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7, - GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2), - f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {} - - template - tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), - f9_(t.f9_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_10_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - f5_ = t.f5_; - f6_ = t.f6_; - f7_ = t.f7_; - f8_ = t.f8_; - f9_ = t.f9_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - T5 f5_; - T6 f6_; - T7 f7_; - T8 f8_; - T9 f9_; -}; - -// 6.1.3.2 Tuple creation functions. - -// Known limitations: we don't support passing an -// std::tr1::reference_wrapper to make_tuple(). And we don't -// implement tie(). - -inline tuple<> make_tuple() { return tuple<>(); } - -template -inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) { - return GTEST_1_TUPLE_(T)(f0); -} - -template -inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) { - return GTEST_2_TUPLE_(T)(f0, f1); -} - -template -inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) { - return GTEST_3_TUPLE_(T)(f0, f1, f2); -} - -template -inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3) { - return GTEST_4_TUPLE_(T)(f0, f1, f2, f3); -} - -template -inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4) { - return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4); -} - -template -inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4, const T5& f5) { - return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5); -} - -template -inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4, const T5& f5, const T6& f6) { - return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6); -} - -template -inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) { - return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7); -} - -template -inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7, - const T8& f8) { - return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8); -} - -template -inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7, - const T8& f8, const T9& f9) { - return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9); -} - -// 6.1.3.3 Tuple helper classes. - -template struct tuple_size; - -template -struct tuple_size { - static const int value = 0; -}; - -template -struct tuple_size { - static const int value = 1; -}; - -template -struct tuple_size { - static const int value = 2; -}; - -template -struct tuple_size { - static const int value = 3; -}; - -template -struct tuple_size { - static const int value = 4; -}; - -template -struct tuple_size { - static const int value = 5; -}; - -template -struct tuple_size { - static const int value = 6; -}; - -template -struct tuple_size { - static const int value = 7; -}; - -template -struct tuple_size { - static const int value = 8; -}; - -template -struct tuple_size { - static const int value = 9; -}; - -template -struct tuple_size { - static const int value = 10; -}; - -template -struct tuple_element { - typedef typename gtest_internal::TupleElement< - k < (tuple_size::value), k, Tuple>::type type; -}; - -#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element::type - -// 6.1.3.4 Element access. - -namespace gtest_internal { - -template <> -class Get<0> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple)) - Field(Tuple& t) { return t.f0_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple)) - ConstField(const Tuple& t) { return t.f0_; } -}; - -template <> -class Get<1> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple)) - Field(Tuple& t) { return t.f1_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple)) - ConstField(const Tuple& t) { return t.f1_; } -}; - -template <> -class Get<2> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple)) - Field(Tuple& t) { return t.f2_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple)) - ConstField(const Tuple& t) { return t.f2_; } -}; - -template <> -class Get<3> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple)) - Field(Tuple& t) { return t.f3_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple)) - ConstField(const Tuple& t) { return t.f3_; } -}; - -template <> -class Get<4> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple)) - Field(Tuple& t) { return t.f4_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple)) - ConstField(const Tuple& t) { return t.f4_; } -}; - -template <> -class Get<5> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple)) - Field(Tuple& t) { return t.f5_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple)) - ConstField(const Tuple& t) { return t.f5_; } -}; - -template <> -class Get<6> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple)) - Field(Tuple& t) { return t.f6_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple)) - ConstField(const Tuple& t) { return t.f6_; } -}; - -template <> -class Get<7> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple)) - Field(Tuple& t) { return t.f7_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple)) - ConstField(const Tuple& t) { return t.f7_; } -}; - -template <> -class Get<8> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple)) - Field(Tuple& t) { return t.f8_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple)) - ConstField(const Tuple& t) { return t.f8_; } -}; - -template <> -class Get<9> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple)) - Field(Tuple& t) { return t.f9_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple)) - ConstField(const Tuple& t) { return t.f9_; } -}; - -} // namespace gtest_internal - -template -GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T))) -get(GTEST_10_TUPLE_(T)& t) { - return gtest_internal::Get::Field(t); -} - -template -GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T))) -get(const GTEST_10_TUPLE_(T)& t) { - return gtest_internal::Get::ConstField(t); -} - -// 6.1.3.5 Relational operators - -// We only implement == and !=, as we don't have a need for the rest yet. - -namespace gtest_internal { - -// SameSizeTuplePrefixComparator::Eq(t1, t2) returns true if the -// first k fields of t1 equals the first k fields of t2. -// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if -// k1 != k2. -template -struct SameSizeTuplePrefixComparator; - -template <> -struct SameSizeTuplePrefixComparator<0, 0> { - template - static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) { - return true; - } -}; - -template -struct SameSizeTuplePrefixComparator { - template - static bool Eq(const Tuple1& t1, const Tuple2& t2) { - return SameSizeTuplePrefixComparator::Eq(t1, t2) && - ::std::tr1::get(t1) == ::std::tr1::get(t2); - } -}; - -} // namespace gtest_internal - -template -inline bool operator==(const GTEST_10_TUPLE_(T)& t, - const GTEST_10_TUPLE_(U)& u) { - return gtest_internal::SameSizeTuplePrefixComparator< - tuple_size::value, - tuple_size::value>::Eq(t, u); -} - -template -inline bool operator!=(const GTEST_10_TUPLE_(T)& t, - const GTEST_10_TUPLE_(U)& u) { return !(t == u); } - -// 6.1.4 Pairs. -// Unimplemented. - -} // namespace tr1 -} // namespace std - -#undef GTEST_0_TUPLE_ -#undef GTEST_1_TUPLE_ -#undef GTEST_2_TUPLE_ -#undef GTEST_3_TUPLE_ -#undef GTEST_4_TUPLE_ -#undef GTEST_5_TUPLE_ -#undef GTEST_6_TUPLE_ -#undef GTEST_7_TUPLE_ -#undef GTEST_8_TUPLE_ -#undef GTEST_9_TUPLE_ -#undef GTEST_10_TUPLE_ - -#undef GTEST_0_TYPENAMES_ -#undef GTEST_1_TYPENAMES_ -#undef GTEST_2_TYPENAMES_ -#undef GTEST_3_TYPENAMES_ -#undef GTEST_4_TYPENAMES_ -#undef GTEST_5_TYPENAMES_ -#undef GTEST_6_TYPENAMES_ -#undef GTEST_7_TYPENAMES_ -#undef GTEST_8_TYPENAMES_ -#undef GTEST_9_TYPENAMES_ -#undef GTEST_10_TYPENAMES_ - -#undef GTEST_DECLARE_TUPLE_AS_FRIEND_ -#undef GTEST_BY_REF_ -#undef GTEST_ADD_REF_ -#undef GTEST_TUPLE_ELEMENT_ - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump b/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump deleted file mode 100644 index bb626e049f..0000000000 --- a/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump +++ /dev/null @@ -1,348 +0,0 @@ -$$ -*- mode: c++; -*- -$var n = 10 $$ Maximum number of tuple fields we want to support. -$$ This meta comment fixes auto-indentation in Emacs. }} -// Copyright 2009 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -// Implements a subset of TR1 tuple needed by Google Test and Google Mock. - -// GOOGLETEST_CM0001 DO NOT DELETE - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ - -#include // For ::std::pair. - -// The compiler used in Symbian has a bug that prevents us from declaring the -// tuple template as a friend (it complains that tuple is redefined). This -// bypasses the bug by declaring the members that should otherwise be -// private as public. -// Sun Studio versions < 12 also have the above bug. -#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590) -# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public: -#else -# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \ - template friend class tuple; \ - private: -#endif - -// Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that conflict -// with our own definitions. Therefore using our own tuple does not work on -// those compilers. -#if defined(_MSC_VER) && _MSC_VER >= 1600 /* 1600 is Visual Studio 2010 */ -# error "gtest's tuple doesn't compile on Visual Studio 2010 or later. \ -GTEST_USE_OWN_TR1_TUPLE must be set to 0 on those compilers." -#endif - - -$range i 0..n-1 -$range j 0..n -$range k 1..n -// GTEST_n_TUPLE_(T) is the type of an n-tuple. -#define GTEST_0_TUPLE_(T) tuple<> - -$for k [[ -$range m 0..k-1 -$range m2 k..n-1 -#define GTEST_$(k)_TUPLE_(T) tuple<$for m, [[T##$m]]$for m2 [[, void]]> - -]] - -// GTEST_n_TYPENAMES_(T) declares a list of n typenames. - -$for j [[ -$range m 0..j-1 -#define GTEST_$(j)_TYPENAMES_(T) $for m, [[typename T##$m]] - - -]] - -// In theory, defining stuff in the ::std namespace is undefined -// behavior. We can do this as we are playing the role of a standard -// library vendor. -namespace std { -namespace tr1 { - -template <$for i, [[typename T$i = void]]> -class tuple; - -// Anything in namespace gtest_internal is Google Test's INTERNAL -// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code. -namespace gtest_internal { - -// ByRef::type is T if T is a reference; otherwise it's const T&. -template -struct ByRef { typedef const T& type; }; // NOLINT -template -struct ByRef { typedef T& type; }; // NOLINT - -// A handy wrapper for ByRef. -#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef::type - -// AddRef::type is T if T is a reference; otherwise it's T&. This -// is the same as tr1::add_reference::type. -template -struct AddRef { typedef T& type; }; // NOLINT -template -struct AddRef { typedef T& type; }; // NOLINT - -// A handy wrapper for AddRef. -#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef::type - -// A helper for implementing get(). -template class Get; - -// A helper for implementing tuple_element. kIndexValid is true -// iff k < the number of fields in tuple type T. -template -struct TupleElement; - - -$for i [[ -template -struct TupleElement { - typedef T$i type; -}; - - -]] -} // namespace gtest_internal - -template <> -class tuple<> { - public: - tuple() {} - tuple(const tuple& /* t */) {} - tuple& operator=(const tuple& /* t */) { return *this; } -}; - - -$for k [[ -$range m 0..k-1 -template -class $if k < n [[GTEST_$(k)_TUPLE_(T)]] $else [[tuple]] { - public: - template friend class gtest_internal::Get; - - tuple() : $for m, [[f$(m)_()]] {} - - explicit tuple($for m, [[GTEST_BY_REF_(T$m) f$m]]) : [[]] -$for m, [[f$(m)_(f$m)]] {} - - tuple(const tuple& t) : $for m, [[f$(m)_(t.f$(m)_)]] {} - - template - tuple(const GTEST_$(k)_TUPLE_(U)& t) : $for m, [[f$(m)_(t.f$(m)_)]] {} - -$if k == 2 [[ - template - tuple(const ::std::pair& p) : f0_(p.first), f1_(p.second) {} - -]] - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_$(k)_TUPLE_(U)& t) { - return CopyFrom(t); - } - -$if k == 2 [[ - template - tuple& operator=(const ::std::pair& p) { - f0_ = p.first; - f1_ = p.second; - return *this; - } - -]] - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_$(k)_TUPLE_(U)& t) { - -$for m [[ - f$(m)_ = t.f$(m)_; - -]] - return *this; - } - - -$for m [[ - T$m f$(m)_; - -]] -}; - - -]] -// 6.1.3.2 Tuple creation functions. - -// Known limitations: we don't support passing an -// std::tr1::reference_wrapper to make_tuple(). And we don't -// implement tie(). - -inline tuple<> make_tuple() { return tuple<>(); } - -$for k [[ -$range m 0..k-1 - -template -inline GTEST_$(k)_TUPLE_(T) make_tuple($for m, [[const T$m& f$m]]) { - return GTEST_$(k)_TUPLE_(T)($for m, [[f$m]]); -} - -]] - -// 6.1.3.3 Tuple helper classes. - -template struct tuple_size; - - -$for j [[ -template -struct tuple_size { - static const int value = $j; -}; - - -]] -template -struct tuple_element { - typedef typename gtest_internal::TupleElement< - k < (tuple_size::value), k, Tuple>::type type; -}; - -#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element::type - -// 6.1.3.4 Element access. - -namespace gtest_internal { - - -$for i [[ -template <> -class Get<$i> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_($i, Tuple)) - Field(Tuple& t) { return t.f$(i)_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_($i, Tuple)) - ConstField(const Tuple& t) { return t.f$(i)_; } -}; - - -]] -} // namespace gtest_internal - -template -GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_$(n)_TUPLE_(T))) -get(GTEST_$(n)_TUPLE_(T)& t) { - return gtest_internal::Get::Field(t); -} - -template -GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_$(n)_TUPLE_(T))) -get(const GTEST_$(n)_TUPLE_(T)& t) { - return gtest_internal::Get::ConstField(t); -} - -// 6.1.3.5 Relational operators - -// We only implement == and !=, as we don't have a need for the rest yet. - -namespace gtest_internal { - -// SameSizeTuplePrefixComparator::Eq(t1, t2) returns true if the -// first k fields of t1 equals the first k fields of t2. -// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if -// k1 != k2. -template -struct SameSizeTuplePrefixComparator; - -template <> -struct SameSizeTuplePrefixComparator<0, 0> { - template - static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) { - return true; - } -}; - -template -struct SameSizeTuplePrefixComparator { - template - static bool Eq(const Tuple1& t1, const Tuple2& t2) { - return SameSizeTuplePrefixComparator::Eq(t1, t2) && - ::std::tr1::get(t1) == ::std::tr1::get(t2); - } -}; - -} // namespace gtest_internal - -template -inline bool operator==(const GTEST_$(n)_TUPLE_(T)& t, - const GTEST_$(n)_TUPLE_(U)& u) { - return gtest_internal::SameSizeTuplePrefixComparator< - tuple_size::value, - tuple_size::value>::Eq(t, u); -} - -template -inline bool operator!=(const GTEST_$(n)_TUPLE_(T)& t, - const GTEST_$(n)_TUPLE_(U)& u) { return !(t == u); } - -// 6.1.4 Pairs. -// Unimplemented. - -} // namespace tr1 -} // namespace std - - -$for j [[ -#undef GTEST_$(j)_TUPLE_ - -]] - - -$for j [[ -#undef GTEST_$(j)_TYPENAMES_ - -]] - -#undef GTEST_DECLARE_TUPLE_AS_FRIEND_ -#undef GTEST_BY_REF_ -#undef GTEST_ADD_REF_ -#undef GTEST_TUPLE_ELEMENT_ - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h index 28e4112453..3d7542d1fb 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h @@ -31,12 +31,11 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - // Type utilities needed for implementing typed and type-parameterized // tests. This file is generated by a SCRIPT. DO NOT EDIT BY HAND! // // Currently we support at most 50 types in a list, and at most 50 -// type-parameterized tests in one type-parameterized test case. +// type-parameterized tests in one type-parameterized test suite. // Please contact googletestframework@googlegroups.com if you need // more. @@ -89,7 +88,7 @@ std::string GetTypeName() { # if GTEST_HAS_CXXABI_H_ using abi::__cxa_demangle; # endif // GTEST_HAS_CXXABI_H_ - char* const readable_name = __cxa_demangle(name, 0, 0, &status); + char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status); const std::string name_str(status == 0 ? readable_name : name); free(readable_name); return CanonicalizeForStdLibVersioning(name_str); @@ -106,18 +105,6 @@ std::string GetTypeName() { #if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P -// AssertyTypeEq::type is defined iff T1 and T2 are the same -// type. This can be used as a compile-time assertion to ensure that -// two types are equal. - -template -struct AssertTypeEq; - -template -struct AssertTypeEq { - typedef bool type; -}; - // A unique type used as the default value for the arguments of class // template Types. This allows us to simulate variadic templates // (e.g. Types, Type, and etc), which C++ doesn't @@ -3312,8 +3299,8 @@ struct Templates list in TYPED_TEST_CASE() and -// INSTANTIATE_TYPED_TEST_CASE_P(). +// or a Types<...> list in TYPED_TEST_SUITE() and +// INSTANTIATE_TYPED_TEST_SUITE_P(). template struct TypeList { diff --git a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump index 0001a5d39d..5e31b7b320 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump +++ b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump @@ -34,7 +34,7 @@ $var n = 50 $$ Maximum length of type lists we want to support. // tests. This file is generated by a SCRIPT. DO NOT EDIT BY HAND! // // Currently we support at most $n types in a list, and at most $n -// type-parameterized tests in one type-parameterized test case. +// type-parameterized tests in one type-parameterized test suite. // Please contact googletestframework@googlegroups.com if you need // more. @@ -87,7 +87,7 @@ std::string GetTypeName() { # if GTEST_HAS_CXXABI_H_ using abi::__cxa_demangle; # endif // GTEST_HAS_CXXABI_H_ - char* const readable_name = __cxa_demangle(name, 0, 0, &status); + char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status); const std::string name_str(status == 0 ? readable_name : name); free(readable_name); return CanonicalizeForStdLibVersioning(name_str); @@ -104,18 +104,6 @@ std::string GetTypeName() { #if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P -// AssertyTypeEq::type is defined iff T1 and T2 are the same -// type. This can be used as a compile-time assertion to ensure that -// two types are equal. - -template -struct AssertTypeEq; - -template -struct AssertTypeEq { - typedef bool type; -}; - // A unique type used as the default value for the arguments of class // template Types. This allows us to simulate variadic templates // (e.g. Types, Type, and etc), which C++ doesn't @@ -291,8 +279,8 @@ struct Templates<$for j, [[T$j]]$for k[[, NoneT]]> { ]] // The TypeList template makes it possible to use either a single type -// or a Types<...> list in TYPED_TEST_CASE() and -// INSTANTIATE_TYPED_TEST_CASE_P(). +// or a Types<...> list in TYPED_TEST_SUITE() and +// INSTANTIATE_TYPED_TEST_SUITE_P(). template struct TypeList { diff --git a/third_party/googletest/src/src/gtest-all.cc b/third_party/googletest/src/src/gtest-all.cc index b217a18006..ad292905cf 100644 --- a/third_party/googletest/src/src/gtest-all.cc +++ b/third_party/googletest/src/src/gtest-all.cc @@ -41,6 +41,7 @@ #include "src/gtest.cc" #include "src/gtest-death-test.cc" #include "src/gtest-filepath.cc" +#include "src/gtest-matchers.cc" #include "src/gtest-port.cc" #include "src/gtest-printers.cc" #include "src/gtest-test-part.cc" diff --git a/third_party/googletest/src/src/gtest-death-test.cc b/third_party/googletest/src/src/gtest-death-test.cc index 0908355161..da09a1cfc2 100644 --- a/third_party/googletest/src/src/gtest-death-test.cc +++ b/third_party/googletest/src/src/gtest-death-test.cc @@ -31,6 +31,9 @@ // This file implements death tests. #include "gtest/gtest-death-test.h" + +#include + #include "gtest/internal/gtest-port.h" #include "gtest/internal/custom/gtest.h" @@ -62,10 +65,16 @@ # endif // GTEST_OS_QNX # if GTEST_OS_FUCHSIA +# include # include # include +# include +# include +# include +# include # include # include +# include # include # endif // GTEST_OS_FUCHSIA @@ -113,8 +122,8 @@ GTEST_DEFINE_string_( "Indicates the file, line number, temporal index of " "the single death test to run, and a file descriptor to " "which a success code may be sent, all separated by " - "the '|' characters. This flag is specified if and only if the current " - "process is a sub-process launched for running a thread-safe " + "the '|' characters. This flag is specified if and only if the " + "current process is a sub-process launched for running a thread-safe " "death test. FOR INTERNAL USE ONLY."); } // namespace internal @@ -266,8 +275,6 @@ static const int kFuchsiaReadPipeFd = 3; // statement, which is not allowed; THREW means that the test statement // returned control by throwing an exception. IN_PROGRESS means the test // has not yet concluded. -// FIXME: Unify names and possibly values for -// AbortReason, DeathTestOutcome, and flag characters above. enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW }; // Routine for aborting the program which is safe to call from an @@ -281,7 +288,7 @@ static void DeathTestAbort(const std::string& message) { // the heap for any additional non-minuscule memory requirements. const InternalRunDeathTestFlag* const flag = GetUnitTestImpl()->internal_run_death_test_flag(); - if (flag != NULL) { + if (flag != nullptr) { FILE* parent = posix::FDOpen(flag->write_fd(), "w"); fputc(kDeathTestInternalError, parent); fprintf(parent, "%s", message.c_str()); @@ -361,7 +368,7 @@ static void FailFromInternalError(int fd) { // for the current test. DeathTest::DeathTest() { TestInfo* const info = GetUnitTestImpl()->current_test_info(); - if (info == NULL) { + if (info == nullptr) { DeathTestAbort("Cannot run a death test outside of a TEST or " "TEST_F construct"); } @@ -369,10 +376,11 @@ DeathTest::DeathTest() { // Creates and returns a death test by dispatching to the current // death test factory. -bool DeathTest::Create(const char* statement, const RE* regex, - const char* file, int line, DeathTest** test) { +bool DeathTest::Create(const char* statement, + Matcher matcher, const char* file, + int line, DeathTest** test) { return GetUnitTestImpl()->death_test_factory()->Create( - statement, regex, file, line, test); + statement, std::move(matcher), file, line, test); } const char* DeathTest::LastMessage() { @@ -388,9 +396,9 @@ std::string DeathTest::last_death_test_message_; // Provides cross platform implementation for some death functionality. class DeathTestImpl : public DeathTest { protected: - DeathTestImpl(const char* a_statement, const RE* a_regex) + DeathTestImpl(const char* a_statement, Matcher matcher) : statement_(a_statement), - regex_(a_regex), + matcher_(std::move(matcher)), spawned_(false), status_(-1), outcome_(IN_PROGRESS), @@ -398,13 +406,12 @@ class DeathTestImpl : public DeathTest { write_fd_(-1) {} // read_fd_ is expected to be closed and cleared by a derived class. - ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); } + ~DeathTestImpl() override { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); } - void Abort(AbortReason reason); - virtual bool Passed(bool status_ok); + void Abort(AbortReason reason) override; + bool Passed(bool status_ok) override; const char* statement() const { return statement_; } - const RE* regex() const { return regex_; } bool spawned() const { return spawned_; } void set_spawned(bool is_spawned) { spawned_ = is_spawned; } int status() const { return status_; } @@ -422,13 +429,15 @@ class DeathTestImpl : public DeathTest { // case of unexpected codes. void ReadAndInterpretStatusByte(); + // Returns stderr output from the child process. + virtual std::string GetErrorLogs(); + private: // The textual content of the code this object is testing. This class // doesn't own this string and should not attempt to delete it. const char* const statement_; - // The regular expression which test output must match. DeathTestImpl - // doesn't own this object and should not attempt to delete it. - const RE* const regex_; + // A matcher that's expected to match the stderr output by the child process. + Matcher matcher_; // True if the death test child process has been successfully spawned. bool spawned_; // The exit status of the child process. @@ -490,6 +499,10 @@ void DeathTestImpl::ReadAndInterpretStatusByte() { set_read_fd(-1); } +std::string DeathTestImpl::GetErrorLogs() { + return GetCapturedStderr(); +} + // Signals that the death test code which should have exited, didn't. // Should be called only in a death test child process. // Writes a status byte to the child's status file descriptor, then @@ -543,22 +556,21 @@ static ::std::string FormatDeathTestOutput(const ::std::string& output) { // in the format specified by wait(2). On Windows, this is the // value supplied to the ExitProcess() API or a numeric code // of the exception that terminated the program. -// regex: A regular expression object to be applied to -// the test's captured standard error output; the death test -// fails if it does not match. +// matcher_: A matcher that's expected to match the stderr output by the child +// process. // // Argument: // status_ok: true if exit_status is acceptable in the context of // this particular death test, which fails if it is false // -// Returns true iff all of the above conditions are met. Otherwise, the -// first failing condition, in the order given above, is the one that is +// Returns true if and only if all of the above conditions are met. Otherwise, +// the first failing condition, in the order given above, is the one that is // reported. Also sets the last death test message string. bool DeathTestImpl::Passed(bool status_ok) { if (!spawned()) return false; - const std::string error_message = GetCapturedStderr(); + const std::string error_message = GetErrorLogs(); bool success = false; Message buffer; @@ -579,18 +591,15 @@ bool DeathTestImpl::Passed(bool status_ok) { break; case DIED: if (status_ok) { -# if GTEST_USES_PCRE - // PCRE regexes support embedded NULs. - const bool matched = RE::PartialMatch(error_message, *regex()); -# else - const bool matched = RE::PartialMatch(error_message.c_str(), *regex()); -# endif // GTEST_USES_PCRE - if (matched) { + if (matcher_.Matches(error_message)) { success = true; } else { + std::ostringstream stream; + matcher_.DescribeTo(&stream); buffer << " Result: died but not with expected error.\n" - << " Expected: " << regex()->pattern() << "\n" - << "Actual msg:\n" << FormatDeathTestOutput(error_message); + << " Expected: " << stream.str() << "\n" + << "Actual msg:\n" + << FormatDeathTestOutput(error_message); } } else { buffer << " Result: died but not with expected exit code:\n" @@ -639,11 +648,11 @@ bool DeathTestImpl::Passed(bool status_ok) { // class WindowsDeathTest : public DeathTestImpl { public: - WindowsDeathTest(const char* a_statement, - const RE* a_regex, - const char* file, - int line) - : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {} + WindowsDeathTest(const char* a_statement, Matcher matcher, + const char* file, int line) + : DeathTestImpl(a_statement, std::move(matcher)), + file_(file), + line_(line) {} // All of these virtual functions are inherited from DeathTest. virtual int Wait(); @@ -720,7 +729,7 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() { const TestInfo* const info = impl->current_test_info(); const int death_test_index = info->result()->death_test_count(); - if (flag != NULL) { + if (flag != nullptr) { // ParseInternalRunDeathTestFlag() has performed all the necessary // processing. set_write_fd(flag->write_fd()); @@ -729,8 +738,8 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() { // WindowsDeathTest uses an anonymous pipe to communicate results of // a death test. - SECURITY_ATTRIBUTES handles_are_inheritable = { - sizeof(SECURITY_ATTRIBUTES), NULL, TRUE }; + SECURITY_ATTRIBUTES handles_are_inheritable = {sizeof(SECURITY_ATTRIBUTES), + nullptr, TRUE}; HANDLE read_handle, write_handle; GTEST_DEATH_TEST_CHECK_( ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable, @@ -741,13 +750,13 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() { write_handle_.Reset(write_handle); event_handle_.Reset(::CreateEvent( &handles_are_inheritable, - TRUE, // The event will automatically reset to non-signaled state. - FALSE, // The initial state is non-signalled. - NULL)); // The even is unnamed. - GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL); - const std::string filter_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" + - info->test_case_name() + "." + info->name(); + TRUE, // The event will automatically reset to non-signaled state. + FALSE, // The initial state is non-signalled. + nullptr)); // The even is unnamed. + GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr); + const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + kFilterFlag + "=" + info->test_suite_name() + + "." + info->name(); const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" + file_ + "|" + StreamableToString(line_) + "|" + @@ -760,10 +769,9 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() { "|" + StreamableToString(reinterpret_cast(event_handle_.Get())); char executable_path[_MAX_PATH + 1]; // NOLINT - GTEST_DEATH_TEST_CHECK_( - _MAX_PATH + 1 != ::GetModuleFileNameA(NULL, - executable_path, - _MAX_PATH)); + GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr, + executable_path, + _MAX_PATH)); std::string command_line = std::string(::GetCommandLineA()) + " " + filter_flag + " \"" + @@ -784,17 +792,16 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() { startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE); PROCESS_INFORMATION process_info; - GTEST_DEATH_TEST_CHECK_(::CreateProcessA( - executable_path, - const_cast(command_line.c_str()), - NULL, // Retuned process handle is not inheritable. - NULL, // Retuned thread handle is not inheritable. - TRUE, // Child inherits all inheritable handles (for write_handle_). - 0x0, // Default creation flags. - NULL, // Inherit the parent's environment. - UnitTest::GetInstance()->original_working_dir(), - &startup_info, - &process_info) != FALSE); + GTEST_DEATH_TEST_CHECK_( + ::CreateProcessA( + executable_path, const_cast(command_line.c_str()), + nullptr, // Retuned process handle is not inheritable. + nullptr, // Retuned thread handle is not inheritable. + TRUE, // Child inherits all inheritable handles (for write_handle_). + 0x0, // Default creation flags. + nullptr, // Inherit the parent's environment. + UnitTest::GetInstance()->original_working_dir(), &startup_info, + &process_info) != FALSE); child_handle_.Reset(process_info.hProcess); ::CloseHandle(process_info.hThread); set_spawned(true); @@ -805,38 +812,34 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() { class FuchsiaDeathTest : public DeathTestImpl { public: - FuchsiaDeathTest(const char* a_statement, - const RE* a_regex, - const char* file, - int line) - : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {} - virtual ~FuchsiaDeathTest() { - zx_status_t status = zx_handle_close(child_process_); - GTEST_DEATH_TEST_CHECK_(status == ZX_OK); - status = zx_handle_close(port_); - GTEST_DEATH_TEST_CHECK_(status == ZX_OK); - } + FuchsiaDeathTest(const char* a_statement, Matcher matcher, + const char* file, int line) + : DeathTestImpl(a_statement, std::move(matcher)), + file_(file), + line_(line) {} // All of these virtual functions are inherited from DeathTest. - virtual int Wait(); - virtual TestRole AssumeRole(); + int Wait() override; + TestRole AssumeRole() override; + std::string GetErrorLogs() override; private: // The name of the file in which the death test is located. const char* const file_; // The line number on which the death test is located. const int line_; + // The stderr data captured by the child process. + std::string captured_stderr_; - zx_handle_t child_process_ = ZX_HANDLE_INVALID; - zx_handle_t port_ = ZX_HANDLE_INVALID; + zx::process child_process_; + zx::channel exception_channel_; + zx::socket stderr_socket_; }; // Utility class for accumulating command-line arguments. class Arguments { public: - Arguments() { - args_.push_back(NULL); - } + Arguments() { args_.push_back(nullptr); } ~Arguments() { for (std::vector::iterator i = args_.begin(); i != args_.end(); @@ -872,51 +875,88 @@ class Arguments { // status, or 0 if no child process exists. As a side effect, sets the // outcome data member. int FuchsiaDeathTest::Wait() { + const int kProcessKey = 0; + const int kSocketKey = 1; + const int kExceptionKey = 2; + if (!spawned()) return 0; - // Register to wait for the child process to terminate. + // Create a port to wait for socket/task/exception events. zx_status_t status_zx; - status_zx = zx_object_wait_async(child_process_, - port_, - 0 /* key */, - ZX_PROCESS_TERMINATED, - ZX_WAIT_ASYNC_ONCE); + zx::port port; + status_zx = zx::port::create(0, &port); GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); - // Wait for it to terminate, or an exception to be received. - zx_port_packet_t packet; - status_zx = zx_port_wait(port_, ZX_TIME_INFINITE, &packet); + // Register to wait for the child process to terminate. + status_zx = child_process_.wait_async( + port, kProcessKey, ZX_PROCESS_TERMINATED, ZX_WAIT_ASYNC_ONCE); GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); - if (ZX_PKT_IS_EXCEPTION(packet.type)) { - // Process encountered an exception. Kill it directly rather than letting - // other handlers process the event. - status_zx = zx_task_kill(child_process_); - GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + // Register to wait for the socket to be readable or closed. + status_zx = stderr_socket_.wait_async( + port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, + ZX_WAIT_ASYNC_ONCE); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); - // Now wait for |child_process_| to terminate. - zx_signals_t signals = 0; - status_zx = zx_object_wait_one( - child_process_, ZX_PROCESS_TERMINATED, ZX_TIME_INFINITE, &signals); + // Register to wait for an exception. + status_zx = exception_channel_.wait_async( + port, kExceptionKey, ZX_CHANNEL_READABLE, ZX_WAIT_ASYNC_ONCE); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + bool process_terminated = false; + bool socket_closed = false; + do { + zx_port_packet_t packet = {}; + status_zx = port.wait(zx::time::infinite(), &packet); GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); - GTEST_DEATH_TEST_CHECK_(signals & ZX_PROCESS_TERMINATED); - } else { - // Process terminated. - GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type)); - GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED); - } + + if (packet.key == kExceptionKey) { + // Process encountered an exception. Kill it directly rather than + // letting other handlers process the event. We will get a kProcessKey + // event when the process actually terminates. + status_zx = child_process_.kill(); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + } else if (packet.key == kProcessKey) { + // Process terminated. + GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type)); + GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED); + process_terminated = true; + } else if (packet.key == kSocketKey) { + GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type)); + if (packet.signal.observed & ZX_SOCKET_READABLE) { + // Read data from the socket. + constexpr size_t kBufferSize = 1024; + do { + size_t old_length = captured_stderr_.length(); + size_t bytes_read = 0; + captured_stderr_.resize(old_length + kBufferSize); + status_zx = stderr_socket_.read( + 0, &captured_stderr_.front() + old_length, kBufferSize, + &bytes_read); + captured_stderr_.resize(old_length + bytes_read); + } while (status_zx == ZX_OK); + if (status_zx == ZX_ERR_PEER_CLOSED) { + socket_closed = true; + } else { + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_ERR_SHOULD_WAIT); + status_zx = stderr_socket_.wait_async( + port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, + ZX_WAIT_ASYNC_ONCE); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + } + } else { + GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_SOCKET_PEER_CLOSED); + socket_closed = true; + } + } + } while (!process_terminated && !socket_closed); ReadAndInterpretStatusByte(); zx_info_process_t buffer; - status_zx = zx_object_get_info( - child_process_, - ZX_INFO_PROCESS, - &buffer, - sizeof(buffer), - nullptr, - nullptr); + status_zx = child_process_.get_info( + ZX_INFO_PROCESS, &buffer, sizeof(buffer), nullptr, nullptr); GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); GTEST_DEATH_TEST_CHECK_(buffer.exited); @@ -936,21 +976,20 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() { const TestInfo* const info = impl->current_test_info(); const int death_test_index = info->result()->death_test_count(); - if (flag != NULL) { + if (flag != nullptr) { // ParseInternalRunDeathTestFlag() has performed all the necessary // processing. set_write_fd(kFuchsiaReadPipeFd); return EXECUTE_TEST; } - CaptureStderr(); // Flush the log buffers since the log streams are shared with the child. FlushInfoLog(); // Build the child process command line. - const std::string filter_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" - + info->test_case_name() + "." + info->name(); + const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + kFilterFlag + "=" + info->test_suite_name() + + "." + info->name(); const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" + file_ + "|" @@ -964,35 +1003,68 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() { // Build the pipe for communication with the child. zx_status_t status; zx_handle_t child_pipe_handle; - uint32_t type; - status = fdio_pipe_half(&child_pipe_handle, &type); - GTEST_DEATH_TEST_CHECK_(status >= 0); - set_read_fd(status); + int child_pipe_fd; + status = fdio_pipe_half(&child_pipe_fd, &child_pipe_handle); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + set_read_fd(child_pipe_fd); // Set the pipe handle for the child. - fdio_spawn_action_t add_handle_action = {}; - add_handle_action.action = FDIO_SPAWN_ACTION_ADD_HANDLE; - add_handle_action.h.id = PA_HND(type, kFuchsiaReadPipeFd); - add_handle_action.h.handle = child_pipe_handle; + fdio_spawn_action_t spawn_actions[2] = {}; + fdio_spawn_action_t* add_handle_action = &spawn_actions[0]; + add_handle_action->action = FDIO_SPAWN_ACTION_ADD_HANDLE; + add_handle_action->h.id = PA_HND(PA_FD, kFuchsiaReadPipeFd); + add_handle_action->h.handle = child_pipe_handle; + + // Create a socket pair will be used to receive the child process' stderr. + zx::socket stderr_producer_socket; + status = + zx::socket::create(0, &stderr_producer_socket, &stderr_socket_); + GTEST_DEATH_TEST_CHECK_(status >= 0); + int stderr_producer_fd = -1; + status = + fdio_fd_create(stderr_producer_socket.release(), &stderr_producer_fd); + GTEST_DEATH_TEST_CHECK_(status >= 0); - // Spawn the child process. - status = fdio_spawn_etc(ZX_HANDLE_INVALID, FDIO_SPAWN_CLONE_ALL, - args.Argv()[0], args.Argv(), nullptr, 1, - &add_handle_action, &child_process_, nullptr); + // Make the stderr socket nonblocking. + GTEST_DEATH_TEST_CHECK_(fcntl(stderr_producer_fd, F_SETFL, 0) == 0); + + fdio_spawn_action_t* add_stderr_action = &spawn_actions[1]; + add_stderr_action->action = FDIO_SPAWN_ACTION_CLONE_FD; + add_stderr_action->fd.local_fd = stderr_producer_fd; + add_stderr_action->fd.target_fd = STDERR_FILENO; + + // Create a child job. + zx_handle_t child_job = ZX_HANDLE_INVALID; + status = zx_job_create(zx_job_default(), 0, & child_job); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + zx_policy_basic_t policy; + policy.condition = ZX_POL_NEW_ANY; + policy.policy = ZX_POL_ACTION_ALLOW; + status = zx_job_set_policy( + child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, &policy, 1); GTEST_DEATH_TEST_CHECK_(status == ZX_OK); - // Create an exception port and attach it to the |child_process_|, to allow + // Create an exception channel attached to the |child_job|, to allow // us to suppress the system default exception handler from firing. - status = zx_port_create(0, &port_); + status = + zx_task_create_exception_channel( + child_job, 0, exception_channel_.reset_and_get_address()); GTEST_DEATH_TEST_CHECK_(status == ZX_OK); - status = zx_task_bind_exception_port( - child_process_, port_, 0 /* key */, 0 /*options */); + + // Spawn the child process. + status = fdio_spawn_etc( + child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], args.Argv(), nullptr, + 2, spawn_actions, child_process_.reset_and_get_address(), nullptr); GTEST_DEATH_TEST_CHECK_(status == ZX_OK); set_spawned(true); return OVERSEE_TEST; } +std::string FuchsiaDeathTest::GetErrorLogs() { + return captured_stderr_; +} + #else // We are neither on Windows, nor on Fuchsia. // ForkingDeathTest provides implementations for most of the abstract @@ -1000,10 +1072,10 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() { // left undefined. class ForkingDeathTest : public DeathTestImpl { public: - ForkingDeathTest(const char* statement, const RE* regex); + ForkingDeathTest(const char* statement, Matcher matcher); // All of these virtual functions are inherited from DeathTest. - virtual int Wait(); + int Wait() override; protected: void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; } @@ -1014,9 +1086,9 @@ class ForkingDeathTest : public DeathTestImpl { }; // Constructs a ForkingDeathTest. -ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex) - : DeathTestImpl(a_statement, a_regex), - child_pid_(-1) {} +ForkingDeathTest::ForkingDeathTest(const char* a_statement, + Matcher matcher) + : DeathTestImpl(a_statement, std::move(matcher)), child_pid_(-1) {} // Waits for the child in a death test to exit, returning its exit // status, or 0 if no child process exists. As a side effect, sets the @@ -1037,9 +1109,9 @@ int ForkingDeathTest::Wait() { // in the child process. class NoExecDeathTest : public ForkingDeathTest { public: - NoExecDeathTest(const char* a_statement, const RE* a_regex) : - ForkingDeathTest(a_statement, a_regex) { } - virtual TestRole AssumeRole(); + NoExecDeathTest(const char* a_statement, Matcher matcher) + : ForkingDeathTest(a_statement, std::move(matcher)) {} + TestRole AssumeRole() override; }; // The AssumeRole process for a fork-and-run death test. It implements a @@ -1092,10 +1164,13 @@ DeathTest::TestRole NoExecDeathTest::AssumeRole() { // only this specific death test to be run. class ExecDeathTest : public ForkingDeathTest { public: - ExecDeathTest(const char* a_statement, const RE* a_regex, - const char* file, int line) : - ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { } - virtual TestRole AssumeRole(); + ExecDeathTest(const char* a_statement, Matcher matcher, + const char* file, int line) + : ForkingDeathTest(a_statement, std::move(matcher)), + file_(file), + line_(line) {} + TestRole AssumeRole() override; + private: static ::std::vector GetArgvsForDeathTestChildProcess() { ::std::vector args = GetInjectableArgvs(); @@ -1115,9 +1190,7 @@ class ExecDeathTest : public ForkingDeathTest { // Utility class for accumulating command-line arguments. class Arguments { public: - Arguments() { - args_.push_back(NULL); - } + Arguments() { args_.push_back(nullptr); } ~Arguments() { for (std::vector::iterator i = args_.begin(); i != args_.end(); @@ -1211,6 +1284,9 @@ static int ExecDeathTestChildMain(void* child_arg) { // correct answer. static void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_; +// HWAddressSanitizer add a random tag to the MSB of the local variable address, +// making comparison result unpredictable. +GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ static void StackLowerThanAddress(const void* ptr, bool* result) { int dummy; *result = (&dummy < ptr); @@ -1218,6 +1294,7 @@ static void StackLowerThanAddress(const void* ptr, bool* result) { // Make sure AddressSanitizer does not tamper with the stack here. GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ static bool StackGrowsDown() { int dummy; bool result; @@ -1262,7 +1339,8 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { fd_flags | FD_CLOEXEC)); struct inheritance inherit = {0}; // spawn is a system call. - child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron()); + child_pid = + spawn(args.argv[0], 0, nullptr, &inherit, args.argv, GetEnviron()); // Restores the current working directory. GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1); GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd)); @@ -1286,9 +1364,9 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { if (!use_fork) { static const bool stack_grows_down = StackGrowsDown(); - const size_t stack_size = getpagesize(); + const auto stack_size = static_cast(getpagesize()); // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead. - void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE, + void* const stack = mmap(nullptr, stack_size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED); @@ -1302,8 +1380,9 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { void* const stack_top = static_cast(stack) + (stack_grows_down ? stack_size - kMaxStackAlignment : 0); - GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment && - reinterpret_cast(stack_top) % kMaxStackAlignment == 0); + GTEST_DEATH_TEST_CHECK_( + static_cast(stack_size) > kMaxStackAlignment && + reinterpret_cast(stack_top) % kMaxStackAlignment == 0); child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args); @@ -1320,7 +1399,7 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { # endif // GTEST_OS_QNX # if GTEST_OS_LINUX GTEST_DEATH_TEST_CHECK_SYSCALL_( - sigaction(SIGPROF, &saved_sigprof_action, NULL)); + sigaction(SIGPROF, &saved_sigprof_action, nullptr)); # endif // GTEST_OS_LINUX GTEST_DEATH_TEST_CHECK_(child_pid != -1); @@ -1338,7 +1417,7 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() { const TestInfo* const info = impl->current_test_info(); const int death_test_index = info->result()->death_test_count(); - if (flag != NULL) { + if (flag != nullptr) { set_write_fd(flag->write_fd()); return EXECUTE_TEST; } @@ -1349,9 +1428,9 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() { // it be closed when the child process does an exec: GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1); - const std::string filter_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" - + info->test_case_name() + "." + info->name(); + const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + kFilterFlag + "=" + info->test_suite_name() + + "." + info->name(); const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" + file_ + "|" + StreamableToString(line_) + "|" @@ -1384,7 +1463,8 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() { // by the "test" argument to its address. If the test should be // skipped, sets that pointer to NULL. Returns true, unless the // flag is set to an invalid value. -bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex, +bool DefaultDeathTestFactory::Create(const char* statement, + Matcher matcher, const char* file, int line, DeathTest** test) { UnitTestImpl* const impl = GetUnitTestImpl(); @@ -1393,7 +1473,7 @@ bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex, const int death_test_index = impl->current_test_info() ->increment_death_test_count(); - if (flag != NULL) { + if (flag != nullptr) { if (death_test_index > flag->index()) { DeathTest::set_last_death_test_message( "Death test count (" + StreamableToString(death_test_index) @@ -1404,7 +1484,7 @@ bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex, if (!(flag->file() == file && flag->line() == line && flag->index() == death_test_index)) { - *test = NULL; + *test = nullptr; return true; } } @@ -1413,22 +1493,22 @@ bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex, if (GTEST_FLAG(death_test_style) == "threadsafe" || GTEST_FLAG(death_test_style) == "fast") { - *test = new WindowsDeathTest(statement, regex, file, line); + *test = new WindowsDeathTest(statement, std::move(matcher), file, line); } # elif GTEST_OS_FUCHSIA if (GTEST_FLAG(death_test_style) == "threadsafe" || GTEST_FLAG(death_test_style) == "fast") { - *test = new FuchsiaDeathTest(statement, regex, file, line); + *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line); } # else if (GTEST_FLAG(death_test_style) == "threadsafe") { - *test = new ExecDeathTest(statement, regex, file, line); + *test = new ExecDeathTest(statement, std::move(matcher), file, line); } else if (GTEST_FLAG(death_test_style) == "fast") { - *test = new NoExecDeathTest(statement, regex); + *test = new NoExecDeathTest(statement, std::move(matcher)); } # endif // GTEST_OS_WINDOWS @@ -1458,8 +1538,6 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id, StreamableToString(parent_process_id)); } - // FIXME: Replace the following check with a - // compile-time assertion when available. GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t)); const HANDLE write_handle = @@ -1515,7 +1593,7 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id, // initialized from the GTEST_FLAG(internal_run_death_test) flag if // the flag is specified; otherwise returns NULL. InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() { - if (GTEST_FLAG(internal_run_death_test) == "") return NULL; + if (GTEST_FLAG(internal_run_death_test) == "") return nullptr; // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we // can use it here. diff --git a/third_party/googletest/src/src/gtest-filepath.cc b/third_party/googletest/src/src/gtest-filepath.cc index a7e65c082a..bd7b99ff03 100644 --- a/third_party/googletest/src/src/gtest-filepath.cc +++ b/third_party/googletest/src/src/gtest-filepath.cc @@ -38,9 +38,6 @@ #elif GTEST_OS_WINDOWS # include # include -#elif GTEST_OS_SYMBIAN -// Symbian OpenC has PATH_MAX in sys/syslimits.h -# include #else # include # include // Some Linux distributions define PATH_MAX here. @@ -95,13 +92,14 @@ static bool IsPathSeparator(char c) { // Returns the current working directory, or "" if unsuccessful. FilePath FilePath::GetCurrentDir() { -#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT - // Windows CE doesn't have a current directory, so we just return +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ + GTEST_OS_WINDOWS_RT || ARDUINO || defined(ESP_PLATFORM) + // These platforms do not have a current directory, so we just return // something reasonable. return FilePath(kCurrentDirectoryString); #elif GTEST_OS_WINDOWS char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; - return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd); + return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd); #else char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; char* result = getcwd(cwd, sizeof(cwd)); @@ -109,9 +107,9 @@ FilePath FilePath::GetCurrentDir() { // getcwd will likely fail in NaCl due to the sandbox, so return something // reasonable. The user may have provided a shim implementation for getcwd, // however, so fallback only when failure is detected. - return FilePath(result == NULL ? kCurrentDirectoryString : cwd); + return FilePath(result == nullptr ? kCurrentDirectoryString : cwd); # endif // GTEST_OS_NACL - return FilePath(result == NULL ? "" : cwd); + return FilePath(result == nullptr ? "" : cwd); #endif // GTEST_OS_WINDOWS_MOBILE } @@ -136,8 +134,8 @@ const char* FilePath::FindLastPathSeparator() const { #if GTEST_HAS_ALT_PATH_SEP_ const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator); // Comparing two pointers of which only one is NULL is undefined. - if (last_alt_sep != NULL && - (last_sep == NULL || last_alt_sep > last_sep)) { + if (last_alt_sep != nullptr && + (last_sep == nullptr || last_alt_sep > last_sep)) { return last_alt_sep; } #endif @@ -165,7 +163,7 @@ FilePath FilePath::RemoveFileName() const { const char* const last_sep = FindLastPathSeparator(); std::string dir; if (last_sep) { - dir = std::string(c_str(), last_sep + 1 - c_str()); + dir = std::string(c_str(), static_cast(last_sep + 1 - c_str())); } else { dir = kCurrentDirectoryString; } @@ -250,9 +248,6 @@ bool FilePath::DirectoryExists() const { // root directory per disk drive.) bool FilePath::IsRootDirectory() const { #if GTEST_OS_WINDOWS - // FIXME: on Windows a network share like - // \\server\share can be a root directory, although it cannot be the - // current directory. Handle this properly. return pathname_.length() == 3 && IsAbsolutePath(); #else return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]); @@ -324,7 +319,7 @@ bool FilePath::CreateFolder() const { #if GTEST_OS_WINDOWS_MOBILE FilePath removed_sep(this->RemoveTrailingPathSeparator()); LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str()); - int result = CreateDirectory(unicode, NULL) ? 0 : -1; + int result = CreateDirectory(unicode, nullptr) ? 0 : -1; delete [] unicode; #elif GTEST_OS_WINDOWS int result = _mkdir(pathname_.c_str()); @@ -350,9 +345,8 @@ FilePath FilePath::RemoveTrailingPathSeparator() const { // Removes any redundant separators that might be in the pathname. // For example, "bar///foo" becomes "bar/foo". Does not eliminate other // redundancies that might be in a pathname involving "." or "..". -// FIXME: handle Windows network shares (e.g. \\server\share). void FilePath::Normalize() { - if (pathname_.c_str() == NULL) { + if (pathname_.c_str() == nullptr) { pathname_ = ""; return; } diff --git a/third_party/googletest/src/src/gtest-internal-inl.h b/third_party/googletest/src/src/gtest-internal-inl.h index 479004149b..8ed70daab0 100644 --- a/third_party/googletest/src/src/gtest-internal-inl.h +++ b/third_party/googletest/src/src/gtest-internal-inl.h @@ -42,6 +42,7 @@ #include // For memmove. #include +#include #include #include @@ -98,14 +99,14 @@ const char kFlagfileFlag[] = "flagfile"; // A valid random seed must be in [1, kMaxRandomSeed]. const int kMaxRandomSeed = 99999; -// g_help_flag is true iff the --help flag or an equivalent form is -// specified on the command line. +// g_help_flag is true if and only if the --help flag or an equivalent form +// is specified on the command line. GTEST_API_ extern bool g_help_flag; // Returns the current time in milliseconds. GTEST_API_ TimeInMillis GetTimeInMillis(); -// Returns true iff Google Test should use colors in the output. +// Returns true if and only if Google Test should use colors in the output. GTEST_API_ bool ShouldUseColor(bool stdout_is_tty); // Formats the given time in milliseconds as seconds. @@ -230,7 +231,7 @@ GTEST_API_ std::string CodePointToUtf8(UInt32 code_point); // Converts a wide string to a narrow string in UTF-8 encoding. // The wide string is assumed to have the following encoding: -// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS) +// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin) // UTF-32 if sizeof(wchar_t) == 4 (on Linux) // Parameter str points to a null-terminated wide string. // Parameter num_chars may additionally limit the number @@ -265,8 +266,8 @@ GTEST_API_ bool ShouldShard(const char* total_shards_str, GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val); // Given the total number of shards, the shard index, and the test id, -// returns true iff the test should be run on this shard. The test id is -// some arbitrary but unique non-negative integer assigned to each test +// returns true if and only if the test should be run on this shard. The test id +// is some arbitrary but unique non-negative integer assigned to each test // method. Assumes that 0 <= shard_index < total_shards. GTEST_API_ bool ShouldRunTestOnShard( int total_shards, int shard_index, int test_id); @@ -297,7 +298,8 @@ void ForEach(const Container& c, Functor functor) { // in range [0, v.size()). template inline E GetElementOr(const std::vector& v, int i, E default_value) { - return (i < 0 || i >= static_cast(v.size())) ? default_value : v[i]; + return (i < 0 || i >= static_cast(v.size())) ? default_value + : v[static_cast(i)]; } // Performs an in-place shuffle of a range of the vector's elements. @@ -319,8 +321,11 @@ void ShuffleRange(internal::Random* random, int begin, int end, // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle for (int range_width = end - begin; range_width >= 2; range_width--) { const int last_in_range = begin + range_width - 1; - const int selected = begin + random->Generate(range_width); - std::swap((*v)[selected], (*v)[last_in_range]); + const int selected = + begin + + static_cast(random->Generate(static_cast(range_width))); + std::swap((*v)[static_cast(selected)], + (*v)[static_cast(last_in_range)]); } } @@ -347,7 +352,7 @@ class TestPropertyKeyIs { // TestPropertyKeyIs has NO default constructor. explicit TestPropertyKeyIs(const std::string& key) : key_(key) {} - // Returns true iff the test name of test property matches on key_. + // Returns true if and only if the test name of test property matches on key_. bool operator()(const TestProperty& test_property) const { return test_property.key() == key_; } @@ -380,17 +385,17 @@ class GTEST_API_ UnitTestOptions { // Functions for processing the gtest_filter flag. - // Returns true iff the wildcard pattern matches the string. The - // first ':' or '\0' character in pattern marks the end of it. + // Returns true if and only if the wildcard pattern matches the string. + // The first ':' or '\0' character in pattern marks the end of it. // // This recursive algorithm isn't very efficient, but is clear and // works well enough for matching test names, which are short. static bool PatternMatchesString(const char *pattern, const char *str); - // Returns true iff the user-specified filter matches the test case - // name and the test name. - static bool FilterMatchesTest(const std::string &test_case_name, - const std::string &test_name); + // Returns true if and only if the user-specified filter matches the test + // suite name and the test name. + static bool FilterMatchesTest(const std::string& test_suite_name, + const std::string& test_name); #if GTEST_OS_WINDOWS // Function for supporting the gtest_catch_exception flag. @@ -442,8 +447,8 @@ class OsStackTraceGetter : public OsStackTraceGetterInterface { public: OsStackTraceGetter() {} - virtual std::string CurrentStackTrace(int max_depth, int skip_count); - virtual void UponLeavingGTest(); + std::string CurrentStackTrace(int max_depth, int skip_count) override; + void UponLeavingGTest() override; private: #if GTEST_HAS_ABSL @@ -474,7 +479,7 @@ class DefaultGlobalTestPartResultReporter explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test); // Implements the TestPartResultReporterInterface. Reports the test part // result in the current test. - virtual void ReportTestPartResult(const TestPartResult& result); + void ReportTestPartResult(const TestPartResult& result) override; private: UnitTestImpl* const unit_test_; @@ -490,7 +495,7 @@ class DefaultPerThreadTestPartResultReporter explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test); // Implements the TestPartResultReporterInterface. The implementation just // delegates to the current global test part result reporter of *unit_test_. - virtual void ReportTestPartResult(const TestPartResult& result); + void ReportTestPartResult(const TestPartResult& result) override; private: UnitTestImpl* const unit_test_; @@ -528,22 +533,25 @@ class GTEST_API_ UnitTestImpl { void SetTestPartResultReporterForCurrentThread( TestPartResultReporterInterface* reporter); - // Gets the number of successful test cases. - int successful_test_case_count() const; + // Gets the number of successful test suites. + int successful_test_suite_count() const; - // Gets the number of failed test cases. - int failed_test_case_count() const; + // Gets the number of failed test suites. + int failed_test_suite_count() const; - // Gets the number of all test cases. - int total_test_case_count() const; + // Gets the number of all test suites. + int total_test_suite_count() const; - // Gets the number of all test cases that contain at least one test + // Gets the number of all test suites that contain at least one test // that should run. - int test_case_to_run_count() const; + int test_suite_to_run_count() const; // Gets the number of successful tests. int successful_test_count() const; + // Gets the number of skipped tests. + int skipped_test_count() const; + // Gets the number of failed tests. int failed_test_count() const; @@ -569,27 +577,33 @@ class GTEST_API_ UnitTestImpl { // Gets the elapsed time, in milliseconds. TimeInMillis elapsed_time() const { return elapsed_time_; } - // Returns true iff the unit test passed (i.e. all test cases passed). + // Returns true if and only if the unit test passed (i.e. all test suites + // passed). bool Passed() const { return !Failed(); } - // Returns true iff the unit test failed (i.e. some test case failed - // or something outside of all tests failed). + // Returns true if and only if the unit test failed (i.e. some test suite + // failed or something outside of all tests failed). bool Failed() const { - return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed(); + return failed_test_suite_count() > 0 || ad_hoc_test_result()->Failed(); } - // Gets the i-th test case among all the test cases. i can range from 0 to - // total_test_case_count() - 1. If i is not in that range, returns NULL. - const TestCase* GetTestCase(int i) const { - const int index = GetElementOr(test_case_indices_, i, -1); - return index < 0 ? NULL : test_cases_[i]; + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + const TestSuite* GetTestSuite(int i) const { + const int index = GetElementOr(test_suite_indices_, i, -1); + return index < 0 ? nullptr : test_suites_[static_cast(i)]; } - // Gets the i-th test case among all the test cases. i can range from 0 to - // total_test_case_count() - 1. If i is not in that range, returns NULL. - TestCase* GetMutableTestCase(int i) { - const int index = GetElementOr(test_case_indices_, i, -1); - return index < 0 ? NULL : test_cases_[index]; + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + const TestCase* GetTestCase(int i) const { return GetTestSuite(i); } +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + TestSuite* GetMutableSuiteCase(int i) { + const int index = GetElementOr(test_suite_indices_, i, -1); + return index < 0 ? nullptr : test_suites_[static_cast(index)]; } // Provides access to the event listener list. @@ -626,30 +640,38 @@ class GTEST_API_ UnitTestImpl { // trace but Bar() and CurrentOsStackTraceExceptTop() won't. std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_; - // Finds and returns a TestCase with the given name. If one doesn't + // Finds and returns a TestSuite with the given name. If one doesn't // exist, creates one and returns it. // // Arguments: // - // test_case_name: name of the test case + // test_suite_name: name of the test suite // type_param: the name of the test's type parameter, or NULL if // this is not a typed or a type-parameterized test. - // set_up_tc: pointer to the function that sets up the test case - // tear_down_tc: pointer to the function that tears down the test case - TestCase* GetTestCase(const char* test_case_name, - const char* type_param, - Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc); + // set_up_tc: pointer to the function that sets up the test suite + // tear_down_tc: pointer to the function that tears down the test suite + TestSuite* GetTestSuite(const char* test_suite_name, const char* type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc); + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + TestCase* GetTestCase(const char* test_case_name, const char* type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc) { + return GetTestSuite(test_case_name, type_param, set_up_tc, tear_down_tc); + } +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ // Adds a TestInfo to the unit test. // // Arguments: // - // set_up_tc: pointer to the function that sets up the test case - // tear_down_tc: pointer to the function that tears down the test case + // set_up_tc: pointer to the function that sets up the test suite + // tear_down_tc: pointer to the function that tears down the test suite // test_info: the TestInfo object - void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc, + void AddTestInfo(internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc, TestInfo* test_info) { // In order to support thread-safe death tests, we need to // remember the original working directory when the test program @@ -664,21 +686,20 @@ class GTEST_API_ UnitTestImpl { << "Failed to get the current working directory."; } - GetTestCase(test_info->test_case_name(), - test_info->type_param(), - set_up_tc, - tear_down_tc)->AddTestInfo(test_info); + GetTestSuite(test_info->test_suite_name(), test_info->type_param(), + set_up_tc, tear_down_tc) + ->AddTestInfo(test_info); } - // Returns ParameterizedTestCaseRegistry object used to keep track of + // Returns ParameterizedTestSuiteRegistry object used to keep track of // value-parameterized tests and instantiate and register them. - internal::ParameterizedTestCaseRegistry& parameterized_test_registry() { + internal::ParameterizedTestSuiteRegistry& parameterized_test_registry() { return parameterized_test_registry_; } - // Sets the TestCase object for the test that's currently running. - void set_current_test_case(TestCase* a_current_test_case) { - current_test_case_ = a_current_test_case; + // Sets the TestSuite object for the test that's currently running. + void set_current_test_suite(TestSuite* a_current_test_suite) { + current_test_suite_ = a_current_test_suite; } // Sets the TestInfo object for the test that's currently running. If @@ -689,7 +710,7 @@ class GTEST_API_ UnitTestImpl { } // Registers all parameterized tests defined using TEST_P and - // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter + // INSTANTIATE_TEST_SUITE_P, creating regular tests for each test/parameter // combination. This method can be called more then once; it has guards // protecting from registering the tests more then once. If // value-parameterized tests are disabled, RegisterParameterizedTests is @@ -704,7 +725,7 @@ class GTEST_API_ UnitTestImpl { // Clears the results of all tests, except the ad hoc tests. void ClearNonAdHocTestResult() { - ForEach(test_cases_, TestCase::ClearTestCaseResult); + ForEach(test_suites_, TestSuite::ClearTestSuiteResult); } // Clears the results of ad-hoc test assertions. @@ -713,7 +734,7 @@ class GTEST_API_ UnitTestImpl { } // Adds a TestProperty to the current TestResult object when invoked in a - // context of a test or a test case, or to the global property set. If the + // context of a test or a test suite, or to the global property set. If the // result already contains a property with the same key, the value will be // updated. void RecordProperty(const TestProperty& test_property); @@ -725,7 +746,7 @@ class GTEST_API_ UnitTestImpl { // Matches the full name of each test against the user-specified // filter to decide whether the test should run, then records the - // result in each TestCase and TestInfo object. + // result in each TestSuite and TestInfo object. // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests // based on sharding variables in the environment. // Returns the number of tests that should run. @@ -734,7 +755,7 @@ class GTEST_API_ UnitTestImpl { // Prints the names of the tests matching the user-specified filter flag. void ListTestsMatchingFilter(); - const TestCase* current_test_case() const { return current_test_case_; } + const TestSuite* current_test_suite() const { return current_test_suite_; } TestInfo* current_test_info() { return current_test_info_; } const TestInfo* current_test_info() const { return current_test_info_; } @@ -795,11 +816,11 @@ class GTEST_API_ UnitTestImpl { // Gets the random number generator. internal::Random* random() { return &random_; } - // Shuffles all test cases, and the tests within each test case, + // Shuffles all test suites, and the tests within each test suite, // making sure that death tests are still run first. void ShuffleTests(); - // Restores the test cases and tests to their order before the first shuffle. + // Restores the test suites and tests to their order before the first shuffle. void UnshuffleTests(); // Returns the value of GTEST_FLAG(catch_exceptions) at the moment @@ -839,31 +860,31 @@ class GTEST_API_ UnitTestImpl { // before/after the tests are run. std::vector environments_; - // The vector of TestCases in their original order. It owns the + // The vector of TestSuites in their original order. It owns the // elements in the vector. - std::vector test_cases_; + std::vector test_suites_; - // Provides a level of indirection for the test case list to allow - // easy shuffling and restoring the test case order. The i-th - // element of this vector is the index of the i-th test case in the + // Provides a level of indirection for the test suite list to allow + // easy shuffling and restoring the test suite order. The i-th + // element of this vector is the index of the i-th test suite in the // shuffled order. - std::vector test_case_indices_; + std::vector test_suite_indices_; // ParameterizedTestRegistry object used to register value-parameterized // tests. - internal::ParameterizedTestCaseRegistry parameterized_test_registry_; + internal::ParameterizedTestSuiteRegistry parameterized_test_registry_; // Indicates whether RegisterParameterizedTests() has been called already. bool parameterized_tests_registered_; - // Index of the last death test case registered. Initially -1. - int last_death_test_case_; + // Index of the last death test suite registered. Initially -1. + int last_death_test_suite_; - // This points to the TestCase for the currently running test. It - // changes as Google Test goes through one test case after another. + // This points to the TestSuite for the currently running test. It + // changes as Google Test goes through one test suite after another. // When no test is running, this is set to NULL and Google Test // stores assertion results in ad_hoc_test_result_. Initially NULL. - TestCase* current_test_case_; + TestSuite* current_test_suite_; // This points to the TestInfo for the currently running test. It // changes as Google Test goes through one test after another. When @@ -891,7 +912,7 @@ class GTEST_API_ UnitTestImpl { // desired. OsStackTraceGetterInterface* os_stack_trace_getter_; - // True iff PostFlagParsingInit() has been called. + // True if and only if PostFlagParsingInit() has been called. bool post_flag_parse_init_performed_; // The random number seed used at the beginning of the test run. @@ -910,8 +931,8 @@ class GTEST_API_ UnitTestImpl { #if GTEST_HAS_DEATH_TEST // The decomposed components of the gtest_internal_run_death_test flag, // parsed when RUN_ALL_TESTS is called. - internal::scoped_ptr internal_run_death_test_flag_; - internal::scoped_ptr death_test_factory_; + std::unique_ptr internal_run_death_test_flag_; + std::unique_ptr death_test_factory_; #endif // GTEST_HAS_DEATH_TEST // A per-thread stack of traces created by the SCOPED_TRACE() macro. @@ -994,8 +1015,6 @@ bool ParseNaturalNumber(const ::std::string& str, Integer* number) { const bool parse_success = *end == '\0' && errno == 0; - // FIXME: Convert this to compile time assertion when it is - // available. GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed)); const Integer result = static_cast(parsed); @@ -1059,18 +1078,18 @@ class StreamingListener : public EmptyTestEventListener { MakeConnection(); } - virtual ~SocketWriter() { + ~SocketWriter() override { if (sockfd_ != -1) CloseConnection(); } // Sends a string to the socket. - virtual void Send(const std::string& message) { + void Send(const std::string& message) override { GTEST_CHECK_(sockfd_ != -1) << "Send() can be called only when there is a connection."; - const int len = static_cast(message.length()); - if (write(sockfd_, message.c_str(), len) != len) { + const auto len = static_cast(message.length()); + if (write(sockfd_, message.c_str(), len) != static_cast(len)) { GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to " << host_name_ << ":" << port_num_; @@ -1082,7 +1101,7 @@ class StreamingListener : public EmptyTestEventListener { void MakeConnection(); // Closes the socket. - void CloseConnection() { + void CloseConnection() override { GTEST_CHECK_(sockfd_ != -1) << "CloseConnection() can be called only when there is a connection."; @@ -1108,11 +1127,11 @@ class StreamingListener : public EmptyTestEventListener { explicit StreamingListener(AbstractSocketWriter* socket_writer) : socket_writer_(socket_writer) { Start(); } - void OnTestProgramStart(const UnitTest& /* unit_test */) { + void OnTestProgramStart(const UnitTest& /* unit_test */) override { SendLn("event=TestProgramStart"); } - void OnTestProgramEnd(const UnitTest& unit_test) { + void OnTestProgramEnd(const UnitTest& unit_test) override { // Note that Google Test current only report elapsed time for each // test iteration, not for the entire test program. SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed())); @@ -1121,42 +1140,47 @@ class StreamingListener : public EmptyTestEventListener { socket_writer_->CloseConnection(); } - void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) { + void OnTestIterationStart(const UnitTest& /* unit_test */, + int iteration) override { SendLn("event=TestIterationStart&iteration=" + StreamableToString(iteration)); } - void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) { + void OnTestIterationEnd(const UnitTest& unit_test, + int /* iteration */) override { SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) + "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) + "ms"); } - void OnTestCaseStart(const TestCase& test_case) { + // Note that "event=TestCaseStart" is a wire format and has to remain + // "case" for compatibilty + void OnTestCaseStart(const TestCase& test_case) override { SendLn(std::string("event=TestCaseStart&name=") + test_case.name()); } - void OnTestCaseEnd(const TestCase& test_case) { - SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) - + "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) - + "ms"); + // Note that "event=TestCaseEnd" is a wire format and has to remain + // "case" for compatibilty + void OnTestCaseEnd(const TestCase& test_case) override { + SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) + + "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) + + "ms"); } - void OnTestStart(const TestInfo& test_info) { + void OnTestStart(const TestInfo& test_info) override { SendLn(std::string("event=TestStart&name=") + test_info.name()); } - void OnTestEnd(const TestInfo& test_info) { + void OnTestEnd(const TestInfo& test_info) override { SendLn("event=TestEnd&passed=" + FormatBool((test_info.result())->Passed()) + "&elapsed_time=" + StreamableToString((test_info.result())->elapsed_time()) + "ms"); } - void OnTestPartResult(const TestPartResult& test_part_result) { + void OnTestPartResult(const TestPartResult& test_part_result) override { const char* file_name = test_part_result.file_name(); - if (file_name == NULL) - file_name = ""; + if (file_name == nullptr) file_name = ""; SendLn("event=TestPartResult&file=" + UrlEncode(file_name) + "&line=" + StreamableToString(test_part_result.line_number()) + "&message=" + UrlEncode(test_part_result.message())); @@ -1172,7 +1196,7 @@ class StreamingListener : public EmptyTestEventListener { std::string FormatBool(bool value) { return value ? "1" : "0"; } - const scoped_ptr socket_writer_; + const std::unique_ptr socket_writer_; GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener); }; // class StreamingListener diff --git a/third_party/googletest/src/src/gtest-matchers.cc b/third_party/googletest/src/src/gtest-matchers.cc new file mode 100644 index 0000000000..7d2fb6851e --- /dev/null +++ b/third_party/googletest/src/src/gtest-matchers.cc @@ -0,0 +1,97 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file implements just enough of the matcher interface to allow +// EXPECT_DEATH and friends to accept a matcher argument. + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" +#include "gtest/gtest-matchers.h" + +#include + +namespace testing { + +// Constructs a matcher that matches a const std::string& whose value is +// equal to s. +Matcher::Matcher(const std::string& s) { *this = Eq(s); } + +// Constructs a matcher that matches a const std::string& whose value is +// equal to s. +Matcher::Matcher(const char* s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a std::string whose value is equal to +// s. +Matcher::Matcher(const std::string& s) { *this = Eq(s); } + +// Constructs a matcher that matches a std::string whose value is equal to +// s. +Matcher::Matcher(const char* s) { *this = Eq(std::string(s)); } + +#if GTEST_HAS_ABSL +// Constructs a matcher that matches a const absl::string_view& whose value is +// equal to s. +Matcher::Matcher(const std::string& s) { + *this = Eq(s); +} + +// Constructs a matcher that matches a const absl::string_view& whose value is +// equal to s. +Matcher::Matcher(const char* s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a const absl::string_view& whose value is +// equal to s. +Matcher::Matcher(absl::string_view s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a absl::string_view whose value is equal to +// s. +Matcher::Matcher(const std::string& s) { *this = Eq(s); } + +// Constructs a matcher that matches a absl::string_view whose value is equal to +// s. +Matcher::Matcher(const char* s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a absl::string_view whose value is equal to +// s. +Matcher::Matcher(absl::string_view s) { + *this = Eq(std::string(s)); +} +#endif // GTEST_HAS_ABSL + +} // namespace testing diff --git a/third_party/googletest/src/src/gtest-port.cc b/third_party/googletest/src/src/gtest-port.cc index fecb5d11c2..fc5ba6becc 100644 --- a/third_party/googletest/src/src/gtest-port.cc +++ b/third_party/googletest/src/src/gtest-port.cc @@ -31,16 +31,20 @@ #include "gtest/internal/gtest-port.h" #include -#include #include +#include #include #include +#include #if GTEST_OS_WINDOWS # include # include # include # include // Used in ThreadLocal. +# ifdef _MSC_VER +# include +# endif // _MSC_VER #else # include #endif // GTEST_OS_WINDOWS @@ -51,6 +55,14 @@ # include #endif // GTEST_OS_MAC +#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \ + GTEST_OS_NETBSD || GTEST_OS_OPENBSD +# include +# if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD +# include +# endif +#endif + #if GTEST_OS_QNX # include # include @@ -105,7 +117,7 @@ T ReadProcFileField(const std::string& filename, int field) { size_t GetThreadCount() { const std::string filename = (Message() << "/proc/" << getpid() << "/stat").GetString(); - return ReadProcFileField(filename, 19); + return ReadProcFileField(filename, 19); } #elif GTEST_OS_MAC @@ -127,6 +139,81 @@ size_t GetThreadCount() { } } +#elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \ + GTEST_OS_NETBSD + +#if GTEST_OS_NETBSD +#undef KERN_PROC +#define KERN_PROC KERN_PROC2 +#define kinfo_proc kinfo_proc2 +#endif + +#if GTEST_OS_DRAGONFLY +#define KP_NLWP(kp) (kp.kp_nthreads) +#elif GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD +#define KP_NLWP(kp) (kp.ki_numthreads) +#elif GTEST_OS_NETBSD +#define KP_NLWP(kp) (kp.p_nlwps) +#endif + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +size_t GetThreadCount() { + int mib[] = { + CTL_KERN, + KERN_PROC, + KERN_PROC_PID, + getpid(), +#if GTEST_OS_NETBSD + sizeof(struct kinfo_proc), + 1, +#endif + }; + u_int miblen = sizeof(mib) / sizeof(mib[0]); + struct kinfo_proc info; + size_t size = sizeof(info); + if (sysctl(mib, miblen, &info, &size, NULL, 0)) { + return 0; + } + return static_cast(KP_NLWP(info)); +} +#elif GTEST_OS_OPENBSD + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +size_t GetThreadCount() { + int mib[] = { + CTL_KERN, + KERN_PROC, + KERN_PROC_PID | KERN_PROC_SHOW_THREADS, + getpid(), + sizeof(struct kinfo_proc), + 0, + }; + u_int miblen = sizeof(mib) / sizeof(mib[0]); + + // get number of structs + size_t size; + if (sysctl(mib, miblen, NULL, &size, NULL, 0)) { + return 0; + } + mib[5] = size / mib[4]; + + // populate array of structs + struct kinfo_proc info[mib[5]]; + if (sysctl(mib, miblen, &info, &size, NULL, 0)) { + return 0; + } + + // exclude empty members + int nthreads = 0; + for (int i = 0; i < size / mib[4]; i++) { + if (info[i].p_tid != -1) + nthreads++; + } + return nthreads; +} + #elif GTEST_OS_QNX // Returns the number of threads running in the process, or 0 to indicate that @@ -138,7 +225,7 @@ size_t GetThreadCount() { } procfs_info process_info; const int status = - devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL); + devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), nullptr); close(fd); if (status == EOK) { return static_cast(process_info.num_threads); @@ -152,7 +239,7 @@ size_t GetThreadCount() { size_t GetThreadCount() { struct procentry64 entry; pid_t pid = getpid(); - int status = getprocs64(&entry, sizeof(entry), NULL, 0, &pid, 1); + int status = getprocs64(&entry, sizeof(entry), nullptr, 0, &pid, 1); if (status == 1) { return entry.pi_thcount; } else { @@ -192,7 +279,7 @@ size_t GetThreadCount() { #if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS void SleepMilliseconds(int n) { - ::Sleep(n); + ::Sleep(static_cast(n)); } AutoHandle::AutoHandle() @@ -230,15 +317,15 @@ void AutoHandle::Reset(HANDLE handle) { bool AutoHandle::IsCloseable() const { // Different Windows APIs may use either of these values to represent an // invalid handle. - return handle_ != NULL && handle_ != INVALID_HANDLE_VALUE; + return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE; } Notification::Notification() - : event_(::CreateEvent(NULL, // Default security attributes. - TRUE, // Do not reset automatically. - FALSE, // Initially unset. - NULL)) { // Anonymous event. - GTEST_CHECK_(event_.Get() != NULL); + : event_(::CreateEvent(nullptr, // Default security attributes. + TRUE, // Do not reset automatically. + FALSE, // Initially unset. + nullptr)) { // Anonymous event. + GTEST_CHECK_(event_.Get() != nullptr); } void Notification::Notify() { @@ -261,13 +348,10 @@ Mutex::Mutex() Mutex::~Mutex() { // Static mutexes are leaked intentionally. It is not thread-safe to try // to clean them up. - // FIXME: Switch to Slim Reader/Writer (SRW) Locks, which requires - // nothing to clean it up but is available only on Vista and later. - // https://docs.microsoft.com/en-us/windows/desktop/Sync/slim-reader-writer--srw--locks if (type_ == kDynamic) { ::DeleteCriticalSection(critical_section_); delete critical_section_; - critical_section_ = NULL; + critical_section_ = nullptr; } } @@ -296,6 +380,7 @@ void Mutex::AssertHeld() { namespace { +#ifdef _MSC_VER // Use the RAII idiom to flag mem allocs that are intentionally never // deallocated. The motivation is to silence the false positive mem leaks // that are reported by the debug version of MS's CRT which can only detect @@ -308,19 +393,15 @@ class MemoryIsNotDeallocated { public: MemoryIsNotDeallocated() : old_crtdbg_flag_(0) { -#ifdef _MSC_VER old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG); // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT // doesn't report mem leak if there's no matching deallocation. _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF); -#endif // _MSC_VER } ~MemoryIsNotDeallocated() { -#ifdef _MSC_VER // Restore the original _CRTDBG_ALLOC_MEM_DF flag _CrtSetDbgFlag(old_crtdbg_flag_); -#endif // _MSC_VER } private: @@ -328,6 +409,7 @@ class MemoryIsNotDeallocated GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated); }; +#endif // _MSC_VER } // namespace @@ -343,7 +425,9 @@ void Mutex::ThreadSafeLazyInit() { owner_thread_id_ = 0; { // Use RAII to flag that following mem alloc is never deallocated. +#ifdef _MSC_VER MemoryIsNotDeallocated memory_is_not_deallocated; +#endif // _MSC_VER critical_section_ = new CRITICAL_SECTION; } ::InitializeCriticalSection(critical_section_); @@ -384,17 +468,16 @@ class ThreadWithParamSupport : public ThreadWithParamBase { Notification* thread_can_start) { ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start); DWORD thread_id; - // FIXME: Consider to use _beginthreadex instead. HANDLE thread_handle = ::CreateThread( - NULL, // Default security. - 0, // Default stack size. + nullptr, // Default security. + 0, // Default stack size. &ThreadWithParamSupport::ThreadMain, - param, // Parameter to ThreadMainStatic - 0x0, // Default creation flags. + param, // Parameter to ThreadMainStatic + 0x0, // Default creation flags. &thread_id); // Need a valid pointer for the call to work under Win98. - GTEST_CHECK_(thread_handle != NULL) << "CreateThread failed with error " - << ::GetLastError() << "."; - if (thread_handle == NULL) { + GTEST_CHECK_(thread_handle != nullptr) + << "CreateThread failed with error " << ::GetLastError() << "."; + if (thread_handle == nullptr) { delete param; } return thread_handle; @@ -406,15 +489,15 @@ class ThreadWithParamSupport : public ThreadWithParamBase { : runnable_(runnable), thread_can_start_(thread_can_start) { } - scoped_ptr runnable_; + std::unique_ptr runnable_; // Does not own. Notification* thread_can_start_; }; static DWORD WINAPI ThreadMain(void* ptr) { // Transfers ownership. - scoped_ptr param(static_cast(ptr)); - if (param->thread_can_start_ != NULL) + std::unique_ptr param(static_cast(ptr)); + if (param->thread_can_start_ != nullptr) param->thread_can_start_->WaitForNotification(); param->runnable_->Run(); return 0; @@ -472,7 +555,7 @@ class ThreadLocalRegistryImpl { thread_local_values .insert(std::make_pair( thread_local_instance, - linked_ptr( + std::shared_ptr( thread_local_instance->NewValueForCurrentThread()))) .first; } @@ -481,7 +564,7 @@ class ThreadLocalRegistryImpl { static void OnThreadLocalDestroyed( const ThreadLocalBase* thread_local_instance) { - std::vector > value_holders; + std::vector > value_holders; // Clean up the ThreadLocalValues data structure while holding the lock, but // defer the destruction of the ThreadLocalValueHolderBases. { @@ -509,7 +592,7 @@ class ThreadLocalRegistryImpl { static void OnThreadExit(DWORD thread_id) { GTEST_CHECK_(thread_id != 0) << ::GetLastError(); - std::vector > value_holders; + std::vector > value_holders; // Clean up the ThreadIdToThreadLocals data structure while holding the // lock, but defer the destruction of the ThreadLocalValueHolderBases. { @@ -536,7 +619,8 @@ class ThreadLocalRegistryImpl { private: // In a particular thread, maps a ThreadLocal object to its value. typedef std::map > ThreadLocalValues; + std::shared_ptr > + ThreadLocalValues; // Stores all ThreadIdToThreadLocals having values in a thread, indexed by // thread's ID. typedef std::map ThreadIdToThreadLocals; @@ -551,18 +635,17 @@ class ThreadLocalRegistryImpl { HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id); - GTEST_CHECK_(thread != NULL); + GTEST_CHECK_(thread != nullptr); // We need to pass a valid thread ID pointer into CreateThread for it // to work correctly under Win98. DWORD watcher_thread_id; HANDLE watcher_thread = ::CreateThread( - NULL, // Default security. - 0, // Default stack size + nullptr, // Default security. + 0, // Default stack size &ThreadLocalRegistryImpl::WatcherThreadFunc, reinterpret_cast(new ThreadIdAndHandle(thread_id, thread)), - CREATE_SUSPENDED, - &watcher_thread_id); - GTEST_CHECK_(watcher_thread != NULL); + CREATE_SUSPENDED, &watcher_thread_id); + GTEST_CHECK_(watcher_thread != nullptr); // Give the watcher thread the same priority as ours to avoid being // blocked by it. ::SetThreadPriority(watcher_thread, @@ -587,7 +670,9 @@ class ThreadLocalRegistryImpl { // Returns map of thread local instances. static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() { mutex_.AssertHeld(); +#ifdef _MSC_VER MemoryIsNotDeallocated memory_is_not_deallocated; +#endif // _MSC_VER static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals(); return map; } @@ -630,7 +715,7 @@ RE::~RE() { free(const_cast(pattern_)); } -// Returns true iff regular expression re matches the entire str. +// Returns true if and only if regular expression re matches the entire str. bool RE::FullMatch(const char* str, const RE& re) { if (!re.is_valid_) return false; @@ -638,8 +723,8 @@ bool RE::FullMatch(const char* str, const RE& re) { return regexec(&re.full_regex_, str, 1, &match, 0) == 0; } -// Returns true iff regular expression re matches a substring of str -// (including str itself). +// Returns true if and only if regular expression re matches a substring of +// str (including str itself). bool RE::PartialMatch(const char* str, const RE& re) { if (!re.is_valid_) return false; @@ -679,14 +764,14 @@ void RE::Init(const char* regex) { #elif GTEST_USES_SIMPLE_RE -// Returns true iff ch appears anywhere in str (excluding the +// Returns true if and only if ch appears anywhere in str (excluding the // terminating '\0' character). bool IsInSet(char ch, const char* str) { - return ch != '\0' && strchr(str, ch) != NULL; + return ch != '\0' && strchr(str, ch) != nullptr; } -// Returns true iff ch belongs to the given classification. Unlike -// similar functions in , these aren't affected by the +// Returns true if and only if ch belongs to the given classification. +// Unlike similar functions in , these aren't affected by the // current locale. bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; } bool IsAsciiPunct(char ch) { @@ -699,13 +784,13 @@ bool IsAsciiWordChar(char ch) { ('0' <= ch && ch <= '9') || ch == '_'; } -// Returns true iff "\\c" is a supported escape sequence. +// Returns true if and only if "\\c" is a supported escape sequence. bool IsValidEscape(char c) { return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW")); } -// Returns true iff the given atom (specified by escaped and pattern) -// matches ch. The result is undefined if the atom is invalid. +// Returns true if and only if the given atom (specified by escaped and +// pattern) matches ch. The result is undefined if the atom is invalid. bool AtomMatchesChar(bool escaped, char pattern_char, char ch) { if (escaped) { // "\\p" where p is pattern_char. switch (pattern_char) { @@ -736,17 +821,14 @@ static std::string FormatRegexSyntaxError(const char* regex, int index) { // Generates non-fatal failures and returns false if regex is invalid; // otherwise returns true. bool ValidateRegex(const char* regex) { - if (regex == NULL) { - // FIXME: fix the source file location in the - // assertion failures to match where the regex is used in user - // code. + if (regex == nullptr) { ADD_FAILURE() << "NULL is not a valid simple regular expression."; return false; } bool is_valid = true; - // True iff ?, *, or + can follow the previous atom. + // True if and only if ?, *, or + can follow the previous atom. bool prev_repeatable = false; for (int i = 0; regex[i]; i++) { if (regex[i] == '\\') { // An escape sequence @@ -822,8 +904,8 @@ bool MatchRepetitionAndRegexAtHead( return false; } -// Returns true iff regex matches a prefix of str. regex must be a -// valid simple regular expression and not start with "^", or the +// Returns true if and only if regex matches a prefix of str. regex must +// be a valid simple regular expression and not start with "^", or the // result is undefined. bool MatchRegexAtHead(const char* regex, const char* str) { if (*regex == '\0') // An empty regex matches a prefix of anything. @@ -853,8 +935,8 @@ bool MatchRegexAtHead(const char* regex, const char* str) { } } -// Returns true iff regex matches any substring of str. regex must be -// a valid simple regular expression, or the result is undefined. +// Returns true if and only if regex matches any substring of str. regex must +// be a valid simple regular expression, or the result is undefined. // // The algorithm is recursive, but the recursion depth doesn't exceed // the regex length, so we won't need to worry about running out of @@ -862,8 +944,7 @@ bool MatchRegexAtHead(const char* regex, const char* str) { // exponential with respect to the regex length + the string length, // but usually it's must faster (often close to linear). bool MatchRegexAnywhere(const char* regex, const char* str) { - if (regex == NULL || str == NULL) - return false; + if (regex == nullptr || str == nullptr) return false; if (*regex == '^') return MatchRegexAtHead(regex + 1, str); @@ -883,21 +964,21 @@ RE::~RE() { free(const_cast(full_pattern_)); } -// Returns true iff regular expression re matches the entire str. +// Returns true if and only if regular expression re matches the entire str. bool RE::FullMatch(const char* str, const RE& re) { return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str); } -// Returns true iff regular expression re matches a substring of str -// (including str itself). +// Returns true if and only if regular expression re matches a substring of +// str (including str itself). bool RE::PartialMatch(const char* str, const RE& re) { return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str); } // Initializes an RE from its string representation. void RE::Init(const char* regex) { - pattern_ = full_pattern_ = NULL; - if (regex != NULL) { + pattern_ = full_pattern_ = nullptr; + if (regex != nullptr) { pattern_ = posix::StrDup(regex); } @@ -935,7 +1016,7 @@ const char kUnknownFile[] = "unknown file"; // Formats a source file path and a line number as they would appear // in an error message from the compiler used to compile this code. GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) { - const std::string file_name(file == NULL ? kUnknownFile : file); + const std::string file_name(file == nullptr ? kUnknownFile : file); if (line < 0) { return file_name + ":"; @@ -954,7 +1035,7 @@ GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) { // to the file location it produces, unlike FormatFileLocation(). GTEST_API_ ::std::string FormatCompilerIndependentFileLocation( const char* file, int line) { - const std::string file_name(file == NULL ? kUnknownFile : file); + const std::string file_name(file == nullptr ? kUnknownFile : file); if (line < 0) return file_name; @@ -1021,20 +1102,22 @@ class CapturedStream { // code as part of a regular standalone executable, which doesn't // run in a Dalvik process (e.g. when running it through 'adb shell'). // - // The location /sdcard is directly accessible from native code - // and is the only location (unofficially) supported by the Android - // team. It's generally a symlink to the real SD Card mount point - // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or - // other OEM-customized locations. Never rely on these, and always - // use /sdcard. - char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX"; + // The location /data/local/tmp is directly accessible from native code. + // '/sdcard' and other variants cannot be relied on, as they are not + // guaranteed to be mounted, or may have a delay in mounting. + char name_template[] = "/data/local/tmp/gtest_captured_stream.XXXXXX"; # else char name_template[] = "/tmp/captured_stream.XXXXXX"; # endif // GTEST_OS_LINUX_ANDROID const int captured_fd = mkstemp(name_template); + if (captured_fd == -1) { + GTEST_LOG_(WARNING) + << "Failed to create tmp file " << name_template + << " for test; does the test have access to the /tmp directory?"; + } filename_ = name_template; # endif // GTEST_OS_WINDOWS - fflush(NULL); + fflush(nullptr); dup2(captured_fd, fd_); close(captured_fd); } @@ -1046,13 +1129,17 @@ class CapturedStream { std::string GetCapturedString() { if (uncaptured_fd_ != -1) { // Restores the original stream. - fflush(NULL); + fflush(nullptr); dup2(uncaptured_fd_, fd_); close(uncaptured_fd_); uncaptured_fd_ = -1; } FILE* const file = posix::FOpen(filename_.c_str(), "r"); + if (file == nullptr) { + GTEST_LOG_(FATAL) << "Failed to open tmp file " << filename_ + << " for capturing stream."; + } const std::string content = ReadEntireFile(file); posix::FClose(file); return content; @@ -1069,13 +1156,13 @@ class CapturedStream { GTEST_DISABLE_MSC_DEPRECATED_POP_() -static CapturedStream* g_captured_stderr = NULL; -static CapturedStream* g_captured_stdout = NULL; +static CapturedStream* g_captured_stderr = nullptr; +static CapturedStream* g_captured_stdout = nullptr; // Starts capturing an output stream (stdout/stderr). static void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) { - if (*stream != NULL) { + if (*stream != nullptr) { GTEST_LOG_(FATAL) << "Only one " << stream_name << " capturer can exist at a time."; } @@ -1087,7 +1174,7 @@ static std::string GetCapturedStream(CapturedStream** captured_stream) { const std::string content = (*captured_stream)->GetCapturedString(); delete *captured_stream; - *captured_stream = NULL; + *captured_stream = nullptr; return content; } @@ -1146,10 +1233,11 @@ std::string ReadEntireFile(FILE* file) { } #if GTEST_HAS_DEATH_TEST -static const std::vector* g_injected_test_argvs = NULL; // Owned. +static const std::vector* g_injected_test_argvs = + nullptr; // Owned. std::vector GetInjectableArgvs() { - if (g_injected_test_argvs != NULL) { + if (g_injected_test_argvs != nullptr) { return *g_injected_test_argvs; } return GetArgvs(); @@ -1165,16 +1253,9 @@ void SetInjectableArgvs(const std::vector& new_argvs) { new std::vector(new_argvs.begin(), new_argvs.end())); } -#if GTEST_HAS_GLOBAL_STRING -void SetInjectableArgvs(const std::vector< ::string>& new_argvs) { - SetInjectableArgvs( - new std::vector(new_argvs.begin(), new_argvs.end())); -} -#endif // GTEST_HAS_GLOBAL_STRING - void ClearInjectableArgvs() { delete g_injected_test_argvs; - g_injected_test_argvs = NULL; + g_injected_test_argvs = nullptr; } #endif // GTEST_HAS_DEATH_TEST @@ -1207,7 +1288,7 @@ static std::string FlagToEnvVar(const char* flag) { // unchanged and returns false. bool ParseInt32(const Message& src_text, const char* str, Int32* value) { // Parses the environment variable as a decimal integer. - char* end = NULL; + char* end = nullptr; const long long_value = strtol(str, &end, 10); // NOLINT // Has strtol() consumed all characters in the string? @@ -1246,15 +1327,15 @@ bool ParseInt32(const Message& src_text, const char* str, Int32* value) { // Reads and returns the Boolean environment variable corresponding to // the given flag; if it's not set, returns default_value. // -// The value is considered true iff it's not "0". +// The value is considered true if and only if it's not "0". bool BoolFromGTestEnv(const char* flag, bool default_value) { #if defined(GTEST_GET_BOOL_FROM_ENV_) return GTEST_GET_BOOL_FROM_ENV_(flag, default_value); #else const std::string env_var = FlagToEnvVar(flag); const char* const string_value = posix::GetEnv(env_var.c_str()); - return string_value == NULL ? - default_value : strcmp(string_value, "0") != 0; + return string_value == nullptr ? default_value + : strcmp(string_value, "0") != 0; #endif // defined(GTEST_GET_BOOL_FROM_ENV_) } @@ -1267,7 +1348,7 @@ Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) { #else const std::string env_var = FlagToEnvVar(flag); const char* const string_value = posix::GetEnv(env_var.c_str()); - if (string_value == NULL) { + if (string_value == nullptr) { // The environment variable is not set. return default_value; } @@ -1296,7 +1377,7 @@ Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) { std::string OutputFlagAlsoCheckEnvVar(){ std::string default_value_for_output_flag = ""; const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE"); - if (NULL != xml_output_file_env) { + if (nullptr != xml_output_file_env) { default_value_for_output_flag = std::string("xml:") + xml_output_file_env; } return default_value_for_output_flag; @@ -1310,7 +1391,7 @@ const char* StringFromGTestEnv(const char* flag, const char* default_value) { #else const std::string env_var = FlagToEnvVar(flag); const char* const value = posix::GetEnv(env_var.c_str()); - return value == NULL ? default_value : value; + return value == nullptr ? default_value : value; #endif // defined(GTEST_GET_STRING_FROM_ENV_) } diff --git a/third_party/googletest/src/src/gtest-printers.cc b/third_party/googletest/src/src/gtest-printers.cc index b5022549f9..3337be312e 100644 --- a/third_party/googletest/src/src/gtest-printers.cc +++ b/third_party/googletest/src/src/gtest-printers.cc @@ -59,6 +59,7 @@ using ::std::ostream; // Prints a segment of bytes in the given object. GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start, size_t count, ostream* os) { @@ -89,7 +90,6 @@ void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count, // If the object size is bigger than kThreshold, we'll have to omit // some details by printing only the first and the last kChunkSize // bytes. - // FIXME: let the user control the threshold using a flag. if (count < kThreshold) { PrintByteSegmentInObjectTo(obj_bytes, 0, count, os); } else { @@ -144,7 +144,8 @@ inline bool IsPrintableAscii(wchar_t c) { // which is the type of c. template static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) { - switch (static_cast(c)) { + wchar_t w_c = static_cast(c); + switch (w_c) { case L'\0': *os << "\\0"; break; @@ -176,7 +177,7 @@ static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) { *os << "\\v"; break; default: - if (IsPrintableAscii(c)) { + if (IsPrintableAscii(w_c)) { *os << static_cast(c); return kAsIs; } else { @@ -236,7 +237,7 @@ void PrintCharAndCodeTo(Char c, ostream* os) { if (format == kHexEscape || (1 <= c && c <= 9)) { // Do nothing. } else { - *os << ", 0x" << String::FormatHexInt(static_cast(c)); + *os << ", 0x" << String::FormatHexInt(static_cast(c)); } *os << ")"; } @@ -261,6 +262,7 @@ void PrintTo(wchar_t wc, ostream* os) { template GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat PrintCharsAsStringTo( const CharType* begin, size_t len, ostream* os) { @@ -291,6 +293,7 @@ static CharFormat PrintCharsAsStringTo( template GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void UniversalPrintCharArray( const CharType* begin, size_t len, ostream* os) { @@ -327,7 +330,7 @@ void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) { // Prints the given C string to the ostream. void PrintTo(const char* s, ostream* os) { - if (s == NULL) { + if (s == nullptr) { *os << "NULL"; } else { *os << ImplicitCast_(s) << " pointing to "; @@ -344,7 +347,7 @@ void PrintTo(const char* s, ostream* os) { #if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) // Prints the given wide C string to the ostream. void PrintTo(const wchar_t* s, ostream* os) { - if (s == NULL) { + if (s == nullptr) { *os << "NULL"; } else { *os << ImplicitCast_(s) << " pointing to "; @@ -420,17 +423,6 @@ void ConditionalPrintAsText(const char* str, size_t length, ostream* os) { } // anonymous namespace -// Prints a ::string object. -#if GTEST_HAS_GLOBAL_STRING -void PrintStringTo(const ::string& s, ostream* os) { - if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) { - if (GTEST_FLAG(print_utf8)) { - ConditionalPrintAsText(s.data(), s.size(), os); - } - } -} -#endif // GTEST_HAS_GLOBAL_STRING - void PrintStringTo(const ::std::string& s, ostream* os) { if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) { if (GTEST_FLAG(print_utf8)) { @@ -439,13 +431,6 @@ void PrintStringTo(const ::std::string& s, ostream* os) { } } -// Prints a ::wstring object. -#if GTEST_HAS_GLOBAL_WSTRING -void PrintWideStringTo(const ::wstring& s, ostream* os) { - PrintCharsAsStringTo(s.data(), s.size(), os); -} -#endif // GTEST_HAS_GLOBAL_WSTRING - #if GTEST_HAS_STD_WSTRING void PrintWideStringTo(const ::std::wstring& s, ostream* os) { PrintCharsAsStringTo(s.data(), s.size(), os); diff --git a/third_party/googletest/src/src/gtest-test-part.cc b/third_party/googletest/src/src/gtest-test-part.cc index c88860d923..178317a6bc 100644 --- a/third_party/googletest/src/src/gtest-test-part.cc +++ b/third_party/googletest/src/src/gtest-test-part.cc @@ -41,18 +41,21 @@ using internal::GetUnitTestImpl; // in it. std::string TestPartResult::ExtractSummary(const char* message) { const char* const stack_trace = strstr(message, internal::kStackTraceMarker); - return stack_trace == NULL ? message : - std::string(message, stack_trace); + return stack_trace == nullptr ? message : std::string(message, stack_trace); } // Prints a TestPartResult object. std::ostream& operator<<(std::ostream& os, const TestPartResult& result) { - return os - << result.file_name() << ":" << result.line_number() << ": " - << (result.type() == TestPartResult::kSuccess ? "Success" : - result.type() == TestPartResult::kFatalFailure ? "Fatal failure" : - "Non-fatal failure") << ":\n" - << result.message() << std::endl; + return os << result.file_name() << ":" << result.line_number() << ": " + << (result.type() == TestPartResult::kSuccess + ? "Success" + : result.type() == TestPartResult::kSkip + ? "Skipped" + : result.type() == TestPartResult::kFatalFailure + ? "Fatal failure" + : "Non-fatal failure") + << ":\n" + << result.message() << std::endl; } // Appends a TestPartResult to the array. @@ -67,7 +70,7 @@ const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const { internal::posix::Abort(); } - return array_[index]; + return array_[static_cast(index)]; } // Returns the number of TestPartResult objects in the array. diff --git a/third_party/googletest/src/src/gtest-typed-test.cc b/third_party/googletest/src/src/gtest-typed-test.cc index 1dc2ad38ba..8677caf732 100644 --- a/third_party/googletest/src/src/gtest-typed-test.cc +++ b/third_party/googletest/src/src/gtest-typed-test.cc @@ -48,7 +48,7 @@ static const char* SkipSpaces(const char* str) { static std::vector SplitIntoTestNames(const char* src) { std::vector name_vec; src = SkipSpaces(src); - for (; src != NULL; src = SkipComma(src)) { + for (; src != nullptr; src = SkipComma(src)) { name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src))); } return name_vec; @@ -57,7 +57,7 @@ static std::vector SplitIntoTestNames(const char* src) { // Verifies that registered_tests match the test names in // registered_tests_; returns registered_tests if successful, or // aborts the program otherwise. -const char* TypedTestCasePState::VerifyRegisteredTestNames( +const char* TypedTestSuitePState::VerifyRegisteredTestNames( const char* file, int line, const char* registered_tests) { typedef RegisteredTestsMap::const_iterator RegisteredTestIter; registered_ = true; @@ -89,7 +89,7 @@ const char* TypedTestCasePState::VerifyRegisteredTestNames( tests.insert(name); } else { errors << "No test named " << name - << " can be found in this test case.\n"; + << " can be found in this test suite.\n"; } } diff --git a/third_party/googletest/src/src/gtest.cc b/third_party/googletest/src/src/gtest.cc index 96b07c68ab..a5b4e5ac78 100644 --- a/third_party/googletest/src/src/gtest.cc +++ b/third_party/googletest/src/src/gtest.cc @@ -54,8 +54,6 @@ #if GTEST_OS_LINUX -// FIXME: Use autoconf to detect availability of -// gettimeofday(). # define GTEST_HAS_GETTIMEOFDAY_ 1 # include // NOLINT @@ -68,10 +66,6 @@ # include // NOLINT # include -#elif GTEST_OS_SYMBIAN -# define GTEST_HAS_GETTIMEOFDAY_ 1 -# include // NOLINT - #elif GTEST_OS_ZOS # define GTEST_HAS_GETTIMEOFDAY_ 1 # include // NOLINT @@ -86,6 +80,11 @@ #elif GTEST_OS_WINDOWS // We are on Windows proper. +# include // NOLINT +# undef min + +# include // NOLINT +# include // NOLINT # include // NOLINT # include // NOLINT # include // NOLINT @@ -93,25 +92,13 @@ # if GTEST_OS_WINDOWS_MINGW // MinGW has gettimeofday() but not _ftime64(). -// FIXME: Use autoconf to detect availability of -// gettimeofday(). -// FIXME: There are other ways to get the time on -// Windows, like GetTickCount() or GetSystemTimeAsFileTime(). MinGW -// supports these. consider using them instead. # define GTEST_HAS_GETTIMEOFDAY_ 1 # include // NOLINT # endif // GTEST_OS_WINDOWS_MINGW -// cpplint thinks that the header is already included, so we want to -// silence it. -# include // NOLINT -# undef min - #else // Assume other platforms have gettimeofday(). -// FIXME: Use autoconf to detect availability of -// gettimeofday(). # define GTEST_HAS_GETTIMEOFDAY_ 1 // cpplint thinks that the header is already included, so we want to @@ -160,14 +147,14 @@ using internal::Shuffle; // Constants. -// A test whose test case name or test name matches this filter is +// A test whose test suite name or test name matches this filter is // disabled and not run. static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*"; -// A test case whose name matches this filter is considered a death -// test case and will be run before test cases whose name doesn't +// A test suite whose name matches this filter is considered a death +// test suite and will be run before test suites whose name doesn't // match this filter. -static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*"; +static const char kDeathTestSuiteFilter[] = "*DeathTest:*DeathTest/*"; // A test filter that matches everything. static const char kUniversalFilter[] = "*"; @@ -190,20 +177,20 @@ namespace internal { // stack trace. const char kStackTraceMarker[] = "\nStack trace:\n"; -// g_help_flag is true iff the --help flag or an equivalent form is -// specified on the command line. +// g_help_flag is true if and only if the --help flag or an equivalent form +// is specified on the command line. bool g_help_flag = false; // Utilty function to Open File for Writing static FILE* OpenFileForWriting(const std::string& output_file) { - FILE* fileout = NULL; + FILE* fileout = nullptr; FilePath output_file_path(output_file); FilePath output_dir(output_file_path.RemoveFileName()); if (output_dir.CreateDirectoriesRecursively()) { fileout = posix::FOpen(output_file.c_str(), "w"); } - if (fileout == NULL) { + if (fileout == nullptr) { GTEST_LOG_(FATAL) << "Unable to open file \"" << output_file << "\""; } return fileout; @@ -216,7 +203,7 @@ static FILE* OpenFileForWriting(const std::string& output_file) { static const char* GetDefaultFilter() { const char* const testbridge_test_only = internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY"); - if (testbridge_test_only != NULL) { + if (testbridge_test_only != nullptr) { return testbridge_test_only; } return kUniversalFilter; @@ -228,15 +215,14 @@ GTEST_DEFINE_bool_( "Run disabled tests too, in addition to the tests normally being run."); GTEST_DEFINE_bool_( - break_on_failure, - internal::BoolFromGTestEnv("break_on_failure", false), - "True iff a failed assertion should be a debugger break-point."); + break_on_failure, internal::BoolFromGTestEnv("break_on_failure", false), + "True if and only if a failed assertion should be a debugger " + "break-point."); -GTEST_DEFINE_bool_( - catch_exceptions, - internal::BoolFromGTestEnv("catch_exceptions", true), - "True iff " GTEST_NAME_ - " should catch exceptions and treat them as test failures."); +GTEST_DEFINE_bool_(catch_exceptions, + internal::BoolFromGTestEnv("catch_exceptions", true), + "True if and only if " GTEST_NAME_ + " should catch exceptions and treat them as test failures."); GTEST_DEFINE_string_( color, @@ -283,17 +269,13 @@ GTEST_DEFINE_string_( "executable's name and, if necessary, made unique by adding " "digits."); -GTEST_DEFINE_bool_( - print_time, - internal::BoolFromGTestEnv("print_time", true), - "True iff " GTEST_NAME_ - " should display elapsed time in text output."); +GTEST_DEFINE_bool_(print_time, internal::BoolFromGTestEnv("print_time", true), + "True if and only if " GTEST_NAME_ + " should display elapsed time in text output."); -GTEST_DEFINE_bool_( - print_utf8, - internal::BoolFromGTestEnv("print_utf8", true), - "True iff " GTEST_NAME_ - " prints UTF8 characters as text."); +GTEST_DEFINE_bool_(print_utf8, internal::BoolFromGTestEnv("print_utf8", true), + "True if and only if " GTEST_NAME_ + " prints UTF8 characters as text."); GTEST_DEFINE_int32_( random_seed, @@ -307,16 +289,14 @@ GTEST_DEFINE_int32_( "How many times to repeat each test. Specify a negative number " "for repeating forever. Useful for shaking out flaky tests."); -GTEST_DEFINE_bool_( - show_internal_stack_frames, false, - "True iff " GTEST_NAME_ " should include internal stack frames when " - "printing test failure stack traces."); +GTEST_DEFINE_bool_(show_internal_stack_frames, false, + "True if and only if " GTEST_NAME_ + " should include internal stack frames when " + "printing test failure stack traces."); -GTEST_DEFINE_bool_( - shuffle, - internal::BoolFromGTestEnv("shuffle", false), - "True iff " GTEST_NAME_ - " should randomize tests' order on every run."); +GTEST_DEFINE_bool_(shuffle, internal::BoolFromGTestEnv("shuffle", false), + "True if and only if " GTEST_NAME_ + " should randomize tests' order on every run."); GTEST_DEFINE_int32_( stack_trace_depth, @@ -367,16 +347,16 @@ UInt32 Random::Generate(UInt32 range) { return state_ % range; } -// GTestIsInitialized() returns true iff the user has initialized +// GTestIsInitialized() returns true if and only if the user has initialized // Google Test. Useful for catching the user mistake of not initializing // Google Test before calling RUN_ALL_TESTS(). static bool GTestIsInitialized() { return GetArgvs().size() > 0; } -// Iterates over a vector of TestCases, keeping a running sum of the +// Iterates over a vector of TestSuites, keeping a running sum of the // results of calling a given int-returning method on each. // Returns the sum. -static int SumOverTestCaseList(const std::vector& case_list, - int (TestCase::*method)() const) { +static int SumOverTestSuiteList(const std::vector& case_list, + int (TestSuite::*method)() const) { int sum = 0; for (size_t i = 0; i < case_list.size(); i++) { sum += (case_list[i]->*method)(); @@ -384,20 +364,20 @@ static int SumOverTestCaseList(const std::vector& case_list, return sum; } -// Returns true iff the test case passed. -static bool TestCasePassed(const TestCase* test_case) { - return test_case->should_run() && test_case->Passed(); +// Returns true if and only if the test suite passed. +static bool TestSuitePassed(const TestSuite* test_suite) { + return test_suite->should_run() && test_suite->Passed(); } -// Returns true iff the test case failed. -static bool TestCaseFailed(const TestCase* test_case) { - return test_case->should_run() && test_case->Failed(); +// Returns true if and only if the test suite failed. +static bool TestSuiteFailed(const TestSuite* test_suite) { + return test_suite->should_run() && test_suite->Failed(); } -// Returns true iff test_case contains at least one test that should -// run. -static bool ShouldRunTestCase(const TestCase* test_case) { - return test_case->should_run(); +// Returns true if and only if test_suite contains at least one test that +// should run. +static bool ShouldRunTestSuite(const TestSuite* test_suite) { + return test_suite->should_run(); } // AssertHelper constructor. @@ -423,9 +403,6 @@ void AssertHelper::operator=(const Message& message) const { ); // NOLINT } -// Mutex for linked pointers. -GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex); - // A copy of all command line arguments. Set by InitGoogleTest(). static ::std::vector g_argvs; @@ -445,7 +422,7 @@ ::std::vector GetArgvs() { FilePath GetCurrentExecutableName() { FilePath result; -#if GTEST_OS_WINDOWS +#if GTEST_OS_WINDOWS || GTEST_OS_OS2 result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe")); #else result.Set(FilePath(GetArgvs()[0])); @@ -460,9 +437,10 @@ FilePath GetCurrentExecutableName() { std::string UnitTestOptions::GetOutputFormat() { const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); const char* const colon = strchr(gtest_output_flag, ':'); - return (colon == NULL) ? - std::string(gtest_output_flag) : - std::string(gtest_output_flag, colon - gtest_output_flag); + return (colon == nullptr) + ? std::string(gtest_output_flag) + : std::string(gtest_output_flag, + static_cast(colon - gtest_output_flag)); } // Returns the name of the requested output file, or the default if none @@ -475,7 +453,7 @@ std::string UnitTestOptions::GetAbsolutePathToOutputFile() { format = std::string(kDefaultOutputFormat); const char* const colon = strchr(gtest_output_flag, ':'); - if (colon == NULL) + if (colon == nullptr) return internal::FilePath::MakeFileName( internal::FilePath( UnitTest::GetInstance()->original_working_dir()), @@ -484,10 +462,6 @@ std::string UnitTestOptions::GetAbsolutePathToOutputFile() { internal::FilePath output_name(colon + 1); if (!output_name.IsAbsolutePath()) - // FIXME: on Windows \some\path is not an absolute - // path (as its meaning depends on the current drive), yet the - // following logic for turning it into an absolute path is wrong. - // Fix it. output_name = internal::FilePath::ConcatPaths( internal::FilePath(UnitTest::GetInstance()->original_working_dir()), internal::FilePath(colon + 1)); @@ -501,8 +475,8 @@ std::string UnitTestOptions::GetAbsolutePathToOutputFile() { return result.string(); } -// Returns true iff the wildcard pattern matches the string. The -// first ':' or '\0' character in pattern marks the end of it. +// Returns true if and only if the wildcard pattern matches the string. +// The first ':' or '\0' character in pattern marks the end of it. // // This recursive algorithm isn't very efficient, but is clear and // works well enough for matching test names, which are short. @@ -535,7 +509,7 @@ bool UnitTestOptions::MatchesFilter( cur_pattern = strchr(cur_pattern, ':'); // Returns if no more pattern can be found. - if (cur_pattern == NULL) { + if (cur_pattern == nullptr) { return false; } @@ -544,11 +518,11 @@ bool UnitTestOptions::MatchesFilter( } } -// Returns true iff the user-specified filter matches the test case -// name and the test name. -bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name, - const std::string &test_name) { - const std::string& full_name = test_case_name + "." + test_name.c_str(); +// Returns true if and only if the user-specified filter matches the test +// suite name and the test name. +bool UnitTestOptions::FilterMatchesTest(const std::string& test_suite_name, + const std::string& test_name) { + const std::string& full_name = test_suite_name + "." + test_name.c_str(); // Split --gtest_filter at '-', if there is one, to separate into // positive filter and negative filter portions @@ -556,7 +530,7 @@ bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name, const char* const dash = strchr(p, '-'); std::string positive; std::string negative; - if (dash == NULL) { + if (dash == nullptr) { positive = GTEST_FLAG(filter).c_str(); // Whole string is a positive filter negative = ""; } else { @@ -701,7 +675,7 @@ static AssertionResult HasOneFailure(const char* /* results_expr */, << r; } - if (strstr(r.message(), substr.c_str()) == NULL) { + if (strstr(r.message(), substr.c_str()) == nullptr) { return AssertionFailure() << "Expected: " << expected << " containing \"" << substr << "\"\n" << " Actual:\n" @@ -770,61 +744,66 @@ void UnitTestImpl::SetTestPartResultReporterForCurrentThread( per_thread_test_part_result_reporter_.set(reporter); } -// Gets the number of successful test cases. -int UnitTestImpl::successful_test_case_count() const { - return CountIf(test_cases_, TestCasePassed); +// Gets the number of successful test suites. +int UnitTestImpl::successful_test_suite_count() const { + return CountIf(test_suites_, TestSuitePassed); } -// Gets the number of failed test cases. -int UnitTestImpl::failed_test_case_count() const { - return CountIf(test_cases_, TestCaseFailed); +// Gets the number of failed test suites. +int UnitTestImpl::failed_test_suite_count() const { + return CountIf(test_suites_, TestSuiteFailed); } -// Gets the number of all test cases. -int UnitTestImpl::total_test_case_count() const { - return static_cast(test_cases_.size()); +// Gets the number of all test suites. +int UnitTestImpl::total_test_suite_count() const { + return static_cast(test_suites_.size()); } -// Gets the number of all test cases that contain at least one test +// Gets the number of all test suites that contain at least one test // that should run. -int UnitTestImpl::test_case_to_run_count() const { - return CountIf(test_cases_, ShouldRunTestCase); +int UnitTestImpl::test_suite_to_run_count() const { + return CountIf(test_suites_, ShouldRunTestSuite); } // Gets the number of successful tests. int UnitTestImpl::successful_test_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count); + return SumOverTestSuiteList(test_suites_, &TestSuite::successful_test_count); +} + +// Gets the number of skipped tests. +int UnitTestImpl::skipped_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::skipped_test_count); } // Gets the number of failed tests. int UnitTestImpl::failed_test_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count); + return SumOverTestSuiteList(test_suites_, &TestSuite::failed_test_count); } // Gets the number of disabled tests that will be reported in the XML report. int UnitTestImpl::reportable_disabled_test_count() const { - return SumOverTestCaseList(test_cases_, - &TestCase::reportable_disabled_test_count); + return SumOverTestSuiteList(test_suites_, + &TestSuite::reportable_disabled_test_count); } // Gets the number of disabled tests. int UnitTestImpl::disabled_test_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count); + return SumOverTestSuiteList(test_suites_, &TestSuite::disabled_test_count); } // Gets the number of tests to be printed in the XML report. int UnitTestImpl::reportable_test_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count); + return SumOverTestSuiteList(test_suites_, &TestSuite::reportable_test_count); } // Gets the number of all tests. int UnitTestImpl::total_test_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::total_test_count); + return SumOverTestSuiteList(test_suites_, &TestSuite::total_test_count); } // Gets the number of tests that should run. int UnitTestImpl::test_to_run_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count); + return SumOverTestSuiteList(test_suites_, &TestSuite::test_to_run_count); } // Returns the current OS stack trace as an std::string. @@ -858,8 +837,6 @@ TimeInMillis GetTimeInMillis() { SYSTEMTIME now_systime; FILETIME now_filetime; ULARGE_INTEGER now_int64; - // FIXME: Shouldn't this just use - // GetSystemTimeAsFileTime()? GetSystemTime(&now_systime); if (SystemTimeToFileTime(&now_systime, &now_filetime)) { now_int64.LowPart = now_filetime.dwLowDateTime; @@ -874,8 +851,6 @@ TimeInMillis GetTimeInMillis() { // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996 // (deprecated function) there. - // FIXME: Use GetTickCount()? Or use - // SystemTimeToFileTime() GTEST_DISABLE_MSC_DEPRECATED_PUSH_() _ftime64(&now); GTEST_DISABLE_MSC_DEPRECATED_POP_() @@ -883,7 +858,7 @@ TimeInMillis GetTimeInMillis() { return static_cast(now.time) * 1000 + now.millitm; #elif GTEST_HAS_GETTIMEOFDAY_ struct timeval now; - gettimeofday(&now, NULL); + gettimeofday(&now, nullptr); return static_cast(now.tv_sec) * 1000 + now.tv_usec / 1000; #else # error "Don't know how to get the current time on your system." @@ -900,11 +875,10 @@ TimeInMillis GetTimeInMillis() { // value using delete[]. Returns the wide string, or NULL if the // input is NULL. LPCWSTR String::AnsiToUtf16(const char* ansi) { - if (!ansi) return NULL; + if (!ansi) return nullptr; const int length = strlen(ansi); const int unicode_length = - MultiByteToWideChar(CP_ACP, 0, ansi, length, - NULL, 0); + MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0); WCHAR* unicode = new WCHAR[unicode_length + 1]; MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length); @@ -917,33 +891,33 @@ LPCWSTR String::AnsiToUtf16(const char* ansi) { // value using delete[]. Returns the ANSI string, or NULL if the // input is NULL. const char* String::Utf16ToAnsi(LPCWSTR utf16_str) { - if (!utf16_str) return NULL; - const int ansi_length = - WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, - NULL, 0, NULL, NULL); + if (!utf16_str) return nullptr; + const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr, + 0, nullptr, nullptr); char* ansi = new char[ansi_length + 1]; - WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, - ansi, ansi_length, NULL, NULL); + WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, ansi, ansi_length, nullptr, + nullptr); ansi[ansi_length] = 0; return ansi; } #endif // GTEST_OS_WINDOWS_MOBILE -// Compares two C strings. Returns true iff they have the same content. +// Compares two C strings. Returns true if and only if they have the same +// content. // // Unlike strcmp(), this function can handle NULL argument(s). A NULL // C string is considered different to any non-NULL C string, // including the empty string. bool String::CStringEquals(const char * lhs, const char * rhs) { - if ( lhs == NULL ) return rhs == NULL; + if (lhs == nullptr) return rhs == nullptr; - if ( rhs == NULL ) return false; + if (rhs == nullptr) return false; return strcmp(lhs, rhs) == 0; } -#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING +#if GTEST_HAS_STD_WSTRING // Converts an array of wide chars to a narrow string using the UTF-8 // encoding, and streams the result to the given Message object. @@ -961,7 +935,7 @@ static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length, } } -#endif // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING +#endif // GTEST_HAS_STD_WSTRING void SplitString(const ::std::string& str, char delimiter, ::std::vector< ::std::string>* dest) { @@ -1011,15 +985,6 @@ Message& Message::operator <<(const ::std::wstring& wstr) { } #endif // GTEST_HAS_STD_WSTRING -#if GTEST_HAS_GLOBAL_WSTRING -// Converts the given wide string to a narrow string using the UTF-8 -// encoding, and streams the result to this Message object. -Message& Message::operator <<(const ::wstring& wstr) { - internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); - return *this; -} -#endif // GTEST_HAS_GLOBAL_WSTRING - // Gets the text streamed to this object so far as an std::string. // Each '\0' character in the buffer is replaced with "\\0". std::string Message::GetString() const { @@ -1030,10 +995,9 @@ std::string Message::GetString() const { // Used in EXPECT_TRUE/FALSE(assertion_result). AssertionResult::AssertionResult(const AssertionResult& other) : success_(other.success_), - message_(other.message_.get() != NULL ? - new ::std::string(*other.message_) : - static_cast< ::std::string*>(NULL)) { -} + message_(other.message_.get() != nullptr + ? new ::std::string(*other.message_) + : static_cast< ::std::string*>(nullptr)) {} // Swaps two AssertionResults. void AssertionResult::swap(AssertionResult& other) { @@ -1045,8 +1009,7 @@ void AssertionResult::swap(AssertionResult& other) { // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. AssertionResult AssertionResult::operator!() const { AssertionResult negation(!success_); - if (message_.get() != NULL) - negation << *message_; + if (message_.get() != nullptr) negation << *message_; return negation; } @@ -1272,9 +1235,10 @@ std::string CreateUnifiedDiff(const std::vector& left, for (; edit_i < edits.size(); ++edit_i) { if (n_suffix >= context) { // Continue only if the next hunk is very close. - std::vector::const_iterator it = edits.begin() + edit_i; + auto it = edits.begin() + static_cast(edit_i); while (it != edits.end() && *it == kMatch) ++it; - if (it == edits.end() || (it - edits.begin()) - edit_i >= context) { + if (it == edits.end() || + static_cast(it - edits.begin()) - edit_i >= context) { // There is no next edit or it is too far away. break; } @@ -1350,7 +1314,7 @@ std::vector SplitEscapedString(const std::string& str) { // lhs_value: "5" // rhs_value: "6" // -// The ignoring_case parameter is true iff the assertion is a +// The ignoring_case parameter is true if and only if the assertion is a // *_STRCASEEQ*. When it's true, the string "Ignoring case" will // be inserted into the message. AssertionResult EqFailure(const char* lhs_expression, @@ -1413,8 +1377,6 @@ AssertionResult DoubleNearPredFormat(const char* expr1, const double diff = fabs(val1 - val2); if (diff <= abs_error) return AssertionSuccess(); - // FIXME: do not print the value of an expression if it's - // already a literal. return AssertionFailure() << "The difference between " << expr1 << " and " << expr2 << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n" @@ -1595,22 +1557,20 @@ namespace { // Helper functions for implementing IsSubString() and IsNotSubstring(). -// This group of overloaded functions return true iff needle is a -// substring of haystack. NULL is considered a substring of itself -// only. +// This group of overloaded functions return true if and only if needle +// is a substring of haystack. NULL is considered a substring of +// itself only. bool IsSubstringPred(const char* needle, const char* haystack) { - if (needle == NULL || haystack == NULL) - return needle == haystack; + if (needle == nullptr || haystack == nullptr) return needle == haystack; - return strstr(haystack, needle) != NULL; + return strstr(haystack, needle) != nullptr; } bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) { - if (needle == NULL || haystack == NULL) - return needle == haystack; + if (needle == nullptr || haystack == nullptr) return needle == haystack; - return wcsstr(haystack, needle) != NULL; + return wcsstr(haystack, needle) != nullptr; } // StringType here can be either ::std::string or ::std::wstring. @@ -1724,12 +1684,12 @@ AssertionResult HRESULTFailureHelper(const char* expr, // Gets the system's human readable message string for this HRESULT. char error_text[kBufSize] = { '\0' }; DWORD message_length = ::FormatMessageA(kFlags, - 0, // no source, we're asking system - hr, // the error - 0, // no line width restrictions + 0, // no source, we're asking system + static_cast(hr), // the error + 0, // no line width restrictions error_text, // output buffer - kBufSize, // buf size - NULL); // no arguments for inserts + kBufSize, // buf size + nullptr); // no arguments for inserts // Trims tailing white space (FormatMessage leaves a trailing CR-LF) for (; message_length && IsSpace(error_text[message_length - 1]); --message_length) { @@ -1803,7 +1763,7 @@ inline UInt32 ChopLowBits(UInt32* bits, int n) { // to "(Invalid Unicode 0xXXXXXXXX)". std::string CodePointToUtf8(UInt32 code_point) { if (code_point > kMaxCodePoint4) { - return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")"; + return "(Invalid Unicode 0x" + String::FormatHexUInt32(code_point) + ")"; } char str[5]; // Big enough for the largest valid code point. @@ -1831,7 +1791,7 @@ std::string CodePointToUtf8(UInt32 code_point) { // The following two functions only make sense if the system // uses UTF-16 for wide string encoding. All supported systems -// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16. +// with 16 bit wchar_t (Windows, Cygwin) do use UTF-16. // Determines if the arguments constitute UTF-16 surrogate pair // and thus should be combined into a single Unicode code point @@ -1844,17 +1804,20 @@ inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) { // Creates a Unicode code point from UTF16 surrogate pair. inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first, wchar_t second) { + const auto first_u = static_cast(first); + const auto second_u = static_cast(second); const UInt32 mask = (1 << 10) - 1; - return (sizeof(wchar_t) == 2) ? - (((first & mask) << 10) | (second & mask)) + 0x10000 : - // This function should not be called when the condition is - // false, but we provide a sensible default in case it is. - static_cast(first); + return (sizeof(wchar_t) == 2) + ? (((first_u & mask) << 10) | (second_u & mask)) + 0x10000 + : + // This function should not be called when the condition is + // false, but we provide a sensible default in case it is. + first_u; } // Converts a wide string to a narrow string in UTF-8 encoding. // The wide string is assumed to have the following encoding: -// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS) +// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin) // UTF-32 if sizeof(wchar_t) == 4 (on Linux) // Parameter str points to a null-terminated wide string. // Parameter num_chars may additionally limit the number @@ -1891,21 +1854,21 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) { // Converts a wide C string to an std::string using the UTF-8 encoding. // NULL will be converted to "(null)". std::string String::ShowWideCString(const wchar_t * wide_c_str) { - if (wide_c_str == NULL) return "(null)"; + if (wide_c_str == nullptr) return "(null)"; return internal::WideStringToUtf8(wide_c_str, -1); } -// Compares two wide C strings. Returns true iff they have the same -// content. +// Compares two wide C strings. Returns true if and only if they have the +// same content. // // Unlike wcscmp(), this function can handle NULL argument(s). A NULL // C string is considered different to any non-NULL C string, // including the empty string. bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) { - if (lhs == NULL) return rhs == NULL; + if (lhs == nullptr) return rhs == nullptr; - if (rhs == NULL) return false; + if (rhs == nullptr) return false; return wcscmp(lhs, rhs) == 0; } @@ -1941,37 +1904,35 @@ AssertionResult CmpHelperSTRNE(const char* s1_expression, << " vs " << PrintToString(s2); } -// Compares two C strings, ignoring case. Returns true iff they have +// Compares two C strings, ignoring case. Returns true if and only if they have // the same content. // // Unlike strcasecmp(), this function can handle NULL argument(s). A // NULL C string is considered different to any non-NULL C string, // including the empty string. bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) { - if (lhs == NULL) - return rhs == NULL; - if (rhs == NULL) - return false; + if (lhs == nullptr) return rhs == nullptr; + if (rhs == nullptr) return false; return posix::StrCaseCmp(lhs, rhs) == 0; } - // Compares two wide C strings, ignoring case. Returns true iff they - // have the same content. - // - // Unlike wcscasecmp(), this function can handle NULL argument(s). - // A NULL C string is considered different to any non-NULL wide C string, - // including the empty string. - // NB: The implementations on different platforms slightly differ. - // On windows, this method uses _wcsicmp which compares according to LC_CTYPE - // environment variable. On GNU platform this method uses wcscasecmp - // which compares according to LC_CTYPE category of the current locale. - // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the - // current locale. +// Compares two wide C strings, ignoring case. Returns true if and only if they +// have the same content. +// +// Unlike wcscasecmp(), this function can handle NULL argument(s). +// A NULL C string is considered different to any non-NULL wide C string, +// including the empty string. +// NB: The implementations on different platforms slightly differ. +// On windows, this method uses _wcsicmp which compares according to LC_CTYPE +// environment variable. On GNU platform this method uses wcscasecmp +// which compares according to LC_CTYPE category of the current locale. +// On MacOS X, it uses towlower, which also uses LC_CTYPE category of the +// current locale. bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs, const wchar_t* rhs) { - if (lhs == NULL) return rhs == NULL; + if (lhs == nullptr) return rhs == nullptr; - if (rhs == NULL) return false; + if (rhs == nullptr) return false; #if GTEST_OS_WINDOWS return _wcsicmp(lhs, rhs) == 0; @@ -1982,14 +1943,14 @@ bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs, // Other unknown OSes may not define it either. wint_t left, right; do { - left = towlower(*lhs++); - right = towlower(*rhs++); + left = towlower(static_cast(*lhs++)); + right = towlower(static_cast(*rhs++)); } while (left && left == right); return left == right; #endif // OS selector } -// Returns true iff str ends with the given suffix, ignoring case. +// Returns true if and only if str ends with the given suffix, ignoring case. // Any string is considered to end with an empty suffix. bool String::EndsWithCaseInsensitive( const std::string& str, const std::string& suffix) { @@ -2008,12 +1969,17 @@ std::string String::FormatIntWidth2(int value) { } // Formats an int value as "%X". -std::string String::FormatHexInt(int value) { +std::string String::FormatHexUInt32(UInt32 value) { std::stringstream ss; ss << std::hex << std::uppercase << value; return ss.str(); } +// Formats an int value as "%X". +std::string String::FormatHexInt(int value) { + return FormatHexUInt32(static_cast(value)); +} + // Formats a byte as "%02X". std::string String::FormatByte(unsigned char value) { std::stringstream ss; @@ -2030,7 +1996,7 @@ std::string StringStreamToString(::std::stringstream* ss) { const char* const end = start + str.length(); std::string result; - result.reserve(2 * (end - start)); + result.reserve(static_cast(2 * (end - start))); for (const char* ch = start; ch != end; ++ch) { if (*ch == '\0') { result += "\\0"; // Replaces NUL with "\\0"; @@ -2060,9 +2026,7 @@ std::string AppendUserMessage(const std::string& gtest_msg, // Creates an empty TestResult. TestResult::TestResult() - : death_test_count_(0), - elapsed_time_(0) { -} + : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {} // D'tor. TestResult::~TestResult() { @@ -2074,7 +2038,7 @@ TestResult::~TestResult() { const TestPartResult& TestResult::GetTestPartResult(int i) const { if (i < 0 || i >= total_part_count()) internal::posix::Abort(); - return test_part_results_.at(i); + return test_part_results_.at(static_cast(i)); } // Returns the i-th test property. i can range from 0 to @@ -2083,7 +2047,7 @@ const TestPartResult& TestResult::GetTestPartResult(int i) const { const TestProperty& TestResult::GetTestProperty(int i) const { if (i < 0 || i >= test_property_count()) internal::posix::Abort(); - return test_properties_.at(i); + return test_properties_.at(static_cast(i)); } // Clears the test part results. @@ -2131,18 +2095,18 @@ static const char* const kReservedTestSuitesAttributes[] = { // The list of reserved attributes used in the element of XML // output. static const char* const kReservedTestSuiteAttributes[] = { - "disabled", - "errors", - "failures", - "name", - "tests", - "time" -}; + "disabled", "errors", "failures", "name", "tests", "time", "timestamp"}; // The list of reserved attributes used in the element of XML output. static const char* const kReservedTestCaseAttributes[] = { - "classname", "name", "status", "time", - "type_param", "value_param", "file", "line"}; + "classname", "name", "status", "time", "type_param", + "value_param", "file", "line"}; + +// Use a slightly different set for allowed output to ensure existing tests can +// still RecordProperty("result") or "RecordProperty(timestamp") +static const char* const kReservedOutputTestCaseAttributes[] = { + "classname", "name", "status", "time", "type_param", + "value_param", "file", "line", "result", "timestamp"}; template std::vector ArrayAsVector(const char* const (&array)[kSize]) { @@ -2164,6 +2128,22 @@ static std::vector GetReservedAttributesForElement( return std::vector(); } +// TODO(jdesprez): Merge the two getReserved attributes once skip is improved +static std::vector GetReservedOutputAttributesForElement( + const std::string& xml_element) { + if (xml_element == "testsuites") { + return ArrayAsVector(kReservedTestSuitesAttributes); + } else if (xml_element == "testsuite") { + return ArrayAsVector(kReservedTestSuiteAttributes); + } else if (xml_element == "testcase") { + return ArrayAsVector(kReservedOutputTestCaseAttributes); + } else { + GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element; + } + // This code is unreachable but some compilers may not realizes that. + return std::vector(); +} + static std::string FormatWordList(const std::vector& words) { Message word_list; for (size_t i = 0; i < words.size(); ++i) { @@ -2207,7 +2187,17 @@ void TestResult::Clear() { elapsed_time_ = 0; } -// Returns true iff the test failed. +// Returns true off the test part was skipped. +static bool TestPartSkipped(const TestPartResult& result) { + return result.skipped(); +} + +// Returns true if and only if the test was skipped. +bool TestResult::Skipped() const { + return !Failed() && CountIf(test_part_results_, TestPartSkipped) > 0; +} + +// Returns true if and only if the test failed. bool TestResult::Failed() const { for (int i = 0; i < total_part_count(); ++i) { if (GetTestPartResult(i).failed()) @@ -2216,22 +2206,22 @@ bool TestResult::Failed() const { return false; } -// Returns true iff the test part fatally failed. +// Returns true if and only if the test part fatally failed. static bool TestPartFatallyFailed(const TestPartResult& result) { return result.fatally_failed(); } -// Returns true iff the test fatally failed. +// Returns true if and only if the test fatally failed. bool TestResult::HasFatalFailure() const { return CountIf(test_part_results_, TestPartFatallyFailed) > 0; } -// Returns true iff the test part non-fatally failed. +// Returns true if and only if the test part non-fatally failed. static bool TestPartNonfatallyFailed(const TestPartResult& result) { return result.nonfatally_failed(); } -// Returns true iff the test has a non-fatal failure. +// Returns true if and only if the test has a non-fatal failure. bool TestResult::HasNonfatalFailure() const { return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0; } @@ -2294,25 +2284,25 @@ void ReportFailureInUnknownLocation(TestPartResult::Type result_type, // AddTestPartResult. UnitTest::GetInstance()->AddTestPartResult( result_type, - NULL, // No info about the source file where the exception occurred. - -1, // We have no info on which line caused the exception. + nullptr, // No info about the source file where the exception occurred. + -1, // We have no info on which line caused the exception. message, - ""); // No stack trace, either. + ""); // No stack trace, either. } } // namespace internal -// Google Test requires all tests in the same test case to use the same test +// Google Test requires all tests in the same test suite to use the same test // fixture class. This function checks if the current test has the -// same fixture class as the first test in the current test case. If +// same fixture class as the first test in the current test suite. If // yes, it returns true; otherwise it generates a Google Test failure and // returns false. bool Test::HasSameFixtureClass() { internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); - const TestCase* const test_case = impl->current_test_case(); + const TestSuite* const test_suite = impl->current_test_suite(); - // Info about the first test in the current test case. - const TestInfo* const first_test_info = test_case->test_info_list()[0]; + // Info about the first test in the current test suite. + const TestInfo* const first_test_info = test_suite->test_info_list()[0]; const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_; const char* const first_test_name = first_test_info->name(); @@ -2328,7 +2318,7 @@ bool Test::HasSameFixtureClass() { const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId(); if (first_is_TEST || this_is_TEST) { - // Both TEST and TEST_F appear in same test case, which is incorrect. + // Both TEST and TEST_F appear in same test suite, which is incorrect. // Tell the user how to fix this. // Gets the name of the TEST and the name of the TEST_F. Note @@ -2340,9 +2330,9 @@ bool Test::HasSameFixtureClass() { first_is_TEST ? this_test_name : first_test_name; ADD_FAILURE() - << "All tests in the same test case must use the same test fixture\n" - << "class, so mixing TEST_F and TEST in the same test case is\n" - << "illegal. In test case " << this_test_info->test_case_name() + << "All tests in the same test suite must use the same test fixture\n" + << "class, so mixing TEST_F and TEST in the same test suite is\n" + << "illegal. In test suite " << this_test_info->test_suite_name() << ",\n" << "test " << TEST_F_name << " is defined using TEST_F but\n" << "test " << TEST_name << " is defined using TEST. You probably\n" @@ -2352,15 +2342,15 @@ bool Test::HasSameFixtureClass() { // Two fixture classes with the same name appear in two different // namespaces, which is not allowed. Tell the user how to fix this. ADD_FAILURE() - << "All tests in the same test case must use the same test fixture\n" - << "class. However, in test case " - << this_test_info->test_case_name() << ",\n" - << "you defined test " << first_test_name - << " and test " << this_test_name << "\n" + << "All tests in the same test suite must use the same test fixture\n" + << "class. However, in test suite " + << this_test_info->test_suite_name() << ",\n" + << "you defined test " << first_test_name << " and test " + << this_test_name << "\n" << "using two different test fixture classes. This can happen if\n" << "the two classes are from different namespaces or translation\n" << "units and have the same name. You should probably rename one\n" - << "of the classes to put the tests into different test cases."; + << "of the classes to put the tests into different test suites."; } return false; } @@ -2393,7 +2383,7 @@ namespace internal { static std::string FormatCxxExceptionMessage(const char* description, const char* location) { Message message; - if (description != NULL) { + if (description != nullptr) { message << "C++ exception with description \"" << description << "\""; } else { message << "Unknown C++ exception"; @@ -2491,7 +2481,7 @@ Result HandleExceptionsInMethodIfSupported( } catch (...) { // NOLINT internal::ReportFailureInUnknownLocation( TestPartResult::kFatalFailure, - FormatCxxExceptionMessage(NULL, location)); + FormatCxxExceptionMessage(nullptr, location)); } return static_cast(0); #else @@ -2511,8 +2501,9 @@ void Test::Run() { internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); impl->os_stack_trace_getter()->UponLeavingGTest(); internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()"); - // We will run the test only if SetUp() was successful. - if (!HasFatalFailure()) { + // We will run the test only if SetUp() was successful and didn't call + // GTEST_SKIP(). + if (!HasFatalFailure() && !IsSkipped()) { impl->os_stack_trace_getter()->UponLeavingGTest(); internal::HandleExceptionsInMethodIfSupported( this, &Test::TestBody, "the test body"); @@ -2526,32 +2517,36 @@ void Test::Run() { this, &Test::TearDown, "TearDown()"); } -// Returns true iff the current test has a fatal failure. +// Returns true if and only if the current test has a fatal failure. bool Test::HasFatalFailure() { return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure(); } -// Returns true iff the current test has a non-fatal failure. +// Returns true if and only if the current test has a non-fatal failure. bool Test::HasNonfatalFailure() { return internal::GetUnitTestImpl()->current_test_result()-> HasNonfatalFailure(); } +// Returns true if and only if the current test was skipped. +bool Test::IsSkipped() { + return internal::GetUnitTestImpl()->current_test_result()->Skipped(); +} + // class TestInfo // Constructs a TestInfo object. It assumes ownership of the test factory // object. -TestInfo::TestInfo(const std::string& a_test_case_name, - const std::string& a_name, - const char* a_type_param, +TestInfo::TestInfo(const std::string& a_test_suite_name, + const std::string& a_name, const char* a_type_param, const char* a_value_param, internal::CodeLocation a_code_location, internal::TypeId fixture_class_id, internal::TestFactoryBase* factory) - : test_case_name_(a_test_case_name), + : test_suite_name_(a_test_suite_name), name_(a_name), - type_param_(a_type_param ? new std::string(a_type_param) : NULL), - value_param_(a_value_param ? new std::string(a_value_param) : NULL), + type_param_(a_type_param ? new std::string(a_type_param) : nullptr), + value_param_(a_value_param ? new std::string(a_value_param) : nullptr), location_(a_code_location), fixture_class_id_(fixture_class_id), should_run_(false), @@ -2570,7 +2565,7 @@ namespace internal { // // Arguments: // -// test_case_name: name of the test case +// test_suite_name: name of the test suite // name: name of the test // type_param: the name of the test's type parameter, or NULL if // this is not a typed or a type-parameterized test. @@ -2578,40 +2573,35 @@ namespace internal { // or NULL if this is not a value-parameterized test. // code_location: code location where the test is defined // fixture_class_id: ID of the test fixture class -// set_up_tc: pointer to the function that sets up the test case -// tear_down_tc: pointer to the function that tears down the test case +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite // factory: pointer to the factory that creates a test object. // The newly created TestInfo instance will assume // ownership of the factory object. TestInfo* MakeAndRegisterTestInfo( - const char* test_case_name, - const char* name, - const char* type_param, - const char* value_param, - CodeLocation code_location, - TypeId fixture_class_id, - SetUpTestCaseFunc set_up_tc, - TearDownTestCaseFunc tear_down_tc, - TestFactoryBase* factory) { + const char* test_suite_name, const char* name, const char* type_param, + const char* value_param, CodeLocation code_location, + TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc, + TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory) { TestInfo* const test_info = - new TestInfo(test_case_name, name, type_param, value_param, + new TestInfo(test_suite_name, name, type_param, value_param, code_location, fixture_class_id, factory); GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info); return test_info; } -void ReportInvalidTestCaseType(const char* test_case_name, - CodeLocation code_location) { +void ReportInvalidTestSuiteType(const char* test_suite_name, + CodeLocation code_location) { Message errors; errors - << "Attempted redefinition of test case " << test_case_name << ".\n" - << "All tests in the same test case must use the same test fixture\n" - << "class. However, in test case " << test_case_name << ", you tried\n" + << "Attempted redefinition of test suite " << test_suite_name << ".\n" + << "All tests in the same test suite must use the same test fixture\n" + << "class. However, in test suite " << test_suite_name << ", you tried\n" << "to define a test using a fixture class different from the one\n" << "used earlier. This can happen if the two fixture classes are\n" << "from different namespaces and have the same name. You should\n" << "probably rename one of the classes to put the tests into different\n" - << "test cases."; + << "test suites."; GTEST_LOG_(ERROR) << FormatFileLocation(code_location.file.c_str(), code_location.line) @@ -2624,7 +2614,7 @@ namespace { // A predicate that checks the test name of a TestInfo against a known // value. // -// This is used for implementation of the TestCase class only. We put +// This is used for implementation of the TestSuite class only. We put // it in the anonymous namespace to prevent polluting the outer // namespace. // @@ -2637,7 +2627,7 @@ class TestNameIs { explicit TestNameIs(const char* name) : name_(name) {} - // Returns true iff the test name of test_info matches name_. + // Returns true if and only if the test name of test_info matches name_. bool operator()(const TestInfo * test_info) const { return test_info && test_info->name() == name_; } @@ -2651,7 +2641,7 @@ class TestNameIs { namespace internal { // This method expands all parameterized tests registered with macros TEST_P -// and INSTANTIATE_TEST_CASE_P into regular tests and registers those. +// and INSTANTIATE_TEST_SUITE_P into regular tests and registers those. // This will be done just once during the program runtime. void UnitTestImpl::RegisterParameterizedTests() { if (!parameterized_tests_registered_) { @@ -2685,19 +2675,23 @@ void TestInfo::Run() { factory_, &internal::TestFactoryBase::CreateTest, "the test fixture's constructor"); - // Runs the test if the constructor didn't generate a fatal failure. + // Runs the test if the constructor didn't generate a fatal failure or invoke + // GTEST_SKIP(). // Note that the object will not be null - if (!Test::HasFatalFailure()) { + if (!Test::HasFatalFailure() && !Test::IsSkipped()) { // This doesn't throw as all user code that can throw are wrapped into // exception handling code. test->Run(); } + if (test != nullptr) { // Deletes the test object. impl->os_stack_trace_getter()->UponLeavingGTest(); internal::HandleExceptionsInMethodIfSupported( test, &Test::DeleteSelf_, "the test fixture's destructor"); + } + result_.set_start_timestamp(start); result_.set_elapsed_time(internal::GetTimeInMillis() - start); // Notifies the unit test event listener that a test has just finished. @@ -2705,134 +2699,151 @@ void TestInfo::Run() { // Tells UnitTest to stop associating assertion results to this // test. - impl->set_current_test_info(NULL); + impl->set_current_test_info(nullptr); } -// class TestCase +// class TestSuite -// Gets the number of successful tests in this test case. -int TestCase::successful_test_count() const { +// Gets the number of successful tests in this test suite. +int TestSuite::successful_test_count() const { return CountIf(test_info_list_, TestPassed); } -// Gets the number of failed tests in this test case. -int TestCase::failed_test_count() const { +// Gets the number of successful tests in this test suite. +int TestSuite::skipped_test_count() const { + return CountIf(test_info_list_, TestSkipped); +} + +// Gets the number of failed tests in this test suite. +int TestSuite::failed_test_count() const { return CountIf(test_info_list_, TestFailed); } // Gets the number of disabled tests that will be reported in the XML report. -int TestCase::reportable_disabled_test_count() const { +int TestSuite::reportable_disabled_test_count() const { return CountIf(test_info_list_, TestReportableDisabled); } -// Gets the number of disabled tests in this test case. -int TestCase::disabled_test_count() const { +// Gets the number of disabled tests in this test suite. +int TestSuite::disabled_test_count() const { return CountIf(test_info_list_, TestDisabled); } // Gets the number of tests to be printed in the XML report. -int TestCase::reportable_test_count() const { +int TestSuite::reportable_test_count() const { return CountIf(test_info_list_, TestReportable); } -// Get the number of tests in this test case that should run. -int TestCase::test_to_run_count() const { +// Get the number of tests in this test suite that should run. +int TestSuite::test_to_run_count() const { return CountIf(test_info_list_, ShouldRunTest); } // Gets the number of all tests. -int TestCase::total_test_count() const { +int TestSuite::total_test_count() const { return static_cast(test_info_list_.size()); } -// Creates a TestCase with the given name. +// Creates a TestSuite with the given name. // // Arguments: // -// name: name of the test case -// a_type_param: the name of the test case's type parameter, or NULL if -// this is not a typed or a type-parameterized test case. -// set_up_tc: pointer to the function that sets up the test case -// tear_down_tc: pointer to the function that tears down the test case -TestCase::TestCase(const char* a_name, const char* a_type_param, - Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc) +// name: name of the test suite +// a_type_param: the name of the test suite's type parameter, or NULL if +// this is not a typed or a type-parameterized test suite. +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite +TestSuite::TestSuite(const char* a_name, const char* a_type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc) : name_(a_name), - type_param_(a_type_param ? new std::string(a_type_param) : NULL), + type_param_(a_type_param ? new std::string(a_type_param) : nullptr), set_up_tc_(set_up_tc), tear_down_tc_(tear_down_tc), should_run_(false), - elapsed_time_(0) { -} + start_timestamp_(0), + elapsed_time_(0) {} -// Destructor of TestCase. -TestCase::~TestCase() { +// Destructor of TestSuite. +TestSuite::~TestSuite() { // Deletes every Test in the collection. ForEach(test_info_list_, internal::Delete); } // Returns the i-th test among all the tests. i can range from 0 to // total_test_count() - 1. If i is not in that range, returns NULL. -const TestInfo* TestCase::GetTestInfo(int i) const { +const TestInfo* TestSuite::GetTestInfo(int i) const { const int index = GetElementOr(test_indices_, i, -1); - return index < 0 ? NULL : test_info_list_[index]; + return index < 0 ? nullptr : test_info_list_[static_cast(index)]; } // Returns the i-th test among all the tests. i can range from 0 to // total_test_count() - 1. If i is not in that range, returns NULL. -TestInfo* TestCase::GetMutableTestInfo(int i) { +TestInfo* TestSuite::GetMutableTestInfo(int i) { const int index = GetElementOr(test_indices_, i, -1); - return index < 0 ? NULL : test_info_list_[index]; + return index < 0 ? nullptr : test_info_list_[static_cast(index)]; } -// Adds a test to this test case. Will delete the test upon -// destruction of the TestCase object. -void TestCase::AddTestInfo(TestInfo * test_info) { +// Adds a test to this test suite. Will delete the test upon +// destruction of the TestSuite object. +void TestSuite::AddTestInfo(TestInfo* test_info) { test_info_list_.push_back(test_info); test_indices_.push_back(static_cast(test_indices_.size())); } -// Runs every test in this TestCase. -void TestCase::Run() { +// Runs every test in this TestSuite. +void TestSuite::Run() { if (!should_run_) return; internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); - impl->set_current_test_case(this); + impl->set_current_test_suite(this); TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + // Call both legacy and the new API + repeater->OnTestSuiteStart(*this); +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI repeater->OnTestCaseStart(*this); +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI + impl->os_stack_trace_getter()->UponLeavingGTest(); internal::HandleExceptionsInMethodIfSupported( - this, &TestCase::RunSetUpTestCase, "SetUpTestCase()"); + this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()"); - const internal::TimeInMillis start = internal::GetTimeInMillis(); + start_timestamp_ = internal::GetTimeInMillis(); for (int i = 0; i < total_test_count(); i++) { GetMutableTestInfo(i)->Run(); } - elapsed_time_ = internal::GetTimeInMillis() - start; + elapsed_time_ = internal::GetTimeInMillis() - start_timestamp_; impl->os_stack_trace_getter()->UponLeavingGTest(); internal::HandleExceptionsInMethodIfSupported( - this, &TestCase::RunTearDownTestCase, "TearDownTestCase()"); + this, &TestSuite::RunTearDownTestSuite, "TearDownTestSuite()"); + // Call both legacy and the new API + repeater->OnTestSuiteEnd(*this); +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI repeater->OnTestCaseEnd(*this); - impl->set_current_test_case(NULL); +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI + + impl->set_current_test_suite(nullptr); } -// Clears the results of all tests in this test case. -void TestCase::ClearResult() { +// Clears the results of all tests in this test suite. +void TestSuite::ClearResult() { ad_hoc_test_result_.Clear(); ForEach(test_info_list_, TestInfo::ClearTestResult); } -// Shuffles the tests in this test case. -void TestCase::ShuffleTests(internal::Random* random) { +// Shuffles the tests in this test suite. +void TestSuite::ShuffleTests(internal::Random* random) { Shuffle(random, &test_indices_); } // Restores the test order to before the first shuffle. -void TestCase::UnshuffleTests() { +void TestSuite::UnshuffleTests() { for (size_t i = 0; i < test_indices_.size(); i++) { test_indices_[i] = static_cast(i); } @@ -2855,9 +2866,9 @@ static std::string FormatTestCount(int test_count) { return FormatCountableNoun(test_count, "test", "tests"); } -// Formats the count of test cases. -static std::string FormatTestCaseCount(int test_case_count) { - return FormatCountableNoun(test_case_count, "test case", "test cases"); +// Formats the count of test suites. +static std::string FormatTestSuiteCount(int test_suite_count) { + return FormatCountableNoun(test_suite_count, "test suite", "test suites"); } // Converts a TestPartResult::Type enum to human-friendly string @@ -2866,6 +2877,8 @@ static std::string FormatTestCaseCount(int test_case_count) { // between the two when viewing the test result. static const char * TestPartResultTypeToString(TestPartResult::Type type) { switch (type) { + case TestPartResult::kSkip: + return "Skipped"; case TestPartResult::kSuccess: return "Success"; @@ -2913,14 +2926,6 @@ static void PrintTestPartResult(const TestPartResult& test_part_result) { } // class PrettyUnitTestResultPrinter - -enum GTestColor { - COLOR_DEFAULT, - COLOR_RED, - COLOR_GREEN, - COLOR_YELLOW -}; - #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \ !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW @@ -2974,13 +2979,14 @@ static const char* GetAnsiColorCode(GTestColor color) { case COLOR_RED: return "1"; case COLOR_GREEN: return "2"; case COLOR_YELLOW: return "3"; - default: return NULL; - }; + default: + return nullptr; + } } #endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE -// Returns true iff Google Test should use colors in the output. +// Returns true if and only if Google Test should use colors in the output. bool ShouldUseColor(bool stdout_is_tty) { const char* const gtest_color = GTEST_FLAG(color).c_str(); @@ -3021,19 +3027,18 @@ bool ShouldUseColor(bool stdout_is_tty) { // cannot simply emit special characters and have the terminal change colors. // This routine must actually emit the characters rather than return a string // that would be colored when printed, as can be done on Linux. -static void ColoredPrintf(GTestColor color, const char* fmt, ...) { +void ColoredPrintf(GTestColor color, const char* fmt, ...) { va_list args; va_start(args, fmt); -#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || \ - GTEST_OS_IOS || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \ + GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM) const bool use_color = AlwaysFalse(); #else static const bool in_color_mode = ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0); const bool use_color = in_color_mode && (color != COLOR_DEFAULT); -#endif // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS - // The '!= 0' comparison is necessary to satisfy MSVC 7.1. +#endif // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS if (!use_color) { vprintf(fmt, args); @@ -3079,14 +3084,13 @@ static void PrintFullTestCommentIfPresent(const TestInfo& test_info) { const char* const type_param = test_info.type_param(); const char* const value_param = test_info.value_param(); - if (type_param != NULL || value_param != NULL) { + if (type_param != nullptr || value_param != nullptr) { printf(", where "); - if (type_param != NULL) { + if (type_param != nullptr) { printf("%s = %s", kTypeParamLabel, type_param); - if (value_param != NULL) - printf(" and "); + if (value_param != nullptr) printf(" and "); } - if (value_param != NULL) { + if (value_param != nullptr) { printf("%s = %s", kValueParamLabel, value_param); } } @@ -3098,27 +3102,39 @@ static void PrintFullTestCommentIfPresent(const TestInfo& test_info) { class PrettyUnitTestResultPrinter : public TestEventListener { public: PrettyUnitTestResultPrinter() {} - static void PrintTestName(const char * test_case, const char * test) { - printf("%s.%s", test_case, test); + static void PrintTestName(const char* test_suite, const char* test) { + printf("%s.%s", test_suite, test); } // The following methods override what's in the TestEventListener class. - virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {} - virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration); - virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test); - virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {} - virtual void OnTestCaseStart(const TestCase& test_case); - virtual void OnTestStart(const TestInfo& test_info); - virtual void OnTestPartResult(const TestPartResult& result); - virtual void OnTestEnd(const TestInfo& test_info); - virtual void OnTestCaseEnd(const TestCase& test_case); - virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test); - virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {} - virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); - virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {} + void OnTestProgramStart(const UnitTest& /*unit_test*/) override {} + void OnTestIterationStart(const UnitTest& unit_test, int iteration) override; + void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override; + void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {} +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseStart(const TestCase& test_case) override; +#else + void OnTestSuiteStart(const TestSuite& test_suite) override; +#endif // OnTestCaseStart + + void OnTestStart(const TestInfo& test_info) override; + + void OnTestPartResult(const TestPartResult& result) override; + void OnTestEnd(const TestInfo& test_info) override; +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseEnd(const TestCase& test_case) override; +#else + void OnTestSuiteEnd(const TestSuite& test_suite) override; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override; + void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {} + void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override; + void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {} private: static void PrintFailedTests(const UnitTest& unit_test); + static void PrintSkippedTests(const UnitTest& unit_test); }; // Fired before each iteration of tests starts. @@ -3153,7 +3169,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart( ColoredPrintf(COLOR_GREEN, "[==========] "); printf("Running %s from %s.\n", FormatTestCount(unit_test.test_to_run_count()).c_str(), - FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str()); + FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); fflush(stdout); } @@ -3164,22 +3180,38 @@ void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart( fflush(stdout); } +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) { const std::string counts = FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); ColoredPrintf(COLOR_GREEN, "[----------] "); printf("%s from %s", counts.c_str(), test_case.name()); - if (test_case.type_param() == NULL) { + if (test_case.type_param() == nullptr) { printf("\n"); } else { printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param()); } fflush(stdout); } +#else +void PrettyUnitTestResultPrinter::OnTestSuiteStart( + const TestSuite& test_suite) { + const std::string counts = + FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests"); + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("%s from %s", counts.c_str(), test_suite.name()); + if (test_suite.type_param() == nullptr) { + printf("\n"); + } else { + printf(", where %s = %s\n", kTypeParamLabel, test_suite.type_param()); + } + fflush(stdout); +} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) { ColoredPrintf(COLOR_GREEN, "[ RUN ] "); - PrintTestName(test_info.test_case_name(), test_info.name()); + PrintTestName(test_info.test_suite_name(), test_info.name()); printf("\n"); fflush(stdout); } @@ -3187,22 +3219,29 @@ void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) { // Called after an assertion failure. void PrettyUnitTestResultPrinter::OnTestPartResult( const TestPartResult& result) { - // If the test part succeeded, we don't need to do anything. - if (result.type() == TestPartResult::kSuccess) - return; - - // Print failure message from the assertion (e.g. expected this and got that). - PrintTestPartResult(result); - fflush(stdout); + switch (result.type()) { + // If the test part succeeded, or was skipped, + // we don't need to do anything. + case TestPartResult::kSkip: + case TestPartResult::kSuccess: + return; + default: + // Print failure message from the assertion + // (e.g. expected this and got that). + PrintTestPartResult(result); + fflush(stdout); + } } void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { if (test_info.result()->Passed()) { ColoredPrintf(COLOR_GREEN, "[ OK ] "); + } else if (test_info.result()->Skipped()) { + ColoredPrintf(COLOR_GREEN, "[ SKIPPED ] "); } else { ColoredPrintf(COLOR_RED, "[ FAILED ] "); } - PrintTestName(test_info.test_case_name(), test_info.name()); + PrintTestName(test_info.test_suite_name(), test_info.name()); if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info); @@ -3215,17 +3254,29 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { fflush(stdout); } +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) { if (!GTEST_FLAG(print_time)) return; const std::string counts = FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); ColoredPrintf(COLOR_GREEN, "[----------] "); - printf("%s from %s (%s ms total)\n\n", - counts.c_str(), test_case.name(), + printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_case.name(), internal::StreamableToString(test_case.elapsed_time()).c_str()); fflush(stdout); } +#else +void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) { + if (!GTEST_FLAG(print_time)) return; + + const std::string counts = + FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests"); + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_suite.name(), + internal::StreamableToString(test_suite.elapsed_time()).c_str()); + fflush(stdout); +} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart( const UnitTest& /*unit_test*/) { @@ -3241,30 +3292,54 @@ void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) { return; } - for (int i = 0; i < unit_test.total_test_case_count(); ++i) { - const TestCase& test_case = *unit_test.GetTestCase(i); - if (!test_case.should_run() || (test_case.failed_test_count() == 0)) { + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + const TestSuite& test_suite = *unit_test.GetTestSuite(i); + if (!test_suite.should_run() || (test_suite.failed_test_count() == 0)) { continue; } - for (int j = 0; j < test_case.total_test_count(); ++j) { - const TestInfo& test_info = *test_case.GetTestInfo(j); - if (!test_info.should_run() || test_info.result()->Passed()) { + for (int j = 0; j < test_suite.total_test_count(); ++j) { + const TestInfo& test_info = *test_suite.GetTestInfo(j); + if (!test_info.should_run() || !test_info.result()->Failed()) { continue; } ColoredPrintf(COLOR_RED, "[ FAILED ] "); - printf("%s.%s", test_case.name(), test_info.name()); + printf("%s.%s", test_suite.name(), test_info.name()); PrintFullTestCommentIfPresent(test_info); printf("\n"); } } } +// Internal helper for printing the list of skipped tests. +void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest& unit_test) { + const int skipped_test_count = unit_test.skipped_test_count(); + if (skipped_test_count == 0) { + return; + } + + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + const TestSuite& test_suite = *unit_test.GetTestSuite(i); + if (!test_suite.should_run() || (test_suite.skipped_test_count() == 0)) { + continue; + } + for (int j = 0; j < test_suite.total_test_count(); ++j) { + const TestInfo& test_info = *test_suite.GetTestInfo(j); + if (!test_info.should_run() || !test_info.result()->Skipped()) { + continue; + } + ColoredPrintf(COLOR_GREEN, "[ SKIPPED ] "); + printf("%s.%s", test_suite.name(), test_info.name()); + printf("\n"); + } + } +} + void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, int /*iteration*/) { ColoredPrintf(COLOR_GREEN, "[==========] "); printf("%s from %s ran.", FormatTestCount(unit_test.test_to_run_count()).c_str(), - FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str()); + FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); if (GTEST_FLAG(print_time)) { printf(" (%s ms total)", internal::StreamableToString(unit_test.elapsed_time()).c_str()); @@ -3273,6 +3348,13 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, ColoredPrintf(COLOR_GREEN, "[ PASSED ] "); printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str()); + const int skipped_test_count = unit_test.skipped_test_count(); + if (skipped_test_count > 0) { + ColoredPrintf(COLOR_GREEN, "[ SKIPPED ] "); + printf("%s, listed below:\n", FormatTestCount(skipped_test_count).c_str()); + PrintSkippedTests(unit_test); + } + int num_failures = unit_test.failed_test_count(); if (!unit_test.Passed()) { const int failed_test_count = unit_test.failed_test_count(); @@ -3305,7 +3387,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, class TestEventRepeater : public TestEventListener { public: TestEventRepeater() : forwarding_enabled_(true) {} - virtual ~TestEventRepeater(); + ~TestEventRepeater() override; void Append(TestEventListener *listener); TestEventListener* Release(TestEventListener* listener); @@ -3314,19 +3396,27 @@ class TestEventRepeater : public TestEventListener { bool forwarding_enabled() const { return forwarding_enabled_; } void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; } - virtual void OnTestProgramStart(const UnitTest& unit_test); - virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration); - virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test); - virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test); - virtual void OnTestCaseStart(const TestCase& test_case); - virtual void OnTestStart(const TestInfo& test_info); - virtual void OnTestPartResult(const TestPartResult& result); - virtual void OnTestEnd(const TestInfo& test_info); - virtual void OnTestCaseEnd(const TestCase& test_case); - virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test); - virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test); - virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); - virtual void OnTestProgramEnd(const UnitTest& unit_test); + void OnTestProgramStart(const UnitTest& unit_test) override; + void OnTestIterationStart(const UnitTest& unit_test, int iteration) override; + void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override; + void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) override; +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseStart(const TestSuite& parameter) override; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestSuiteStart(const TestSuite& parameter) override; + void OnTestStart(const TestInfo& test_info) override; + void OnTestPartResult(const TestPartResult& result) override; + void OnTestEnd(const TestInfo& test_info) override; +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseEnd(const TestCase& parameter) override; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestSuiteEnd(const TestSuite& parameter) override; + void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override; + void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) override; + void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override; + void OnTestProgramEnd(const UnitTest& unit_test) override; private: // Controls whether events will be forwarded to listeners_. Set to false @@ -3346,16 +3436,15 @@ void TestEventRepeater::Append(TestEventListener *listener) { listeners_.push_back(listener); } -// FIXME: Factor the search functionality into Vector::Find. TestEventListener* TestEventRepeater::Release(TestEventListener *listener) { for (size_t i = 0; i < listeners_.size(); ++i) { if (listeners_[i] == listener) { - listeners_.erase(listeners_.begin() + i); + listeners_.erase(listeners_.begin() + static_cast(i)); return listener; } } - return NULL; + return nullptr; } // Since most methods are very similar, use macros to reduce boilerplate. @@ -3370,25 +3459,33 @@ void TestEventRepeater::Name(const Type& parameter) { \ } // This defines a member that forwards the call to all listeners in reverse // order. -#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \ -void TestEventRepeater::Name(const Type& parameter) { \ - if (forwarding_enabled_) { \ - for (int i = static_cast(listeners_.size()) - 1; i >= 0; i--) { \ - listeners_[i]->Name(parameter); \ - } \ - } \ -} +#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \ + void TestEventRepeater::Name(const Type& parameter) { \ + if (forwarding_enabled_) { \ + for (size_t i = listeners_.size(); i != 0; i--) { \ + listeners_[i - 1]->Name(parameter); \ + } \ + } \ + } GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest) GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest) -GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase) +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite) +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite) GTEST_REPEATER_METHOD_(OnTestStart, TestInfo) GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult) GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest) GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest) GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest) GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo) -GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase) +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestSuite) +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REVERSE_REPEATER_METHOD_(OnTestSuiteEnd, TestSuite) GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest) #undef GTEST_REPEATER_METHOD_ @@ -3406,8 +3503,8 @@ void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test, void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test, int iteration) { if (forwarding_enabled_) { - for (int i = static_cast(listeners_.size()) - 1; i >= 0; i--) { - listeners_[i]->OnTestIterationEnd(unit_test, iteration); + for (size_t i = listeners_.size(); i > 0; i--) { + listeners_[i - 1]->OnTestIterationEnd(unit_test, iteration); } } } @@ -3419,12 +3516,12 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener { public: explicit XmlUnitTestResultPrinter(const char* output_file); - virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); - void ListTestsMatchingFilter(const std::vector& test_cases); + void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override; + void ListTestsMatchingFilter(const std::vector& test_suites); // Prints an XML summary of all unit tests. static void PrintXmlTestsList(std::ostream* stream, - const std::vector& test_cases); + const std::vector& test_suites); private: // Is c a whitespace character that is normalized to a space character @@ -3469,12 +3566,12 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener { // Streams an XML representation of a TestInfo object. static void OutputXmlTestInfo(::std::ostream* stream, - const char* test_case_name, + const char* test_suite_name, const TestInfo& test_info); - // Prints an XML representation of a TestCase object - static void PrintXmlTestCase(::std::ostream* stream, - const TestCase& test_case); + // Prints an XML representation of a TestSuite object + static void PrintXmlTestSuite(::std::ostream* stream, + const TestSuite& test_suite); // Prints an XML summary of unit_test to output stream out. static void PrintXmlUnitTest(::std::ostream* stream, @@ -3516,10 +3613,10 @@ void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, } void XmlUnitTestResultPrinter::ListTestsMatchingFilter( - const std::vector& test_cases) { + const std::vector& test_suites) { FILE* xmlout = OpenFileForWriting(output_file_); std::stringstream stream; - PrintXmlTestsList(&stream, test_cases); + PrintXmlTestsList(&stream, test_suites); fprintf(xmlout, "%s", StringStreamToString(&stream).c_str()); fclose(xmlout); } @@ -3534,8 +3631,6 @@ void XmlUnitTestResultPrinter::ListTestsMatchingFilter( // module will consist of ordinary English text. // If this module is ever modified to produce version 1.1 XML output, // most invalid characters can be retained using character references. -// FIXME: It might be nice to have a minimally invasive, human-readable -// escaping scheme for invalid characters, rather than dropping them. std::string XmlUnitTestResultPrinter::EscapeXml( const std::string& str, bool is_attribute) { Message m; @@ -3600,7 +3695,7 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters( // This is how Google Test concepts map to the DTD: // // <-- corresponds to a UnitTest object -// <-- corresponds to a TestCase object +// <-- corresponds to a TestSuite object // <-- corresponds to a TestInfo object // ... // ... @@ -3624,12 +3719,11 @@ static bool PortableLocaltime(time_t seconds, struct tm* out) { // MINGW provides neither localtime_r nor localtime_s, but uses // Windows' localtime(), which has a thread-local tm buffer. struct tm* tm_ptr = localtime(&seconds); // NOLINT - if (tm_ptr == NULL) - return false; + if (tm_ptr == nullptr) return false; *out = *tm_ptr; return true; #else - return localtime_r(&seconds, out) != NULL; + return localtime_r(&seconds, out) != nullptr; #endif } @@ -3655,7 +3749,7 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream, *stream << ""); - if (next_segment != NULL) { + if (next_segment != nullptr) { stream->write( segment, static_cast(next_segment - segment)); *stream << "]]>]]>& allowed_names = - GetReservedAttributesForElement(element_name); + GetReservedOutputAttributesForElement(element_name); GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != allowed_names.end()) @@ -3685,40 +3779,47 @@ void XmlUnitTestResultPrinter::OutputXmlAttribute( } // Prints an XML representation of a TestInfo object. -// FIXME: There is also value in printing properties with the plain printer. void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, - const char* test_case_name, + const char* test_suite_name, const TestInfo& test_info) { const TestResult& result = *test_info.result(); - const std::string kTestcase = "testcase"; + const std::string kTestsuite = "testcase"; if (test_info.is_in_another_shard()) { return; } *stream << " \n"; return; } - OutputXmlAttribute(stream, kTestcase, "status", + OutputXmlAttribute(stream, kTestsuite, "status", test_info.should_run() ? "run" : "notrun"); - OutputXmlAttribute(stream, kTestcase, "time", + OutputXmlAttribute(stream, kTestsuite, "result", + test_info.should_run() + ? (result.Skipped() ? "skipped" : "completed") + : "suppressed"); + OutputXmlAttribute(stream, kTestsuite, "time", FormatTimeInMillisAsSeconds(result.elapsed_time())); - OutputXmlAttribute(stream, kTestcase, "classname", test_case_name); + OutputXmlAttribute( + stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsIso8601(result.start_timestamp())); + OutputXmlAttribute(stream, kTestsuite, "classname", test_suite_name); int failures = 0; for (int i = 0; i < result.total_part_count(); ++i) { @@ -3751,29 +3852,32 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, } } -// Prints an XML representation of a TestCase object -void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream, - const TestCase& test_case) { +// Prints an XML representation of a TestSuite object +void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream, + const TestSuite& test_suite) { const std::string kTestsuite = "testsuite"; *stream << " <" << kTestsuite; - OutputXmlAttribute(stream, kTestsuite, "name", test_case.name()); + OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name()); OutputXmlAttribute(stream, kTestsuite, "tests", - StreamableToString(test_case.reportable_test_count())); + StreamableToString(test_suite.reportable_test_count())); if (!GTEST_FLAG(list_tests)) { OutputXmlAttribute(stream, kTestsuite, "failures", - StreamableToString(test_case.failed_test_count())); + StreamableToString(test_suite.failed_test_count())); OutputXmlAttribute( stream, kTestsuite, "disabled", - StreamableToString(test_case.reportable_disabled_test_count())); + StreamableToString(test_suite.reportable_disabled_test_count())); OutputXmlAttribute(stream, kTestsuite, "errors", "0"); OutputXmlAttribute(stream, kTestsuite, "time", - FormatTimeInMillisAsSeconds(test_case.elapsed_time())); - *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result()); + FormatTimeInMillisAsSeconds(test_suite.elapsed_time())); + OutputXmlAttribute( + stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsIso8601(test_suite.start_timestamp())); + *stream << TestPropertiesAsXmlAttributes(test_suite.ad_hoc_test_result()); } *stream << ">\n"; - for (int i = 0; i < test_case.total_test_count(); ++i) { - if (test_case.GetTestInfo(i)->is_reportable()) - OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i)); + for (int i = 0; i < test_suite.total_test_count(); ++i) { + if (test_suite.GetTestInfo(i)->is_reportable()) + OutputXmlTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i)); } *stream << " \n"; } @@ -3794,11 +3898,11 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream, stream, kTestsuites, "disabled", StreamableToString(unit_test.reportable_disabled_test_count())); OutputXmlAttribute(stream, kTestsuites, "errors", "0"); + OutputXmlAttribute(stream, kTestsuites, "time", + FormatTimeInMillisAsSeconds(unit_test.elapsed_time())); OutputXmlAttribute( stream, kTestsuites, "timestamp", FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp())); - OutputXmlAttribute(stream, kTestsuites, "time", - FormatTimeInMillisAsSeconds(unit_test.elapsed_time())); if (GTEST_FLAG(shuffle)) { OutputXmlAttribute(stream, kTestsuites, "random_seed", @@ -3809,31 +3913,31 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream, OutputXmlAttribute(stream, kTestsuites, "name", "AllTests"); *stream << ">\n"; - for (int i = 0; i < unit_test.total_test_case_count(); ++i) { - if (unit_test.GetTestCase(i)->reportable_test_count() > 0) - PrintXmlTestCase(stream, *unit_test.GetTestCase(i)); + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + if (unit_test.GetTestSuite(i)->reportable_test_count() > 0) + PrintXmlTestSuite(stream, *unit_test.GetTestSuite(i)); } *stream << "\n"; } void XmlUnitTestResultPrinter::PrintXmlTestsList( - std::ostream* stream, const std::vector& test_cases) { + std::ostream* stream, const std::vector& test_suites) { const std::string kTestsuites = "testsuites"; *stream << "\n"; *stream << "<" << kTestsuites; int total_tests = 0; - for (size_t i = 0; i < test_cases.size(); ++i) { - total_tests += test_cases[i]->total_test_count(); + for (auto test_suite : test_suites) { + total_tests += test_suite->total_test_count(); } OutputXmlAttribute(stream, kTestsuites, "tests", StreamableToString(total_tests)); OutputXmlAttribute(stream, kTestsuites, "name", "AllTests"); *stream << ">\n"; - for (size_t i = 0; i < test_cases.size(); ++i) { - PrintXmlTestCase(stream, *test_cases[i]); + for (auto test_suite : test_suites) { + PrintXmlTestSuite(stream, *test_suite); } *stream << "\n"; } @@ -3878,11 +3982,11 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener { public: explicit JsonUnitTestResultPrinter(const char* output_file); - virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); + void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override; // Prints an JSON summary of all unit tests. static void PrintJsonTestList(::std::ostream* stream, - const std::vector& test_cases); + const std::vector& test_suites); private: // Returns an JSON-escaped copy of the input string str. @@ -3905,12 +4009,12 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener { // Streams a JSON representation of a TestInfo object. static void OutputJsonTestInfo(::std::ostream* stream, - const char* test_case_name, + const char* test_suite_name, const TestInfo& test_info); - // Prints a JSON representation of a TestCase object - static void PrintJsonTestCase(::std::ostream* stream, - const TestCase& test_case); + // Prints a JSON representation of a TestSuite object + static void PrintJsonTestSuite(::std::ostream* stream, + const TestSuite& test_suite); // Prints a JSON summary of unit_test to output stream out. static void PrintJsonUnitTest(::std::ostream* stream, @@ -4009,7 +4113,7 @@ static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) { String::FormatIntWidth2(time_struct.tm_sec) + "Z"; } -static inline std::string Indent(int width) { +static inline std::string Indent(size_t width) { return std::string(width, ' '); } @@ -4021,7 +4125,7 @@ void JsonUnitTestResultPrinter::OutputJsonKey( const std::string& indent, bool comma) { const std::vector& allowed_names = - GetReservedAttributesForElement(element_name); + GetReservedOutputAttributesForElement(element_name); GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != allowed_names.end()) @@ -4041,7 +4145,7 @@ void JsonUnitTestResultPrinter::OutputJsonKey( const std::string& indent, bool comma) { const std::vector& allowed_names = - GetReservedAttributesForElement(element_name); + GetReservedOutputAttributesForElement(element_name); GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != allowed_names.end()) @@ -4055,35 +4159,44 @@ void JsonUnitTestResultPrinter::OutputJsonKey( // Prints a JSON representation of a TestInfo object. void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream, - const char* test_case_name, + const char* test_suite_name, const TestInfo& test_info) { const TestResult& result = *test_info.result(); - const std::string kTestcase = "testcase"; + const std::string kTestsuite = "testcase"; const std::string kIndent = Indent(10); *stream << Indent(8) << "{\n"; - OutputJsonKey(stream, kTestcase, "name", test_info.name(), kIndent); + OutputJsonKey(stream, kTestsuite, "name", test_info.name(), kIndent); - if (test_info.value_param() != NULL) { - OutputJsonKey(stream, kTestcase, "value_param", - test_info.value_param(), kIndent); + if (test_info.value_param() != nullptr) { + OutputJsonKey(stream, kTestsuite, "value_param", test_info.value_param(), + kIndent); } - if (test_info.type_param() != NULL) { - OutputJsonKey(stream, kTestcase, "type_param", test_info.type_param(), + if (test_info.type_param() != nullptr) { + OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(), kIndent); } if (GTEST_FLAG(list_tests)) { - OutputJsonKey(stream, kTestcase, "file", test_info.file(), kIndent); - OutputJsonKey(stream, kTestcase, "line", test_info.line(), kIndent, false); + OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent); + OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false); *stream << "\n" << Indent(8) << "}"; return; } - OutputJsonKey(stream, kTestcase, "status", + OutputJsonKey(stream, kTestsuite, "status", test_info.should_run() ? "RUN" : "NOTRUN", kIndent); - OutputJsonKey(stream, kTestcase, "time", + OutputJsonKey(stream, kTestsuite, "result", + test_info.should_run() + ? (result.Skipped() ? "SKIPPED" : "COMPLETED") + : "SUPPRESSED", + kIndent); + OutputJsonKey(stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()), + kIndent); + OutputJsonKey(stream, kTestsuite, "time", FormatTimeInMillisAsDuration(result.elapsed_time()), kIndent); - OutputJsonKey(stream, kTestcase, "classname", test_case_name, kIndent, false); + OutputJsonKey(stream, kTestsuite, "classname", test_suite_name, kIndent, + false); *stream << TestPropertiesAsJson(result, kIndent); int failures = 0; @@ -4110,40 +4223,44 @@ void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream, *stream << "\n" << Indent(8) << "}"; } -// Prints an JSON representation of a TestCase object -void JsonUnitTestResultPrinter::PrintJsonTestCase(std::ostream* stream, - const TestCase& test_case) { +// Prints an JSON representation of a TestSuite object +void JsonUnitTestResultPrinter::PrintJsonTestSuite( + std::ostream* stream, const TestSuite& test_suite) { const std::string kTestsuite = "testsuite"; const std::string kIndent = Indent(6); *stream << Indent(4) << "{\n"; - OutputJsonKey(stream, kTestsuite, "name", test_case.name(), kIndent); - OutputJsonKey(stream, kTestsuite, "tests", test_case.reportable_test_count(), + OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent); + OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(), kIndent); if (!GTEST_FLAG(list_tests)) { - OutputJsonKey(stream, kTestsuite, "failures", test_case.failed_test_count(), - kIndent); + OutputJsonKey(stream, kTestsuite, "failures", + test_suite.failed_test_count(), kIndent); OutputJsonKey(stream, kTestsuite, "disabled", - test_case.reportable_disabled_test_count(), kIndent); + test_suite.reportable_disabled_test_count(), kIndent); OutputJsonKey(stream, kTestsuite, "errors", 0, kIndent); + OutputJsonKey( + stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsRFC3339(test_suite.start_timestamp()), + kIndent); OutputJsonKey(stream, kTestsuite, "time", - FormatTimeInMillisAsDuration(test_case.elapsed_time()), + FormatTimeInMillisAsDuration(test_suite.elapsed_time()), kIndent, false); - *stream << TestPropertiesAsJson(test_case.ad_hoc_test_result(), kIndent) + *stream << TestPropertiesAsJson(test_suite.ad_hoc_test_result(), kIndent) << ",\n"; } *stream << kIndent << "\"" << kTestsuite << "\": [\n"; bool comma = false; - for (int i = 0; i < test_case.total_test_count(); ++i) { - if (test_case.GetTestInfo(i)->is_reportable()) { + for (int i = 0; i < test_suite.total_test_count(); ++i) { + if (test_suite.GetTestInfo(i)->is_reportable()) { if (comma) { *stream << ",\n"; } else { comma = true; } - OutputJsonTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i)); + OutputJsonTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i)); } } *stream << "\n" << kIndent << "]\n" << Indent(4) << "}"; @@ -4181,14 +4298,14 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream, *stream << kIndent << "\"" << kTestsuites << "\": [\n"; bool comma = false; - for (int i = 0; i < unit_test.total_test_case_count(); ++i) { - if (unit_test.GetTestCase(i)->reportable_test_count() > 0) { + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + if (unit_test.GetTestSuite(i)->reportable_test_count() > 0) { if (comma) { *stream << ",\n"; } else { comma = true; } - PrintJsonTestCase(stream, *unit_test.GetTestCase(i)); + PrintJsonTestSuite(stream, *unit_test.GetTestSuite(i)); } } @@ -4196,24 +4313,24 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream, } void JsonUnitTestResultPrinter::PrintJsonTestList( - std::ostream* stream, const std::vector& test_cases) { + std::ostream* stream, const std::vector& test_suites) { const std::string kTestsuites = "testsuites"; const std::string kIndent = Indent(2); *stream << "{\n"; int total_tests = 0; - for (size_t i = 0; i < test_cases.size(); ++i) { - total_tests += test_cases[i]->total_test_count(); + for (auto test_suite : test_suites) { + total_tests += test_suite->total_test_count(); } OutputJsonKey(stream, kTestsuites, "tests", total_tests, kIndent); OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent); *stream << kIndent << "\"" << kTestsuites << "\": [\n"; - for (size_t i = 0; i < test_cases.size(); ++i) { + for (size_t i = 0; i < test_suites.size(); ++i) { if (i != 0) { *stream << ",\n"; } - PrintJsonTestCase(stream, *test_cases[i]); + PrintJsonTestSuite(stream, *test_suites[i]); } *stream << "\n" @@ -4269,7 +4386,7 @@ void StreamingListener::SocketWriter::MakeConnection() { memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses. hints.ai_socktype = SOCK_STREAM; - addrinfo* servinfo = NULL; + addrinfo* servinfo = nullptr; // Use the getaddrinfo() to get a linked list of IP addresses for // the given host name. @@ -4281,7 +4398,7 @@ void StreamingListener::SocketWriter::MakeConnection() { } // Loop through all the results and connect to the first we can. - for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL; + for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr; cur_addr = cur_addr->ai_next) { sockfd_ = socket( cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol); @@ -4413,9 +4530,8 @@ class ScopedPrematureExitFile { TestEventListeners::TestEventListeners() : repeater_(new internal::TestEventRepeater()), - default_result_printer_(NULL), - default_xml_generator_(NULL) { -} + default_result_printer_(nullptr), + default_xml_generator_(nullptr) {} TestEventListeners::~TestEventListeners() { delete repeater_; } @@ -4432,9 +4548,9 @@ void TestEventListeners::Append(TestEventListener* listener) { // NULL if the listener is not found in the list. TestEventListener* TestEventListeners::Release(TestEventListener* listener) { if (listener == default_result_printer_) - default_result_printer_ = NULL; + default_result_printer_ = nullptr; else if (listener == default_xml_generator_) - default_xml_generator_ = NULL; + default_xml_generator_ = nullptr; return repeater_->Release(listener); } @@ -4453,8 +4569,7 @@ void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) { // list. delete Release(default_result_printer_); default_result_printer_ = listener; - if (listener != NULL) - Append(listener); + if (listener != nullptr) Append(listener); } } @@ -4469,8 +4584,7 @@ void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) { // list. delete Release(default_xml_generator_); default_xml_generator_ = listener; - if (listener != NULL) - Append(listener); + if (listener != nullptr) Append(listener); } } @@ -4494,52 +4608,66 @@ void TestEventListeners::SuppressEventForwarding() { // call this before main() starts, from which point on the return // value will never change. UnitTest* UnitTest::GetInstance() { - // When compiled with MSVC 7.1 in optimized mode, destroying the - // UnitTest object upon exiting the program messes up the exit code, - // causing successful tests to appear failed. We have to use a - // different implementation in this case to bypass the compiler bug. - // This implementation makes the compiler happy, at the cost of - // leaking the UnitTest object. - // CodeGear C++Builder insists on a public destructor for the // default implementation. Use this implementation to keep good OO // design with private destructor. -#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__) +#if defined(__BORLANDC__) static UnitTest* const instance = new UnitTest; return instance; #else static UnitTest instance; return &instance; -#endif // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__) +#endif // defined(__BORLANDC__) } -// Gets the number of successful test cases. -int UnitTest::successful_test_case_count() const { - return impl()->successful_test_case_count(); +// Gets the number of successful test suites. +int UnitTest::successful_test_suite_count() const { + return impl()->successful_test_suite_count(); } -// Gets the number of failed test cases. -int UnitTest::failed_test_case_count() const { - return impl()->failed_test_case_count(); +// Gets the number of failed test suites. +int UnitTest::failed_test_suite_count() const { + return impl()->failed_test_suite_count(); } -// Gets the number of all test cases. -int UnitTest::total_test_case_count() const { - return impl()->total_test_case_count(); +// Gets the number of all test suites. +int UnitTest::total_test_suite_count() const { + return impl()->total_test_suite_count(); } -// Gets the number of all test cases that contain at least one test +// Gets the number of all test suites that contain at least one test // that should run. +int UnitTest::test_suite_to_run_count() const { + return impl()->test_suite_to_run_count(); +} + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +int UnitTest::successful_test_case_count() const { + return impl()->successful_test_suite_count(); +} +int UnitTest::failed_test_case_count() const { + return impl()->failed_test_suite_count(); +} +int UnitTest::total_test_case_count() const { + return impl()->total_test_suite_count(); +} int UnitTest::test_case_to_run_count() const { - return impl()->test_case_to_run_count(); + return impl()->test_suite_to_run_count(); } +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ // Gets the number of successful tests. int UnitTest::successful_test_count() const { return impl()->successful_test_count(); } +// Gets the number of skipped tests. +int UnitTest::skipped_test_count() const { + return impl()->skipped_test_count(); +} + // Gets the number of failed tests. int UnitTest::failed_test_count() const { return impl()->failed_test_count(); } @@ -4575,29 +4703,37 @@ internal::TimeInMillis UnitTest::elapsed_time() const { return impl()->elapsed_time(); } -// Returns true iff the unit test passed (i.e. all test cases passed). +// Returns true if and only if the unit test passed (i.e. all test suites +// passed). bool UnitTest::Passed() const { return impl()->Passed(); } -// Returns true iff the unit test failed (i.e. some test case failed -// or something outside of all tests failed). +// Returns true if and only if the unit test failed (i.e. some test suite +// failed or something outside of all tests failed). bool UnitTest::Failed() const { return impl()->Failed(); } -// Gets the i-th test case among all the test cases. i can range from 0 to -// total_test_case_count() - 1. If i is not in that range, returns NULL. +// Gets the i-th test suite among all the test suites. i can range from 0 to +// total_test_suite_count() - 1. If i is not in that range, returns NULL. +const TestSuite* UnitTest::GetTestSuite(int i) const { + return impl()->GetTestSuite(i); +} + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ const TestCase* UnitTest::GetTestCase(int i) const { return impl()->GetTestCase(i); } +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ // Returns the TestResult containing information on test failures and -// properties logged outside of individual test cases. +// properties logged outside of individual test suites. const TestResult& UnitTest::ad_hoc_test_result() const { return *impl()->ad_hoc_test_result(); } -// Gets the i-th test case among all the test cases. i can range from 0 to -// total_test_case_count() - 1. If i is not in that range, returns NULL. -TestCase* UnitTest::GetMutableTestCase(int i) { - return impl()->GetMutableTestCase(i); +// Gets the i-th test suite among all the test suites. i can range from 0 to +// total_test_suite_count() - 1. If i is not in that range, returns NULL. +TestSuite* UnitTest::GetMutableTestSuite(int i) { + return impl()->GetMutableSuiteCase(i); } // Returns the list of event listeners that can be used to track events @@ -4617,8 +4753,8 @@ TestEventListeners& UnitTest::listeners() { // We don't protect this under mutex_, as we only support calling it // from the main thread. Environment* UnitTest::AddEnvironment(Environment* env) { - if (env == NULL) { - return NULL; + if (env == nullptr) { + return nullptr; } impl_->environments().push_back(env); @@ -4642,25 +4778,24 @@ void UnitTest::AddTestPartResult( if (impl_->gtest_trace_stack().size() > 0) { msg << "\n" << GTEST_NAME_ << " trace:"; - for (int i = static_cast(impl_->gtest_trace_stack().size()); - i > 0; --i) { + for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) { const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1]; msg << "\n" << internal::FormatFileLocation(trace.file, trace.line) << " " << trace.message; } } - if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) { + if (os_stack_trace.c_str() != nullptr && !os_stack_trace.empty()) { msg << internal::kStackTraceMarker << os_stack_trace; } - const TestPartResult result = - TestPartResult(result_type, file_name, line_number, - msg.GetString().c_str()); + const TestPartResult result = TestPartResult( + result_type, file_name, line_number, msg.GetString().c_str()); impl_->GetTestPartResultReporterForCurrentThread()-> ReportTestPartResult(result); - if (result_type != TestPartResult::kSuccess) { + if (result_type != TestPartResult::kSuccess && + result_type != TestPartResult::kSkip) { // gtest_break_on_failure takes precedence over // gtest_throw_on_failure. This allows a user to set the latter // in the code (perhaps in order to use Google Test assertions @@ -4678,11 +4813,10 @@ void UnitTest::AddTestPartResult( // with clang/gcc we can achieve the same effect on x86 by invoking int3 asm("int3"); #else - // Dereference NULL through a volatile pointer to prevent the compiler + // Dereference nullptr through a volatile pointer to prevent the compiler // from removing. We use this rather than abort() or __builtin_trap() for - // portability: Symbian doesn't implement abort() well, and some debuggers - // don't correctly trap abort(). - *static_cast(NULL) = 1; + // portability: some debuggers don't correctly trap abort(). + *static_cast(nullptr) = 1; #endif // GTEST_OS_WINDOWS } else if (GTEST_FLAG(throw_on_failure)) { #if GTEST_HAS_EXCEPTIONS @@ -4697,8 +4831,8 @@ void UnitTest::AddTestPartResult( } // Adds a TestProperty to the current TestResult object when invoked from -// inside a test, to current TestCase's ad_hoc_test_result_ when invoked -// from SetUpTestCase or TearDownTestCase, or to the global property set +// inside a test, to current TestSuite's ad_hoc_test_result_ when invoked +// from SetUpTestSuite or TearDownTestSuite, or to the global property set // when invoked elsewhere. If the result already contains a property with // the same key, the value will be updated. void UnitTest::RecordProperty(const std::string& key, @@ -4737,8 +4871,9 @@ int UnitTest::Run() { // that understands the premature-exit-file protocol to report the // test as having failed. const internal::ScopedPrematureExitFile premature_exit_file( - in_death_test_child_process ? - NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE")); + in_death_test_child_process + ? nullptr + : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE")); // Captures the value of GTEST_FLAG(catch_exceptions). This value will be // used for the duration of the program. @@ -4763,23 +4898,27 @@ int UnitTest::Run() { _set_error_mode(_OUT_TO_STDERR); # endif -# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE +# if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE // In the debug version, Visual Studio pops up a separate dialog // offering a choice to debug the aborted program. We need to suppress // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement // executed. Google Test will notify the user of any unexpected // failure via stderr. - // - // VC++ doesn't define _set_abort_behavior() prior to the version 8.0. - // Users of prior VC versions shall suffer the agony and pain of - // clicking through the countless debug dialogs. - // FIXME: find a way to suppress the abort dialog() in the - // debug mode when compiled with VC 7.1 or lower. if (!GTEST_FLAG(break_on_failure)) _set_abort_behavior( 0x0, // Clear the following flags: _WRITE_ABORT_MSG | _CALL_REPORTFAULT); // pop-up window, core dump. # endif + + // In debug mode, the Windows CRT can crash with an assertion over invalid + // input (e.g. passing an invalid file descriptor). The default handling + // for these assertions is to pop up a dialog and wait for user input. + // Instead ask the CRT to dump such assertions to stderr non-interactively. + if (!IsDebuggerPresent()) { + (void)_CrtSetReportMode(_CRT_ASSERT, + _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG); + (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR); + } } #endif // GTEST_OS_WINDOWS @@ -4795,13 +4934,22 @@ const char* UnitTest::original_working_dir() const { return impl_->original_working_dir_.c_str(); } -// Returns the TestCase object for the test that's currently running, +// Returns the TestSuite object for the test that's currently running, // or NULL if no test is running. +const TestSuite* UnitTest::current_test_suite() const + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + return impl_->current_test_suite(); +} + +// Legacy API is still available but deprecated +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ const TestCase* UnitTest::current_test_case() const GTEST_LOCK_EXCLUDED_(mutex_) { internal::MutexLock lock(&mutex_); - return impl_->current_test_case(); + return impl_->current_test_suite(); } +#endif // Returns the TestInfo object for the test that's currently running, // or NULL if no test is running. @@ -4814,11 +4962,10 @@ const TestInfo* UnitTest::current_test_info() const // Returns the random seed used at the start of the current test run. int UnitTest::random_seed() const { return impl_->random_seed(); } -// Returns ParameterizedTestCaseRegistry object used to keep track of +// Returns ParameterizedTestSuiteRegistry object used to keep track of // value-parameterized tests and instantiate and register them. -internal::ParameterizedTestCaseRegistry& - UnitTest::parameterized_test_registry() - GTEST_LOCK_EXCLUDED_(mutex_) { +internal::ParameterizedTestSuiteRegistry& +UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) { return impl_->parameterized_test_registry(); } @@ -4852,23 +4999,22 @@ namespace internal { UnitTestImpl::UnitTestImpl(UnitTest* parent) : parent_(parent), GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */) - default_global_test_part_result_reporter_(this), + default_global_test_part_result_reporter_(this), default_per_thread_test_part_result_reporter_(this), - GTEST_DISABLE_MSC_WARNINGS_POP_() - global_test_part_result_repoter_( + GTEST_DISABLE_MSC_WARNINGS_POP_() global_test_part_result_repoter_( &default_global_test_part_result_reporter_), per_thread_test_part_result_reporter_( &default_per_thread_test_part_result_reporter_), parameterized_test_registry_(), parameterized_tests_registered_(false), - last_death_test_case_(-1), - current_test_case_(NULL), - current_test_info_(NULL), + last_death_test_suite_(-1), + current_test_suite_(nullptr), + current_test_info_(nullptr), ad_hoc_test_result_(), - os_stack_trace_getter_(NULL), + os_stack_trace_getter_(nullptr), post_flag_parse_init_performed_(false), random_seed_(0), // Will be overridden by the flag before first use. - random_(0), // Will be reseeded before first use. + random_(0), // Will be reseeded before first use. start_timestamp_(0), elapsed_time_(0), #if GTEST_HAS_DEATH_TEST @@ -4880,8 +5026,8 @@ UnitTestImpl::UnitTestImpl(UnitTest* parent) } UnitTestImpl::~UnitTestImpl() { - // Deletes every TestCase. - ForEach(test_cases_, internal::Delete); + // Deletes every TestSuite. + ForEach(test_suites_, internal::Delete); // Deletes every Environment. ForEach(environments_, internal::Delete); @@ -4890,20 +5036,20 @@ UnitTestImpl::~UnitTestImpl() { } // Adds a TestProperty to the current TestResult object when invoked in a -// context of a test, to current test case's ad_hoc_test_result when invoke -// from SetUpTestCase/TearDownTestCase, or to the global property set +// context of a test, to current test suite's ad_hoc_test_result when invoke +// from SetUpTestSuite/TearDownTestSuite, or to the global property set // otherwise. If the result already contains a property with the same key, // the value will be updated. void UnitTestImpl::RecordProperty(const TestProperty& test_property) { std::string xml_element; TestResult* test_result; // TestResult appropriate for property recording. - if (current_test_info_ != NULL) { + if (current_test_info_ != nullptr) { xml_element = "testcase"; test_result = &(current_test_info_->result_); - } else if (current_test_case_ != NULL) { + } else if (current_test_suite_ != nullptr) { xml_element = "testsuite"; - test_result = &(current_test_case_->ad_hoc_test_result_); + test_result = &(current_test_suite_->ad_hoc_test_result_); } else { xml_element = "testsuites"; test_result = &ad_hoc_test_result_; @@ -4915,7 +5061,7 @@ void UnitTestImpl::RecordProperty(const TestProperty& test_property) { // Disables event forwarding if the control is currently in a death test // subprocess. Must not be called before InitGoogleTest. void UnitTestImpl::SuppressTestEventsIfInSubprocess() { - if (internal_run_death_test_flag_.get() != NULL) + if (internal_run_death_test_flag_.get() != nullptr) listeners()->SuppressEventForwarding(); } #endif // GTEST_HAS_DEATH_TEST @@ -4997,74 +5143,73 @@ void UnitTestImpl::PostFlagParsingInit() { } } -// A predicate that checks the name of a TestCase against a known +// A predicate that checks the name of a TestSuite against a known // value. // // This is used for implementation of the UnitTest class only. We put // it in the anonymous namespace to prevent polluting the outer // namespace. // -// TestCaseNameIs is copyable. -class TestCaseNameIs { +// TestSuiteNameIs is copyable. +class TestSuiteNameIs { public: // Constructor. - explicit TestCaseNameIs(const std::string& name) - : name_(name) {} + explicit TestSuiteNameIs(const std::string& name) : name_(name) {} - // Returns true iff the name of test_case matches name_. - bool operator()(const TestCase* test_case) const { - return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0; + // Returns true if and only if the name of test_suite matches name_. + bool operator()(const TestSuite* test_suite) const { + return test_suite != nullptr && + strcmp(test_suite->name(), name_.c_str()) == 0; } private: std::string name_; }; -// Finds and returns a TestCase with the given name. If one doesn't +// Finds and returns a TestSuite with the given name. If one doesn't // exist, creates one and returns it. It's the CALLER'S // RESPONSIBILITY to ensure that this function is only called WHEN THE // TESTS ARE NOT SHUFFLED. // // Arguments: // -// test_case_name: name of the test case -// type_param: the name of the test case's type parameter, or NULL if -// this is not a typed or a type-parameterized test case. -// set_up_tc: pointer to the function that sets up the test case -// tear_down_tc: pointer to the function that tears down the test case -TestCase* UnitTestImpl::GetTestCase(const char* test_case_name, - const char* type_param, - Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc) { - // Can we find a TestCase with the given name? - const std::vector::const_reverse_iterator test_case = - std::find_if(test_cases_.rbegin(), test_cases_.rend(), - TestCaseNameIs(test_case_name)); - - if (test_case != test_cases_.rend()) - return *test_case; +// test_suite_name: name of the test suite +// type_param: the name of the test suite's type parameter, or NULL if +// this is not a typed or a type-parameterized test suite. +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite +TestSuite* UnitTestImpl::GetTestSuite( + const char* test_suite_name, const char* type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc) { + // Can we find a TestSuite with the given name? + const auto test_suite = + std::find_if(test_suites_.rbegin(), test_suites_.rend(), + TestSuiteNameIs(test_suite_name)); + + if (test_suite != test_suites_.rend()) return *test_suite; // No. Let's create one. - TestCase* const new_test_case = - new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc); - - // Is this a death test case? - if (internal::UnitTestOptions::MatchesFilter(test_case_name, - kDeathTestCaseFilter)) { - // Yes. Inserts the test case after the last death test case - // defined so far. This only works when the test cases haven't + auto* const new_test_suite = + new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc); + + // Is this a death test suite? + if (internal::UnitTestOptions::MatchesFilter(test_suite_name, + kDeathTestSuiteFilter)) { + // Yes. Inserts the test suite after the last death test suite + // defined so far. This only works when the test suites haven't // been shuffled. Otherwise we may end up running a death test // after a non-death test. - ++last_death_test_case_; - test_cases_.insert(test_cases_.begin() + last_death_test_case_, - new_test_case); + ++last_death_test_suite_; + test_suites_.insert(test_suites_.begin() + last_death_test_suite_, + new_test_suite); } else { // No. Appends to the end of the list. - test_cases_.push_back(new_test_case); + test_suites_.push_back(new_test_suite); } - test_case_indices_.push_back(static_cast(test_case_indices_.size())); - return new_test_case; + test_suite_indices_.push_back(static_cast(test_suite_indices_.size())); + return new_test_suite; } // Helpers for setting up / tearing down the given environment. They @@ -5082,7 +5227,8 @@ static void TearDownEnvironment(Environment* env) { env->TearDown(); } // All other functions called from RunAllTests() may safely assume that // parameterized tests are ready to be counted and run. bool UnitTestImpl::RunAllTests() { - // True iff Google Test is initialized before RUN_ALL_TESTS() is called. + // True if and only if Google Test is initialized before RUN_ALL_TESTS() is + // called. const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized(); // Do not run any test if the --help flag was specified. @@ -5098,12 +5244,13 @@ bool UnitTestImpl::RunAllTests() { // protocol. internal::WriteToShardStatusFileIfNeeded(); - // True iff we are in a subprocess for running a thread-safe-style + // True if and only if we are in a subprocess for running a thread-safe-style // death test. bool in_subprocess_for_death_test = false; #if GTEST_HAS_DEATH_TEST - in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL); + in_subprocess_for_death_test = + (internal_run_death_test_flag_.get() != nullptr); # if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) if (in_subprocess_for_death_test) { GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_(); @@ -5130,7 +5277,7 @@ bool UnitTestImpl::RunAllTests() { random_seed_ = GTEST_FLAG(shuffle) ? GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0; - // True iff at least one test has failed. + // True if and only if at least one test has failed. bool failed = false; TestEventListener* repeater = listeners()->repeater(); @@ -5142,17 +5289,17 @@ bool UnitTestImpl::RunAllTests() { // when we are inside the subprocess of a death test. const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat); // Repeats forever if the repeat count is negative. - const bool forever = repeat < 0; - for (int i = 0; forever || i != repeat; i++) { + const bool gtest_repeat_forever = repeat < 0; + for (int i = 0; gtest_repeat_forever || i != repeat; i++) { // We want to preserve failures generated by ad-hoc test // assertions executed before RUN_ALL_TESTS(). ClearNonAdHocTestResult(); const TimeInMillis start = GetTimeInMillis(); - // Shuffles test cases and tests if requested. + // Shuffles test suites and tests if requested. if (has_tests_to_run && GTEST_FLAG(shuffle)) { - random()->Reseed(random_seed_); + random()->Reseed(static_cast(random_seed_)); // This should be done before calling OnTestIterationStart(), // such that a test event listener can see the actual test order // in the event. @@ -5162,19 +5309,33 @@ bool UnitTestImpl::RunAllTests() { // Tells the unit test event listeners that the tests are about to start. repeater->OnTestIterationStart(*parent_, i); - // Runs each test case if there is at least one test to run. + // Runs each test suite if there is at least one test to run. if (has_tests_to_run) { // Sets up all environments beforehand. repeater->OnEnvironmentsSetUpStart(*parent_); ForEach(environments_, SetUpEnvironment); repeater->OnEnvironmentsSetUpEnd(*parent_); - // Runs the tests only if there was no fatal failure during global - // set-up. - if (!Test::HasFatalFailure()) { - for (int test_index = 0; test_index < total_test_case_count(); + // Runs the tests only if there was no fatal failure or skip triggered + // during global set-up. + if (Test::IsSkipped()) { + // Emit diagnostics when global set-up calls skip, as it will not be + // emitted by default. + TestResult& test_result = + *internal::GetUnitTestImpl()->current_test_result(); + for (int j = 0; j < test_result.total_part_count(); ++j) { + const TestPartResult& test_part_result = + test_result.GetTestPartResult(j); + if (test_part_result.type() == TestPartResult::kSkip) { + const std::string& result = test_part_result.message(); + printf("%s\n", result.c_str()); + } + } + fflush(stdout); + } else if (!Test::HasFatalFailure()) { + for (int test_index = 0; test_index < total_test_suite_count(); test_index++) { - GetMutableTestCase(test_index)->Run(); + GetMutableSuiteCase(test_index)->Run(); } } @@ -5234,9 +5395,9 @@ bool UnitTestImpl::RunAllTests() { // be created, prints an error and exits. void WriteToShardStatusFileIfNeeded() { const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile); - if (test_shard_file != NULL) { + if (test_shard_file != nullptr) { FILE* const file = posix::FOpen(test_shard_file, "w"); - if (file == NULL) { + if (file == nullptr) { ColoredPrintf(COLOR_RED, "Could not write to the test shard status file \"%s\" " "specified by the %s environment variable.\n", @@ -5271,7 +5432,7 @@ bool ShouldShard(const char* total_shards_env, << "Invalid environment variables: you have " << kTestShardIndex << " = " << shard_index << ", but have left " << kTestTotalShards << " unset.\n"; - ColoredPrintf(COLOR_RED, msg.GetString().c_str()); + ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str()); fflush(stdout); exit(EXIT_FAILURE); } else if (total_shards != -1 && shard_index == -1) { @@ -5279,7 +5440,7 @@ bool ShouldShard(const char* total_shards_env, << "Invalid environment variables: you have " << kTestTotalShards << " = " << total_shards << ", but have left " << kTestShardIndex << " unset.\n"; - ColoredPrintf(COLOR_RED, msg.GetString().c_str()); + ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str()); fflush(stdout); exit(EXIT_FAILURE); } else if (shard_index < 0 || shard_index >= total_shards) { @@ -5288,7 +5449,7 @@ bool ShouldShard(const char* total_shards_env, << kTestShardIndex << " < " << kTestTotalShards << ", but you have " << kTestShardIndex << "=" << shard_index << ", " << kTestTotalShards << "=" << total_shards << ".\n"; - ColoredPrintf(COLOR_RED, msg.GetString().c_str()); + ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str()); fflush(stdout); exit(EXIT_FAILURE); } @@ -5301,7 +5462,7 @@ bool ShouldShard(const char* total_shards_env, // and aborts. Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) { const char* str_val = posix::GetEnv(var); - if (str_val == NULL) { + if (str_val == nullptr) { return default_val; } @@ -5314,8 +5475,8 @@ Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) { } // Given the total number of shards, the shard index, and the test id, -// returns true iff the test should be run on this shard. The test id is -// some arbitrary but unique non-negative integer assigned to each test +// returns true if and only if the test should be run on this shard. The test id +// is some arbitrary but unique non-negative integer assigned to each test // method. Assumes that 0 <= shard_index < total_shards. bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) { return (test_id % total_shards) == shard_index; @@ -5323,7 +5484,7 @@ bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) { // Compares the name of each test with the user-specified filter to // decide whether the test should be run, then records the result in -// each TestCase and TestInfo object. +// each TestSuite and TestInfo object. // If shard_tests == true, further filters tests based on sharding // variables in the environment - see // https://github.com/google/googletest/blob/master/googletest/docs/advanced.md @@ -5340,26 +5501,23 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { // this shard. int num_runnable_tests = 0; int num_selected_tests = 0; - for (size_t i = 0; i < test_cases_.size(); i++) { - TestCase* const test_case = test_cases_[i]; - const std::string &test_case_name = test_case->name(); - test_case->set_should_run(false); + for (auto* test_suite : test_suites_) { + const std::string& test_suite_name = test_suite->name(); + test_suite->set_should_run(false); - for (size_t j = 0; j < test_case->test_info_list().size(); j++) { - TestInfo* const test_info = test_case->test_info_list()[j]; + for (size_t j = 0; j < test_suite->test_info_list().size(); j++) { + TestInfo* const test_info = test_suite->test_info_list()[j]; const std::string test_name(test_info->name()); - // A test is disabled if test case name or test name matches + // A test is disabled if test suite name or test name matches // kDisableTestFilter. - const bool is_disabled = - internal::UnitTestOptions::MatchesFilter(test_case_name, - kDisableTestFilter) || - internal::UnitTestOptions::MatchesFilter(test_name, - kDisableTestFilter); + const bool is_disabled = internal::UnitTestOptions::MatchesFilter( + test_suite_name, kDisableTestFilter) || + internal::UnitTestOptions::MatchesFilter( + test_name, kDisableTestFilter); test_info->is_disabled_ = is_disabled; - const bool matches_filter = - internal::UnitTestOptions::FilterMatchesTest(test_case_name, - test_name); + const bool matches_filter = internal::UnitTestOptions::FilterMatchesTest( + test_suite_name, test_name); test_info->matches_filter_ = matches_filter; const bool is_runnable = @@ -5376,7 +5534,7 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { num_selected_tests += is_selected; test_info->should_run_ = is_selected; - test_case->set_should_run(test_case->should_run() || is_selected); + test_suite->set_should_run(test_suite->should_run() || is_selected); } } return num_selected_tests; @@ -5387,7 +5545,7 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { // max_length characters, only prints the first max_length characters // and "...". static void PrintOnOneLine(const char* str, int max_length) { - if (str != NULL) { + if (str != nullptr) { for (int i = 0; *str != '\0'; ++str) { if (i >= max_length) { printf("..."); @@ -5409,27 +5567,25 @@ void UnitTestImpl::ListTestsMatchingFilter() { // Print at most this many characters for each type/value parameter. const int kMaxParamLength = 250; - for (size_t i = 0; i < test_cases_.size(); i++) { - const TestCase* const test_case = test_cases_[i]; - bool printed_test_case_name = false; + for (auto* test_suite : test_suites_) { + bool printed_test_suite_name = false; - for (size_t j = 0; j < test_case->test_info_list().size(); j++) { - const TestInfo* const test_info = - test_case->test_info_list()[j]; + for (size_t j = 0; j < test_suite->test_info_list().size(); j++) { + const TestInfo* const test_info = test_suite->test_info_list()[j]; if (test_info->matches_filter_) { - if (!printed_test_case_name) { - printed_test_case_name = true; - printf("%s.", test_case->name()); - if (test_case->type_param() != NULL) { + if (!printed_test_suite_name) { + printed_test_suite_name = true; + printf("%s.", test_suite->name()); + if (test_suite->type_param() != nullptr) { printf(" # %s = ", kTypeParamLabel); // We print the type parameter on a single line to make // the output easy to parse by a program. - PrintOnOneLine(test_case->type_param(), kMaxParamLength); + PrintOnOneLine(test_suite->type_param(), kMaxParamLength); } printf("\n"); } printf(" %s", test_info->name()); - if (test_info->value_param() != NULL) { + if (test_info->value_param() != nullptr) { printf(" # %s = ", kValueParamLabel); // We print the value parameter on a single line to make the // output easy to parse by a program. @@ -5448,11 +5604,11 @@ void UnitTestImpl::ListTestsMatchingFilter() { if (output_format == "xml") { XmlUnitTestResultPrinter( UnitTestOptions::GetAbsolutePathToOutputFile().c_str()) - .PrintXmlTestsList(&stream, test_cases_); + .PrintXmlTestsList(&stream, test_suites_); } else if (output_format == "json") { JsonUnitTestResultPrinter( UnitTestOptions::GetAbsolutePathToOutputFile().c_str()) - .PrintJsonTestList(&stream, test_cases_); + .PrintJsonTestList(&stream, test_suites_); } fprintf(fileout, "%s", StringStreamToString(&stream).c_str()); fclose(fileout); @@ -5476,7 +5632,7 @@ void UnitTestImpl::set_os_stack_trace_getter( // otherwise, creates an OsStackTraceGetter, makes it the current // getter, and returns it. OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() { - if (os_stack_trace_getter_ == NULL) { + if (os_stack_trace_getter_ == nullptr) { #ifdef GTEST_OS_STACK_TRACE_GETTER_ os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_; #else @@ -5489,38 +5645,38 @@ OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() { // Returns the most specific TestResult currently running. TestResult* UnitTestImpl::current_test_result() { - if (current_test_info_ != NULL) { + if (current_test_info_ != nullptr) { return ¤t_test_info_->result_; } - if (current_test_case_ != NULL) { - return ¤t_test_case_->ad_hoc_test_result_; + if (current_test_suite_ != nullptr) { + return ¤t_test_suite_->ad_hoc_test_result_; } return &ad_hoc_test_result_; } -// Shuffles all test cases, and the tests within each test case, +// Shuffles all test suites, and the tests within each test suite, // making sure that death tests are still run first. void UnitTestImpl::ShuffleTests() { - // Shuffles the death test cases. - ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_); + // Shuffles the death test suites. + ShuffleRange(random(), 0, last_death_test_suite_ + 1, &test_suite_indices_); - // Shuffles the non-death test cases. - ShuffleRange(random(), last_death_test_case_ + 1, - static_cast(test_cases_.size()), &test_case_indices_); + // Shuffles the non-death test suites. + ShuffleRange(random(), last_death_test_suite_ + 1, + static_cast(test_suites_.size()), &test_suite_indices_); - // Shuffles the tests inside each test case. - for (size_t i = 0; i < test_cases_.size(); i++) { - test_cases_[i]->ShuffleTests(random()); + // Shuffles the tests inside each test suite. + for (auto& test_suite : test_suites_) { + test_suite->ShuffleTests(random()); } } -// Restores the test cases and tests to their order before the first shuffle. +// Restores the test suites and tests to their order before the first shuffle. void UnitTestImpl::UnshuffleTests() { - for (size_t i = 0; i < test_cases_.size(); i++) { - // Unshuffles the tests in each test case. - test_cases_[i]->UnshuffleTests(); - // Resets the index of each test case. - test_case_indices_[i] = static_cast(i); + for (size_t i = 0; i < test_suites_.size(); i++) { + // Unshuffles the tests in each test suite. + test_suites_[i]->UnshuffleTests(); + // Resets the index of each test suite. + test_suite_indices_[i] = static_cast(i); } } @@ -5579,12 +5735,12 @@ bool SkipPrefix(const char* prefix, const char** pstr) { static const char* ParseFlagValue(const char* str, const char* flag, bool def_optional) { // str and flag must not be NULL. - if (str == NULL || flag == NULL) return NULL; + if (str == nullptr || flag == nullptr) return nullptr; // The flag must start with "--" followed by GTEST_FLAG_PREFIX_. const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag; const size_t flag_len = flag_str.length(); - if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL; + if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr; // Skips the flag name. const char* flag_end = str + flag_len; @@ -5597,7 +5753,7 @@ static const char* ParseFlagValue(const char* str, const char* flag, // If def_optional is true and there are more characters after the // flag name, or if def_optional is false, there must be a '=' after // the flag name. - if (flag_end[0] != '=') return NULL; + if (flag_end[0] != '=') return nullptr; // Returns the string after "=". return flag_end + 1; @@ -5618,7 +5774,7 @@ static bool ParseBoolFlag(const char* str, const char* flag, bool* value) { const char* const value_str = ParseFlagValue(str, flag, true); // Aborts if the parsing failed. - if (value_str == NULL) return false; + if (value_str == nullptr) return false; // Converts the string value to a bool. *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F'); @@ -5635,7 +5791,7 @@ bool ParseInt32Flag(const char* str, const char* flag, Int32* value) { const char* const value_str = ParseFlagValue(str, flag, false); // Aborts if the parsing failed. - if (value_str == NULL) return false; + if (value_str == nullptr) return false; // Sets *value to the value of the flag. return ParseInt32(Message() << "The value of flag --" << flag, @@ -5653,7 +5809,7 @@ static bool ParseStringFlag(const char* str, const char* flag, String* value) { const char* const value_str = ParseFlagValue(str, flag, false); // Aborts if the parsing failed. - if (value_str == NULL) return false; + if (value_str == nullptr) return false; // Sets *value to the value of the flag. *value = value_str; @@ -5684,8 +5840,6 @@ static bool HasGoogleTestFlagPrefix(const char* str) { // @Y changes the color to yellow. // @D changes to the default terminal text color. // -// FIXME: Write tests for this once we add stdout -// capturing to Google Test. static void PrintColorEncoded(const char* str) { GTestColor color = COLOR_DEFAULT; // The current color. @@ -5695,7 +5849,7 @@ static void PrintColorEncoded(const char* str) { // next segment. for (;;) { const char* p = strchr(str, '@'); - if (p == NULL) { + if (p == nullptr) { ColoredPrintf(color, "%s", str); return; } @@ -5753,7 +5907,7 @@ static const char kColorEncodedHelpMessage[] = " @G--" GTEST_FLAG_PREFIX_ "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G" GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n" " Generate a JSON or XML report in the given directory or with the given\n" -" file name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n" +" file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n" # if GTEST_CAN_STREAM_RESULTS_ " @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n" " Stream test results to the given server.\n" @@ -5895,7 +6049,7 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) { void ParseGoogleTestFlagsOnly(int* argc, char** argv) { ParseGoogleTestFlagsOnlyImpl(argc, argv); - // Fix the value of *_NSGetArgc() on macOS, but iff + // Fix the value of *_NSGetArgc() on macOS, but if and only if // *_NSGetArgv() == argv // Only applicable to char** version of argv #if GTEST_OS_MAC @@ -5963,6 +6117,22 @@ void InitGoogleTest(int* argc, wchar_t** argv) { #endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) } +// This overloaded version can be used on Arduino/embedded platforms where +// there is no argc/argv. +void InitGoogleTest() { + // Since Arduino doesn't have a command line, fake out the argc/argv arguments + int argc = 1; + const auto arg0 = "dummy"; + char* argv0 = const_cast(arg0); + char** argv = &argv0; + +#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv); +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + internal::InitGoogleTestImpl(&argc, argv); +#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +} + std::string TempDir() { #if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_) return GTEST_CUSTOM_TEMPDIR_FUNCTION_(); @@ -5972,7 +6142,7 @@ std::string TempDir() { return "\\temp\\"; #elif GTEST_OS_WINDOWS const char* temp_dir = internal::posix::GetEnv("TEMP"); - if (temp_dir == NULL || temp_dir[0] == '\0') + if (temp_dir == nullptr || temp_dir[0] == '\0') return "\\temp\\"; else if (temp_dir[strlen(temp_dir) - 1] == '\\') return temp_dir; diff --git a/third_party/googletest/src/src/gtest_main.cc b/third_party/googletest/src/src/gtest_main.cc index 2113f621e6..f6e1dd96fb 100644 --- a/third_party/googletest/src/src/gtest_main.cc +++ b/third_party/googletest/src/src/gtest_main.cc @@ -27,11 +27,21 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include +#include #include "gtest/gtest.h" +#ifdef ARDUINO +void setup() { + testing::InitGoogleTest(); +} + +void loop() { RUN_ALL_TESTS(); } + +#else + GTEST_API_ int main(int argc, char **argv) { printf("Running main() from %s\n", __FILE__); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } +#endif From 769129fb29fc66720be2b01276a472c334757d2d Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Thu, 18 Jun 2020 11:24:28 -0700 Subject: [PATCH 0010/1000] vp9-rtc: Fix to resetting drop_spatial_layer The reset happens on the base spatial layer, before encoding. But it should be reset on the first_spatial_layer_to_encode, which may not be 0. Change-Id: I38ef686b4459ca7433062adbfe32ef2134e1ad60 --- vp9/encoder/vp9_svc_layercontext.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 6f23d0b4d2..27dc8ec50f 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -864,8 +864,9 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { } } - // Reset the drop flags for all spatial layers, on the base layer. - if (svc->spatial_layer_id == 0) { + // Reset the drop flags for all spatial layers, on the + // first_spatial_layer_to_encode. + if (svc->spatial_layer_id == svc->first_spatial_layer_to_encode) { vp9_zero(svc->drop_spatial_layer); // TODO(jianj/marpan): Investigate why setting svc->lst/gld/alt_fb_idx // causes an issue with frame dropping and temporal layers, when the frame From 1c9fd977aa5e8a2f115554a15e4a5e804e638ccb Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 18 Jun 2020 18:34:55 -0700 Subject: [PATCH 0011/1000] tools/lint-hunks.py: skip third_party files Change-Id: I2fda3119c08b5755f1a9b2fad1125090b0d86850 --- tools/lint-hunks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/lint-hunks.py b/tools/lint-hunks.py index 6e25d93624..30d3249193 100755 --- a/tools/lint-hunks.py +++ b/tools/lint-hunks.py @@ -106,6 +106,8 @@ def main(argv=None): for filename, affected_lines in file_affected_line_map.iteritems(): if filename.split(".")[-1] not in ("c", "h", "cc"): continue + if filename.startswith("third_party"): + continue if args: # File contents come from git From 3f18b08397674e4fe5b327c735f4945cb1892c41 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Tue, 23 Jun 2020 19:20:01 -0700 Subject: [PATCH 0012/1000] vp9-svc: Allow scale_references for single layer svc This is needed to allow for newmv search in nonrd_pickmode for resize/scaled frame, and for int_pro_motion_estimation on resized/scaled frame. Change-Id: I5e2fdbc4706a10813c1b00f6194e2442f648905a --- vp9/encoder/vp9_encoder.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index ae7b47b523..2780c73843 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4040,8 +4040,11 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can // avoid this frame-level upsampling (for non intra_only frames). + // For SVC single_layer mode, dynamic resize is allowed and we need to + // scale references for this case. if (frame_is_intra_only(cm) == 0 && - !(is_one_pass_cbr_svc(cpi) && svc->force_zero_mode_spatial_ref)) { + ((svc->single_layer_svc && cpi->oxcf.resize_mode == RESIZE_DYNAMIC) || + !(is_one_pass_cbr_svc(cpi) && svc->force_zero_mode_spatial_ref))) { vp9_scale_references(cpi); } From de4aedaec33c6c29f882f99a740713596713a1f9 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Fri, 26 Jun 2020 15:34:35 -0700 Subject: [PATCH 0013/1000] vp9-svc: Fix to setting frame size for dynamic resize For svc with dynamic resize (only for single_layer_svc mode), add flag to indicate resized width/height has already been set, otherwise on the resized/trigger frame (resize_pending=1), the wrong resolution may be set if oxcf->width/height is different than layer width/height in single_layer_svc mode. Change-Id: I24403ee93fc96b830a9bf7c66d763a48762cdcb4 --- vp9/encoder/vp9_encoder.c | 18 +++++++++++------- vp9/encoder/vp9_ratectrl.c | 2 ++ vp9/encoder/vp9_svc_layercontext.c | 1 + vp9/encoder/vp9_svc_layercontext.h | 1 + 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 2780c73843..06dadc42a6 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -3749,13 +3749,17 @@ static void set_frame_size(VP9_COMP *cpi) { if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending != 0) { - oxcf->scaled_frame_width = - (oxcf->width * cpi->resize_scale_num) / cpi->resize_scale_den; - oxcf->scaled_frame_height = - (oxcf->height * cpi->resize_scale_num) / cpi->resize_scale_den; - // There has been a change in frame size. - vp9_set_size_literal(cpi, oxcf->scaled_frame_width, - oxcf->scaled_frame_height); + // For SVC scaled width/height will have been set (svc->resize_set=1) + // in get_svc_params based on the layer width/height. + if (!cpi->use_svc || !cpi->svc.resize_set) { + oxcf->scaled_frame_width = + (oxcf->width * cpi->resize_scale_num) / cpi->resize_scale_den; + oxcf->scaled_frame_height = + (oxcf->height * cpi->resize_scale_num) / cpi->resize_scale_den; + // There has been a change in frame size. + vp9_set_size_literal(cpi, oxcf->scaled_frame_width, + oxcf->scaled_frame_height); + } // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed. set_mv_search_params(cpi); diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 34dd618acf..3beb9f41f9 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -2405,9 +2405,11 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { lc->scaling_factor_num_resize, lc->scaling_factor_den_resize, &width, &height); vp9_set_size_literal(cpi, width, height); + svc->resize_set = 1; } } else { cpi->resize_pending = 0; + svc->resize_set = 0; } } diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 27dc8ec50f..d85b3632cd 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -56,6 +56,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { svc->num_encoded_top_layer = 0; svc->simulcast_mode = 0; svc->single_layer_svc = 0; + svc->resize_set = 0; for (i = 0; i < REF_FRAMES; ++i) { svc->fb_idx_spatial_layer_id[i] = 0xff; diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index 7e46500b58..e7d9712aae 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -198,6 +198,7 @@ typedef struct SVC { // Flag to indicate SVC is dynamically switched to a single layer. int single_layer_svc; + int resize_set; } SVC; struct VP9_COMP; From c039b5442b342cf0d99ce3cfaf596fd1c26a7208 Mon Sep 17 00:00:00 2001 From: jinbo Date: Wed, 24 Jun 2020 17:10:20 +0800 Subject: [PATCH 0014/1000] vp8,vpx_dsp:[loongson] fix specification of instruction name 1.'xor,or,and' to 'pxor,por,pand'. In the case of operating FPR, gcc supports both of them, clang only supports the second type. 2.'dsrl,srl' to 'ssrld,ssrlw'. In the case of operating FPR, gcc supports both of them, clang only supports the second type. Change-Id: I93b47348e7c6580d99f57dc11165b4645236533c --- vp8/common/mips/mmi/dequantize_mmi.c | 2 +- vp8/common/mips/mmi/idctllm_mmi.c | 16 +- vp8/common/mips/mmi/loopfilter_filters_mmi.c | 250 +++++++++---------- vp8/common/mips/mmi/sixtap_filter_mmi.c | 12 +- vp8/encoder/mips/mmi/dct_mmi.c | 28 +-- vp8/encoder/mips/mmi/vp8_quantize_mmi.c | 38 +-- vpx_dsp/mips/sad_mmi.c | 20 +- vpx_dsp/mips/subtract_mmi.c | 6 +- vpx_dsp/mips/variance_mmi.c | 102 ++++---- vpx_dsp/mips/vpx_convolve8_mmi.c | 10 +- vpx_ports/asmdefs_mmi.h | 4 +- 11 files changed, 244 insertions(+), 244 deletions(-) diff --git a/vp8/common/mips/mmi/dequantize_mmi.c b/vp8/common/mips/mmi/dequantize_mmi.c index b3f8084aee..b9330a6663 100644 --- a/vp8/common/mips/mmi/dequantize_mmi.c +++ b/vp8/common/mips/mmi/dequantize_mmi.c @@ -100,7 +100,7 @@ void vp8_dequant_idct_add_mmi(int16_t *input, int16_t *dq, unsigned char *dest, vp8_short_idct4x4llm_mmi(input, dest, stride, dest, stride); __asm__ volatile( - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "gssdlc1 %[ftmp0], 0x07(%[input]) \n\t" "gssdrc1 %[ftmp0], 0x00(%[input]) \n\t" "sdl $0, 0x0f(%[input]) \n\t" diff --git a/vp8/common/mips/mmi/idctllm_mmi.c b/vp8/common/mips/mmi/idctllm_mmi.c index 5e48f59166..4fad1d347f 100644 --- a/vp8/common/mips/mmi/idctllm_mmi.c +++ b/vp8/common/mips/mmi/idctllm_mmi.c @@ -13,25 +13,25 @@ #include "vpx_ports/asmdefs_mmi.h" #define TRANSPOSE_4H \ - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ MMI_LI(%[tmp0], 0x93) \ "mtc1 %[tmp0], %[ftmp10] \n\t" \ "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ - "or %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ + "por %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ - "or %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \ + "por %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \ "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ - "or %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \ + "por %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \ "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ - "or %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "por %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \ "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \ "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \ @@ -49,7 +49,7 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr, __asm__ volatile ( MMI_LI(%[tmp0], 0x02) "mtc1 %[tmp0], %[ftmp11] \n\t" - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" @@ -203,7 +203,7 @@ void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr, int low32; __asm__ volatile ( - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "pshufh %[a1], %[a1], %[ftmp0] \n\t" "ulw %[low32], 0x00(%[pred_ptr]) \n\t" "mtc1 %[low32], %[ftmp1] \n\t" @@ -260,7 +260,7 @@ void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) { __asm__ volatile ( MMI_LI(%[tmp0], 0x03) - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" diff --git a/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/vp8/common/mips/mmi/loopfilter_filters_mmi.c index f2182f95c9..fc1240cc27 100644 --- a/vp8/common/mips/mmi/loopfilter_filters_mmi.c +++ b/vp8/common/mips/mmi/loopfilter_filters_mmi.c @@ -56,14 +56,14 @@ void vp8_loop_filter_horizontal_edge_mmi( "gsldrc1 %[ftmp4], 0x00(%[addr1]) \n\t" "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t" "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) "gsldlc1 %[ftmp5], 0x07(%[addr1]) \n\t" "gsldrc1 %[ftmp5], 0x00(%[addr1]) \n\t" "pasubub %[ftmp9], %[ftmp4], %[ftmp5] \n\t" "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" @@ -72,21 +72,21 @@ void vp8_loop_filter_horizontal_edge_mmi( "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t" "psubusb %[ftmp1], %[ftmp11], %[ftmp10] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) "gsldlc1 %[ftmp8], 0x07(%[addr1]) \n\t" "gsldrc1 %[ftmp8], 0x00(%[addr1]) \n\t" "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t" "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t" "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t" "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t" "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t" "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" @@ -99,8 +99,8 @@ void vp8_loop_filter_horizontal_edge_mmi( "gsldlc1 %[ftmp10], 0x07(%[blimit]) \n\t" "gsldrc1 %[ftmp10], 0x00(%[blimit]) \n\t" "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" - "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp10] \n\t" "gsldlc1 %[ftmp10], 0x07(%[thresh]) \n\t" @@ -108,29 +108,29 @@ void vp8_loop_filter_horizontal_edge_mmi( "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t" "psubusb %[ftmp2], %[ftmp11], %[ftmp10] \n\t" "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" - "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "pxor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t" - "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pxor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" - "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" - "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" - "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" - "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t" - "and %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t" "psubsb %[ftmp3], %[ftmp6], %[ftmp5] \n\t" "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" - "and %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t" "paddsb %[ftmp8], %[ftmp2], %[ff_pb_03] \n\t" "paddsb %[ftmp9], %[ftmp2], %[ff_pb_04] \n\t" - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "xor %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp11], %[ftmp11], %[ftmp11] \n\t" "punpcklbh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" "punpckhbh %[ftmp11], %[ftmp11], %[ftmp8] \n\t" @@ -139,10 +139,10 @@ void vp8_loop_filter_horizontal_edge_mmi( "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t" "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t" "packsshb %[ftmp8], %[ftmp0], %[ftmp11] \n\t" - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "punpcklbh %[ftmp0], %[ftmp0], %[ftmp9] \n\t" "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t" - "xor %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + "pxor %[ftmp11], %[ftmp11], %[ftmp11] \n\t" "punpckhbh %[ftmp9], %[ftmp11], %[ftmp9] \n\t" "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t" "paddsh %[ftmp11], %[ftmp0], %[ff_ph_01] \n\t" @@ -156,24 +156,24 @@ void vp8_loop_filter_horizontal_edge_mmi( "packsshb %[ftmp11], %[ftmp11], %[ftmp9] \n\t" "pandn %[ftmp1], %[ftmp1], %[ftmp11] \n\t" "paddsb %[ftmp5], %[ftmp5], %[ftmp8] \n\t" - "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) "gssdlc1 %[ftmp5], 0x07(%[addr1]) \n\t" "gssdrc1 %[ftmp5], 0x00(%[addr1]) \n\t" MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t" - "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" "gssdlc1 %[ftmp4], 0x07(%[addr1]) \n\t" "gssdrc1 %[ftmp4], 0x00(%[addr1]) \n\t" "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" - "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t" - "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" "gssdlc1 %[ftmp7], 0x07(%[addr0]) \n\t" "gssdrc1 %[ftmp7], 0x00(%[addr0]) \n\t" @@ -288,23 +288,23 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, /* abs (q2-q1) */ "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t" "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" /* ftmp3: abs(q1-q0) */ "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t" "psubusb %[ftmp7], %[ftmp3], %[ftmp8] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" /* ftmp4: abs(p1-p0) */ "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t" "psubusb %[ftmp7], %[ftmp4], %[ftmp8] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" /* abs (p2-p1) */ "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t" "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" /* abs (p3-p2) */ "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t" "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" "gsldlc1 %[ftmp8], 0x07(%[blimit]) \n\t" "gsldrc1 %[ftmp8], 0x00(%[blimit]) \n\t" @@ -314,14 +314,14 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, "paddusb %[ftmp11], %[ftmp11], %[ftmp11] \n\t" /* abs (p1-q1) */ "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" - "and %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" + "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" "li %[tmp0], 0x01 \n\t" "mtc1 %[tmp0], %[ftmp1] \n\t" "psrlh %[ftmp12], %[ftmp12], %[ftmp1] \n\t" "paddusb %[ftmp1], %[ftmp11], %[ftmp12] \n\t" "psubusb %[ftmp1], %[ftmp1], %[ftmp8] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" - "xor %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pxor %[ftmp1], %[ftmp1], %[ftmp1] \n\t" /* ftmp0:mask */ "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" @@ -331,41 +331,41 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, /* ftmp3: abs(q1-q0) ftmp4: abs(p1-p0) */ "psubusb %[ftmp4], %[ftmp4], %[ftmp8] \n\t" "psubusb %[ftmp3], %[ftmp3], %[ftmp8] \n\t" - "or %[ftmp2], %[ftmp4], %[ftmp3] \n\t" + "por %[ftmp2], %[ftmp4], %[ftmp3] \n\t" "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp1] \n\t" "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" /* ftmp1:hev */ - "xor %[ftmp1], %[ftmp2], %[ftmp1] \n\t" + "pxor %[ftmp1], %[ftmp2], %[ftmp1] \n\t" - "xor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" - "xor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" - "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" - "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" "psubsb %[ftmp2], %[ftmp5], %[ftmp10] \n\t" - "and %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t" "psubsb %[ftmp3], %[ftmp9], %[ftmp6] \n\t" "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" /* ftmp2:filter_value */ - "and %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t" "paddsb %[ftmp11], %[ftmp2], %[ff_pb_04] \n\t" "paddsb %[ftmp12], %[ftmp2], %[ff_pb_03] \n\t" "li %[tmp0], 0x0b \n\t" "mtc1 %[tmp0], %[ftmp7] \n\t" - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" "punpcklbh %[ftmp0], %[ftmp0], %[ftmp12] \n\t" "punpckhbh %[ftmp8], %[ftmp8], %[ftmp12] \n\t" "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" "packsshb %[ftmp12], %[ftmp0], %[ftmp8] \n\t" - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" "punpcklbh %[ftmp0], %[ftmp0], %[ftmp11] \n\t" "punpckhbh %[ftmp8], %[ftmp8], %[ftmp11] \n\t" "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" @@ -373,9 +373,9 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, "packsshb %[ftmp11], %[ftmp0], %[ftmp8] \n\t" "psubsb %[ftmp9], %[ftmp9], %[ftmp11] \n\t" - "xor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" "paddsb %[ftmp6], %[ftmp6], %[ftmp12] \n\t" - "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" "paddsh %[ftmp0], %[ftmp0], %[ff_ph_01] \n\t" "paddsh %[ftmp8], %[ftmp8], %[ff_ph_01] \n\t" @@ -386,9 +386,9 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, "packsshb %[ftmp2], %[ftmp0], %[ftmp8] \n\t" "pandn %[ftmp2], %[ftmp1], %[ftmp2] \n\t" "psubsb %[ftmp10], %[ftmp10], %[ftmp2] \n\t" - "xor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" "paddsb %[ftmp5], %[ftmp5], %[ftmp2] \n\t" - "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" /* ftmp5: *op1 ; ftmp6: *op0 */ "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" @@ -408,7 +408,7 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp9] \n\t" - "dsrl %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "ssrld %[ftmp2], %[ftmp2], %[ftmp9] \n\t" MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) MMI_SUBU(%[addr1], %[addr0], %[tmp0]) "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t" @@ -419,21 +419,21 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t" "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t" - "dsrl %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "ssrld %[ftmp6], %[ftmp6], %[ftmp9] \n\t" MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t" "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t" "gsswlc1 %[ftmp1], 0x05(%[src_ptr]) \n\t" "gsswrc1 %[ftmp1], 0x02(%[src_ptr]) \n\t" - "dsrl %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "ssrld %[ftmp1], %[ftmp1], %[ftmp9] \n\t" "gsswlc1 %[ftmp1], 0x05(%[addr0]) \n\t" "gsswrc1 %[ftmp1], 0x02(%[addr0]) \n\t" MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step]) "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t" "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t" - "dsrl %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "ssrld %[ftmp5], %[ftmp5], %[ftmp9] \n\t" MMI_ADDU(%[addr1], %[addr0], %[tmp0]) "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t" "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t" @@ -532,31 +532,31 @@ void vp8_mbloop_filter_horizontal_edge_mmi( "psubusb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t" "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" "pasubub %[ftmp10], %[ftmp4], %[ftmp5] \n\t" "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t" "psubusb %[ftmp1], %[ftmp11], %[ftmp9] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t" "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t" "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t" "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t" - "and %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" + "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" "li %[tmp0], 0x01 \n\t" "mtc1 %[tmp0], %[ftmp9] \n\t" "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" "psubusb %[ftmp1], %[ftmp1], %[ftmp12] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" - "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" /* ftmp0: mask */ "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" @@ -565,24 +565,24 @@ void vp8_mbloop_filter_horizontal_edge_mmi( "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t" "psubusb %[ftmp2], %[ftmp11], %[ftmp9] \n\t" "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" - "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "pxor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t" /* ftmp1: hev */ - "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pxor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" - "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" - "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" - "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" - "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t" "psubsb %[ftmp9], %[ftmp6], %[ftmp5] \n\t" "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" - "and %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t" "pandn %[ftmp12], %[ftmp1], %[ftmp2] \n\t" - "and %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t" "li %[tmp0], 0x0b \n\t" "mtc1 %[tmp0], %[ftmp9] \n\t" @@ -595,13 +595,13 @@ void vp8_mbloop_filter_horizontal_edge_mmi( "li %[tmp0], 0x07 \n\t" "mtc1 %[tmp0], %[ftmp9] \n\t" - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00]) "psubsb %[ftmp6], %[ftmp6], %[ftmp1] \n\t" "paddsb %[ftmp5], %[ftmp5], %[ftmp1] \n\t" - "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" - "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) "gssdlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" @@ -613,8 +613,8 @@ void vp8_mbloop_filter_horizontal_edge_mmi( VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200]) "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t" "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t" - "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" - "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) "gssdlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" "gssdrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" @@ -624,12 +624,12 @@ void vp8_mbloop_filter_horizontal_edge_mmi( "gssdrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900]) - "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" - "xor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t" "paddsb %[ftmp3], %[ftmp3], %[ftmp1] \n\t" "psubsb %[ftmp8], %[ftmp8], %[ftmp1] \n\t" - "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" - "xor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t" MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) "gssdlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" "gssdrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" @@ -662,8 +662,8 @@ void vp8_mbloop_filter_horizontal_edge_mmi( } #define VP8_MBLOOP_VPSRAB_ADDH \ - "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \ - "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \ + "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \ + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \ "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" \ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" @@ -755,23 +755,23 @@ void vp8_mbloop_filter_vertical_edge_mmi( /* abs (q2-q1) */ "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t" "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" /* ftmp3: abs(q1-q0) */ "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t" "psubusb %[ftmp7], %[ftmp3], %[ftmp13] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" /* ftmp4: abs(p1-p0) */ "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t" "psubusb %[ftmp7], %[ftmp4], %[ftmp13] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" /* abs (p2-p1) */ "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t" "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" /* abs (p3-p2) */ "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t" "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" "gsldlc1 %[ftmp13], 0x07(%[blimit]) \n\t" "gsldrc1 %[ftmp13], 0x00(%[blimit]) \n\t" @@ -782,14 +782,14 @@ void vp8_mbloop_filter_vertical_edge_mmi( "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" /* abs (p1-q1) / 2 */ "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" - "and %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" + "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" "li %[tmp0], 0x01 \n\t" "mtc1 %[tmp0], %[ftmp8] \n\t" "psrlh %[ftmp12], %[ftmp12], %[ftmp8] \n\t" "paddusb %[ftmp12], %[ftmp1], %[ftmp12] \n\t" "psubusb %[ftmp12], %[ftmp12], %[ftmp13] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp12] \n\t" - "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" /* ftmp0: mask */ "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp12] \n\t" @@ -797,19 +797,19 @@ void vp8_mbloop_filter_vertical_edge_mmi( "psubusb %[ftmp4], %[ftmp4], %[ftmp7] \n\t" /* abs(q1-q0) - thresh */ "psubusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" - "or %[ftmp3], %[ftmp4], %[ftmp3] \n\t" + "por %[ftmp3], %[ftmp4], %[ftmp3] \n\t" "pcmpeqb %[ftmp3], %[ftmp3], %[ftmp12] \n\t" "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" /* ftmp1: hev */ - "xor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */ - "xor %[ftmp11], %[ftmp11], %[ff_pb_80] \n\t" - "xor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" - "xor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" - "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" - "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" - "xor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" + "pxor %[ftmp11], %[ftmp11], %[ff_pb_80] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" "psubsb %[ftmp3], %[ftmp5], %[ftmp10] \n\t" "psubsb %[ftmp4], %[ftmp9], %[ftmp6] \n\t" @@ -817,9 +817,9 @@ void vp8_mbloop_filter_vertical_edge_mmi( "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" /* filter_value &= mask */ - "and %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "pand %[ftmp0], %[ftmp0], %[ftmp3] \n\t" /* Filter2 = filter_value & hev */ - "and %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + "pand %[ftmp3], %[ftmp1], %[ftmp0] \n\t" /* filter_value &= ~hev */ "pandn %[ftmp0], %[ftmp1], %[ftmp0] \n\t" @@ -852,10 +852,10 @@ void vp8_mbloop_filter_vertical_edge_mmi( VP8_MBLOOP_VPSRAB_ADDT "psubsb %[ftmp4], %[ftmp9], %[ftmp3] \n\t" /* ftmp9: oq0 */ - "xor %[ftmp9], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp9], %[ftmp4], %[ff_pb_80] \n\t" "paddsb %[ftmp4], %[ftmp6], %[ftmp3] \n\t" /* ftmp6: op0 */ - "xor %[ftmp6], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp4], %[ff_pb_80] \n\t" VP8_MBLOOP_VPSRAB_ADDH "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t" @@ -864,10 +864,10 @@ void vp8_mbloop_filter_vertical_edge_mmi( VP8_MBLOOP_VPSRAB_ADDT "psubsb %[ftmp4], %[ftmp10], %[ftmp3] \n\t" /* ftmp10: oq1 */ - "xor %[ftmp10], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp10], %[ftmp4], %[ff_pb_80] \n\t" "paddsb %[ftmp4], %[ftmp5], %[ftmp3] \n\t" /* ftmp5: op1 */ - "xor %[ftmp5], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp4], %[ff_pb_80] \n\t" VP8_MBLOOP_VPSRAB_ADDH "pmulhh %[ftmp7], %[ftmp7], %[ff_ph_0900] \n\t" @@ -875,10 +875,10 @@ void vp8_mbloop_filter_vertical_edge_mmi( VP8_MBLOOP_VPSRAB_ADDT "psubsb %[ftmp4], %[ftmp11], %[ftmp3] \n\t" /* ftmp11: oq2 */ - "xor %[ftmp11], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp11], %[ftmp4], %[ff_pb_80] \n\t" "paddsb %[ftmp4], %[ftmp2], %[ftmp3] \n\t" /* ftmp2: op2 */ - "xor %[ftmp2], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp2], %[ftmp4], %[ff_pb_80] \n\t" "ldc1 %[ftmp12], 0x00(%[srct]) \n\t" "ldc1 %[ftmp8], 0x08(%[srct]) \n\t" @@ -965,7 +965,7 @@ void vp8_mbloop_filter_vertical_edge_mmi( "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" \ "psrah %[ftmp1], %[ftmp5], %[ftmp10] \n\t" \ "psllh %[ftmp1], %[ftmp1], %[ftmp8] \n\t" \ - "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, int src_pixel_step, @@ -1008,14 +1008,14 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "paddusb %[ftmp5], %[ftmp5], %[ftmp1] \n\t" "psubusb %[ftmp5], %[ftmp5], %[ftmp3] \n\t" - "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp3] \n\t" - "xor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" - "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "pxor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" "psubsb %[ftmp2], %[ftmp2], %[ftmp7] \n\t" - "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" - "xor %[ftmp3], %[ftmp0], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp0], %[ff_pb_80] \n\t" "psubsb %[ftmp0], %[ftmp3], %[ftmp6] \n\t" "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" @@ -1025,14 +1025,14 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" VP8_SIMPLE_HPSRAB "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" - "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t" VP8_SIMPLE_HPSRAB "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" - "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) "gssdlc1 %[ftmp6], 0x07(%[addr1]) \n\t" "gssdrc1 %[ftmp6], 0x00(%[addr1]) \n\t" @@ -1121,7 +1121,7 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, "li %[tmp0], 0x01 \n\t" "mtc1 %[tmp0], %[ftmp9] \n\t" "pasubub %[ftmp6], %[ftmp3], %[ftmp0] \n\t" - "and %[ftmp6], %[ftmp6], %[ff_pb_fe] \n\t" + "pand %[ftmp6], %[ftmp6], %[ff_pb_fe] \n\t" "psrlh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" "pasubub %[ftmp5], %[ftmp1], %[ftmp2] \n\t" "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" @@ -1130,23 +1130,23 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, "gsldlc1 %[ftmp7], 0x07(%[blimit]) \n\t" "gsldrc1 %[ftmp7], 0x00(%[blimit]) \n\t" "psubusb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" - "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" "sdc1 %[ftmp0], 0x00(%[srct]) \n\t" "sdc1 %[ftmp3], 0x08(%[srct]) \n\t" - "xor %[ftmp0], %[ftmp0], %[ff_pb_80] \n\t" - "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" "psubsb %[ftmp0], %[ftmp0], %[ftmp3] \n\t" - "xor %[ftmp6], %[ftmp1], %[ff_pb_80] \n\t" - "xor %[ftmp3], %[ftmp2], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp1], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp2], %[ff_pb_80] \n\t" "psubsb %[ftmp7], %[ftmp3], %[ftmp6] \n\t" "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" - "and %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "pand %[ftmp5], %[ftmp5], %[ftmp0] \n\t" "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" "li %[tmp0], 0x03 \n\t" @@ -1159,9 +1159,9 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, "mtc1 %[tmp0], %[ftmp9] \n\t" "psrah %[ftmp7], %[ftmp5], %[ftmp9] \n\t" "psllh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" - "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t" "li %[tmp0], 0x03 \n\t" @@ -1174,9 +1174,9 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, "mtc1 %[tmp0], %[ftmp9] \n\t" "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t" "psllh %[ftmp5], %[ftmp5], %[ftmp8] \n\t" - "or %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp5] \n\t" "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" - "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" "ldc1 %[ftmp0], 0x00(%[srct]) \n\t" "ldc1 %[ftmp4], 0x08(%[srct]) \n\t" @@ -1195,7 +1195,7 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t" "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t" - "dsrl %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + "ssrld %[ftmp0], %[ftmp0], %[ftmp10] \n\t" MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t" "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t" @@ -1203,7 +1203,7 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t" "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t" - "dsrl %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "ssrld %[ftmp6], %[ftmp6], %[ftmp10] \n\t" "gsswlc1 %[ftmp1], 0x03(%[src_ptr]) \n\t" "gsswrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" @@ -1215,11 +1215,11 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t" "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t" - "dsrl %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "ssrld %[ftmp1], %[ftmp1], %[ftmp10] \n\t" "gsswlc1 %[ftmp1], 0x03(%[addr0]) \n\t" "gsswrc1 %[ftmp1], 0x00(%[addr0]) \n\t" - "dsrl %[ftmp5], %[ftmp5], %[ftmp10] \n\t" + "ssrld %[ftmp5], %[ftmp5], %[ftmp10] \n\t" MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t" "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t" diff --git a/vp8/common/mips/mmi/sixtap_filter_mmi.c b/vp8/common/mips/mmi/sixtap_filter_mmi.c index 77d665d456..dbe35d09f3 100644 --- a/vp8/common/mips/mmi/sixtap_filter_mmi.c +++ b/vp8/common/mips/mmi/sixtap_filter_mmi.c @@ -110,7 +110,7 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t" "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" - "xor %[fzero], %[fzero], %[fzero] \n\t" + "pxor %[fzero], %[fzero], %[fzero] \n\t" "li %[tmp0], 0x07 \n\t" "mtc1 %[tmp0], %[ftmp7] \n\t" "li %[tmp0], 0x08 \n\t" @@ -137,12 +137,12 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, "pmullh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" - "dsrl %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "ssrld %[ftmp10], %[ftmp10], %[ftmp11] \n\t" "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" "pmullh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" - "dsrl %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "ssrld %[ftmp10], %[ftmp10], %[ftmp11] \n\t" "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" @@ -222,7 +222,7 @@ static INLINE void vp8_filter_block1dc_v6_mmi( "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t" "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" - "xor %[fzero], %[fzero], %[fzero] \n\t" + "pxor %[fzero], %[fzero], %[fzero] \n\t" "li %[tmp0], 0x07 \n\t" "mtc1 %[tmp0], %[ftmp13] \n\t" @@ -314,7 +314,7 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi( #endif // _MIPS_SIM == _ABIO32 __asm__ volatile ( - "xor %[fzero], %[fzero], %[fzero] \n\t" + "pxor %[fzero], %[fzero], %[fzero] \n\t" "1: \n\t" "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" @@ -351,7 +351,7 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi( #endif // _MIPS_SIM == _ABIO32 __asm__ volatile ( - "xor %[fzero], %[fzero], %[fzero] \n\t" + "pxor %[fzero], %[fzero], %[fzero] \n\t" "1: \n\t" "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" diff --git a/vp8/encoder/mips/mmi/dct_mmi.c b/vp8/encoder/mips/mmi/dct_mmi.c index 1f60a692d0..b5ecf0f1ca 100644 --- a/vp8/encoder/mips/mmi/dct_mmi.c +++ b/vp8/encoder/mips/mmi/dct_mmi.c @@ -24,19 +24,19 @@ "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ - "or %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ + "por %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ - "or %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \ + "por %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \ "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ - "or %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \ + "por %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \ "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ - "or %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "por %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \ "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \ "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \ @@ -90,7 +90,7 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL }; __asm__ volatile ( - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" MMI_ADDU(%[ip], %[ip], %[pitch]) @@ -237,7 +237,7 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { __asm__ volatile ( MMI_LI(%[tmp0], 0x02) - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" @@ -340,49 +340,49 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { "mtc1 %[tmp0], %[ftmp11] \n\t" "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp1] \n\t" - "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" "paddw %[ftmp1], %[ftmp1], %[ftmp9] \n\t" "paddw %[ftmp1], %[ftmp1], %[ff_pw_03] \n\t" "psraw %[ftmp1], %[ftmp1], %[ftmp11] \n\t" "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp2] \n\t" - "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" "paddw %[ftmp2], %[ftmp2], %[ftmp9] \n\t" "paddw %[ftmp2], %[ftmp2], %[ff_pw_03] \n\t" "psraw %[ftmp2], %[ftmp2], %[ftmp11] \n\t" "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp3] \n\t" - "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" "paddw %[ftmp3], %[ftmp3], %[ftmp9] \n\t" "paddw %[ftmp3], %[ftmp3], %[ff_pw_03] \n\t" "psraw %[ftmp3], %[ftmp3], %[ftmp11] \n\t" "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp4] \n\t" - "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" "paddw %[ftmp4], %[ftmp4], %[ftmp9] \n\t" "paddw %[ftmp4], %[ftmp4], %[ff_pw_03] \n\t" "psraw %[ftmp4], %[ftmp4], %[ftmp11] \n\t" "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp5] \n\t" - "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" "paddw %[ftmp5], %[ftmp5], %[ftmp9] \n\t" "paddw %[ftmp5], %[ftmp5], %[ff_pw_03] \n\t" "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp6] \n\t" - "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" "paddw %[ftmp6], %[ftmp6], %[ftmp9] \n\t" "paddw %[ftmp6], %[ftmp6], %[ff_pw_03] \n\t" "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp7] \n\t" - "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" "paddw %[ftmp7], %[ftmp7], %[ftmp9] \n\t" "paddw %[ftmp7], %[ftmp7], %[ff_pw_03] \n\t" "psraw %[ftmp7], %[ftmp7], %[ftmp11] \n\t" "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp8] \n\t" - "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" "paddw %[ftmp8], %[ftmp8], %[ff_pw_03] \n\t" "psraw %[ftmp8], %[ftmp8], %[ftmp11] \n\t" diff --git a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c index 3ccb196ffb..69a9e5e018 100644 --- a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c +++ b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c @@ -47,7 +47,7 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { __asm__ volatile( // loop 0 ~ 7 - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "gsldlc1 %[ftmp1], 0x07(%[coeff_ptr]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[coeff_ptr]) \n\t" "li %[tmp0], 0x0f \n\t" @@ -56,10 +56,10 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { "gsldrc1 %[ftmp2], 0x08(%[coeff_ptr]) \n\t" "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t" - "xor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t" - "xor %[ftmp2], %[ftmp4], %[ftmp2] \n\t" + "pxor %[ftmp2], %[ftmp4], %[ftmp2] \n\t" "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" "gsldlc1 %[ftmp5], 0x07(%[round_ptr]) \n\t" @@ -75,8 +75,8 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" - "xor %[ftmp7], %[ftmp5], %[ftmp3] \n\t" - "xor %[ftmp8], %[ftmp6], %[ftmp4] \n\t" + "pxor %[ftmp7], %[ftmp5], %[ftmp3] \n\t" + "pxor %[ftmp8], %[ftmp6], %[ftmp4] \n\t" "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" "gssdlc1 %[ftmp7], 0x07(%[qcoeff_ptr]) \n\t" @@ -90,10 +90,10 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { "gsldrc1 %[ftmp2], 0x08(%[inv_zig_zag]) \n\t" "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" - "xor %[ftmp5], %[ftmp5], %[ones] \n\t" - "xor %[ftmp6], %[ftmp6], %[ones] \n\t" - "and %[ftmp5], %[ftmp5], %[ftmp1] \n\t" - "and %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ones] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ones] \n\t" + "pand %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "pand %[ftmp6], %[ftmp6], %[ftmp2] \n\t" "pmaxsh %[ftmp10], %[ftmp5], %[ftmp6] \n\t" "gsldlc1 %[ftmp5], 0x07(%[dequant_ptr]) \n\t" @@ -114,10 +114,10 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { "gsldrc1 %[ftmp2], 0x18(%[coeff_ptr]) \n\t" "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t" - "xor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t" - "xor %[ftmp2], %[ftmp4], %[ftmp2] \n\t" + "pxor %[ftmp2], %[ftmp4], %[ftmp2] \n\t" "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" "gsldlc1 %[ftmp5], 0x17(%[round_ptr]) \n\t" @@ -133,8 +133,8 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" - "xor %[ftmp7], %[ftmp5], %[ftmp3] \n\t" - "xor %[ftmp8], %[ftmp6], %[ftmp4] \n\t" + "pxor %[ftmp7], %[ftmp5], %[ftmp3] \n\t" + "pxor %[ftmp8], %[ftmp6], %[ftmp4] \n\t" "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" "gssdlc1 %[ftmp7], 0x17(%[qcoeff_ptr]) \n\t" @@ -148,10 +148,10 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { "gsldrc1 %[ftmp2], 0x18(%[inv_zig_zag]) \n\t" "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" - "xor %[ftmp5], %[ftmp5], %[ones] \n\t" - "xor %[ftmp6], %[ftmp6], %[ones] \n\t" - "and %[ftmp5], %[ftmp5], %[ftmp1] \n\t" - "and %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ones] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ones] \n\t" + "pand %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "pand %[ftmp6], %[ftmp6], %[ftmp2] \n\t" "pmaxsh %[ftmp11], %[ftmp5], %[ftmp6] \n\t" "gsldlc1 %[ftmp5], 0x17(%[dequant_ptr]) \n\t" @@ -177,7 +177,7 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" "li %[tmp0], 0xffff \n\t" "mtc1 %[tmp0], %[ftmp9] \n\t" - "and %[ftmp10], %[ftmp10], %[ftmp9] \n\t" + "pand %[ftmp10], %[ftmp10], %[ftmp9] \n\t" "gssdlc1 %[ftmp10], 0x07(%[eob]) \n\t" "gssdrc1 %[ftmp10], 0x00(%[eob]) \n\t" : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), @@ -217,7 +217,7 @@ void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) { // memset(dqcoeff_ptr, 0, 32); /* clang-format off */ __asm__ volatile ( - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "gssdlc1 %[ftmp0], 0x07(%[qcoeff_ptr]) \n\t" "gssdrc1 %[ftmp0], 0x00(%[qcoeff_ptr]) \n\t" "gssdlc1 %[ftmp0], 0x0f(%[qcoeff_ptr]) \n\t" diff --git a/vpx_dsp/mips/sad_mmi.c b/vpx_dsp/mips/sad_mmi.c index 4368db5fdb..5dee3164bc 100644 --- a/vpx_dsp/mips/sad_mmi.c +++ b/vpx_dsp/mips/sad_mmi.c @@ -365,7 +365,7 @@ static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride, mips_reg l_counter = counter; __asm__ volatile ( - "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" // Include two loop body, to reduce loop time. SAD_SRC_REF_ABS_SUB_64 @@ -407,7 +407,7 @@ static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride, mips_reg l_counter = counter; __asm__ volatile ( - "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" // Include two loop body, to reduce loop time. SAD_SRC_AVGREF_ABS_SUB_64 @@ -451,7 +451,7 @@ static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride, mips_reg l_counter = counter; __asm__ volatile ( - "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" // Include two loop body, to reduce loop time. SAD_SRC_REF_ABS_SUB_32 @@ -495,7 +495,7 @@ static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride, mips_reg l_counter = counter; __asm__ volatile ( - "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" // Include two loop body, to reduce loop time. SAD_SRC_AVGREF_ABS_SUB_32 @@ -540,7 +540,7 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride, mips_reg l_counter = counter; __asm__ volatile ( - "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" // Include two loop body, to reduce loop time. SAD_SRC_REF_ABS_SUB_16 @@ -588,7 +588,7 @@ static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride, mips_reg l_counter = counter; __asm__ volatile ( - "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" // Include two loop body, to reduce loop time. SAD_SRC_AVGREF_ABS_SUB_16 @@ -633,7 +633,7 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride, mips_reg l_counter = counter; __asm__ volatile ( - "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" "1: \n\t" // Include two loop body, to reduce loop time. SAD_SRC_REF_ABS_SUB_8 @@ -681,7 +681,7 @@ static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride, mips_reg l_counter = counter; __asm__ volatile ( - "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" "1: \n\t" // Include two loop body, to reduce loop time. SAD_SRC_AVGREF_ABS_SUB_8 @@ -725,7 +725,7 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride, mips_reg l_counter = counter; __asm__ volatile ( - "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" "1: \n\t" // Include two loop body, to reduce loop time. SAD_SRC_REF_ABS_SUB_4 @@ -769,7 +769,7 @@ static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride, mips_reg l_counter = counter; __asm__ volatile ( - "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" "1: \n\t" // Include two loop body, to reduce loop time. SAD_SRC_AVGREF_ABS_SUB_4 diff --git a/vpx_dsp/mips/subtract_mmi.c b/vpx_dsp/mips/subtract_mmi.c index 9f361704a8..8bd7e6977c 100644 --- a/vpx_dsp/mips/subtract_mmi.c +++ b/vpx_dsp/mips/subtract_mmi.c @@ -24,7 +24,7 @@ void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff, switch (rows) { case 4: __asm__ volatile( - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" #if _MIPS_SIM == _ABIO32 "ulw %[tmp0], 0x00(%[src]) \n\t" "mtc1 %[tmp0], %[ftmp1] \n\t" @@ -118,7 +118,7 @@ void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff, break; case 8: __asm__ volatile( - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "li %[tmp0], 0x02 \n\t" "1: \n\t" "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" @@ -206,7 +206,7 @@ void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff, break; case 16: __asm__ volatile( - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "li %[tmp0], 0x08 \n\t" "1: \n\t" "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" diff --git a/vpx_dsp/mips/variance_mmi.c b/vpx_dsp/mips/variance_mmi.c index c1780c33af..29e52a1a89 100644 --- a/vpx_dsp/mips/variance_mmi.c +++ b/vpx_dsp/mips/variance_mmi.c @@ -150,7 +150,7 @@ static const uint8_t bilinear_filters[8][2] = { "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \ \ /* store: temp2[0] ~ temp2[3] */ \ - "and %[ftmp2], %[ftmp2], %[mask] \n\t" \ + "pand %[ftmp2], %[ftmp2], %[mask] \n\t" \ "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t" @@ -163,7 +163,7 @@ static const uint8_t bilinear_filters[8][2] = { "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \ \ /* store: temp2[0] ~ temp2[3] */ \ - "and %[ftmp4], %[ftmp4], %[mask] \n\t" \ + "pand %[ftmp4], %[ftmp4], %[mask] \n\t" \ "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ "gssdrc1 %[ftmp4], 0x00(%[temp2_ptr]) \n\t" @@ -225,8 +225,8 @@ static const uint8_t bilinear_filters[8][2] = { "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" \ \ /* store: temp2[0] ~ temp2[7] */ \ - "and %[ftmp2], %[ftmp2], %[mask] \n\t" \ - "and %[ftmp3], %[ftmp3], %[mask] \n\t" \ + "pand %[ftmp2], %[ftmp2], %[mask] \n\t" \ + "pand %[ftmp3], %[ftmp3], %[mask] \n\t" \ "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ "gssdlc1 %[ftmp2], 0x07(%[temp2_ptr]) \n\t" \ "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t" @@ -247,8 +247,8 @@ static const uint8_t bilinear_filters[8][2] = { "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" \ \ /* store: temp2[0] ~ temp2[7] */ \ - "and %[ftmp8], %[ftmp8], %[mask] \n\t" \ - "and %[ftmp9], %[ftmp9], %[mask] \n\t" \ + "pand %[ftmp8], %[ftmp8], %[mask] \n\t" \ + "pand %[ftmp9], %[ftmp9], %[mask] \n\t" \ "packushb %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ "gssdlc1 %[ftmp8], 0x07(%[temp2_ptr]) \n\t" \ "gssdrc1 %[ftmp8], 0x00(%[temp2_ptr]) \n\t" @@ -319,8 +319,8 @@ static const uint8_t bilinear_filters[8][2] = { "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" \ \ /* store: temp2[8] ~ temp2[15] */ \ - "and %[ftmp4], %[ftmp4], %[mask] \n\t" \ - "and %[ftmp5], %[ftmp5], %[mask] \n\t" \ + "pand %[ftmp4], %[ftmp4], %[mask] \n\t" \ + "pand %[ftmp5], %[ftmp5], %[mask] \n\t" \ "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ "gssdlc1 %[ftmp4], 0x0f(%[temp2_ptr]) \n\t" \ "gssdrc1 %[ftmp4], 0x08(%[temp2_ptr]) \n\t" @@ -343,8 +343,8 @@ static const uint8_t bilinear_filters[8][2] = { "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" \ \ /* store: temp2[8] ~ temp2[15] */ \ - "and %[ftmp10], %[ftmp10], %[mask] \n\t" \ - "and %[ftmp11], %[ftmp11], %[mask] \n\t" \ + "pand %[ftmp10], %[ftmp10], %[mask] \n\t" \ + "pand %[ftmp11], %[ftmp11], %[mask] \n\t" \ "packushb %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ "gssdlc1 %[ftmp10], 0x0f(%[temp2_ptr]) \n\t" \ "gssdrc1 %[ftmp10], 0x08(%[temp2_ptr]) \n\t" @@ -418,9 +418,9 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride, "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" MMI_L(%[tmp0], %[high], 0x00) - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" - "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" "1: \n\t" "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" @@ -478,7 +478,7 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride, "mfc1 %[tmp1], %[ftmp9] \n\t" "mfhc1 %[tmp2], %[ftmp9] \n\t" "addu %[sum], %[tmp1], %[tmp2] \n\t" - "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t" + "ssrld %[ftmp1], %[ftmp10], %[ftmp11] \n\t" "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t" "swc1 %[ftmp1], 0x00(%[sse]) \n\t" : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), @@ -523,9 +523,9 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" "li %[tmp0], 0x40 \n\t" - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" - "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" "1: \n\t" "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" @@ -559,7 +559,7 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, "mfc1 %[tmp1], %[ftmp9] \n\t" "mfhc1 %[tmp2], %[ftmp9] \n\t" "addu %[sum], %[tmp1], %[tmp2] \n\t" - "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t" + "ssrld %[ftmp1], %[ftmp10], %[ftmp11] \n\t" "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t" "swc1 %[ftmp1], 0x00(%[sse]) \n\t" : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), @@ -594,10 +594,10 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride, "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" MMI_L(%[tmp0], %[high], 0x00) - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" - "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" - "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" "1: \n\t" "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" @@ -625,7 +625,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride, MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" - "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" "swc1 %[ftmp9], 0x00(%[sse]) \n\t" @@ -636,7 +636,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride, "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" - "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t" + "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t" "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" "swc1 %[ftmp0], 0x00(%[sum]) \n\t" @@ -680,10 +680,10 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride, "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" MMI_L(%[tmp0], %[high], 0x00) - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" - "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" - "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" "1: \n\t" "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" @@ -701,7 +701,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride, MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" - "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" "swc1 %[ftmp9], 0x00(%[sse]) \n\t" @@ -712,7 +712,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride, "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" - "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t" + "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t" "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" "swc1 %[ftmp0], 0x00(%[sum]) \n\t" @@ -757,10 +757,10 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride, "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" MMI_L(%[tmp0], %[high], 0x00) - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" - "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" - "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" "1: \n\t" "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" @@ -773,7 +773,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride, MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" - "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" "swc1 %[ftmp9], 0x00(%[sse]) \n\t" @@ -784,7 +784,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride, "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" - "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t" + "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t" "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" "swc1 %[ftmp0], 0x00(%[sum]) \n\t" @@ -829,10 +829,10 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride, "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp10] \n\t" MMI_L(%[tmp0], %[high], 0x00) - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t" - "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" - "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" "1: \n\t" "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" @@ -845,7 +845,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride, MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" - "dsrl %[ftmp9], %[ftmp6], %[ftmp10] \n\t" + "ssrld %[ftmp9], %[ftmp6], %[ftmp10] \n\t" "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t" "swc1 %[ftmp9], 0x00(%[sse]) \n\t" @@ -856,7 +856,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride, "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" - "dsrl %[ftmp0], %[ftmp3], %[ftmp10] \n\t" + "ssrld %[ftmp0], %[ftmp3], %[ftmp10] \n\t" "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" "swc1 %[ftmp0], 0x00(%[sum]) \n\t" : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), @@ -898,8 +898,8 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride, "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" MMI_L(%[tmp0], %[high], 0x00) - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" "1: \n\t" VARIANCE_SSE_16 @@ -909,7 +909,7 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride, MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" - "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" "swc1 %[ftmp9], 0x00(%[sse]) \n\t" : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), @@ -951,8 +951,8 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride, "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" MMI_L(%[tmp0], %[high], 0x00) - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" "1: \n\t" VARIANCE_SSE_8 @@ -962,7 +962,7 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride, MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" - "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" "swc1 %[ftmp9], 0x00(%[sse]) \n\t" : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), @@ -1029,7 +1029,7 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr, const uint8_t *filter_y = bilinear_filters[y_offset]; __asm__ volatile ( - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" MMI_LI(%[tmp0], 0x07) MMI_MTC1(%[tmp0], %[ftmp14]) "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" @@ -1111,7 +1111,7 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr, const uint8_t *filter_y = bilinear_filters[y_offset]; __asm__ volatile ( - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" MMI_LI(%[tmp0], 0x07) MMI_MTC1(%[tmp0], %[ftmp14]) "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" @@ -1194,7 +1194,7 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr, const uint8_t *filter_y = bilinear_filters[y_offset]; __asm__ volatile ( - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" MMI_LI(%[tmp0], 0x07) MMI_MTC1(%[tmp0], %[ftmp6]) "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" diff --git a/vpx_dsp/mips/vpx_convolve8_mmi.c b/vpx_dsp/mips/vpx_convolve8_mmi.c index ba9ceb8665..cb7bca5589 100644 --- a/vpx_dsp/mips/vpx_convolve8_mmi.c +++ b/vpx_dsp/mips/vpx_convolve8_mmi.c @@ -105,7 +105,7 @@ static void convolve_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, /* clang-format off */ __asm__ volatile( "move %[tmp1], %[width] \n\t" - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "gsldlc1 %[filter1], 0x03(%[filter]) \n\t" "gsldrc1 %[filter1], 0x00(%[filter]) \n\t" "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t" @@ -178,7 +178,7 @@ static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, (void)y_step_q4; __asm__ volatile( - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t" "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t" "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t" @@ -271,7 +271,7 @@ static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, __asm__ volatile( "move %[tmp1], %[width] \n\t" - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "gsldlc1 %[filter1], 0x03(%[filter]) \n\t" "gsldrc1 %[filter1], 0x00(%[filter]) \n\t" "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t" @@ -354,7 +354,7 @@ static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, (void)y_step_q4; __asm__ volatile( - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t" "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t" "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t" @@ -467,7 +467,7 @@ void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride, __asm__ volatile( "move %[tmp1], %[width] \n\t" - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "li %[tmp0], 0x10001 \n\t" MMI_MTC1(%[tmp0], %[ftmp3]) "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" diff --git a/vpx_ports/asmdefs_mmi.h b/vpx_ports/asmdefs_mmi.h index 28355bf9fb..400a51cc32 100644 --- a/vpx_ports/asmdefs_mmi.h +++ b/vpx_ports/asmdefs_mmi.h @@ -34,7 +34,7 @@ "ld " #reg ", " #bias "(" #addr ") \n\t" #define MMI_SRL(reg1, reg2, shift) \ - "dsrl " #reg1 ", " #reg2 ", " #shift " \n\t" + "ssrld " #reg1 ", " #reg2 ", " #shift " \n\t" #define MMI_SLL(reg1, reg2, shift) \ "dsll " #reg1 ", " #reg2 ", " #shift " \n\t" @@ -63,7 +63,7 @@ "lw " #reg ", " #bias "(" #addr ") \n\t" #define MMI_SRL(reg1, reg2, shift) \ - "srl " #reg1 ", " #reg2 ", " #shift " \n\t" + "ssrlw " #reg1 ", " #reg2 ", " #shift " \n\t" #define MMI_SLL(reg1, reg2, shift) \ "sll " #reg1 ", " #reg2 ", " #shift " \n\t" From 220b00dd0d624a24a081fa60e35b024357c506ad Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 29 Jun 2020 19:43:18 -0700 Subject: [PATCH 0015/1000] add CONTRIBUTING.md serves as a brief introduction and adds a link to the gerrit instructions on webmproject.org. Bug: webm:1669 Change-Id: If1d483eb48e2edcda8c51e66bdd1a86b7c35b986 --- CONTRIBUTING.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000..577c96a6b5 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,30 @@ +# How to Contribute + +We'd love to accept your patches and contributions to this project. There are +just a few small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use a [Gerrit](https://www.gerritcodereview.com) instance hosted at +https://chromium-review.googlesource.com for this purpose. + +See https://www.webmproject.org/code/contribute/submitting-patches for an +example of a typical gerrit workflow. + +## Community Guidelines + +This project follows +[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). From 1e9929390c8c18ffda02e0073481625e5afb2529 Mon Sep 17 00:00:00 2001 From: Sreerenj Balachandran Date: Tue, 30 Jun 2020 19:46:17 -0700 Subject: [PATCH 0016/1000] vp9-svc: Fix the bitrate control for spatial svc Make sure to initialize the layer context for spatial-svc which has a single temporal layer. Change-Id: I026ecec483555658e09d6d8893e56ab62ee6914b --- AUTHORS | 1 + vp9/ratectrl_rtc.cc | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/AUTHORS b/AUTHORS index 3eb03e9233..ab11bfdd7d 100644 --- a/AUTHORS +++ b/AUTHORS @@ -165,6 +165,7 @@ Shimon Doodkin Shiyou Yin Shubham Tandle Shunyao Li +Sreerenj Balachandran Stefan Holmer Suman Sunkara Supradeep T R diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index 3d6afc5d03..6238b3a955 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -107,7 +107,8 @@ void VP9RateControlRTC::UpdateRateControl( } vp9_set_rc_buffer_sizes(cpi_); vp9_new_framerate(cpi_, cpi_->framerate); - if (cpi_->svc.number_temporal_layers > 1) { + if (cpi_->svc.number_temporal_layers > 1 || + cpi_->svc.number_spatial_layers > 1) { if (cm->current_video_frame == 0) vp9_init_layer_context(cpi_); vp9_update_layer_context_change_config(cpi_, (int)cpi_->oxcf.target_bandwidth); From 9e15c3058543d71867af9e02e314a7303f6049b3 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Mon, 6 Jul 2020 11:28:56 -0700 Subject: [PATCH 0017/1000] vp9: Fix to use last_q for resize check For temporal layers resize is only checked on the base/TL0 frames. So rc->last_q should be used, which because rc is in the layer context, rc->last_q will correspond to the qindex on last TL0 frame. In the previous code cm->base_qindex was used, which would correspond to qindex on last encoded frame, which is not TL0 when temporal_layers > 1. Change-Id: Iaf86f7156d2d48ae99a1b34ad576d453d490e746 --- vp9/encoder/vp9_ratectrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 3beb9f41f9..7301e80b84 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -2716,7 +2716,7 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { // Ignore samples close to key frame, since QP is usually high after key. if (!force_downsize_rate && cpi->rc.frames_since_key > cpi->framerate) { const int window = VPXMIN(30, (int)(2 * cpi->framerate)); - cpi->resize_avg_qp += cm->base_qindex; + cpi->resize_avg_qp += rc->last_q[INTER_FRAME]; if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100)) ++cpi->resize_buffer_underflow; ++cpi->resize_count; From a1cee8dc919df1980d802e1a9bce1259ec34cba8 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Mon, 6 Jul 2020 14:04:57 -0700 Subject: [PATCH 0018/1000] vp9: Update last_q for dropped frames last_q is used in resize logic, should always be last Q selected for previous frame, encoded or dropped. Change-Id: Ie9019ccf5a9e3acc8456a2e70cc2aa8d1c90236e --- vp9/encoder/vp9_ratectrl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 7301e80b84..c8b3e0b29a 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1986,6 +1986,7 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { cpi->rc.rc_2_frame = 0; cpi->rc.rc_1_frame = 0; cpi->rc.last_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth; + cpi->rc.last_q[INTER_FRAME] = cpi->common.base_qindex; // For SVC on dropped frame when framedrop_mode != LAYER_DROP: // in this mode the whole superframe may be dropped if only a single layer // has buffer underflow (below threshold). Since this can then lead to From 5b7882139c2f64d4ff830e1665e04a4d72f9e484 Mon Sep 17 00:00:00 2001 From: jinbo Date: Wed, 1 Jul 2020 08:56:25 +0800 Subject: [PATCH 0019/1000] vp8,vpx_dsp:[loongson] fix bugs reported by clang 1. Adjust variable type to match clang compiler. Clang is more strict on the type of asm operands, float or double type variable should use constraint 'f', integer variable should use constraint 'r'. 2. Fix prob of using r-value in output operands. clang report error: 'invalid use of a cast in a inline asm context requiring an l-value: remove the cast or build with -fheinous-gnu-extensions'. Change-Id: Iae9e08f55f249059066c391534013e320812463e --- vp8/common/mips/mmi/idctllm_mmi.c | 43 +-- vp8/common/mips/mmi/loopfilter_filters_mmi.c | 282 ++++++++++++------- vp8/common/mips/mmi/sixtap_filter_mmi.c | 45 +-- vp8/encoder/mips/mmi/dct_mmi.c | 53 ++-- vp8/encoder/mips/mmi/vp8_quantize_mmi.c | 27 +- vpx_dsp/mips/sad_mmi.c | 35 ++- vpx_dsp/mips/variance_mmi.c | 146 +++++++--- 7 files changed, 414 insertions(+), 217 deletions(-) diff --git a/vp8/common/mips/mmi/idctllm_mmi.c b/vp8/common/mips/mmi/idctllm_mmi.c index 4fad1d347f..a35689dd30 100644 --- a/vp8/common/mips/mmi/idctllm_mmi.c +++ b/vp8/common/mips/mmi/idctllm_mmi.c @@ -41,14 +41,18 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride) { double ftmp[12]; - uint32_t tmp[0]; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_04) = { 0x0004000400040004ULL }; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = { 0x4e7b4e7b4e7b4e7bULL }; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = { 0x22a322a322a322a3ULL }; + uint64_t tmp[1]; + double ff_ph_04, ff_ph_4e7b, ff_ph_22a3; __asm__ volatile ( + "dli %[tmp0], 0x0004000400040004 \n\t" + "dmtc1 %[tmp0], %[ff_ph_04] \n\t" + "dli %[tmp0], 0x4e7b4e7b4e7b4e7b \n\t" + "dmtc1 %[tmp0], %[ff_ph_4e7b] \n\t" + "dli %[tmp0], 0x22a322a322a322a3 \n\t" + "dmtc1 %[tmp0], %[ff_ph_22a3] \n\t" MMI_LI(%[tmp0], 0x02) - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" @@ -186,9 +190,10 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr, [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), - [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr) - : [ip]"r"(input), [ff_ph_22a3]"f"(ff_ph_22a3), - [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_04]"f"(ff_ph_04), + [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr), + [ff_ph_4e7b]"=&f"(ff_ph_4e7b), [ff_ph_04]"=&f"(ff_ph_04), + [ff_ph_22a3]"=&f"(ff_ph_22a3) + : [ip]"r"(input), [pred_stride]"r"((mips_reg)pred_stride), [dst_stride]"r"((mips_reg)dst_stride) : "memory" @@ -198,12 +203,13 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr, void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride) { - int a1 = ((input_dc + 4) >> 3); - double ftmp[5]; + int a0 = ((input_dc + 4) >> 3); + double a1, ftmp[5]; int low32; __asm__ volatile ( "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dmtc1 %[a0], %[a1] \n\t" "pshufh %[a1], %[a1], %[ftmp0] \n\t" "ulw %[low32], 0x00(%[pred_ptr]) \n\t" "mtc1 %[low32], %[ftmp1] \n\t" @@ -244,9 +250,9 @@ void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr, "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32), - [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr) + [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr), [a1]"=&f"(a1) : [dst_stride]"r"((mips_reg)dst_stride), - [pred_stride]"r"((mips_reg)pred_stride), [a1]"f"(a1) + [pred_stride]"r"((mips_reg)pred_stride), [a0]"r"(a0) : "memory" ); } @@ -254,14 +260,15 @@ void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr, void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) { int i; int16_t output[16]; - double ftmp[12]; - uint32_t tmp[1]; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_03) = { 0x0003000300030003ULL }; + double ff_ph_03, ftmp[12]; + uint64_t tmp[1]; __asm__ volatile ( + "dli %[tmp0], 0x0003000300030003 \n\t" + "dmtc1 %[tmp0], %[ff_ph_03] \n\t" MMI_LI(%[tmp0], 0x03) "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t" @@ -317,8 +324,8 @@ void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) { [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), - [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]) - : [ip]"r"(input), [op]"r"(output), [ff_ph_03]"f"(ff_ph_03) + [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [ff_ph_03]"=&f"(ff_ph_03) + : [ip]"r"(input), [op]"r"(output) : "memory" ); diff --git a/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/vp8/common/mips/mmi/loopfilter_filters_mmi.c index fc1240cc27..a07a7e3b41 100644 --- a/vp8/common/mips/mmi/loopfilter_filters_mmi.c +++ b/vp8/common/mips/mmi/loopfilter_filters_mmi.c @@ -13,28 +13,25 @@ #include "vp8/common/onyxc_int.h" #include "vpx_ports/asmdefs_mmi.h" -DECLARE_ALIGNED(8, static const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; -DECLARE_ALIGNED(8, static const uint64_t, - ff_ph_003f) = { 0x003f003f003f003fULL }; -DECLARE_ALIGNED(8, static const uint64_t, - ff_ph_0900) = { 0x0900090009000900ULL }; -DECLARE_ALIGNED(8, static const uint64_t, - ff_ph_1200) = { 0x1200120012001200ULL }; -DECLARE_ALIGNED(8, static const uint64_t, - ff_ph_1b00) = { 0x1b001b001b001b00ULL }; -DECLARE_ALIGNED(8, static const uint64_t, ff_pb_fe) = { 0xfefefefefefefefeULL }; -DECLARE_ALIGNED(8, static const uint64_t, ff_pb_80) = { 0x8080808080808080ULL }; -DECLARE_ALIGNED(8, static const uint64_t, ff_pb_04) = { 0x0404040404040404ULL }; -DECLARE_ALIGNED(8, static const uint64_t, ff_pb_03) = { 0x0303030303030303ULL }; -DECLARE_ALIGNED(8, static const uint64_t, ff_pb_01) = { 0x0101010101010101ULL }; - void vp8_loop_filter_horizontal_edge_mmi( unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh, int count) { - uint32_t tmp[1]; + uint64_t tmp[1]; mips_reg addr[2]; double ftmp[12]; + double ff_ph_01, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03; + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ff_ph_01] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" "1: \n\t" "gsldlc1 %[ftmp10], 0x07(%[limit]) \n\t" "gsldrc1 %[ftmp10], 0x00(%[limit]) \n\t" @@ -91,9 +88,9 @@ void vp8_loop_filter_horizontal_edge_mmi( "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t" "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t" - "and %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp10] \n\t" + "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" "psrlh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" "gsldlc1 %[ftmp10], 0x07(%[blimit]) \n\t" @@ -134,8 +131,8 @@ void vp8_loop_filter_horizontal_edge_mmi( "punpcklbh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" "punpckhbh %[ftmp11], %[ftmp11], %[ftmp8] \n\t" - "li %[tmp0], 0x0b \n\t" - "mtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t" "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t" "packsshb %[ftmp8], %[ftmp0], %[ftmp11] \n\t" @@ -149,8 +146,8 @@ void vp8_loop_filter_horizontal_edge_mmi( "packsshb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" "paddsh %[ftmp9], %[ftmp9], %[ff_ph_01] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t" "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t" "packsshb %[ftmp11], %[ftmp11], %[ftmp9] \n\t" @@ -188,17 +185,18 @@ void vp8_loop_filter_horizontal_edge_mmi( [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), - [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_fe]"=&f"(ff_pb_fe), + [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_04]"=&f"(ff_pb_04), + [ff_pb_03]"=&f"(ff_pb_03) : [limit]"r"(limit), [blimit]"r"(blimit), [thresh]"r"(thresh), [src_pixel_step]"r"((mips_reg)src_pixel_step), [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), - [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)), - [ff_ph_01]"f"(ff_ph_01), [ff_pb_fe]"f"(ff_pb_fe), - [ff_pb_80]"f"(ff_pb_80), [ff_pb_04]"f"(ff_pb_04), - [ff_pb_03]"f"(ff_pb_03) + [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)) : "memory" ); + /* clang-format on */ } void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, @@ -206,11 +204,23 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh, int count) { - uint32_t tmp[1]; + uint64_t tmp[1]; mips_reg addr[2]; double ftmp[13]; + double ff_pb_fe, ff_ph_01, ff_pb_03, ff_pb_04, ff_pb_80; + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ff_ph_01] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) MMI_SUBU(%[src_ptr], %[src_ptr], 0x04) @@ -315,8 +325,8 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, /* abs (p1-q1) */ "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp1] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp1] \n\t" "psrlh %[ftmp12], %[ftmp12], %[ftmp1] \n\t" "paddusb %[ftmp1], %[ftmp11], %[ftmp12] \n\t" "psubusb %[ftmp1], %[ftmp1], %[ftmp8] \n\t" @@ -354,8 +364,8 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, "paddsb %[ftmp11], %[ftmp2], %[ff_pb_04] \n\t" "paddsb %[ftmp12], %[ftmp2], %[ff_pb_03] \n\t" - "li %[tmp0], 0x0b \n\t" - "mtc1 %[tmp0], %[ftmp7] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp7] \n\t" "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" "punpcklbh %[ftmp0], %[ftmp0], %[ftmp12] \n\t" @@ -379,8 +389,8 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, "paddsh %[ftmp0], %[ftmp0], %[ff_ph_01] \n\t" "paddsh %[ftmp8], %[ftmp8], %[ff_ph_01] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp7] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp7] \n\t" "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" "packsshb %[ftmp2], %[ftmp0], %[ftmp8] \n\t" @@ -450,15 +460,16 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), - [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_03]"=&f"(ff_pb_03), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_fe]"=&f"(ff_pb_fe) : [limit]"r"(limit), [blimit]"r"(blimit), [thresh]"r"(thresh), - [src_pixel_step]"r"((mips_reg)src_pixel_step), - [ff_ph_01]"f"(ff_ph_01), [ff_pb_03]"f"(ff_pb_03), - [ff_pb_04]"f"(ff_pb_04), [ff_pb_80]"f"(ff_pb_80), - [ff_pb_fe]"f"(ff_pb_fe) + [src_pixel_step]"r"((mips_reg)src_pixel_step) : "memory" ); + /* clang-format on */ } /* clang-format off */ @@ -484,10 +495,29 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, void vp8_mbloop_filter_horizontal_edge_mmi( unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh, int count) { - uint32_t tmp[1]; + uint64_t tmp[1]; double ftmp[13]; + double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03, ff_ph_003f, ff_ph_0900, + ff_ph_1200, ff_ph_1b00; + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + "dli %[tmp0], 0x003f003f003f003f \n\t" + "dmtc1 %[tmp0], %[ff_ph_003f] \n\t" + "dli %[tmp0], 0x0900090009000900 \n\t" + "dmtc1 %[tmp0], %[ff_ph_0900] \n\t" + "dli %[tmp0], 0x1200120012001200 \n\t" + "dmtc1 %[tmp0], %[ff_ph_1200] \n\t" + "dli %[tmp0], 0x1b001b001b001b00 \n\t" + "dmtc1 %[tmp0], %[ff_ph_1b00] \n\t" MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) "1: \n\t" @@ -550,8 +580,8 @@ void vp8_mbloop_filter_horizontal_edge_mmi( "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t" "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" "psubusb %[ftmp1], %[ftmp1], %[ftmp12] \n\t" @@ -584,8 +614,8 @@ void vp8_mbloop_filter_horizontal_edge_mmi( "pandn %[ftmp12], %[ftmp1], %[ftmp2] \n\t" "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t" - "li %[tmp0], 0x0b \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "paddsb %[ftmp0], %[ftmp2], %[ff_pb_03] \n\t" VP8_MBLOOP_HPSRAB "paddsb %[ftmp5], %[ftmp5], %[ftmp0] \n\t" @@ -593,8 +623,8 @@ void vp8_mbloop_filter_horizontal_edge_mmi( VP8_MBLOOP_HPSRAB "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" - "li %[tmp0], 0x07 \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00]) @@ -649,18 +679,20 @@ void vp8_mbloop_filter_horizontal_edge_mmi( [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), - [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_03]"=&f"(ff_pb_03), + [ff_ph_0900]"=&f"(ff_ph_0900), [ff_ph_1b00]"=&f"(ff_ph_1b00), + [ff_ph_1200]"=&f"(ff_ph_1200), [ff_ph_003f]"=&f"(ff_ph_003f) : [limit]"r"(limit), [blimit]"r"(blimit), [thresh]"r"(thresh), - [src_pixel_step]"r"((mips_reg)src_pixel_step), - [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80), - [ff_pb_04]"f"(ff_pb_04), [ff_pb_03]"f"(ff_pb_03), - [ff_ph_0900]"f"(ff_ph_0900), [ff_ph_1b00]"f"(ff_ph_1b00), - [ff_ph_1200]"f"(ff_ph_1200), [ff_ph_003f]"f"(ff_ph_003f) + [src_pixel_step]"r"((mips_reg)src_pixel_step) : "memory" ); + /* clang-format on */ } +/* clang-format off */ #define VP8_MBLOOP_VPSRAB_ADDH \ "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \ @@ -673,15 +705,30 @@ void vp8_mbloop_filter_horizontal_edge_mmi( "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" \ "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" \ "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t" +/* clang-format on */ void vp8_mbloop_filter_vertical_edge_mmi( unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh, int count) { mips_reg tmp[1]; - DECLARE_ALIGNED(8, const uint64_t, srct[1]); + DECLARE_ALIGNED(8, const uint64_t, srct[2]); double ftmp[14]; + double ff_ph_003f, ff_ph_0900, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03; + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0x003f003f003f003f \n\t" + "dmtc1 %[tmp0], %[ff_ph_003f] \n\t" + "dli %[tmp0], 0x0900090009000900 \n\t" + "dmtc1 %[tmp0], %[ff_ph_0900] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" MMI_SUBU(%[src_ptr], %[src_ptr], 0x04) "1: \n\t" @@ -783,8 +830,8 @@ void vp8_mbloop_filter_vertical_edge_mmi( /* abs (p1-q1) / 2 */ "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" "psrlh %[ftmp12], %[ftmp12], %[ftmp8] \n\t" "paddusb %[ftmp12], %[ftmp1], %[ftmp12] \n\t" "psubusb %[ftmp12], %[ftmp12], %[ftmp13] \n\t" @@ -824,8 +871,8 @@ void vp8_mbloop_filter_vertical_edge_mmi( "pandn %[ftmp0], %[ftmp1], %[ftmp0] \n\t" "paddsb %[ftmp4], %[ftmp3], %[ff_pb_04] \n\t" - "li %[tmp0], 0x0b \n\t" - "mtc1 %[tmp0], %[ftmp12] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "punpcklbh %[ftmp7], %[ftmp7], %[ftmp4] \n\t" "punpckhbh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" @@ -842,8 +889,8 @@ void vp8_mbloop_filter_vertical_edge_mmi( /* ftmp6: ps0 */ "paddsb %[ftmp6], %[ftmp6], %[ftmp3] \n\t" - "li %[tmp0], 0x07 \n\t" - "mtc1 %[tmp0], %[ftmp12] \n\t" + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" VP8_MBLOOP_VPSRAB_ADDH "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t" "paddh %[ftmp1], %[ftmp1], %[ff_ph_0900] \n\t" @@ -948,17 +995,19 @@ void vp8_mbloop_filter_vertical_edge_mmi( [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), [tmp0]"=&r"(tmp[0]), [src_ptr]"+&r"(src_ptr), - [count]"+&r"(count) + [count]"+&r"(count), + [ff_ph_003f]"=&f"(ff_ph_003f), [ff_ph_0900]"=&f"(ff_ph_0900), + [ff_pb_03]"=&f"(ff_pb_03), [ff_pb_04]"=&f"(ff_pb_04), + [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_fe]"=&f"(ff_pb_fe) : [limit]"r"(limit), [blimit]"r"(blimit), [srct]"r"(srct), [thresh]"r"(thresh), - [src_pixel_step]"r"((mips_reg)src_pixel_step), - [ff_ph_003f]"f"(ff_ph_003f), [ff_ph_0900]"f"(ff_ph_0900), - [ff_pb_03]"f"(ff_pb_03), [ff_pb_04]"f"(ff_pb_04), - [ff_pb_80]"f"(ff_pb_80), [ff_pb_fe]"f"(ff_pb_fe) + [src_pixel_step]"r"((mips_reg)src_pixel_step) : "memory" ); + /* clang-format on */ } +/* clang-format off */ #define VP8_SIMPLE_HPSRAB \ "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" \ "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" \ @@ -966,23 +1015,38 @@ void vp8_mbloop_filter_vertical_edge_mmi( "psrah %[ftmp1], %[ftmp5], %[ftmp10] \n\t" \ "psllh %[ftmp1], %[ftmp1], %[ftmp8] \n\t" \ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" +/* clang-format on */ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit) { - uint32_t tmp[1], count = 2; + uint64_t tmp[1], count = 2; mips_reg addr[2]; double ftmp[12]; + double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01; + /* clang-format off */ __asm__ volatile ( - "li %[tmp0], 0x08 \n\t" - "mtc1 %[tmp0], %[ftmp8] \n\t" - "li %[tmp0], 0x03 \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" - "li %[tmp0], 0x0b \n\t" - "mtc1 %[tmp0], %[ftmp10] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x03 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0101010101010101 \n\t" + "dmtc1 %[tmp0], %[ff_pb_01] \n\t" "1: \n\t" "gsldlc1 %[ftmp3], 0x07(%[blimit]) \n\t" @@ -996,7 +1060,7 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" "pasubub %[ftmp1], %[ftmp7], %[ftmp2] \n\t" - "and %[ftmp1], %[ftmp1], %[ff_pb_fe] \n\t" + "pand %[ftmp1], %[ftmp1], %[ff_pb_fe] \n\t" "psrlh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) @@ -1020,7 +1084,7 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" - "and %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + "pand %[ftmp5], %[ftmp5], %[ftmp2] \n\t" "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" VP8_SIMPLE_HPSRAB @@ -1048,30 +1112,43 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), - [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01) : [blimit]"r"(blimit), [src_pixel_step]"r"((mips_reg)src_pixel_step), - [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), - [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80), - [ff_pb_04]"f"(ff_pb_04), [ff_pb_01]"f"(ff_pb_01) + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)) : "memory" ); + /* clang-format on */ } void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit) { - uint32_t tmp[1], count = 2; + uint64_t tmp[1], count = 2; mips_reg addr[2]; - DECLARE_ALIGNED(8, const uint64_t, srct[1]); - double ftmp[12]; + DECLARE_ALIGNED(8, const uint64_t, srct[2]); + double ftmp[12], ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01; + /* clang-format off */ __asm__ volatile ( - "li %[tmp0], 0x08 \n\t" - "mtc1 %[tmp0], %[ftmp8] \n\t" - "li %[tmp0], 0x20 \n\t" - "mtc1 %[tmp0], %[ftmp10] \n\t" - + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x20 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x20 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0101010101010101 \n\t" + "dmtc1 %[tmp0], %[ff_pb_01] \n\t" MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4]) MMI_SUBU(%[src_ptr], %[src_ptr], 0x02) @@ -1118,8 +1195,8 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, "punpckhwd %[ftmp3], %[ftmp2], %[ftmp5] \n\t" "punpcklwd %[ftmp2], %[ftmp2], %[ftmp5] \n\t" - "li %[tmp0], 0x01 \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "pasubub %[ftmp6], %[ftmp3], %[ftmp0] \n\t" "pand %[ftmp6], %[ftmp6], %[ff_pb_fe] \n\t" "psrlh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" @@ -1149,14 +1226,14 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, "pand %[ftmp5], %[ftmp5], %[ftmp0] \n\t" "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" - "li %[tmp0], 0x03 \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x03 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" - "li %[tmp0], 0x0b \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "psrah %[ftmp7], %[ftmp5], %[ftmp9] \n\t" "psllh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" @@ -1164,14 +1241,14 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t" - "li %[tmp0], 0x03 \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x03 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" - "li %[tmp0], 0x0b \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t" "psllh %[ftmp5], %[ftmp5], %[ftmp8] \n\t" "por %[ftmp0], %[ftmp0], %[ftmp5] \n\t" @@ -1235,16 +1312,17 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), - [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01) : [blimit]"r"(blimit), [srct]"r"(srct), [src_pixel_step]"r"((mips_reg)src_pixel_step), [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)), - [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)), - [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80), - [ff_pb_04]"f"(ff_pb_04), [ff_pb_01]"f"(ff_pb_01) + [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)) : "memory" ); + /* clang-format on */ } /* Horizontal MB filtering */ diff --git a/vp8/common/mips/mmi/sixtap_filter_mmi.c b/vp8/common/mips/mmi/sixtap_filter_mmi.c index dbe35d09f3..b85f73fdff 100644 --- a/vp8/common/mips/mmi/sixtap_filter_mmi.c +++ b/vp8/common/mips/mmi/sixtap_filter_mmi.c @@ -70,9 +70,8 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, unsigned int output_height, unsigned int output_width, const int16_t *vp8_filter) { - uint32_t tmp[1]; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; - + uint64_t tmp[1]; + double ff_ph_40; #if _MIPS_SIM == _ABIO32 register double fzero asm("$f0"); register double ftmp0 asm("$f2"); @@ -103,7 +102,10 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, register double ftmp11 asm("$f12"); #endif // _MIPS_SIM == _ABIO32 + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0x0040004000400040 \n\t" + "dmtc1 %[tmp0], %[ff_ph_40] \n\t" "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" @@ -111,10 +113,10 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" "pxor %[fzero], %[fzero], %[fzero] \n\t" - "li %[tmp0], 0x07 \n\t" - "mtc1 %[tmp0], %[ftmp7] \n\t" - "li %[tmp0], 0x08 \n\t" - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp7] \n\t" + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" "1: \n\t" "gsldlc1 %[ftmp9], 0x05(%[src_ptr]) \n\t" @@ -166,21 +168,22 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10), [ftmp11]"=&f"(ftmp11), [tmp0]"=&r"(tmp[0]), [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height), - [src_ptr]"+&r"(src_ptr) + [src_ptr]"+&r"(src_ptr), [ff_ph_40]"=&f"(ff_ph_40) : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), - [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width), - [ff_ph_40]"f"(ff_ph_40) + [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width) : "memory" ); + /* clang-format on */ } /* Horizontal filter: pixel_step is always W */ static INLINE void vp8_filter_block1dc_v6_mmi( uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) { - DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; - uint32_t tmp[1]; + double ff_ph_40; + uint64_t tmp[1]; mips_reg addr[1]; + #if _MIPS_SIM == _ABIO32 register double fzero asm("$f0"); register double ftmp0 asm("$f2"); @@ -215,7 +218,10 @@ static INLINE void vp8_filter_block1dc_v6_mmi( register double ftmp13 asm("$f14"); #endif // _MIPS_SIM == _ABIO32 + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0x0040004000400040 \n\t" + "dmtc1 %[tmp0], %[ff_ph_40] \n\t" "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" @@ -223,8 +229,8 @@ static INLINE void vp8_filter_block1dc_v6_mmi( "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" "pxor %[fzero], %[fzero], %[fzero] \n\t" - "li %[tmp0], 0x07 \n\t" - "mtc1 %[tmp0], %[ftmp13] \n\t" + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp13] \n\t" /* In order to make full use of memory load delay slot, * Operation of memory loading and calculating has been rearranged. @@ -285,15 +291,16 @@ static INLINE void vp8_filter_block1dc_v6_mmi( [ftmp11]"=&f"(ftmp11), [ftmp12]"=&f"(ftmp12), [ftmp13]"=&f"(ftmp13), [tmp0]"=&r"(tmp[0]), [addr0]"=&r"(addr[0]), [src_ptr]"+&r"(src_ptr), - [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height), + [ff_ph_40]"=&f"(ff_ph_40) : [pixels_per_line]"r"((mips_reg)pixels_per_line), [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)), [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)), [vp8_filter]"r"(vp8_filter), - [output_pitch]"r"((mips_reg)output_pitch), - [ff_ph_40]"f"(ff_ph_40) + [output_pitch]"r"((mips_reg)output_pitch) : "memory" ); + /* clang-format on */ } /* When xoffset == 0, vp8_filter= {0,0,128,0,0,0}, @@ -313,6 +320,7 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi( register double ftmp1 asm("$f2"); #endif // _MIPS_SIM == _ABIO32 + /* clang-format off */ __asm__ volatile ( "pxor %[fzero], %[fzero], %[fzero] \n\t" @@ -335,6 +343,7 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi( [output_width]"r"(output_width) : "memory" ); + /* clang-format on */ } static INLINE void vp8_filter_block1dc_v6_filter0_mmi( @@ -350,6 +359,7 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi( register double ftmp1 asm("$f2"); #endif // _MIPS_SIM == _ABIO32 + /* clang-format on */ __asm__ volatile ( "pxor %[fzero], %[fzero], %[fzero] \n\t" @@ -371,6 +381,7 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi( [output_pitch]"r"((mips_reg)output_pitch) : "memory" ); + /* clang-format on */ } #define sixtapNxM(n, m) \ diff --git a/vp8/encoder/mips/mmi/dct_mmi.c b/vp8/encoder/mips/mmi/dct_mmi.c index b5ecf0f1ca..0fd25fcda5 100644 --- a/vp8/encoder/mips/mmi/dct_mmi.c +++ b/vp8/encoder/mips/mmi/dct_mmi.c @@ -46,6 +46,7 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { uint64_t tmp[1]; int16_t *ip = input; + double ff_ph_op1, ff_ph_op3; #if _MIPS_SIM == _ABIO32 register double ftmp0 asm("$f0"); @@ -83,13 +84,16 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL }; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL }; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL }; + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0x14e808a914e808a9 \n\t" + "dmtc1 %[tmp0], %[ff_ph_op1] \n\t" + "dli %[tmp0], 0xeb1808a9eb1808a9 \n\t" + "dmtc1 %[tmp0], %[ff_ph_op3] \n\t" "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" @@ -129,7 +133,7 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12 MMI_LI(%[tmp0], 0x0c) - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" "ldc1 %[ftmp12], %[ff_pw_14500] \n\t" "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op1] \n\t" @@ -169,7 +173,7 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" "paddh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" MMI_LI(%[tmp0], 0x04) - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" @@ -211,15 +215,16 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5), [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8), [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11), - [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip) + [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip), + [ff_ph_op1] "=&f"(ff_ph_op1), [ff_ph_op3] "=&f"(ff_ph_op3) : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07), - [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3), [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500), [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000), [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217), [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output) : "memory" ); + /* clang-format on */ } void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) { @@ -228,17 +233,22 @@ void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) { } void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { - double ftmp[13]; - uint32_t tmp[1]; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; - DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL }; - DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL }; - DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL }; + double ftmp[13], ff_ph_01, ff_pw_01, ff_pw_03, ff_pw_mask; + uint64_t tmp[1]; + /* clang-format off */ __asm__ volatile ( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ff_ph_01] \n\t" + "dli %[tmp0], 0x0000000100000001 \n\t" + "dmtc1 %[tmp0], %[ff_pw_01] \n\t" + "dli %[tmp0], 0x0000000300000003 \n\t" + "dmtc1 %[tmp0], %[ff_pw_03] \n\t" + "dli %[tmp0], 0x0001000000010000 \n\t" + "dmtc1 %[tmp0], %[ff_pw_mask] \n\t" MMI_LI(%[tmp0], 0x02) "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" @@ -337,7 +347,7 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { "psubw %[ftmp4], %[ftmp9], %[ftmp10] \n\t" MMI_LI(%[tmp0], 0x03) - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp1] \n\t" "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" @@ -393,7 +403,7 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { "packsswh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" MMI_LI(%[tmp0], 0x72) - "mtc1 %[tmp0], %[ftmp11] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" "pshufh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" "pshufh %[ftmp2], %[ftmp2], %[ftmp11] \n\t" "pshufh %[ftmp3], %[ftmp3], %[ftmp11] \n\t" @@ -413,13 +423,12 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), - [ftmp12]"=&f"(ftmp[12]), - [tmp0]"=&r"(tmp[0]), - [ip]"+&r"(input) - : [op]"r"(output), - [ff_pw_01]"f"(ff_pw_01), [pitch]"r"((mips_reg)pitch), - [ff_pw_03]"f"(ff_pw_03), [ff_pw_mask]"f"(ff_pw_mask), - [ff_ph_01]"f"(ff_ph_01) + [ftmp12]"=&f"(ftmp[12]), [ff_pw_mask]"=&f"(ff_pw_mask), + [tmp0]"=&r"(tmp[0]), [ff_pw_01]"=&f"(ff_pw_01), + [ip]"+&r"(input), [ff_pw_03]"=&f"(ff_pw_03), + [ff_ph_01]"=&f"(ff_ph_01) + : [op]"r"(output), [pitch]"r"((mips_reg)pitch) : "memory" ); + /* clang-format on */ } diff --git a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c index 69a9e5e018..1986444aa3 100644 --- a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c +++ b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c @@ -42,16 +42,17 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { double ftmp[13]; uint64_t tmp[1]; - DECLARE_ALIGNED(8, const uint64_t, ones) = { 0xffffffffffffffffULL }; - int eob = 0; + int64_t eob = 0; + double ones; __asm__ volatile( // loop 0 ~ 7 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pcmpeqh %[ones], %[ones], %[ones] \n\t" "gsldlc1 %[ftmp1], 0x07(%[coeff_ptr]) \n\t" "gsldrc1 %[ftmp1], 0x00(%[coeff_ptr]) \n\t" - "li %[tmp0], 0x0f \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x0f \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "gsldlc1 %[ftmp2], 0x0f(%[coeff_ptr]) \n\t" "gsldrc1 %[ftmp2], 0x08(%[coeff_ptr]) \n\t" @@ -165,18 +166,18 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { "gssdlc1 %[ftmp6], 0x1f(%[dqcoeff_ptr]) \n\t" "gssdrc1 %[ftmp6], 0x18(%[dqcoeff_ptr]) \n\t" - "li %[tmp0], 0x10 \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x10 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" "psrlw %[ftmp11], %[ftmp10], %[ftmp9] \n\t" "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" - "li %[tmp0], 0xaa \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0xaa \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "pshufh %[ftmp11], %[ftmp10], %[ftmp9] \n\t" "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" - "li %[tmp0], 0xffff \n\t" - "mtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0xffff \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" "pand %[ftmp10], %[ftmp10], %[ftmp9] \n\t" "gssdlc1 %[ftmp10], 0x07(%[eob]) \n\t" "gssdrc1 %[ftmp10], 0x00(%[eob]) \n\t" @@ -184,15 +185,15 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), - [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), + [tmp0] "=&r"(tmp[0]), [ones] "=&f"(ones) : [coeff_ptr] "r"((mips_reg)coeff_ptr), [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr), [dequant_ptr] "r"((mips_reg)dequant_ptr), [round_ptr] "r"((mips_reg)round_ptr), [quant_ptr] "r"((mips_reg)quant_ptr), [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr), - [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob), - [ones] "f"(ones) + [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob) : "memory"); *d->eob = eob; diff --git a/vpx_dsp/mips/sad_mmi.c b/vpx_dsp/mips/sad_mmi.c index 5dee3164bc..eaca4773f2 100644 --- a/vpx_dsp/mips/sad_mmi.c +++ b/vpx_dsp/mips/sad_mmi.c @@ -364,6 +364,7 @@ static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride, double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; mips_reg l_counter = counter; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" @@ -383,6 +384,7 @@ static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride, : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -405,7 +407,9 @@ static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride, unsigned int sad; double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" @@ -424,11 +428,12 @@ static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride, : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), - [second_pred]"+&r"((mips_reg)second_pred), + [second_pred]"+&r"(l_second_pred), [sad]"=&r"(sad) : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -450,6 +455,7 @@ static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride, double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; mips_reg l_counter = counter; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" @@ -469,6 +475,7 @@ static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride, : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -493,7 +500,9 @@ static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride, unsigned int sad; double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" @@ -512,11 +521,12 @@ static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride, : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), - [second_pred]"+&r"((mips_reg)second_pred), + [second_pred]"+&r"(l_second_pred), [sad]"=&r"(sad) : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -539,6 +549,7 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride, double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; mips_reg l_counter = counter; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" @@ -558,6 +569,7 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride, : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -586,7 +598,9 @@ static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride, unsigned int sad; double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" "1: \n\t" @@ -605,11 +619,12 @@ static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride, : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), - [second_pred]"+&r"((mips_reg)second_pred), + [second_pred]"+&r"(l_second_pred), [sad]"=&r"(sad) : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -632,6 +647,7 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride, double ftmp1, ftmp2, ftmp3; mips_reg l_counter = counter; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" "1: \n\t" @@ -651,6 +667,7 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride, : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -679,7 +696,9 @@ static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride, unsigned int sad; double ftmp1, ftmp2, ftmp3; mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" "1: \n\t" @@ -697,11 +716,12 @@ static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride, "mfc1 %[sad], %[ftmp3] \n\t" : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), - [second_pred]"+&r"((mips_reg)second_pred), + [second_pred]"+&r"(l_second_pred), [sad]"=&r"(sad) : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -724,6 +744,7 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride, double ftmp1, ftmp2, ftmp3; mips_reg l_counter = counter; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" "1: \n\t" @@ -743,6 +764,7 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride, : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } @@ -767,7 +789,9 @@ static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride, unsigned int sad; double ftmp1, ftmp2, ftmp3; mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" "1: \n\t" @@ -785,11 +809,12 @@ static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride, "mfc1 %[sad], %[ftmp3] \n\t" : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), - [second_pred]"+&r"((mips_reg)second_pred), + [second_pred]"+&r"(l_second_pred), [sad]"=&r"(sad) : [src_stride]"r"((mips_reg)src_stride), [ref_stride]"r"((mips_reg)ref_stride) ); + /* clang-format on */ return sad; } diff --git a/vpx_dsp/mips/variance_mmi.c b/vpx_dsp/mips/variance_mmi.c index 29e52a1a89..c2adcfa018 100644 --- a/vpx_dsp/mips/variance_mmi.c +++ b/vpx_dsp/mips/variance_mmi.c @@ -414,6 +414,7 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" @@ -496,6 +497,7 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride, [high]"r"(&high), [sse]"r"(sse) : "memory" ); + /* clang-format on */ return *sse - (((int64_t)sum * sum) / (64 * high)); } @@ -519,6 +521,7 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" @@ -577,6 +580,7 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, [sse]"r"(sse) : "memory" ); + /* clang-format on */ return *sse - (((int64_t)sum * sum) / 2048); } @@ -590,6 +594,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" @@ -653,6 +658,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride, [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); + /* clang-format on */ return *sse - (((int64_t)sum * sum) / (32 * high)); } @@ -676,6 +682,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" @@ -729,6 +736,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride, [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); + /* clang-format on */ return *sse - (((int64_t)sum * sum) / (16 * high)); } @@ -753,6 +761,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" @@ -801,6 +810,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride, [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); + /* clang-format on */ return *sse - (((int64_t)sum * sum) / (8 * high)); } @@ -825,6 +835,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp10] \n\t" @@ -872,6 +883,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride, [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); + /* clang-format on */ return *sse - (((int64_t)sum * sum) / (4 * high)); } @@ -894,6 +906,7 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" @@ -925,6 +938,7 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride, [high]"r"(&high), [sse]"r"(sse) : "memory" ); + /* clang-format on */ return *sse; } @@ -947,6 +961,7 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride, *sse = 0; + /* clang-format off */ __asm__ volatile ( "li %[tmp0], 0x20 \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" @@ -978,6 +993,7 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride, [high]"r"(&high), [sse]"r"(sse) : "memory" ); + /* clang-format on */ return *sse; } @@ -1021,22 +1037,39 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr, uint8_t *temp2_ptr = temp2; mips_reg l_counter = counter; double ftmp[15]; + double ff_ph_40, mask; + double filter_x0, filter_x1, filter_y0, filter_y1; mips_reg tmp[2]; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; - DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; + uint64_t x0, x1, y0, y1, all; const uint8_t *filter_x = bilinear_filters[x_offset]; const uint8_t *filter_y = bilinear_filters[y_offset]; + x0 = (uint64_t)filter_x[0]; + x1 = (uint64_t)filter_x[1]; + y0 = (uint64_t)filter_y[0]; + y1 = (uint64_t)filter_y[1]; + all = x0 | x1 << 8 | y0 << 16 | y1 << 24; + /* clang-format off */ __asm__ volatile ( "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_MTC1(%[all], %[ftmp14]) + "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t" + "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x10) + MMI_MTC1(%[tmp0], %[mask]) + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t" MMI_LI(%[tmp0], 0x07) MMI_MTC1(%[tmp0], %[ftmp14]) - "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" - "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t" - "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t" - "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t" - + MMI_LI(%[tmp0], 0x0040004000400040) + MMI_MTC1(%[tmp0], %[ff_ph_40]) + MMI_LI(%[tmp0], 0x00ff00ff00ff00ff) + MMI_MTC1(%[tmp0], %[mask]) // fdata3: fdata3[0] ~ fdata3[15] VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A @@ -1072,15 +1105,13 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr, [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), - [counter]"+&r"(l_counter) - : [filter_x0] "f"((uint64_t)filter_x[0]), - [filter_x1] "f"((uint64_t)filter_x[1]), - [filter_y0] "f"((uint64_t)filter_y[0]), - [filter_y1] "f"((uint64_t)filter_y[1]), - [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40), - [mask] "f"(mask) + [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask), + [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1), + [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1) + : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all) : "memory" ); + /* clang-format on */ } #define SUBPIX_VAR16XN(H) \ @@ -1105,19 +1136,38 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr, mips_reg l_counter = counter; double ftmp[15]; mips_reg tmp[2]; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; - DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; + double ff_ph_40, mask; + uint64_t x0, x1, y0, y1, all; + double filter_x0, filter_x1, filter_y0, filter_y1; const uint8_t *filter_x = bilinear_filters[x_offset]; const uint8_t *filter_y = bilinear_filters[y_offset]; + x0 = (uint64_t)filter_x[0]; + x1 = (uint64_t)filter_x[1]; + y0 = (uint64_t)filter_y[0]; + y1 = (uint64_t)filter_y[1]; + all = x0 | x1 << 8 | y0 << 16 | y1 << 24; + /* clang-format off */ __asm__ volatile ( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_MTC1(%[all], %[ftmp14]) + "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t" + "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x10) + MMI_MTC1(%[tmp0], %[mask]) + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t" "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" MMI_LI(%[tmp0], 0x07) MMI_MTC1(%[tmp0], %[ftmp14]) - "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" - "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t" - "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t" - "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x0040004000400040) + MMI_MTC1(%[tmp0], %[ff_ph_40]) + MMI_LI(%[tmp0], 0x00ff00ff00ff00ff) + MMI_MTC1(%[tmp0], %[mask]) // fdata3: fdata3[0] ~ fdata3[7] VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A @@ -1154,15 +1204,13 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr, [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), - [counter]"+&r"(l_counter) - : [filter_x0] "f"((uint64_t)filter_x[0]), - [filter_x1] "f"((uint64_t)filter_x[1]), - [filter_y0] "f"((uint64_t)filter_y[0]), - [filter_y1] "f"((uint64_t)filter_y[1]), - [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40), - [mask] "f"(mask) + [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask), + [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1), + [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1) + : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all) : "memory" ); + /* clang-format on */ } #define SUBPIX_VAR8XN(H) \ @@ -1188,19 +1236,38 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr, mips_reg l_counter = counter; double ftmp[7]; mips_reg tmp[2]; - DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; - DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; + double ff_ph_40, mask; + uint64_t x0, x1, y0, y1, all; + double filter_x0, filter_x1, filter_y0, filter_y1; const uint8_t *filter_x = bilinear_filters[x_offset]; const uint8_t *filter_y = bilinear_filters[y_offset]; + x0 = (uint64_t)filter_x[0]; + x1 = (uint64_t)filter_x[1]; + y0 = (uint64_t)filter_y[0]; + y1 = (uint64_t)filter_y[1]; + all = x0 | x1 << 8 | y0 << 16 | y1 << 24; + /* clang-format off */ __asm__ volatile ( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_MTC1(%[all], %[ftmp6]) + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pshufh %[filter_x0], %[ftmp6], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x10) + MMI_MTC1(%[tmp0], %[mask]) + "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t" + "pshufh %[filter_x1], %[ftmp6], %[ftmp0] \n\t" + "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t" + "pshufh %[filter_y0], %[ftmp6], %[ftmp0] \n\t" + "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t" + "pshufh %[filter_y1], %[ftmp6], %[ftmp0] \n\t" "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" MMI_LI(%[tmp0], 0x07) MMI_MTC1(%[tmp0], %[ftmp6]) - "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" - "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t" - "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t" - "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x0040004000400040) + MMI_MTC1(%[tmp0], %[ff_ph_40]) + MMI_LI(%[tmp0], 0x00ff00ff00ff00ff) + MMI_MTC1(%[tmp0], %[mask]) // fdata3: fdata3[0] ~ fdata3[3] VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A @@ -1232,15 +1299,14 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr, : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), - [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter) - : [filter_x0] "f"((uint64_t)filter_x[0]), - [filter_x1] "f"((uint64_t)filter_x[1]), - [filter_y0] "f"((uint64_t)filter_y[0]), - [filter_y1] "f"((uint64_t)filter_y[1]), - [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40), - [mask] "f"(mask) + [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter), + [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask), + [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1), + [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1) + : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all) : "memory" ); + /* clang-format on */ } #define SUBPIX_VAR4XN(H) \ From 6ee3f3649f21b83cfec6d08265e3724693a846af Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 9 Jul 2020 16:38:00 -0700 Subject: [PATCH 0020/1000] test/*: rename *TestCase to TestSuite similar to the TEST_CASE -> TEST_SUITE changes in: 83769e3d2 update googletest to v1.10.0 BUG=webm:1695 Change-Id: Ib2bdb6bc0e4ed02d61523f8a8315b017b8ad6dad --- test/blockiness_test.cc | 4 ++-- test/consistency_test.cc | 4 ++-- test/convolve_test.cc | 4 ++-- test/cq_test.cc | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/blockiness_test.cc b/test/blockiness_test.cc index 75aa2938e2..1ad444a044 100644 --- a/test/blockiness_test.cc +++ b/test/blockiness_test.cc @@ -35,14 +35,14 @@ class BlockinessTestBase : public ::testing::Test { public: BlockinessTestBase(int width, int height) : width_(width), height_(height) {} - static void SetUpTestCase() { + static void SetUpTestSuite() { source_data_ = reinterpret_cast( vpx_memalign(kDataAlignment, kDataBufferSize)); reference_data_ = reinterpret_cast( vpx_memalign(kDataAlignment, kDataBufferSize)); } - static void TearDownTestCase() { + static void TearDownTestSuite() { vpx_free(source_data_); source_data_ = NULL; vpx_free(reference_data_); diff --git a/test/consistency_test.cc b/test/consistency_test.cc index 69ebaf70c4..f31fd8c923 100644 --- a/test/consistency_test.cc +++ b/test/consistency_test.cc @@ -39,7 +39,7 @@ class ConsistencyTestBase : public ::testing::Test { public: ConsistencyTestBase(int width, int height) : width_(width), height_(height) {} - static void SetUpTestCase() { + static void SetUpTestSuite() { source_data_[0] = reinterpret_cast( vpx_memalign(kDataAlignment, kDataBufferSize)); reference_data_[0] = reinterpret_cast( @@ -52,7 +52,7 @@ class ConsistencyTestBase : public ::testing::Test { } static void ClearSsim() { memset(ssim_array_, 0, kDataBufferSize / 16); } - static void TearDownTestCase() { + static void TearDownTestSuite() { vpx_free(source_data_[0]); source_data_[0] = NULL; vpx_free(reference_data_[0]); diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 7330e97db0..6eef26f93a 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -341,7 +341,7 @@ void wrapper_filter_block2d_8_c(const uint8_t *src_ptr, class ConvolveTest : public ::testing::TestWithParam { public: - static void SetUpTestCase() { + static void SetUpTestSuite() { // Force input_ to be unaligned, output to be 16 byte aligned. input_ = reinterpret_cast( vpx_memalign(kDataAlignment, kInputBufferSize + 1)) + @@ -363,7 +363,7 @@ class ConvolveTest : public ::testing::TestWithParam { virtual void TearDown() { libvpx_test::ClearSystemState(); } - static void TearDownTestCase() { + static void TearDownTestSuite() { vpx_free(input_ - 1); input_ = NULL; vpx_free(output_); diff --git a/test/cq_test.cc b/test/cq_test.cc index 3126f3b4e2..292adb0d04 100644 --- a/test/cq_test.cc +++ b/test/cq_test.cc @@ -29,9 +29,9 @@ class CQTest : public ::libvpx_test::EncoderTest, // maps the cqlevel to the bitrate produced. typedef std::map BitrateMap; - static void SetUpTestCase() { bitrates_.clear(); } + static void SetUpTestSuite() { bitrates_.clear(); } - static void TearDownTestCase() { + static void TearDownTestSuite() { ASSERT_TRUE(!HasFailure()) << "skipping bitrate validation due to earlier failure."; uint32_t prev_actual_bitrate = kCQTargetBitrate; From baefbe85d09f7b884923437d9413b3e6ba4a1c7d Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 7 Jul 2020 17:44:13 -0700 Subject: [PATCH 0021/1000] Cap target bitrate to raw rate internally BUG=webm:1685 Change-Id: Ida72fe854fadb19c3745724e74b67d88087eb83c --- test/realtime_test.cc | 7 +------ vp8/encoder/bitstream.c | 2 +- vp8/encoder/onyx_if.c | 7 ++++++- vp9/encoder/vp9_pickmode.c | 2 +- vp9/encoder/vp9_ratectrl.c | 6 ++++-- vp9/vp9_cx_iface.c | 11 +++++++++-- 6 files changed, 22 insertions(+), 13 deletions(-) diff --git a/test/realtime_test.cc b/test/realtime_test.cc index 25a933bcc5..63a5347d99 100644 --- a/test/realtime_test.cc +++ b/test/realtime_test.cc @@ -63,15 +63,10 @@ TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) { EXPECT_EQ(kFramesToEncode, frame_packets_); } -// TODO(https://crbug.com/webm/1685): the following passes -fsanitize=undefined -// with bitrate set to 140000000 for vp8 and 128000 for vp9. There are -// additional failures with lower bitrates using -fsanitize=integer. -TEST_P(RealtimeTest, DISABLED_IntegerOverflow) { +TEST_P(RealtimeTest, IntegerOverflow) { ::libvpx_test::RandomVideoSource video; video.SetSize(800, 480); video.set_limit(20); - // TODO(https://crbug.com/webm/1685): this should be silently capped - // internally to the raw yuv rate or below. cfg_.rc_target_bitrate = 140000000; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c index 3daa4e2c2a..80cbb882fd 100644 --- a/vp8/encoder/bitstream.c +++ b/vp8/encoder/bitstream.c @@ -222,7 +222,7 @@ void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) { validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error); - w->buffer[w->pos++] = (lowvalue >> (24 - offset)); + w->buffer[w->pos++] = (lowvalue >> (24 - offset)) & 0xff; lowvalue <<= offset; shift = count; lowvalue &= 0xffffff; diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index dccc6ebb1a..aeed719d1f 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1430,6 +1430,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { VP8_COMMON *cm = &cpi->common; int last_w, last_h; unsigned int prev_number_of_layers; + unsigned int raw_target_rate; if (!cpi) return; @@ -1570,6 +1571,10 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { cpi->oxcf.maximum_buffer_size_in_ms = 240000; } + raw_target_rate = (unsigned int)((int64_t)cpi->oxcf.Width * cpi->oxcf.Height * + 8 * 3 * cpi->framerate / 1000); + if (cpi->oxcf.target_bandwidth > raw_target_rate) + cpi->oxcf.target_bandwidth = raw_target_rate; /* Convert target bandwidth from Kbit/s to Bit/s */ cpi->oxcf.target_bandwidth *= 1000; @@ -3615,7 +3620,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, if (cpi->this_key_frame_forced) { if (cpi->active_best_quality > cpi->avg_frame_qindex * 7 / 8) { cpi->active_best_quality = cpi->avg_frame_qindex * 7 / 8; - } else if (cpi->active_best_qualityavg_frame_qindex>> 2) { + } else if (cpi->active_best_quality < (cpi->avg_frame_qindex >> 2)) { cpi->active_best_quality = cpi->avg_frame_qindex >> 2; } } diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 23c943c21d..695fd484fc 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1127,7 +1127,7 @@ static INLINE void update_thresh_freq_fact_row_mt( } static INLINE void update_thresh_freq_fact( - VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance, + VP9_COMP *cpi, TileDataEnc *tile_data, unsigned int source_variance, BLOCK_SIZE bsize, MV_REFERENCE_FRAME ref_frame, THR_MODES best_mode_idx, PREDICTION_MODE mode) { THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)]; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index c8b3e0b29a..2afa3618e8 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1695,8 +1695,10 @@ void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target, } else { // For very small rate targets where the fractional adjustment // may be tiny make sure there is at least a minimum range. - const int tol_low = (cpi->sf.recode_tolerance_low * frame_target) / 100; - const int tol_high = (cpi->sf.recode_tolerance_high * frame_target) / 100; + const int tol_low = + (int)(((int64_t)cpi->sf.recode_tolerance_low * frame_target) / 100); + const int tol_high = + (int)(((int64_t)cpi->sf.recode_tolerance_high * frame_target) / 100); *frame_under_shoot_limit = VPXMAX(frame_target - tol_low - 100, 0); *frame_over_shoot_limit = VPXMIN(frame_target + tol_high + 100, cpi->rc.max_frame_bandwidth); diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 15aa7e60f7..9074e1b4ec 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -470,10 +470,11 @@ static vpx_rational64_t get_g_timebase_in_ts(vpx_rational_t g_timebase) { } static vpx_codec_err_t set_encoder_config( - VP9EncoderConfig *oxcf, const vpx_codec_enc_cfg_t *cfg, + VP9EncoderConfig *oxcf, vpx_codec_enc_cfg_t *cfg, const struct vp9_extracfg *extra_cfg) { const int is_vbr = cfg->rc_end_usage == VPX_VBR; int sl, tl; + unsigned int raw_target_rate; oxcf->profile = cfg->g_profile; oxcf->max_threads = (int)cfg->g_threads; oxcf->width = cfg->g_w; @@ -500,8 +501,14 @@ static vpx_codec_err_t set_encoder_config( cfg->g_pass == VPX_RC_FIRST_PASS ? 0 : cfg->g_lag_in_frames; oxcf->rc_mode = cfg->rc_end_usage; + raw_target_rate = + (unsigned int)((int64_t)oxcf->width * oxcf->height * oxcf->bit_depth * 3 * + oxcf->init_framerate / 1000); + // Cap target bitrate to raw rate + cfg->rc_target_bitrate = VPXMIN(raw_target_rate, cfg->rc_target_bitrate); + // Convert target bandwidth from Kbit/s to Bit/s - oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate; + oxcf->target_bandwidth = 1000 * (int64_t)cfg->rc_target_bitrate; oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct; oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct; oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct; From 16935397ee6712c407f0554e80160054d5eb7188 Mon Sep 17 00:00:00 2001 From: angiebird Date: Mon, 15 Jun 2020 15:09:01 -0700 Subject: [PATCH 0022/1000] Add SetEncodeSpeed() to SimpleEncode Change-Id: I2fcf37045a96bb101de3359e2e69dcc266c1dc10 --- vp9/simple_encode.cc | 13 +++++++++---- vp9/simple_encode.h | 10 ++++++++++ vp9/vp9_cx_iface.c | 7 +++++-- vp9/vp9_cx_iface.h | 2 +- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 7bce91f7f0..1d7214ce58 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -706,6 +706,7 @@ SimpleEncode::SimpleEncode(int frame_width, int frame_height, frame_rate_den_ = frame_rate_den; target_bitrate_ = target_bitrate; num_frames_ = num_frames; + encode_speed_ = 0; frame_coding_index_ = 0; show_frame_count_ = 0; @@ -727,12 +728,16 @@ SimpleEncode::SimpleEncode(int frame_width, int frame_height, InitRefFrameInfo(&ref_frame_info_); } +void SimpleEncode::SetEncodeSpeed(int encode_speed) { + encode_speed_ = encode_speed; +} + void SimpleEncode::ComputeFirstPassStats() { vpx_rational_t frame_rate = make_vpx_rational(frame_rate_num_, frame_rate_den_); const VP9EncoderConfig oxcf = vp9_get_encoder_config(frame_width_, frame_height_, frame_rate, - target_bitrate_, VPX_RC_FIRST_PASS); + target_bitrate_, encode_speed_, VPX_RC_FIRST_PASS); VP9_COMP *cpi = init_encoder(&oxcf, impl_ptr_->img_fmt); struct lookahead_ctx *lookahead = cpi->lookahead; int i; @@ -881,7 +886,7 @@ void SimpleEncode::StartEncode() { make_vpx_rational(frame_rate_num_, frame_rate_den_); VP9EncoderConfig oxcf = vp9_get_encoder_config(frame_width_, frame_height_, frame_rate, - target_bitrate_, VPX_RC_LAST_PASS); + target_bitrate_, encode_speed_, VPX_RC_LAST_PASS); vpx_fixed_buf_t stats; stats.buf = GetVectorData(impl_ptr_->first_pass_stats); stats.sz = sizeof(impl_ptr_->first_pass_stats[0]) * @@ -1091,7 +1096,7 @@ int SimpleEncode::GetCodingFrameNum() const { make_vpx_rational(frame_rate_num_, frame_rate_den_); const VP9EncoderConfig oxcf = vp9_get_encoder_config(frame_width_, frame_height_, frame_rate, - target_bitrate_, VPX_RC_LAST_PASS); + target_bitrate_, encode_speed_, VPX_RC_LAST_PASS); FRAME_INFO frame_info = vp9_get_frame_info(&oxcf); FIRST_PASS_INFO first_pass_info; fps_init_first_pass_info(&first_pass_info, @@ -1108,7 +1113,7 @@ std::vector SimpleEncode::ComputeKeyFrameMap() const { make_vpx_rational(frame_rate_num_, frame_rate_den_); const VP9EncoderConfig oxcf = vp9_get_encoder_config(frame_width_, frame_height_, frame_rate, - target_bitrate_, VPX_RC_LAST_PASS); + target_bitrate_, encode_speed_, VPX_RC_LAST_PASS); FRAME_INFO frame_info = vp9_get_frame_info(&oxcf); FIRST_PASS_INFO first_pass_info; fps_init_first_pass_info(&first_pass_info, diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index b217320701..77197e7a23 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -304,6 +304,15 @@ class SimpleEncode { SimpleEncode(SimpleEncode &) = delete; SimpleEncode &operator=(const SimpleEncode &) = delete; + // Adjusts the encoder's coding speed. + // If this function is not called, the encoder will use default encode_speed + // 0. Call this function before ComputeFirstPassStats() if needed. + // The encode_speed is equivalent to --cpu-used of the vpxenc command. + // The encode_speed's range should be [0, 9]. + // Setting the encode_speed to a higher level will yield faster coding + // at the cost of lower compression efficiency. + void SetEncodeSpeed(int encode_speed); + // Makes encoder compute the first pass stats and store it at // impl_ptr_->first_pass_stats. key_frame_map_ is also computed based on the // first pass stats. @@ -405,6 +414,7 @@ class SimpleEncode { int frame_rate_den_; int target_bitrate_; int num_frames_; + int encode_speed_; std::FILE *in_file_; std::FILE *out_file_; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 9074e1b4ec..6caa4f7390 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1899,7 +1899,7 @@ static vp9_extracfg get_extra_cfg() { VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, vpx_rational_t frame_rate, - int target_bitrate, + int target_bitrate, int encode_speed, vpx_enc_pass enc_pass) { /* This function will generate the same VP9EncoderConfig used by the * vpxenc command given below. @@ -1910,6 +1910,7 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, * HEIGHT: frame_height * FPS: frame_rate * BITRATE: target_bitrate + * CPU_USED:encode_speed * * INPUT, OUTPUT, LIMIT will not affect VP9EncoderConfig * @@ -1921,9 +1922,10 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, * BITRATE=600 * FPS=30/1 * LIMIT=150 + * CPU_USED=0 * ./vpxenc --limit=$LIMIT --width=$WIDTH --height=$HEIGHT --fps=$FPS * --lag-in-frames=25 \ - * --codec=vp9 --good --cpu-used=0 --threads=0 --profile=0 \ + * --codec=vp9 --good --cpu-used=CPU_USED --threads=0 --profile=0 \ * --min-q=0 --max-q=63 --auto-alt-ref=1 --passes=2 --kf-max-dist=150 \ * --kf-min-dist=0 --drop-frame=0 --static-thresh=0 --bias-pct=50 \ * --minsection-pct=0 --maxsection-pct=150 --arnr-maxframes=7 --psnr \ @@ -1946,6 +1948,7 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, oxcf.tile_columns = 0; oxcf.frame_parallel_decoding_mode = 0; oxcf.two_pass_vbrmax_section = 150; + oxcf.speed = abs(encode_speed); return oxcf; } diff --git a/vp9/vp9_cx_iface.h b/vp9/vp9_cx_iface.h index 08569fcc96..3188575dd6 100644 --- a/vp9/vp9_cx_iface.h +++ b/vp9/vp9_cx_iface.h @@ -19,7 +19,7 @@ extern "C" { VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, vpx_rational_t frame_rate, - int target_bitrate, + int target_bitrate, int encode_speed, vpx_enc_pass enc_pass); void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf); From 56345d256ae191e6de042ad82ccf458de3102b8a Mon Sep 17 00:00:00 2001 From: angiebird Date: Wed, 15 Jul 2020 14:52:02 -0700 Subject: [PATCH 0023/1000] Build libsimple_encode.a separately BUG=webm:1689 Change-Id: Id920816315c6586cd652ba6cd1b3a76dfc1f12b7 --- libs.mk | 59 ++++++++++++++++++++++++++++++++++---- test/simple_encode_test.cc | 5 ++++ test/test.mk | 5 +++- vp9/vp9cx.mk | 3 -- 4 files changed, 63 insertions(+), 9 deletions(-) diff --git a/libs.mk b/libs.mk index d73578be92..39f11d4bc8 100644 --- a/libs.mk +++ b/libs.mk @@ -173,7 +173,18 @@ INSTALL-LIBS-$(CONFIG_STATIC) += $(LIBSUBDIR)/libvpx.a INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(LIBSUBDIR)/libvpx_g.a endif +ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_RATE_CTRL),yesyes) + SIMPLE_ENCODE_SRCS := $(call enabled,CODEC_SRCS) + SIMPLE_ENCODE_SRCS += $(VP9_PREFIX)simple_encode.cc + SIMPLE_ENCODE_SRCS += $(VP9_PREFIX)simple_encode.h + SIMPLE_ENCODE_SRCS += ivfenc.h + SIMPLE_ENCODE_SRCS += ivfenc.c + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)simple_encode.cc + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)simple_encode.h +endif + CODEC_SRCS=$(call enabled,CODEC_SRCS) + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(CODEC_SRCS) INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS) @@ -263,8 +274,8 @@ PROJECTS-yes += vp9rc.$(VCPROJ_SFX) vp9rc.$(VCPROJ_SFX): vpx_config.asm vp9rc.$(VCPROJ_SFX): $(RTCD) -endif -else +endif # ifeq ($(CONFIG_MSVS),yes) +else # ifeq ($(CONFIG_EXTERNAL_BUILD),yes) LIBVPX_OBJS=$(call objs, $(filter-out $(ASM_INCLUDES), $(CODEC_SRCS))) OBJS-yes += $(LIBVPX_OBJS) LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a @@ -377,8 +388,15 @@ ifeq ($(CONFIG_VP9_ENCODER),yes) $(BUILD_PFX)libvp9rc_g.a: $(RC_RTC_OBJS) endif +ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_RATE_CTRL),yesyes) + SIMPLE_ENCODE_OBJS=$(call objs,$(SIMPLE_ENCODE_SRCS)) + OBJS-yes += $(SIMPLE_ENCODE_OBJS) + LIBS-yes += $(BUILD_PFX)libsimple_encode.a $(BUILD_PFX)libsimple_encode_g.a + $(BUILD_PFX)libsimple_encode_g.a: $(SIMPLE_ENCODE_OBJS) endif +endif # ifeq ($(CONFIG_EXTERNAL_BUILD),yes) + libvpx.ver: $(call enabled,CODEC_EXPORTS) @echo " [CREATE] $@" $(qexec)echo "{ global:" > $@ @@ -434,20 +452,39 @@ ifeq ($(CONFIG_UNIT_TESTS),yes) LIBVPX_TEST_DATA_PATH ?= . include $(SRC_PATH_BARE)/test/test.mk -LIBVPX_TEST_SRCS=$(addprefix test/,$(call enabled,LIBVPX_TEST_SRCS)) + +# addprefix_clean behaves like addprefix if the target doesn't start with "../" +# However, if the target starts with "../", instead of adding prefix, +# it will remove "../". +# Using addprefix_clean, we can avoid two different targets building the +# same file, i.e. +# test/../ivfenc.c.d: ivfenc.o +# ivfenc.c.d: ivfenc.o +# Note that the other way to solve this problem is using "realpath". +# The "realpath" is supported by make 3.81 or later. +addprefix_clean=$(patsubst $(1)../%,%,$(addprefix $(1), $(2))) +LIBVPX_TEST_SRCS=$(call addprefix_clean,test/,$(call enabled,LIBVPX_TEST_SRCS)) + LIBVPX_TEST_BIN=./test_libvpx$(EXE_SFX) LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\ $(call enabled,LIBVPX_TEST_DATA)) libvpx_test_data_url=https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/$(1) TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX) -TEST_INTRA_PRED_SPEED_SRCS=$(addprefix test/,$(call enabled,TEST_INTRA_PRED_SPEED_SRCS)) +TEST_INTRA_PRED_SPEED_SRCS=$(call addprefix_clean,test/,\ + $(call enabled,TEST_INTRA_PRED_SPEED_SRCS)) TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS))) RC_INTERFACE_TEST_BIN=./test_rc_interface$(EXE_SFX) -RC_INTERFACE_TEST_SRCS=$(addprefix test/,$(call enabled,RC_INTERFACE_TEST_SRCS)) +RC_INTERFACE_TEST_SRCS=$(call addprefix_clean,test/,\ + $(call enabled,RC_INTERFACE_TEST_SRCS)) RC_INTERFACE_TEST_OBJS := $(sort $(call objs,$(RC_INTERFACE_TEST_SRCS))) +SIMPLE_ENCODE_TEST_BIN=./test_simple_encode$(EXE_SFX) +SIMPLE_ENCODE_TEST_SRCS=$(call addprefix_clean,test/,\ + $(call enabled,SIMPLE_ENCODE_TEST_SRCS)) +SIMPLE_ENCODE_TEST_OBJS := $(sort $(call objs,$(SIMPLE_ENCODE_TEST_SRCS))) + libvpx_test_srcs.txt: @echo " [CREATE] $@" @echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@ @@ -609,6 +646,18 @@ $(eval $(call linkerxx_template,$(RC_INTERFACE_TEST_BIN), \ -L. -lvpx -lgtest -lvp9rc $(extralibs) -lm)) endif # RC_INTERFACE_TEST +ifneq ($(strip $(SIMPLE_ENCODE_TEST_OBJS)),) +$(SIMPLE_ENCODE_TEST_OBJS) $(SIMPLE_ENCODE_TEST_OBJS:.o=.d): \ + CXXFLAGS += $(GTEST_INCLUDES) +OBJS-yes += $(SIMPLE_ENCODE_TEST_OBJS) +BINS-yes += $(SIMPLE_ENCODE_TEST_BIN) + +$(SIMPLE_ENCODE_TEST_BIN): $(TEST_LIBS) libsimple_encode.a +$(eval $(call linkerxx_template,$(SIMPLE_ENCODE_TEST_BIN), \ + $(SIMPLE_ENCODE_TEST_OBJS) \ + -L. -lsimple_encode -lvpx -lgtest $(extralibs) -lm)) +endif # SIMPLE_ENCODE_TEST + endif # CONFIG_UNIT_TESTS # Install test sources only if codec source is included diff --git a/test/simple_encode_test.cc b/test/simple_encode_test.cc index 69bed5a51a..6848351682 100644 --- a/test/simple_encode_test.cc +++ b/test/simple_encode_test.cc @@ -504,3 +504,8 @@ TEST_F(SimpleEncodeTest, GetFramePixelCount) { } // namespace } // namespace vp9 + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/test.mk b/test/test.mk index e289a78535..5aecb013f3 100644 --- a/test/test.mk +++ b/test/test.mk @@ -58,7 +58,6 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_RATE_CTRL) += simple_encode_test.cc LIBVPX_TEST_SRCS-yes += decode_test_driver.cc LIBVPX_TEST_SRCS-yes += decode_test_driver.h @@ -193,6 +192,10 @@ LIBVPX_TEST_SRCS-yes += vp9_denoiser_test.cc endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc +ifeq ($(CONFIG_VP9_ENCODER),yes) +SIMPLE_ENCODE_TEST_SRCS-$(CONFIG_RATE_CTRL) := simple_encode_test.cc +endif + endif # VP9 ## Multi-codec / unconditional whitebox tests. diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index ad774505c6..666f228827 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -18,9 +18,6 @@ VP9_CX_SRCS_REMOVE-no += $(VP9_COMMON_SRCS_REMOVE-no) VP9_CX_SRCS-yes += vp9_cx_iface.c VP9_CX_SRCS-yes += vp9_cx_iface.h -VP9_CX_SRCS-$(CONFIG_RATE_CTRL) += simple_encode.cc -VP9_CX_SRCS-$(CONFIG_RATE_CTRL) += simple_encode.h - VP9_CX_SRCS-yes += encoder/vp9_bitstream.c VP9_CX_SRCS-yes += encoder/vp9_context_tree.c VP9_CX_SRCS-yes += encoder/vp9_context_tree.h From c2f82351e45ee51a87f0d4d3badcdc1b99a5fd50 Mon Sep 17 00:00:00 2001 From: jinbo Date: Sat, 18 Jul 2020 12:25:47 +0800 Subject: [PATCH 0024/1000] vp8,vpx_dsp: [loongson] fix msa optimization bugs Fix two bugs reported by clang when enable msa optimizatons: 1. clang dose not support uld instruction. 2. ulw instruction will result in unit cases coredump. Change-Id: I171bed11d18b58252cbc8853428c039e2549cb95 --- vp8/common/mips/msa/vp8_macros_msa.h | 16 ++++++++-------- vpx_dsp/mips/macros_msa.h | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h index 14f83799ff..ddc881a7fc 100644 --- a/vp8/common/mips/msa/vp8_macros_msa.h +++ b/vp8/common/mips/msa/vp8_macros_msa.h @@ -122,10 +122,10 @@ const uint8_t *psrc_m = (const uint8_t *)(psrc); \ uint32_t val_m; \ \ - asm volatile("ulw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ + asm volatile("lwr %[val_m], 0(%[psrc_m]) \n\t" \ + "lwl %[val_m], 3(%[psrc_m]) \n\t" \ + : [val_m] "=&r"(val_m) \ + : [psrc_m] "r"(psrc_m)); \ \ val_m; \ }) @@ -136,10 +136,10 @@ const uint8_t *psrc_m = (const uint8_t *)(psrc); \ uint64_t val_m = 0; \ \ - asm volatile("uld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ + asm volatile("ldr %[val_m], 0(%[psrc_m]) \n\t" \ + "ldl %[val_m], 7(%[psrc_m]) \n\t" \ + : [val_m] "=&r"(val_m) \ + : [psrc_m] "r"(psrc_m)); \ \ val_m; \ }) diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h index a3a5a4dfee..3c2f50c790 100644 --- a/vpx_dsp/mips/macros_msa.h +++ b/vpx_dsp/mips/macros_msa.h @@ -88,10 +88,10 @@ const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \ uint32_t val_lw_m; \ \ - __asm__ __volatile__("ulw %[val_lw_m], %[psrc_lw_m] \n\t" \ - \ - : [val_lw_m] "=r"(val_lw_m) \ - : [psrc_lw_m] "m"(*psrc_lw_m)); \ + __asm__ __volatile__("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ + "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ + : [val_lw_m] "=&r"(val_lw_m) \ + : [psrc_lw_m] "r"(psrc_lw_m)); \ \ val_lw_m; \ }) @@ -102,10 +102,10 @@ const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ uint64_t val_ld_m = 0; \ \ - __asm__ __volatile__("uld %[val_ld_m], %[psrc_ld_m] \n\t" \ - \ - : [val_ld_m] "=r"(val_ld_m) \ - : [psrc_ld_m] "m"(*psrc_ld_m)); \ + __asm__ __volatile__("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ + "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ + : [val_ld_m] "=&r"(val_ld_m) \ + : [psrc_ld_m] "r"(psrc_ld_m)); \ \ val_ld_m; \ }) From 642f6a195d7c9ec327b2a1445d7dd09dfa4f9bbc Mon Sep 17 00:00:00 2001 From: angiebird Date: Fri, 17 Jul 2020 18:43:06 -0700 Subject: [PATCH 0025/1000] Add init version of EncodeFrameWithTargetFrameBits() Will add a unit test in a followup CL. Change-Id: I6a6354f307c427e1a352be7c6421927323eb5e1b --- vp9/encoder/vp9_encoder.h | 28 +++++++++++++++++++++------- vp9/encoder/vp9_ratectrl.c | 9 ++++++++- vp9/simple_encode.cc | 8 ++++++++ vp9/simple_encode.h | 6 ++++++ 4 files changed, 43 insertions(+), 8 deletions(-) diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 7aa4bf7dca..34ae0a09fb 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -563,16 +563,11 @@ static INLINE int gop_command_coding_frame_count( typedef struct ENCODE_COMMAND { int use_external_quantize_index; int external_quantize_index; + int use_external_target_frame_bits; + int target_frame_bits; GOP_COMMAND gop_command; } ENCODE_COMMAND; -static INLINE void encode_command_init(ENCODE_COMMAND *encode_command) { - vp9_zero(*encode_command); - encode_command->use_external_quantize_index = 0; - encode_command->external_quantize_index = -1; - gop_command_off(&encode_command->gop_command); -} - static INLINE void encode_command_set_gop_command( ENCODE_COMMAND *encode_command, GOP_COMMAND gop_command) { encode_command->gop_command = gop_command; @@ -590,6 +585,25 @@ static INLINE void encode_command_reset_external_quantize_index( encode_command->external_quantize_index = -1; } +static INLINE void encode_command_set_target_frame_bits( + ENCODE_COMMAND *encode_command, int target_frame_bits) { + encode_command->use_external_target_frame_bits = 1; + encode_command->target_frame_bits = target_frame_bits; +} + +static INLINE void encode_command_reset_target_frame_bits( + ENCODE_COMMAND *encode_command) { + encode_command->use_external_target_frame_bits = 0; + encode_command->target_frame_bits = -1; +} + +static INLINE void encode_command_init(ENCODE_COMMAND *encode_command) { + vp9_zero(*encode_command); + encode_command_reset_external_quantize_index(encode_command); + encode_command_reset_target_frame_bits(encode_command); + gop_command_off(&encode_command->gop_command); +} + // Returns number of units in size of 4, if not multiple not a multiple of 4, // round it up. For example, size is 7, return 2. static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; } diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 2afa3618e8..9bb8d45d2d 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1713,9 +1713,16 @@ void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) { // Modify frame size target when down-scaling. if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && - rc->frame_size_selector != UNSCALED) + rc->frame_size_selector != UNSCALED) { rc->this_frame_target = (int)(rc->this_frame_target * rate_thresh_mult[rc->frame_size_selector]); + } + +#if CONFIG_RATE_CTRL + if (cpi->encode_command.use_external_target_frame_bits) { + rc->this_frame_target = cpi->encode_command.target_frame_bits; + } +#endif // Target rate per SB64 (including partial SB64s. rc->sb64_target_rate = (int)(((int64_t)rc->this_frame_target * 64 * 64) / diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 1d7214ce58..d083a44c2d 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -1071,6 +1071,14 @@ void SimpleEncode::EncodeFrameWithQuantizeIndex( encode_command_reset_external_quantize_index(&impl_ptr_->cpi->encode_command); } +void SimpleEncode::EncodeFrameWithTargetFrameBits( + EncodeFrameResult *encode_frame_result, int target_frame_bits) { + encode_command_set_target_frame_bits(&impl_ptr_->cpi->encode_command, + target_frame_bits); + EncodeFrame(encode_frame_result); + encode_command_reset_target_frame_bits(&impl_ptr_->cpi->encode_command); +} + static int GetCodingFrameNumFromGopMap(const std::vector &gop_map) { int start_show_index = 0; int coding_frame_count = 0; diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index 77197e7a23..ae36eb2c55 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -382,6 +382,12 @@ class SimpleEncode { void EncodeFrameWithQuantizeIndex(EncodeFrameResult *encode_frame_result, int quantize_index); + // Encode a frame with target frame bits usage. + // The encoder will find a quantize index to make the actual frame bits usage + // match the target. + void EncodeFrameWithTargetFrameBits(EncodeFrameResult *encode_frame_result, + int target_frame_bits); + // Gets the number of coding frames for the video. The coding frames include // show frame and no show frame. // This function should be called after ComputeFirstPassStats(). From fbfd3fdfb7fa136debada4d4fd65825c80d19db9 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 22 Jul 2020 11:37:40 -0700 Subject: [PATCH 0026/1000] update googletest to release-1.10.0-224-g23b2a3b1 this matches libaom and provides GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST BUG=webm:1695 BUG=b/159031848 Change-Id: Icdaf61481ab2012dd0e517dd1e600045c937c0dd --- third_party/googletest/README.libvpx | 5 +- third_party/googletest/src/CHANGES | 157 - third_party/googletest/src/CONTRIBUTORS | 1 + third_party/googletest/src/README.md | 4 +- .../src/include/gtest/gtest-matchers.h | 44 +- .../src/include/gtest/gtest-message.h | 1 + .../src/include/gtest/gtest-param-test.h | 16 +- .../src/include/gtest/gtest-printers.h | 31 +- .../src/include/gtest/gtest-typed-test.h | 55 +- .../googletest/src/include/gtest/gtest.h | 25 +- .../include/gtest/internal/gtest-internal.h | 136 +- .../include/gtest/internal/gtest-param-util.h | 53 +- .../include/gtest/internal/gtest-port-arch.h | 4 + .../src/include/gtest/internal/gtest-port.h | 173 +- .../src/include/gtest/internal/gtest-string.h | 3 +- .../include/gtest/internal/gtest-type-util.h | 3234 +---------------- .../gtest/internal/gtest-type-util.h.pump | 302 -- .../googletest/src/src/gtest-death-test.cc | 2 +- .../googletest/src/src/gtest-filepath.cc | 5 +- .../googletest/src/src/gtest-internal-inl.h | 53 +- .../googletest/src/src/gtest-matchers.cc | 28 +- third_party/googletest/src/src/gtest-port.cc | 16 +- .../googletest/src/src/gtest-test-part.cc | 6 +- .../googletest/src/src/gtest-typed-test.cc | 5 +- third_party/googletest/src/src/gtest.cc | 276 +- third_party/googletest/src/src/gtest_main.cc | 9 +- 26 files changed, 697 insertions(+), 3947 deletions(-) delete mode 100644 third_party/googletest/src/CHANGES delete mode 100644 third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump diff --git a/third_party/googletest/README.libvpx b/third_party/googletest/README.libvpx index 111390a12e..0f6202cb7b 100644 --- a/third_party/googletest/README.libvpx +++ b/third_party/googletest/README.libvpx @@ -1,5 +1,5 @@ URL: https://github.com/google/googletest.git -Version: release-1.10.0 +Version: release-1.10.0-224-g23b2a3b1 License: BSD License File: LICENSE @@ -13,8 +13,7 @@ generation. Local Modifications: - Remove everything but: - googletest-release-1.10.0/googletest/ - CHANGES + googletest/ CONTRIBUTORS include LICENSE diff --git a/third_party/googletest/src/CHANGES b/third_party/googletest/src/CHANGES deleted file mode 100644 index 0552132421..0000000000 --- a/third_party/googletest/src/CHANGES +++ /dev/null @@ -1,157 +0,0 @@ -Changes for 1.7.0: - -* New feature: death tests are supported on OpenBSD and in iOS - simulator now. -* New feature: Google Test now implements a protocol to allow - a test runner to detect that a test program has exited - prematurely and report it as a failure (before it would be - falsely reported as a success if the exit code is 0). -* New feature: Test::RecordProperty() can now be used outside of the - lifespan of a test method, in which case it will be attributed to - the current test case or the test program in the XML report. -* New feature (potentially breaking): --gtest_list_tests now prints - the type parameters and value parameters for each test. -* Improvement: char pointers and char arrays are now escaped properly - in failure messages. -* Improvement: failure summary in XML reports now includes file and - line information. -* Improvement: the XML element now has a timestamp attribute. -* Improvement: When --gtest_filter is specified, XML report now doesn't - contain information about tests that are filtered out. -* Fixed the bug where long --gtest_filter flag values are truncated in - death tests. -* Potentially breaking change: RUN_ALL_TESTS() is now implemented as a - function instead of a macro in order to work better with Clang. -* Compatibility fixes with C++ 11 and various platforms. -* Bug/warning fixes. - -Changes for 1.6.0: - -* New feature: ADD_FAILURE_AT() for reporting a test failure at the - given source location -- useful for writing testing utilities. -* New feature: the universal value printer is moved from Google Mock - to Google Test. -* New feature: type parameters and value parameters are reported in - the XML report now. -* A gtest_disable_pthreads CMake option. -* Colored output works in GNU Screen sessions now. -* Parameters of value-parameterized tests are now printed in the - textual output. -* Failures from ad hoc test assertions run before RUN_ALL_TESTS() are - now correctly reported. -* Arguments of ASSERT_XY and EXPECT_XY no longer need to support << to - ostream. -* More complete handling of exceptions. -* GTEST_ASSERT_XY can be used instead of ASSERT_XY in case the latter - name is already used by another library. -* --gtest_catch_exceptions is now true by default, allowing a test - program to continue after an exception is thrown. -* Value-parameterized test fixtures can now derive from Test and - WithParamInterface separately, easing conversion of legacy tests. -* Death test messages are clearly marked to make them more - distinguishable from other messages. -* Compatibility fixes for Android, Google Native Client, MinGW, HP UX, - PowerPC, Lucid autotools, libCStd, Sun C++, Borland C++ Builder (Code Gear), - IBM XL C++ (Visual Age C++), and C++0x. -* Bug fixes and implementation clean-ups. -* Potentially incompatible changes: disables the harmful 'make install' - command in autotools. - -Changes for 1.5.0: - - * New feature: assertions can be safely called in multiple threads - where the pthreads library is available. - * New feature: predicates used inside EXPECT_TRUE() and friends - can now generate custom failure messages. - * New feature: Google Test can now be compiled as a DLL. - * New feature: fused source files are included. - * New feature: prints help when encountering unrecognized Google Test flags. - * Experimental feature: CMake build script (requires CMake 2.6.4+). - * Experimental feature: the Pump script for meta programming. - * double values streamed to an assertion are printed with enough precision - to differentiate any two different values. - * Google Test now works on Solaris and AIX. - * Build and test script improvements. - * Bug fixes and implementation clean-ups. - - Potentially breaking changes: - - * Stopped supporting VC++ 7.1 with exceptions disabled. - * Dropped support for 'make install'. - -Changes for 1.4.0: - - * New feature: the event listener API - * New feature: test shuffling - * New feature: the XML report format is closer to junitreport and can - be parsed by Hudson now. - * New feature: when a test runs under Visual Studio, its failures are - integrated in the IDE. - * New feature: /MD(d) versions of VC++ projects. - * New feature: elapsed time for the tests is printed by default. - * New feature: comes with a TR1 tuple implementation such that Boost - is no longer needed for Combine(). - * New feature: EXPECT_DEATH_IF_SUPPORTED macro and friends. - * New feature: the Xcode project can now produce static gtest - libraries in addition to a framework. - * Compatibility fixes for Solaris, Cygwin, minGW, Windows Mobile, - Symbian, gcc, and C++Builder. - * Bug fixes and implementation clean-ups. - -Changes for 1.3.0: - - * New feature: death tests on Windows, Cygwin, and Mac. - * New feature: ability to use Google Test assertions in other testing - frameworks. - * New feature: ability to run disabled test via - --gtest_also_run_disabled_tests. - * New feature: the --help flag for printing the usage. - * New feature: access to Google Test flag values in user code. - * New feature: a script that packs Google Test into one .h and one - .cc file for easy deployment. - * New feature: support for distributing test functions to multiple - machines (requires support from the test runner). - * Bug fixes and implementation clean-ups. - -Changes for 1.2.1: - - * Compatibility fixes for Linux IA-64 and IBM z/OS. - * Added support for using Boost and other TR1 implementations. - * Changes to the build scripts to support upcoming release of Google C++ - Mocking Framework. - * Added Makefile to the distribution package. - * Improved build instructions in README. - -Changes for 1.2.0: - - * New feature: value-parameterized tests. - * New feature: the ASSERT/EXPECT_(NON)FATAL_FAILURE(_ON_ALL_THREADS) - macros. - * Changed the XML report format to match JUnit/Ant's. - * Added tests to the Xcode project. - * Added scons/SConscript for building with SCons. - * Added src/gtest-all.cc for building Google Test from a single file. - * Fixed compatibility with Solaris and z/OS. - * Enabled running Python tests on systems with python 2.3 installed, - e.g. Mac OS X 10.4. - * Bug fixes. - -Changes for 1.1.0: - - * New feature: type-parameterized tests. - * New feature: exception assertions. - * New feature: printing elapsed time of tests. - * Improved the robustness of death tests. - * Added an Xcode project and samples. - * Adjusted the output format on Windows to be understandable by Visual Studio. - * Minor bug fixes. - -Changes for 1.0.1: - - * Added project files for Visual Studio 7.1. - * Fixed issues with compiling on Mac OS X. - * Fixed issues with compiling on Cygwin. - -Changes for 1.0.0: - - * Initial Open Source release of Google Test diff --git a/third_party/googletest/src/CONTRIBUTORS b/third_party/googletest/src/CONTRIBUTORS index feae2fc044..1e4afe2182 100644 --- a/third_party/googletest/src/CONTRIBUTORS +++ b/third_party/googletest/src/CONTRIBUTORS @@ -17,6 +17,7 @@ Jói Sigurðsson Keir Mierle Keith Ray Kenton Varda +Krystian Kuzniarek Manuel Klimek Markus Heule Mika Raento diff --git a/third_party/googletest/src/README.md b/third_party/googletest/src/README.md index 766ddc1e07..904048f484 100644 --- a/third_party/googletest/src/README.md +++ b/third_party/googletest/src/README.md @@ -8,8 +8,8 @@ depends on which build system you use, and is usually straightforward. ### Build with CMake -Google Test comes with a CMake build script ( -[CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt)) +Google Test comes with a CMake build script +([CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt)) that can be used on a wide range of platforms ("C" stands for cross-platform.). If you don't have CMake installed already, you can download it for free from . diff --git a/third_party/googletest/src/include/gtest/gtest-matchers.h b/third_party/googletest/src/include/gtest/gtest-matchers.h index 9de6c2e10a..a61cef4093 100644 --- a/third_party/googletest/src/include/gtest/gtest-matchers.h +++ b/third_party/googletest/src/include/gtest/gtest-matchers.h @@ -384,18 +384,18 @@ class GTEST_API_ Matcher Matcher(const char* s); // NOLINT }; -#if GTEST_HAS_ABSL +#if GTEST_INTERNAL_HAS_STRING_VIEW // The following two specializations allow the user to write str // instead of Eq(str) and "foo" instead of Eq("foo") when a absl::string_view // matcher is expected. template <> -class GTEST_API_ Matcher - : public internal::MatcherBase { +class GTEST_API_ Matcher + : public internal::MatcherBase { public: Matcher() {} - explicit Matcher(const MatcherInterface* impl) - : internal::MatcherBase(impl) {} + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} // Allows the user to write str instead of Eq(str) sometimes, where // str is a std::string object. @@ -404,20 +404,20 @@ class GTEST_API_ Matcher // Allows the user to write "foo" instead of Eq("foo") sometimes. Matcher(const char* s); // NOLINT - // Allows the user to pass absl::string_views directly. - Matcher(absl::string_view s); // NOLINT + // Allows the user to pass absl::string_views or std::string_views directly. + Matcher(internal::StringView s); // NOLINT }; template <> -class GTEST_API_ Matcher - : public internal::MatcherBase { +class GTEST_API_ Matcher + : public internal::MatcherBase { public: Matcher() {} - explicit Matcher(const MatcherInterface* impl) - : internal::MatcherBase(impl) {} - explicit Matcher(const MatcherInterface* impl) - : internal::MatcherBase(impl) {} + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} // Allows the user to write str instead of Eq(str) sometimes, where // str is a std::string object. @@ -426,10 +426,10 @@ class GTEST_API_ Matcher // Allows the user to write "foo" instead of Eq("foo") sometimes. Matcher(const char* s); // NOLINT - // Allows the user to pass absl::string_views directly. - Matcher(absl::string_view s); // NOLINT + // Allows the user to pass absl::string_views or std::string_views directly. + Matcher(internal::StringView s); // NOLINT }; -#endif // GTEST_HAS_ABSL +#endif // GTEST_INTERNAL_HAS_STRING_VIEW // Prints a matcher in a human-readable format. template @@ -474,13 +474,13 @@ class PolymorphicMatcher { public: explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {} - virtual void DescribeTo(::std::ostream* os) const { impl_.DescribeTo(os); } + void DescribeTo(::std::ostream* os) const override { impl_.DescribeTo(os); } - virtual void DescribeNegationTo(::std::ostream* os) const { + void DescribeNegationTo(::std::ostream* os) const override { impl_.DescribeNegationTo(os); } - virtual bool MatchAndExplain(T x, MatchResultListener* listener) const { + bool MatchAndExplain(T x, MatchResultListener* listener) const override { return impl_.MatchAndExplain(x, listener); } @@ -620,12 +620,12 @@ class MatchesRegexMatcher { MatchesRegexMatcher(const RE* regex, bool full_match) : regex_(regex), full_match_(full_match) {} -#if GTEST_HAS_ABSL - bool MatchAndExplain(const absl::string_view& s, +#if GTEST_INTERNAL_HAS_STRING_VIEW + bool MatchAndExplain(const internal::StringView& s, MatchResultListener* listener) const { return MatchAndExplain(std::string(s), listener); } -#endif // GTEST_HAS_ABSL +#endif // GTEST_INTERNAL_HAS_STRING_VIEW // Accepts pointer types, particularly: // const char* diff --git a/third_party/googletest/src/include/gtest/gtest-message.h b/third_party/googletest/src/include/gtest/gtest-message.h index 4a80e11e6b..21899232a2 100644 --- a/third_party/googletest/src/include/gtest/gtest-message.h +++ b/third_party/googletest/src/include/gtest/gtest-message.h @@ -49,6 +49,7 @@ #include #include +#include #include "gtest/internal/gtest-port.h" diff --git a/third_party/googletest/src/include/gtest/gtest-param-test.h b/third_party/googletest/src/include/gtest/gtest-param-test.h index c2e6eae3d8..5b039df9f6 100644 --- a/third_party/googletest/src/include/gtest/gtest-param-test.h +++ b/third_party/googletest/src/include/gtest/gtest-param-test.h @@ -416,14 +416,14 @@ internal::CartesianProductHolder Combine(const Generator&... g) { : public test_suite_name { \ public: \ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {} \ - virtual void TestBody(); \ + void TestBody() override; \ \ private: \ static int AddToRegistry() { \ ::testing::UnitTest::GetInstance() \ ->parameterized_test_registry() \ .GetTestSuitePatternHolder( \ - #test_suite_name, \ + GTEST_STRINGIFY_(test_suite_name), \ ::testing::internal::CodeLocation(__FILE__, __LINE__)) \ ->AddTestPattern( \ GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name), \ @@ -483,13 +483,21 @@ internal::CartesianProductHolder Combine(const Generator&... g) { ::testing::UnitTest::GetInstance() \ ->parameterized_test_registry() \ .GetTestSuitePatternHolder( \ - #test_suite_name, \ + GTEST_STRINGIFY_(test_suite_name), \ ::testing::internal::CodeLocation(__FILE__, __LINE__)) \ ->AddTestSuiteInstantiation( \ - #prefix, >est_##prefix##test_suite_name##_EvalGenerator_, \ + GTEST_STRINGIFY_(prefix), \ + >est_##prefix##test_suite_name##_EvalGenerator_, \ >est_##prefix##test_suite_name##_EvalGenerateName_, \ __FILE__, __LINE__) + +// Allow Marking a Parameterized test class as not needing to be instantiated. +#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T) \ + namespace gtest_do_not_use_outside_namespace_scope {} \ + static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \ + GTEST_STRINGIFY_(T)) + // Legacy API is deprecated but still available #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ #define INSTANTIATE_TEST_CASE_P \ diff --git a/third_party/googletest/src/include/gtest/gtest-printers.h b/third_party/googletest/src/include/gtest/gtest-printers.h index 56a05450ef..407d1f1859 100644 --- a/third_party/googletest/src/include/gtest/gtest-printers.h +++ b/third_party/googletest/src/include/gtest/gtest-printers.h @@ -135,9 +135,9 @@ enum TypeKind { kProtobuf, // a protobuf type kConvertibleToInteger, // a type implicitly convertible to BiggestInt // (e.g. a named or unnamed enum type) -#if GTEST_HAS_ABSL +#if GTEST_INTERNAL_HAS_STRING_VIEW kConvertibleToStringView, // a type implicitly convertible to - // absl::string_view + // absl::string_view or std::string_view #endif kOtherType // anything else }; @@ -191,12 +191,13 @@ class TypeWithoutFormatter { } }; -#if GTEST_HAS_ABSL +#if GTEST_INTERNAL_HAS_STRING_VIEW template class TypeWithoutFormatter { public: // Since T has neither operator<< nor PrintTo() but can be implicitly - // converted to absl::string_view, we print it as a absl::string_view. + // converted to absl::string_view, we print it as a absl::string_view + // (or std::string_view). // // Note: the implementation is further below, as it depends on // internal::PrintTo symbol which is defined later in the file. @@ -237,9 +238,9 @@ ::std::basic_ostream& operator<<( const T&, internal::BiggestInt>::value ? kConvertibleToInteger : -#if GTEST_HAS_ABSL +#if GTEST_INTERNAL_HAS_STRING_VIEW std::is_convertible< - const T&, absl::string_view>::value + const T&, internal::StringView>::value ? kConvertibleToStringView : #endif @@ -266,10 +267,8 @@ void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) { // 7.3.4-1 [namespace.udir]. This allows us to fall back onto // testing::internal2::operator<< in case T doesn't come with a << // operator. - // - // We cannot write 'using ::testing::internal2::operator<<;', which - // gcc 3.3 fails to compile due to a compiler bug. - using namespace ::testing::internal2; // NOLINT + + using ::testing::internal2::operator<<; // Assuming T is defined in namespace foo, in the next statement, // the compiler will consider all of: @@ -603,12 +602,12 @@ inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) { } #endif // GTEST_HAS_STD_WSTRING -#if GTEST_HAS_ABSL -// Overload for absl::string_view. -inline void PrintTo(absl::string_view sp, ::std::ostream* os) { +#if GTEST_INTERNAL_HAS_STRING_VIEW +// Overload for internal::StringView. +inline void PrintTo(internal::StringView sp, ::std::ostream* os) { PrintTo(::std::string(sp), os); } -#endif // GTEST_HAS_ABSL +#endif // GTEST_INTERNAL_HAS_STRING_VIEW inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; } @@ -901,12 +900,12 @@ Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) { } // namespace internal -#if GTEST_HAS_ABSL +#if GTEST_INTERNAL_HAS_STRING_VIEW namespace internal2 { template void TypeWithoutFormatter::PrintValue( const T& value, ::std::ostream* os) { - internal::PrintTo(absl::string_view(value), os); + internal::PrintTo(internal::StringView(value), os); } } // namespace internal2 #endif diff --git a/third_party/googletest/src/include/gtest/gtest-typed-test.h b/third_party/googletest/src/include/gtest/gtest-typed-test.h index 095ce05802..3ffa50b739 100644 --- a/third_party/googletest/src/include/gtest/gtest-typed-test.h +++ b/third_party/googletest/src/include/gtest/gtest-typed-test.h @@ -27,7 +27,6 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - // GOOGLETEST_CM0001 DO NOT DELETE #ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ @@ -170,6 +169,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); #endif // 0 +#include "gtest/internal/gtest-internal.h" #include "gtest/internal/gtest-port.h" #include "gtest/internal/gtest-type-util.h" @@ -188,24 +188,25 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); #define GTEST_NAME_GENERATOR_(TestSuiteName) \ gtest_type_params_##TestSuiteName##_NameGenerator -#define TYPED_TEST_SUITE(CaseName, Types, ...) \ - typedef ::testing::internal::TypeList::type GTEST_TYPE_PARAMS_( \ - CaseName); \ - typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \ +#define TYPED_TEST_SUITE(CaseName, Types, ...) \ + typedef ::testing::internal::GenerateTypeList::type \ + GTEST_TYPE_PARAMS_(CaseName); \ + typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \ GTEST_NAME_GENERATOR_(CaseName) -# define TYPED_TEST(CaseName, TestName) \ +#define TYPED_TEST(CaseName, TestName) \ + static_assert(sizeof(GTEST_STRINGIFY_(TestName)) > 1, \ + "test-name must not be empty"); \ template \ class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \ : public CaseName { \ private: \ typedef CaseName TestFixture; \ typedef gtest_TypeParam_ TypeParam; \ - virtual void TestBody(); \ + void TestBody() override; \ }; \ static bool gtest_##CaseName##_##TestName##_registered_ \ - GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::internal::TypeParameterizedTest< \ + GTEST_ATTRIBUTE_UNUSED_ = ::testing::internal::TypeParameterizedTest< \ CaseName, \ ::testing::internal::TemplateSel, \ @@ -213,7 +214,8 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); CaseName)>::Register("", \ ::testing::internal::CodeLocation( \ __FILE__, __LINE__), \ - #CaseName, #TestName, 0, \ + GTEST_STRINGIFY_(CaseName), \ + GTEST_STRINGIFY_(TestName), 0, \ ::testing::internal::GenerateNames< \ GTEST_NAME_GENERATOR_(CaseName), \ GTEST_TYPE_PARAMS_(CaseName)>()); \ @@ -276,24 +278,26 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); private: \ typedef SuiteName TestFixture; \ typedef gtest_TypeParam_ TypeParam; \ - virtual void TestBody(); \ + void TestBody() override; \ }; \ static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \ GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName( \ - __FILE__, __LINE__, #SuiteName, #TestName); \ + __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName), \ + GTEST_STRINGIFY_(TestName)); \ } \ template \ void GTEST_SUITE_NAMESPACE_( \ SuiteName)::TestName::TestBody() -#define REGISTER_TYPED_TEST_SUITE_P(SuiteName, ...) \ - namespace GTEST_SUITE_NAMESPACE_(SuiteName) { \ - typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \ - } \ - static const char* const GTEST_REGISTERED_TEST_NAMES_( \ - SuiteName) GTEST_ATTRIBUTE_UNUSED_ = \ - GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).VerifyRegisteredTestNames( \ - __FILE__, __LINE__, #__VA_ARGS__) +// Note: this won't work correctly if the trailing arguments are macros. +#define REGISTER_TYPED_TEST_SUITE_P(SuiteName, ...) \ + namespace GTEST_SUITE_NAMESPACE_(SuiteName) { \ + typedef ::testing::internal::Templates<__VA_ARGS__> gtest_AllTests_; \ + } \ + static const char* const GTEST_REGISTERED_TEST_NAMES_( \ + SuiteName) GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).VerifyRegisteredTestNames( \ + GTEST_STRINGIFY_(SuiteName), __FILE__, __LINE__, #__VA_ARGS__) // Legacy API is deprecated but still available #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ @@ -304,18 +308,21 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); #endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ #define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...) \ + static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1, \ + "test-suit-prefix must not be empty"); \ static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ = \ ::testing::internal::TypeParameterizedTestSuite< \ SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_, \ - ::testing::internal::TypeList::type>:: \ - Register(#Prefix, \ + ::testing::internal::GenerateTypeList::type>:: \ + Register(GTEST_STRINGIFY_(Prefix), \ ::testing::internal::CodeLocation(__FILE__, __LINE__), \ - >EST_TYPED_TEST_SUITE_P_STATE_(SuiteName), #SuiteName, \ + >EST_TYPED_TEST_SUITE_P_STATE_(SuiteName), \ + GTEST_STRINGIFY_(SuiteName), \ GTEST_REGISTERED_TEST_NAMES_(SuiteName), \ ::testing::internal::GenerateNames< \ ::testing::internal::NameGeneratorSelector< \ __VA_ARGS__>::type, \ - ::testing::internal::TypeList::type>()) + ::testing::internal::GenerateTypeList::type>()) // Legacy API is deprecated but still available #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ diff --git a/third_party/googletest/src/include/gtest/gtest.h b/third_party/googletest/src/include/gtest/gtest.h index dbe5b1c2c3..39cff08d65 100644 --- a/third_party/googletest/src/include/gtest/gtest.h +++ b/third_party/googletest/src/include/gtest/gtest.h @@ -177,6 +177,7 @@ class FuchsiaDeathTest; class UnitTestImpl* GetUnitTestImpl(); void ReportFailureInUnknownLocation(TestPartResult::Type result_type, const std::string& message); +std::set* GetIgnoredParameterizedTestSuites(); } // namespace internal @@ -278,7 +279,11 @@ class GTEST_API_ AssertionResult { // Used in EXPECT_TRUE/FALSE(assertion_result). AssertionResult(const AssertionResult& other); -#if defined(_MSC_VER) && _MSC_VER < 1910 +// C4800 is a level 3 warning in Visual Studio 2015 and earlier. +// This warning is not emitted in Visual Studio 2017. +// This warning is off by default starting in Visual Studio 2019 but can be +// enabled with command-line options. +#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */) #endif @@ -298,7 +303,7 @@ class GTEST_API_ AssertionResult { = nullptr) : success_(success) {} -#if defined(_MSC_VER) && _MSC_VER < 1910 +#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) GTEST_DISABLE_MSC_WARNINGS_POP_() #endif @@ -412,8 +417,6 @@ class GTEST_API_ Test { // test in test case Foo. Hence a sub-class can define its own // SetUpTestSuite() method to shadow the one defined in the super // class. - // Failures that happen during SetUpTestSuite are logged but otherwise - // ignored. static void SetUpTestSuite() {} // Tears down the stuff shared by all tests in this test suite. @@ -422,8 +425,6 @@ class GTEST_API_ Test { // test in test case Foo. Hence a sub-class can define its own // TearDownTestSuite() method to shadow the one defined in the super // class. - // Failures that happen during TearDownTestSuite are logged but otherwise - // ignored. static void TearDownTestSuite() {} // Legacy API is deprecated but still available @@ -889,7 +890,9 @@ class GTEST_API_ TestSuite { bool Passed() const { return !Failed(); } // Returns true if and only if the test suite failed. - bool Failed() const { return failed_test_count() > 0; } + bool Failed() const { + return failed_test_count() > 0 || ad_hoc_test_result().Failed(); + } // Returns the elapsed time, in milliseconds. TimeInMillis elapsed_time() const { return elapsed_time_; } @@ -1420,6 +1423,7 @@ class GTEST_API_ UnitTest { friend class internal::StreamingListenerTest; friend class internal::UnitTestRecordPropertyTestHelper; friend Environment* AddGlobalTestEnvironment(Environment* env); + friend std::set* internal::GetIgnoredParameterizedTestSuites(); friend internal::UnitTestImpl* internal::GetUnitTestImpl(); friend void internal::ReportFailureInUnknownLocation( TestPartResult::Type result_type, @@ -1889,7 +1893,7 @@ class TestWithParam : public Test, public WithParamInterface { // Skips test in runtime. // Skipping test aborts current function. // Skipped tests are neither successful nor failed. -#define GTEST_SKIP() GTEST_SKIP_("Skipped") +#define GTEST_SKIP() GTEST_SKIP_("") // ADD_FAILURE unconditionally adds a failure to the current test. // SUCCEED generates a success - it doesn't automatically make the @@ -2298,8 +2302,7 @@ class GTEST_API_ ScopedTrace { // to cause a compiler error. template constexpr bool StaticAssertTypeEq() noexcept { - static_assert(std::is_same::value, - "type1 and type2 are not the same type"); + static_assert(std::is_same::value, "T1 and T2 are not the same type"); return true; } @@ -2365,9 +2368,11 @@ constexpr bool StaticAssertTypeEq() noexcept { // } // // GOOGLETEST_CM0011 DO NOT DELETE +#if !GTEST_DONT_DEFINE_TEST #define TEST_F(test_fixture, test_name)\ GTEST_TEST_(test_fixture, test_name, test_fixture, \ ::testing::internal::GetTypeId()) +#endif // !GTEST_DONT_DEFINE_TEST // Returns a path to temporary directory. // Tries to determine an appropriate directory for the platform. diff --git a/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/third_party/googletest/src/include/gtest/internal/gtest-internal.h index 94c816a28b..6bad8780b5 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-internal.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-internal.h @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -78,7 +79,16 @@ #define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar // Stringifies its argument. -#define GTEST_STRINGIFY_(name) #name +// Work around a bug in visual studio which doesn't accept code like this: +// +// #define GTEST_STRINGIFY_(name) #name +// #define MACRO(a, b, c) ... GTEST_STRINGIFY_(a) ... +// MACRO(, x, y) +// +// Complaining about the argument to GTEST_STRINGIFY_ being empty. +// This is allowed by the spec. +#define GTEST_STRINGIFY_HELPER_(name, ...) #name +#define GTEST_STRINGIFY_(...) GTEST_STRINGIFY_HELPER_(__VA_ARGS__, ) namespace proto2 { class Message; } @@ -607,8 +617,9 @@ class GTEST_API_ TypedTestSuitePState { // Verifies that registered_tests match the test names in // defined_test_names_; returns registered_tests if successful, or // aborts the program otherwise. - const char* VerifyRegisteredTestNames( - const char* file, int line, const char* registered_tests); + const char* VerifyRegisteredTestNames(const char* test_suite_name, + const char* file, int line, + const char* registered_tests); private: typedef ::std::map RegisteredTestsMap; @@ -662,7 +673,7 @@ struct NameGeneratorSelector { }; template -void GenerateNamesRecursively(Types0, std::vector*, int) {} +void GenerateNamesRecursively(internal::None, std::vector*, int) {} template void GenerateNamesRecursively(Types, std::vector* result, int i) { @@ -729,7 +740,7 @@ class TypeParameterizedTest { // The base case for the compile time recursion. template -class TypeParameterizedTest { +class TypeParameterizedTest { public: static bool Register(const char* /*prefix*/, const CodeLocation&, const char* /*case_name*/, const char* /*test_names*/, @@ -740,6 +751,11 @@ class TypeParameterizedTest { } }; +GTEST_API_ void RegisterTypeParameterizedTestSuite(const char* test_suite_name, + CodeLocation code_location); +GTEST_API_ void RegisterTypeParameterizedTestSuiteInstantiation( + const char* case_name); + // TypeParameterizedTestSuite::Register() // registers *all combinations* of 'Tests' and 'Types' with Google // Test. The return value is insignificant - we just need to return @@ -752,6 +768,7 @@ class TypeParameterizedTestSuite { const char* test_names, const std::vector& type_names = GenerateNames()) { + RegisterTypeParameterizedTestSuiteInstantiation(case_name); std::string test_name = StripTrailingSpaces( GetPrefixUntilComma(test_names)); if (!state->TestExists(test_name)) { @@ -781,7 +798,7 @@ class TypeParameterizedTestSuite { // The base case for the compile time recursion. template -class TypeParameterizedTestSuite { +class TypeParameterizedTestSuite { public: static bool Register(const char* /*prefix*/, const CodeLocation&, const TypedTestSuitePState* /*state*/, @@ -825,6 +842,16 @@ struct GTEST_API_ ConstCharPtr { const char* value; }; +// Helper for declaring std::string within 'if' statement +// in pre C++17 build environment. +struct TrueWithString { + TrueWithString() = default; + explicit TrueWithString(const char* str) : value(str) {} + explicit TrueWithString(const std::string& str) : value(str) {} + explicit operator bool() const { return true; } + std::string value; +}; + // A simple Linear Congruential Generator for generating random // numbers with a uniform distribution. Unlike rand() and srand(), it // doesn't use global state (and therefore can't interfere with user @@ -832,18 +859,18 @@ struct GTEST_API_ ConstCharPtr { // but it's good enough for our purposes. class GTEST_API_ Random { public: - static const UInt32 kMaxRange = 1u << 31; + static const uint32_t kMaxRange = 1u << 31; - explicit Random(UInt32 seed) : state_(seed) {} + explicit Random(uint32_t seed) : state_(seed) {} - void Reseed(UInt32 seed) { state_ = seed; } + void Reseed(uint32_t seed) { state_ = seed; } // Generates a random number from [0, range). Crashes if 'range' is // 0 or greater than kMaxRange. - UInt32 Generate(UInt32 range); + uint32_t Generate(uint32_t range); private: - UInt32 state_; + uint32_t state_; GTEST_DISALLOW_COPY_AND_ASSIGN_(Random); }; @@ -855,8 +882,7 @@ class GTEST_API_ Random { // true if and only if T is type proto2::Message or a subclass of it. template struct IsAProtocolMessage - : public bool_constant< - std::is_convertible::value> {}; + : public std::is_convertible {}; // When the compiler sees expression IsContainerTest(0), if C is an // STL-style container class, the first overload of IsContainerTest @@ -1124,25 +1150,29 @@ struct MakeIndexSequence template <> struct MakeIndexSequence<0> : IndexSequence<> {}; -// FIXME: This implementation of ElemFromList is O(1) in instantiation depth, -// but it is O(N^2) in total instantiations. Not sure if this is the best -// tradeoff, as it will make it somewhat slow to compile. -template -struct ElemFromListImpl {}; - -template -struct ElemFromListImpl { - using type = T; +template +struct Ignore { + Ignore(...); // NOLINT }; -// Get the Nth element from T... -// It uses O(1) instantiation depth. -template -struct ElemFromList; +template +struct ElemFromListImpl; +template +struct ElemFromListImpl> { + // We make Ignore a template to solve a problem with MSVC. + // A non-template Ignore would work fine with `decltype(Ignore(I))...`, but + // MSVC doesn't understand how to deal with that pack expansion. + // Use `0 * I` to have a single instantiation of Ignore. + template + static R Apply(Ignore<0 * I>..., R (*)(), ...); +}; -template -struct ElemFromList, T...> - : ElemFromListImpl... {}; +template +struct ElemFromList { + using type = + decltype(ElemFromListImpl::type>::Apply( + static_cast(nullptr)...)); +}; template class FlatTuple; @@ -1152,9 +1182,7 @@ struct FlatTupleElemBase; template struct FlatTupleElemBase, I> { - using value_type = - typename ElemFromList::type, - T...>::type; + using value_type = typename ElemFromList::type; FlatTupleElemBase() = default; explicit FlatTupleElemBase(value_type t) : value(std::move(t)) {} value_type value; @@ -1174,7 +1202,7 @@ struct FlatTupleBase, IndexSequence> // Analog to std::tuple but with different tradeoffs. // This class minimizes the template instantiation depth, thus allowing more -// elements that std::tuple would. std::tuple has been seen to require an +// elements than std::tuple would. std::tuple has been seen to require an // instantiation depth of more than 10x the number of elements in some // implementations. // FlatTuple and ElemFromList are not recursive and have a fixed depth @@ -1185,19 +1213,20 @@ template class FlatTuple : private FlatTupleBase, typename MakeIndexSequence::type> { - using Indices = typename FlatTuple::FlatTupleBase::Indices; + using Indices = typename FlatTupleBase< + FlatTuple, typename MakeIndexSequence::type>::Indices; public: FlatTuple() = default; explicit FlatTuple(T... t) : FlatTuple::FlatTupleBase(std::move(t)...) {} template - const typename ElemFromList::type& Get() const { + const typename ElemFromList::type& Get() const { return static_cast*>(this)->value; } template - typename ElemFromList::type& Get() { + typename ElemFromList::type& Get() { return static_cast*>(this)->value; } }; @@ -1283,19 +1312,39 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; } GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \ fail(gtest_msg.value) +#if GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \ + catch (std::exception const& e) { \ + gtest_msg.value = ( \ + "it throws std::exception-derived exception with description: \"" \ + ); \ + gtest_msg.value += e.what(); \ + gtest_msg.value += "\"."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ + } + +#else // GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() + +#endif // GTEST_HAS_EXCEPTIONS + #define GTEST_TEST_NO_THROW_(statement, fail) \ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ + if (::testing::internal::TrueWithString gtest_msg{}) { \ try { \ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ } \ + GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \ catch (...) { \ + gtest_msg.value = "it throws."; \ goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ } \ } else \ GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \ - fail("Expected: " #statement " doesn't throw an exception.\n" \ - " Actual: it throws.") + fail(("Expected: " #statement " doesn't throw an exception.\n" \ + " Actual: " + gtest_msg.value).c_str()) #define GTEST_TEST_ANY_THROW_(statement, fail) \ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ @@ -1356,12 +1405,15 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; } : public parent_class { \ public: \ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {} \ + ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \ + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)); \ + GTEST_DISALLOW_MOVE_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)); \ \ private: \ - virtual void TestBody(); \ + void TestBody() override; \ static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_; \ - GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ - test_name)); \ }; \ \ ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name, \ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h index 97533993c0..7f7a13bf84 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h @@ -42,12 +42,14 @@ #include #include #include +#include #include #include #include "gtest/internal/gtest-internal.h" #include "gtest/internal/gtest-port.h" #include "gtest/gtest-printers.h" +#include "gtest/gtest-test-part.h" namespace testing { // Input to a parameterized test name generator, describing a test parameter. @@ -472,6 +474,17 @@ class ParameterizedTestSuiteInfoBase { GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfoBase); }; +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Report a the name of a test_suit as safe to ignore +// as the side effect of construction of this type. +struct MarkAsIgnored { + explicit MarkAsIgnored(const char* test_suite); +}; + +GTEST_API_ void InsertSyntheticTestCase(const std::string& name, + CodeLocation location, bool has_test_p); + // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. // // ParameterizedTestSuiteInfo accumulates tests obtained from TEST_P @@ -522,11 +535,13 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { return 0; // Return value used only to run this method in namespace scope. } // UnitTest class invokes this method to register tests in this test suite - // test suites right before running tests in RUN_ALL_TESTS macro. + // right before running tests in RUN_ALL_TESTS macro. // This method should not be called more than once on any single // instance of a ParameterizedTestSuiteInfoBase derived class. // UnitTest has a guard to prevent from calling this method more than once. void RegisterTests() override { + bool generated_instantiations = false; + for (typename TestInfoContainer::iterator test_it = tests_.begin(); test_it != tests_.end(); ++test_it) { std::shared_ptr test_info = *test_it; @@ -549,6 +564,8 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { for (typename ParamGenerator::iterator param_it = generator.begin(); param_it != generator.end(); ++param_it, ++i) { + generated_instantiations = true; + Message test_name_stream; std::string param_name = name_func( @@ -580,6 +597,12 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { } // for param_it } // for gen_it } // for test_it + + if (!generated_instantiations) { + // There are no generaotrs, or they all generate nothing ... + InsertSyntheticTestCase(GetTestSuiteName(), code_location_, + !tests_.empty()); + } } // RegisterTests private: @@ -717,6 +740,34 @@ class ParameterizedTestSuiteRegistry { GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteRegistry); }; +// Keep track of what type-parameterized test suite are defined and +// where as well as which are intatiated. This allows susequently +// identifying suits that are defined but never used. +class TypeParameterizedTestSuiteRegistry { + public: + // Add a suite definition + void RegisterTestSuite(const char* test_suite_name, + CodeLocation code_location); + + // Add an instantiation of a suit. + void RegisterInstantiation(const char* test_suite_name); + + // For each suit repored as defined but not reported as instantiation, + // emit a test that reports that fact (configurably, as an error). + void CheckForInstantiations(); + + private: + struct TypeParameterizedTestSuiteInfo { + explicit TypeParameterizedTestSuiteInfo(CodeLocation c) + : code_location(c), instantiated(false) {} + + CodeLocation code_location; + bool instantiated; + }; + + std::map suites_; +}; + } // namespace internal // Forward declarations of ValuesIn(), which is implemented in diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h index cece93dba1..d3239b25ba 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h @@ -102,6 +102,10 @@ # define GTEST_OS_QNX 1 #elif defined(__HAIKU__) #define GTEST_OS_HAIKU 1 +#elif defined ESP8266 +#define GTEST_OS_ESP8266 1 +#elif defined ESP32 +#define GTEST_OS_ESP32 1 #endif // __CYGWIN__ #endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port.h b/third_party/googletest/src/include/gtest/internal/gtest-port.h index 063fcb1083..60ff47164f 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-port.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-port.h @@ -190,13 +190,18 @@ // GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning. // GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a // variable don't have to be used. -// GTEST_DISALLOW_ASSIGN_ - disables operator=. +// GTEST_DISALLOW_ASSIGN_ - disables copy operator=. // GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=. +// GTEST_DISALLOW_MOVE_ASSIGN_ - disables move operator=. +// GTEST_DISALLOW_MOVE_AND_ASSIGN_ - disables move ctor and operator=. // GTEST_MUST_USE_RESULT_ - declares that a function's result must be used. // GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is // suppressed (constant conditional). // GTEST_INTENTIONAL_CONST_COND_POP_ - finish code section where MSVC C4127 // is suppressed. +// GTEST_INTERNAL_HAS_STRING_VIEW - for enabling Matcher or +// Matcher +// specializations. // // Synchronization: // Mutex, MutexLock, ThreadLocal, GetThreadCount() @@ -223,8 +228,7 @@ // // Integer types: // TypeWithSize - maps an integer to a int type. -// Int32, UInt32, Int64, UInt64, TimeInMillis -// - integers of known sizes. +// TimeInMillis - integers of known sizes. // BiggestInt - the biggest signed integer type. // // Command-line utilities: @@ -235,7 +239,7 @@ // Environment variable utilities: // GetEnv() - gets the value of an environment variable. // BoolFromGTestEnv() - parses a bool environment variable. -// Int32FromGTestEnv() - parses an Int32 environment variable. +// Int32FromGTestEnv() - parses an int32_t environment variable. // StringFromGTestEnv() - parses a string environment variable. // // Deprecation warnings: @@ -248,7 +252,8 @@ #include #include #include -#include +#include +#include #include #ifndef _WIN32_WCE @@ -261,16 +266,14 @@ # include #endif -#include // NOLINT -#include // NOLINT -#include // NOLINT -#include // NOLINT +#include // NOLINT +#include +#include // NOLINT #include -#include #include // NOLINT -#include "gtest/internal/gtest-port-arch.h" #include "gtest/internal/custom/gtest-port.h" +#include "gtest/internal/gtest-port-arch.h" #if !defined(GTEST_DEV_EMAIL_) # define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com" @@ -441,15 +444,6 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; # endif // defined(_MSC_VER) || defined(__BORLANDC__) #endif // GTEST_HAS_EXCEPTIONS -#if !defined(GTEST_HAS_STD_STRING) -// Even though we don't use this macro any longer, we keep it in case -// some clients still depend on it. -# define GTEST_HAS_STD_STRING 1 -#elif !GTEST_HAS_STD_STRING -// The user told us that ::std::string isn't available. -# error "::std::string isn't available." -#endif // !defined(GTEST_HAS_STD_STRING) - #ifndef GTEST_HAS_STD_WSTRING // The user didn't tell us whether ::std::wstring is available, so we need // to figure it out. @@ -458,7 +452,7 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // no support for it at least as recent as Froyo (2.2). #define GTEST_HAS_STD_WSTRING \ (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ - GTEST_OS_HAIKU)) + GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266)) #endif // GTEST_HAS_STD_WSTRING @@ -582,7 +576,8 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; #ifndef GTEST_HAS_STREAM_REDIRECTION // By default, we assume that stream redirection is supported on all // platforms except known mobile ones. -# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ + GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 # define GTEST_HAS_STREAM_REDIRECTION 0 # else # define GTEST_HAS_STREAM_REDIRECTION 1 @@ -676,10 +671,10 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; #endif -// A macro to disallow operator= +// A macro to disallow copy operator= // This should be used in the private: declarations for a class. #define GTEST_DISALLOW_ASSIGN_(type) \ - void operator=(type const &) = delete + type& operator=(type const &) = delete // A macro to disallow copy constructor and operator= // This should be used in the private: declarations for a class. @@ -687,6 +682,17 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; type(type const &) = delete; \ GTEST_DISALLOW_ASSIGN_(type) +// A macro to disallow move operator= +// This should be used in the private: declarations for a class. +#define GTEST_DISALLOW_MOVE_ASSIGN_(type) \ + type& operator=(type &&) noexcept = delete + +// A macro to disallow move constructor and operator= +// This should be used in the private: declarations for a class. +#define GTEST_DISALLOW_MOVE_AND_ASSIGN_(type) \ + type(type &&) noexcept = delete; \ + GTEST_DISALLOW_MOVE_ASSIGN_(type) + // Tell the compiler to warn about unused return values for functions declared // with this macro. The macro should be used on function declarations // following the argument list: @@ -856,9 +862,6 @@ class Secret; // expression is false, compiler will issue an error containing this identifier. #define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg) -// Evaluates to the number of elements in 'array'. -#define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0])) - // A helper for suppressing warnings on constant condition. It just // returns 'condition'. GTEST_API_ bool IsTrue(bool condition); @@ -1599,7 +1602,7 @@ class ThreadLocal : public ThreadLocalBase { class DefaultValueHolderFactory : public ValueHolderFactory { public: DefaultValueHolderFactory() {} - virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); } + ValueHolder* MakeNewHolder() const override { return new ValueHolder(); } private: GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory); @@ -1608,7 +1611,7 @@ class ThreadLocal : public ThreadLocalBase { class InstanceValueHolderFactory : public ValueHolderFactory { public: explicit InstanceValueHolderFactory(const T& value) : value_(value) {} - virtual ValueHolder* MakeNewHolder() const { + ValueHolder* MakeNewHolder() const override { return new ValueHolder(value_); } @@ -1808,7 +1811,7 @@ class GTEST_API_ ThreadLocal { class DefaultValueHolderFactory : public ValueHolderFactory { public: DefaultValueHolderFactory() {} - virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); } + ValueHolder* MakeNewHolder() const override { return new ValueHolder(); } private: GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory); @@ -1817,7 +1820,7 @@ class GTEST_API_ ThreadLocal { class InstanceValueHolderFactory : public ValueHolderFactory { public: explicit InstanceValueHolderFactory(const T& value) : value_(value) {} - virtual ValueHolder* MakeNewHolder() const { + ValueHolder* MakeNewHolder() const override { return new ValueHolder(value_); } @@ -1887,18 +1890,12 @@ class GTEST_API_ ThreadLocal { // we cannot detect it. GTEST_API_ size_t GetThreadCount(); -template -using bool_constant = std::integral_constant; - #if GTEST_OS_WINDOWS # define GTEST_PATH_SEP_ "\\" # define GTEST_HAS_ALT_PATH_SEP_ 1 -// The biggest signed integer type the compiler supports. -typedef __int64 BiggestInt; #else # define GTEST_PATH_SEP_ "/" # define GTEST_HAS_ALT_PATH_SEP_ 0 -typedef long long BiggestInt; // NOLINT #endif // GTEST_OS_WINDOWS // Utilities for char. @@ -1993,6 +1990,22 @@ inline bool IsDir(const StatStruct& st) { } # endif // GTEST_OS_WINDOWS_MOBILE +#elif GTEST_OS_ESP8266 +typedef struct stat StatStruct; + +inline int FileNo(FILE* file) { return fileno(file); } +inline int IsATTY(int fd) { return isatty(fd); } +inline int Stat(const char* path, StatStruct* buf) { + // stat function not implemented on ESP8266 + return 0; +} +inline int StrCaseCmp(const char* s1, const char* s2) { + return strcasecmp(s1, s2); +} +inline char* StrDup(const char* src) { return strdup(src); } +inline int RmDir(const char* dir) { return rmdir(dir); } +inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); } + #else typedef struct stat StatStruct; @@ -2013,10 +2026,6 @@ inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); } GTEST_DISABLE_MSC_DEPRECATED_PUSH_() -inline const char* StrNCpy(char* dest, const char* src, size_t n) { - return strncpy(dest, src, n); -} - // ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and // StrError() aren't needed on Windows CE at this time and thus not // defined there. @@ -2045,8 +2054,9 @@ inline int Close(int fd) { return close(fd); } inline const char* StrError(int errnum) { return strerror(errnum); } #endif inline const char* GetEnv(const char* name) { -#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT - // We are on Windows CE, which has no environment variables. +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ + GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 + // We are on an embedded platform, which has no environment variables. static_cast(name); // To prevent 'unused argument' warning. return nullptr; #elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9) @@ -2088,15 +2098,13 @@ GTEST_DISABLE_MSC_DEPRECATED_POP_() # define GTEST_SNPRINTF_ snprintf #endif -// The maximum number a BiggestInt can represent. This definition -// works no matter BiggestInt is represented in one's complement or -// two's complement. +// The biggest signed integer type the compiler supports. // -// We cannot rely on numeric_limits in STL, as __int64 and long long -// are not part of standard C++ and numeric_limits doesn't need to be -// defined for them. -const BiggestInt kMaxBiggestInt = - ~(static_cast(1) << (8*sizeof(BiggestInt) - 1)); +// long long is guaranteed to be at least 64-bits in C++11. +using BiggestInt = long long; // NOLINT + +// The maximum number a BiggestInt can represent. +constexpr BiggestInt kMaxBiggestInt = (std::numeric_limits::max)(); // This template class serves as a compile-time function from size to // type. It maps a size in bytes to a primitive type with that @@ -2121,40 +2129,27 @@ class TypeWithSize { public: // This prevents the user from using TypeWithSize with incorrect // values of N. - typedef void UInt; + using UInt = void; }; // The specialization for size 4. template <> class TypeWithSize<4> { public: - // unsigned int has size 4 in both gcc and MSVC. - // - // As base/basictypes.h doesn't compile on Windows, we cannot use - // uint32, uint64, and etc here. - typedef int Int; - typedef unsigned int UInt; + using Int = std::int32_t; + using UInt = std::uint32_t; }; // The specialization for size 8. template <> class TypeWithSize<8> { public: -#if GTEST_OS_WINDOWS - typedef __int64 Int; - typedef unsigned __int64 UInt; -#else - typedef long long Int; // NOLINT - typedef unsigned long long UInt; // NOLINT -#endif // GTEST_OS_WINDOWS + using Int = std::int64_t; + using UInt = std::uint64_t; }; // Integer types of known sizes. -typedef TypeWithSize<4>::Int Int32; -typedef TypeWithSize<4>::UInt UInt32; -typedef TypeWithSize<8>::Int Int64; -typedef TypeWithSize<8>::UInt UInt64; -typedef TypeWithSize<8>::Int TimeInMillis; // Represents time in milliseconds. +using TimeInMillis = int64_t; // Represents time in milliseconds. // Utilities for command line flags and environment variables. @@ -2173,7 +2168,7 @@ typedef TypeWithSize<8>::Int TimeInMillis; // Represents time in milliseconds. // Macros for declaring flags. # define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name) # define GTEST_DECLARE_int32_(name) \ - GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name) + GTEST_API_ extern std::int32_t GTEST_FLAG(name) # define GTEST_DECLARE_string_(name) \ GTEST_API_ extern ::std::string GTEST_FLAG(name) @@ -2181,7 +2176,7 @@ typedef TypeWithSize<8>::Int TimeInMillis; // Represents time in milliseconds. # define GTEST_DEFINE_bool_(name, default_val, doc) \ GTEST_API_ bool GTEST_FLAG(name) = (default_val) # define GTEST_DEFINE_int32_(name, default_val, doc) \ - GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val) + GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val) # define GTEST_DEFINE_string_(name, default_val, doc) \ GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val) @@ -2196,12 +2191,12 @@ typedef TypeWithSize<8>::Int TimeInMillis; // Represents time in milliseconds. // Parses 'str' for a 32-bit signed integer. If successful, writes the result // to *value and returns true; otherwise leaves *value unchanged and returns // false. -bool ParseInt32(const Message& src_text, const char* str, Int32* value); +bool ParseInt32(const Message& src_text, const char* str, int32_t* value); -// Parses a bool/Int32/string from the environment variable +// Parses a bool/int32_t/string from the environment variable // corresponding to the given Google Test flag. bool BoolFromGTestEnv(const char* flag, bool default_val); -GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val); +GTEST_API_ int32_t Int32FromGTestEnv(const char* flag, int32_t default_val); std::string OutputFlagAlsoCheckEnvVar(); const char* StringFromGTestEnv(const char* flag, const char* default_val); @@ -2228,4 +2223,32 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val); #endif // !defined(GTEST_INTERNAL_DEPRECATED) +#if GTEST_HAS_ABSL +// Always use absl::string_view for Matcher<> specializations if googletest +// is built with absl support. +# define GTEST_INTERNAL_HAS_STRING_VIEW 1 +#include "absl/strings/string_view.h" +namespace testing { +namespace internal { +using StringView = ::absl::string_view; +} // namespace internal +} // namespace testing +#else +# ifdef __has_include +# if __has_include() && __cplusplus >= 201703L +// Otherwise for C++17 and higher use std::string_view for Matcher<> +// specializations. +# define GTEST_INTERNAL_HAS_STRING_VIEW 1 +#include +namespace testing { +namespace internal { +using StringView = ::std::string_view; +} // namespace internal +} // namespace testing +// The case where absl is configured NOT to alias std::string_view is not +// supported. +# endif // __has_include() && __cplusplus >= 201703L +# endif // __has_include +#endif // GTEST_HAS_ABSL + #endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-string.h b/third_party/googletest/src/include/gtest/internal/gtest-string.h index 82aaa63bf4..0b2a91a5dc 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-string.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-string.h @@ -47,6 +47,7 @@ #endif #include +#include #include #include "gtest/internal/gtest-port.h" @@ -152,7 +153,7 @@ class GTEST_API_ String { static std::string FormatHexInt(int value); // Formats an int value as "%X". - static std::string FormatHexUInt32(UInt32 value); + static std::string FormatHexUInt32(uint32_t value); // Formats a byte as "%02X". static std::string FormatByte(unsigned char value); diff --git a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h index 3d7542d1fb..082fdad12c 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h @@ -1,7 +1,3 @@ -// This file was GENERATED by command: -// pump.py gtest-type-util.h.pump -// DO NOT EDIT BY HAND!!! - // Copyright 2008 Google Inc. // All Rights Reserved. // @@ -32,12 +28,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Type utilities needed for implementing typed and type-parameterized -// tests. This file is generated by a SCRIPT. DO NOT EDIT BY HAND! -// -// Currently we support at most 50 types in a list, and at most 50 -// type-parameterized tests in one type-parameterized test suite. -// Please contact googletestframework@googlegroups.com if you need -// more. +// tests. // GOOGLETEST_CM0001 DO NOT DELETE @@ -105,1524 +96,9 @@ std::string GetTypeName() { #if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P -// A unique type used as the default value for the arguments of class -// template Types. This allows us to simulate variadic templates -// (e.g. Types, Type, and etc), which C++ doesn't -// support directly. +// A unique type indicating an empty node struct None {}; -// The following family of struct and struct templates are used to -// represent type lists. In particular, TypesN -// represents a type list with N types (T1, T2, ..., and TN) in it. -// Except for Types0, every struct in the family has two member types: -// Head for the first type in the list, and Tail for the rest of the -// list. - -// The empty type list. -struct Types0 {}; - -// Type lists of length 1, 2, 3, and so on. - -template -struct Types1 { - typedef T1 Head; - typedef Types0 Tail; -}; -template -struct Types2 { - typedef T1 Head; - typedef Types1 Tail; -}; - -template -struct Types3 { - typedef T1 Head; - typedef Types2 Tail; -}; - -template -struct Types4 { - typedef T1 Head; - typedef Types3 Tail; -}; - -template -struct Types5 { - typedef T1 Head; - typedef Types4 Tail; -}; - -template -struct Types6 { - typedef T1 Head; - typedef Types5 Tail; -}; - -template -struct Types7 { - typedef T1 Head; - typedef Types6 Tail; -}; - -template -struct Types8 { - typedef T1 Head; - typedef Types7 Tail; -}; - -template -struct Types9 { - typedef T1 Head; - typedef Types8 Tail; -}; - -template -struct Types10 { - typedef T1 Head; - typedef Types9 Tail; -}; - -template -struct Types11 { - typedef T1 Head; - typedef Types10 Tail; -}; - -template -struct Types12 { - typedef T1 Head; - typedef Types11 Tail; -}; - -template -struct Types13 { - typedef T1 Head; - typedef Types12 Tail; -}; - -template -struct Types14 { - typedef T1 Head; - typedef Types13 Tail; -}; - -template -struct Types15 { - typedef T1 Head; - typedef Types14 Tail; -}; - -template -struct Types16 { - typedef T1 Head; - typedef Types15 Tail; -}; - -template -struct Types17 { - typedef T1 Head; - typedef Types16 Tail; -}; - -template -struct Types18 { - typedef T1 Head; - typedef Types17 Tail; -}; - -template -struct Types19 { - typedef T1 Head; - typedef Types18 Tail; -}; - -template -struct Types20 { - typedef T1 Head; - typedef Types19 Tail; -}; - -template -struct Types21 { - typedef T1 Head; - typedef Types20 Tail; -}; - -template -struct Types22 { - typedef T1 Head; - typedef Types21 Tail; -}; - -template -struct Types23 { - typedef T1 Head; - typedef Types22 Tail; -}; - -template -struct Types24 { - typedef T1 Head; - typedef Types23 Tail; -}; - -template -struct Types25 { - typedef T1 Head; - typedef Types24 Tail; -}; - -template -struct Types26 { - typedef T1 Head; - typedef Types25 Tail; -}; - -template -struct Types27 { - typedef T1 Head; - typedef Types26 Tail; -}; - -template -struct Types28 { - typedef T1 Head; - typedef Types27 Tail; -}; - -template -struct Types29 { - typedef T1 Head; - typedef Types28 Tail; -}; - -template -struct Types30 { - typedef T1 Head; - typedef Types29 Tail; -}; - -template -struct Types31 { - typedef T1 Head; - typedef Types30 Tail; -}; - -template -struct Types32 { - typedef T1 Head; - typedef Types31 Tail; -}; - -template -struct Types33 { - typedef T1 Head; - typedef Types32 Tail; -}; - -template -struct Types34 { - typedef T1 Head; - typedef Types33 Tail; -}; - -template -struct Types35 { - typedef T1 Head; - typedef Types34 Tail; -}; - -template -struct Types36 { - typedef T1 Head; - typedef Types35 Tail; -}; - -template -struct Types37 { - typedef T1 Head; - typedef Types36 Tail; -}; - -template -struct Types38 { - typedef T1 Head; - typedef Types37 Tail; -}; - -template -struct Types39 { - typedef T1 Head; - typedef Types38 Tail; -}; - -template -struct Types40 { - typedef T1 Head; - typedef Types39 Tail; -}; - -template -struct Types41 { - typedef T1 Head; - typedef Types40 Tail; -}; - -template -struct Types42 { - typedef T1 Head; - typedef Types41 Tail; -}; - -template -struct Types43 { - typedef T1 Head; - typedef Types42 Tail; -}; - -template -struct Types44 { - typedef T1 Head; - typedef Types43 Tail; -}; - -template -struct Types45 { - typedef T1 Head; - typedef Types44 Tail; -}; - -template -struct Types46 { - typedef T1 Head; - typedef Types45 Tail; -}; - -template -struct Types47 { - typedef T1 Head; - typedef Types46 Tail; -}; - -template -struct Types48 { - typedef T1 Head; - typedef Types47 Tail; -}; - -template -struct Types49 { - typedef T1 Head; - typedef Types48 Tail; -}; - -template -struct Types50 { - typedef T1 Head; - typedef Types49 Tail; -}; - - -} // namespace internal - -// We don't want to require the users to write TypesN<...> directly, -// as that would require them to count the length. Types<...> is much -// easier to write, but generates horrible messages when there is a -// compiler error, as gcc insists on printing out each template -// argument, even if it has the default value (this means Types -// will appear as Types in the compiler -// errors). -// -// Our solution is to combine the best part of the two approaches: a -// user would write Types, and Google Test will translate -// that to TypesN internally to make error messages -// readable. The translation is done by the 'type' member of the -// Types template. -template -struct Types { - typedef internal::Types50 type; -}; - -template <> -struct Types { - typedef internal::Types0 type; -}; -template -struct Types { - typedef internal::Types1 type; -}; -template -struct Types { - typedef internal::Types2 type; -}; -template -struct Types { - typedef internal::Types3 type; -}; -template -struct Types { - typedef internal::Types4 type; -}; -template -struct Types { - typedef internal::Types5 type; -}; -template -struct Types { - typedef internal::Types6 type; -}; -template -struct Types { - typedef internal::Types7 type; -}; -template -struct Types { - typedef internal::Types8 type; -}; -template -struct Types { - typedef internal::Types9 type; -}; -template -struct Types { - typedef internal::Types10 type; -}; -template -struct Types { - typedef internal::Types11 type; -}; -template -struct Types { - typedef internal::Types12 type; -}; -template -struct Types { - typedef internal::Types13 type; -}; -template -struct Types { - typedef internal::Types14 type; -}; -template -struct Types { - typedef internal::Types15 type; -}; -template -struct Types { - typedef internal::Types16 type; -}; -template -struct Types { - typedef internal::Types17 type; -}; -template -struct Types { - typedef internal::Types18 type; -}; -template -struct Types { - typedef internal::Types19 type; -}; -template -struct Types { - typedef internal::Types20 type; -}; -template -struct Types { - typedef internal::Types21 type; -}; -template -struct Types { - typedef internal::Types22 type; -}; -template -struct Types { - typedef internal::Types23 type; -}; -template -struct Types { - typedef internal::Types24 type; -}; -template -struct Types { - typedef internal::Types25 type; -}; -template -struct Types { - typedef internal::Types26 type; -}; -template -struct Types { - typedef internal::Types27 type; -}; -template -struct Types { - typedef internal::Types28 type; -}; -template -struct Types { - typedef internal::Types29 type; -}; -template -struct Types { - typedef internal::Types30 type; -}; -template -struct Types { - typedef internal::Types31 type; -}; -template -struct Types { - typedef internal::Types32 type; -}; -template -struct Types { - typedef internal::Types33 type; -}; -template -struct Types { - typedef internal::Types34 type; -}; -template -struct Types { - typedef internal::Types35 type; -}; -template -struct Types { - typedef internal::Types36 type; -}; -template -struct Types { - typedef internal::Types37 type; -}; -template -struct Types { - typedef internal::Types38 type; -}; -template -struct Types { - typedef internal::Types39 type; -}; -template -struct Types { - typedef internal::Types40 type; -}; -template -struct Types { - typedef internal::Types41 type; -}; -template -struct Types { - typedef internal::Types42 type; -}; -template -struct Types { - typedef internal::Types43 type; -}; -template -struct Types { - typedef internal::Types44 type; -}; -template -struct Types { - typedef internal::Types45 type; -}; -template -struct Types { - typedef internal::Types46 type; -}; -template -struct Types { - typedef internal::Types47 type; -}; -template -struct Types { - typedef internal::Types48 type; -}; -template -struct Types { - typedef internal::Types49 type; -}; - -namespace internal { - # define GTEST_TEMPLATE_ template class // The template "selector" struct TemplateSel is used to @@ -1644,1692 +120,64 @@ struct TemplateSel { # define GTEST_BIND_(TmplSel, T) \ TmplSel::template Bind::type -// A unique struct template used as the default value for the -// arguments of class template Templates. This allows us to simulate -// variadic templates (e.g. Templates, Templates, -// and etc), which C++ doesn't support directly. -template -struct NoneT {}; - -// The following family of struct and struct templates are used to -// represent template lists. In particular, TemplatesN represents a list of N templates (T1, T2, ..., and TN). Except -// for Templates0, every struct in the family has two member types: -// Head for the selector of the first template in the list, and Tail -// for the rest of the list. - -// The empty template list. -struct Templates0 {}; - -// Template lists of length 1, 2, 3, and so on. - -template -struct Templates1 { - typedef TemplateSel Head; - typedef Templates0 Tail; -}; -template -struct Templates2 { - typedef TemplateSel Head; - typedef Templates1 Tail; -}; - -template -struct Templates3 { - typedef TemplateSel Head; - typedef Templates2 Tail; -}; - -template -struct Templates4 { - typedef TemplateSel Head; - typedef Templates3 Tail; -}; - -template -struct Templates5 { - typedef TemplateSel Head; - typedef Templates4 Tail; -}; - -template -struct Templates6 { - typedef TemplateSel Head; - typedef Templates5 Tail; -}; - -template -struct Templates7 { - typedef TemplateSel Head; - typedef Templates6 Tail; -}; - -template -struct Templates8 { - typedef TemplateSel Head; - typedef Templates7 Tail; -}; - -template -struct Templates9 { - typedef TemplateSel Head; - typedef Templates8 Tail; -}; - -template -struct Templates10 { - typedef TemplateSel Head; - typedef Templates9 Tail; -}; - -template -struct Templates11 { - typedef TemplateSel Head; - typedef Templates10 Tail; -}; - -template -struct Templates12 { - typedef TemplateSel Head; - typedef Templates11 Tail; -}; - -template -struct Templates13 { - typedef TemplateSel Head; - typedef Templates12 Tail; -}; - -template -struct Templates14 { - typedef TemplateSel Head; - typedef Templates13 Tail; -}; - -template -struct Templates15 { - typedef TemplateSel Head; - typedef Templates14 Tail; -}; - -template -struct Templates16 { - typedef TemplateSel Head; - typedef Templates15 Tail; -}; - -template -struct Templates17 { - typedef TemplateSel Head; - typedef Templates16 Tail; -}; - -template -struct Templates18 { - typedef TemplateSel Head; - typedef Templates17 Tail; -}; - -template -struct Templates19 { - typedef TemplateSel Head; - typedef Templates18 Tail; -}; - -template -struct Templates20 { - typedef TemplateSel Head; - typedef Templates19 Tail; -}; - -template -struct Templates21 { - typedef TemplateSel Head; - typedef Templates20 Tail; -}; - -template -struct Templates22 { - typedef TemplateSel Head; - typedef Templates21 Tail; -}; - -template -struct Templates23 { - typedef TemplateSel Head; - typedef Templates22 Tail; -}; - -template -struct Templates24 { - typedef TemplateSel Head; - typedef Templates23 Tail; -}; - -template -struct Templates25 { - typedef TemplateSel Head; - typedef Templates24 Tail; -}; - -template -struct Templates26 { - typedef TemplateSel Head; - typedef Templates25 Tail; -}; - -template -struct Templates27 { - typedef TemplateSel Head; - typedef Templates26 Tail; -}; - -template -struct Templates28 { - typedef TemplateSel Head; - typedef Templates27 Tail; -}; - -template -struct Templates29 { - typedef TemplateSel Head; - typedef Templates28 Tail; -}; - -template -struct Templates30 { - typedef TemplateSel Head; - typedef Templates29 Tail; -}; - -template -struct Templates31 { - typedef TemplateSel Head; - typedef Templates30 Tail; -}; - -template -struct Templates32 { - typedef TemplateSel Head; - typedef Templates31 Tail; -}; - -template -struct Templates33 { - typedef TemplateSel Head; - typedef Templates32 Tail; -}; - -template -struct Templates34 { - typedef TemplateSel Head; - typedef Templates33 Tail; -}; - -template -struct Templates35 { - typedef TemplateSel Head; - typedef Templates34 Tail; -}; - -template -struct Templates36 { - typedef TemplateSel Head; - typedef Templates35 Tail; -}; - -template -struct Templates37 { - typedef TemplateSel Head; - typedef Templates36 Tail; -}; - -template -struct Templates38 { - typedef TemplateSel Head; - typedef Templates37 Tail; -}; - -template -struct Templates39 { - typedef TemplateSel Head; - typedef Templates38 Tail; -}; - -template -struct Templates40 { - typedef TemplateSel Head; - typedef Templates39 Tail; -}; - -template -struct Templates41 { - typedef TemplateSel Head; - typedef Templates40 Tail; -}; - -template -struct Templates42 { - typedef TemplateSel Head; - typedef Templates41 Tail; -}; - -template -struct Templates43 { - typedef TemplateSel Head; - typedef Templates42 Tail; -}; - -template -struct Templates44 { - typedef TemplateSel Head; - typedef Templates43 Tail; -}; - -template -struct Templates45 { - typedef TemplateSel Head; - typedef Templates44 Tail; -}; - -template -struct Templates46 { - typedef TemplateSel Head; - typedef Templates45 Tail; +template +struct Templates { + using Head = TemplateSel; + using Tail = Templates; }; -template -struct Templates47 { - typedef TemplateSel Head; - typedef Templates46 Tail; +template +struct Templates { + using Head = TemplateSel; + using Tail = None; }; -template -struct Templates48 { - typedef TemplateSel Head; - typedef Templates47 Tail; +// Tuple-like type lists +template +struct Types { + using Head = Head_; + using Tail = Types; }; -template -struct Templates49 { - typedef TemplateSel Head; - typedef Templates48 Tail; +template +struct Types { + using Head = Head_; + using Tail = None; }; -template -struct Templates50 { - typedef TemplateSel Head; - typedef Templates49 Tail; +// Helper metafunctions to tell apart a single type from types +// generated by ::testing::Types +template +struct ProxyTypeList { + using type = Types; }; +template +struct is_proxy_type_list : std::false_type {}; -// We don't want to require the users to write TemplatesN<...> directly, -// as that would require them to count the length. Templates<...> is much -// easier to write, but generates horrible messages when there is a -// compiler error, as gcc insists on printing out each template -// argument, even if it has the default value (this means Templates -// will appear as Templates in the compiler -// errors). -// -// Our solution is to combine the best part of the two approaches: a -// user would write Templates, and Google Test will translate -// that to TemplatesN internally to make error messages -// readable. The translation is done by the 'type' member of the -// Templates template. -template -struct Templates { - typedef Templates50 type; -}; - -template <> -struct Templates { - typedef Templates0 type; -}; -template -struct Templates { - typedef Templates1 type; -}; -template -struct Templates { - typedef Templates2 type; -}; -template -struct Templates { - typedef Templates3 type; -}; -template -struct Templates { - typedef Templates4 type; -}; -template -struct Templates { - typedef Templates5 type; -}; -template -struct Templates { - typedef Templates6 type; -}; -template -struct Templates { - typedef Templates7 type; -}; -template -struct Templates { - typedef Templates8 type; -}; -template -struct Templates { - typedef Templates9 type; -}; -template -struct Templates { - typedef Templates10 type; -}; -template -struct Templates { - typedef Templates11 type; -}; -template -struct Templates { - typedef Templates12 type; -}; -template -struct Templates { - typedef Templates13 type; -}; -template -struct Templates { - typedef Templates14 type; -}; -template -struct Templates { - typedef Templates15 type; -}; -template -struct Templates { - typedef Templates16 type; -}; -template -struct Templates { - typedef Templates17 type; -}; -template -struct Templates { - typedef Templates18 type; -}; -template -struct Templates { - typedef Templates19 type; -}; -template -struct Templates { - typedef Templates20 type; -}; -template -struct Templates { - typedef Templates21 type; -}; -template -struct Templates { - typedef Templates22 type; -}; -template -struct Templates { - typedef Templates23 type; -}; -template -struct Templates { - typedef Templates24 type; -}; -template -struct Templates { - typedef Templates25 type; -}; -template -struct Templates { - typedef Templates26 type; -}; -template -struct Templates { - typedef Templates27 type; -}; -template -struct Templates { - typedef Templates28 type; -}; -template -struct Templates { - typedef Templates29 type; -}; -template -struct Templates { - typedef Templates30 type; -}; -template -struct Templates { - typedef Templates31 type; -}; -template -struct Templates { - typedef Templates32 type; -}; -template -struct Templates { - typedef Templates33 type; -}; -template -struct Templates { - typedef Templates34 type; -}; -template -struct Templates { - typedef Templates35 type; -}; -template -struct Templates { - typedef Templates36 type; -}; -template -struct Templates { - typedef Templates37 type; -}; -template -struct Templates { - typedef Templates38 type; -}; -template -struct Templates { - typedef Templates39 type; -}; -template -struct Templates { - typedef Templates40 type; -}; -template -struct Templates { - typedef Templates41 type; -}; -template -struct Templates { - typedef Templates42 type; -}; -template -struct Templates { - typedef Templates43 type; -}; -template -struct Templates { - typedef Templates44 type; -}; -template -struct Templates { - typedef Templates45 type; -}; -template -struct Templates { - typedef Templates46 type; -}; -template -struct Templates { - typedef Templates47 type; -}; -template -struct Templates { - typedef Templates48 type; -}; -template -struct Templates { - typedef Templates49 type; -}; - -// The TypeList template makes it possible to use either a single type -// or a Types<...> list in TYPED_TEST_SUITE() and -// INSTANTIATE_TYPED_TEST_SUITE_P(). +template +struct is_proxy_type_list> : std::true_type {}; +// Generator which conditionally creates type lists. +// It recognizes if a requested type list should be created +// and prevents creating a new type list nested within another one. template -struct TypeList { - typedef Types1 type; -}; +struct GenerateTypeList { + private: + using proxy = typename std::conditional::value, T, + ProxyTypeList>::type; -template -struct TypeList > { - typedef typename Types::type type; + public: + using type = typename proxy::type; }; #endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P } // namespace internal + +template +using Types = internal::ProxyTypeList; + } // namespace testing #endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump deleted file mode 100644 index 5e31b7b320..0000000000 --- a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump +++ /dev/null @@ -1,302 +0,0 @@ -$$ -*- mode: c++; -*- -$var n = 50 $$ Maximum length of type lists we want to support. -// Copyright 2008 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -// Type utilities needed for implementing typed and type-parameterized -// tests. This file is generated by a SCRIPT. DO NOT EDIT BY HAND! -// -// Currently we support at most $n types in a list, and at most $n -// type-parameterized tests in one type-parameterized test suite. -// Please contact googletestframework@googlegroups.com if you need -// more. - -// GOOGLETEST_CM0001 DO NOT DELETE - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ - -#include "gtest/internal/gtest-port.h" - -// #ifdef __GNUC__ is too general here. It is possible to use gcc without using -// libstdc++ (which is where cxxabi.h comes from). -# if GTEST_HAS_CXXABI_H_ -# include -# elif defined(__HP_aCC) -# include -# endif // GTEST_HASH_CXXABI_H_ - -namespace testing { -namespace internal { - -// Canonicalizes a given name with respect to the Standard C++ Library. -// This handles removing the inline namespace within `std` that is -// used by various standard libraries (e.g., `std::__1`). Names outside -// of namespace std are returned unmodified. -inline std::string CanonicalizeForStdLibVersioning(std::string s) { - static const char prefix[] = "std::__"; - if (s.compare(0, strlen(prefix), prefix) == 0) { - std::string::size_type end = s.find("::", strlen(prefix)); - if (end != s.npos) { - // Erase everything between the initial `std` and the second `::`. - s.erase(strlen("std"), end - strlen("std")); - } - } - return s; -} - -// GetTypeName() returns a human-readable name of type T. -// NB: This function is also used in Google Mock, so don't move it inside of -// the typed-test-only section below. -template -std::string GetTypeName() { -# if GTEST_HAS_RTTI - - const char* const name = typeid(T).name(); -# if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC) - int status = 0; - // gcc's implementation of typeid(T).name() mangles the type name, - // so we have to demangle it. -# if GTEST_HAS_CXXABI_H_ - using abi::__cxa_demangle; -# endif // GTEST_HAS_CXXABI_H_ - char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status); - const std::string name_str(status == 0 ? readable_name : name); - free(readable_name); - return CanonicalizeForStdLibVersioning(name_str); -# else - return name; -# endif // GTEST_HAS_CXXABI_H_ || __HP_aCC - -# else - - return ""; - -# endif // GTEST_HAS_RTTI -} - -#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P - -// A unique type used as the default value for the arguments of class -// template Types. This allows us to simulate variadic templates -// (e.g. Types, Type, and etc), which C++ doesn't -// support directly. -struct None {}; - -// The following family of struct and struct templates are used to -// represent type lists. In particular, TypesN -// represents a type list with N types (T1, T2, ..., and TN) in it. -// Except for Types0, every struct in the family has two member types: -// Head for the first type in the list, and Tail for the rest of the -// list. - -// The empty type list. -struct Types0 {}; - -// Type lists of length 1, 2, 3, and so on. - -template -struct Types1 { - typedef T1 Head; - typedef Types0 Tail; -}; - -$range i 2..n - -$for i [[ -$range j 1..i -$range k 2..i -template <$for j, [[typename T$j]]> -struct Types$i { - typedef T1 Head; - typedef Types$(i-1)<$for k, [[T$k]]> Tail; -}; - - -]] - -} // namespace internal - -// We don't want to require the users to write TypesN<...> directly, -// as that would require them to count the length. Types<...> is much -// easier to write, but generates horrible messages when there is a -// compiler error, as gcc insists on printing out each template -// argument, even if it has the default value (this means Types -// will appear as Types in the compiler -// errors). -// -// Our solution is to combine the best part of the two approaches: a -// user would write Types, and Google Test will translate -// that to TypesN internally to make error messages -// readable. The translation is done by the 'type' member of the -// Types template. - -$range i 1..n -template <$for i, [[typename T$i = internal::None]]> -struct Types { - typedef internal::Types$n<$for i, [[T$i]]> type; -}; - -template <> -struct Types<$for i, [[internal::None]]> { - typedef internal::Types0 type; -}; - -$range i 1..n-1 -$for i [[ -$range j 1..i -$range k i+1..n -template <$for j, [[typename T$j]]> -struct Types<$for j, [[T$j]]$for k[[, internal::None]]> { - typedef internal::Types$i<$for j, [[T$j]]> type; -}; - -]] - -namespace internal { - -# define GTEST_TEMPLATE_ template class - -// The template "selector" struct TemplateSel is used to -// represent Tmpl, which must be a class template with one type -// parameter, as a type. TemplateSel::Bind::type is defined -// as the type Tmpl. This allows us to actually instantiate the -// template "selected" by TemplateSel. -// -// This trick is necessary for simulating typedef for class templates, -// which C++ doesn't support directly. -template -struct TemplateSel { - template - struct Bind { - typedef Tmpl type; - }; -}; - -# define GTEST_BIND_(TmplSel, T) \ - TmplSel::template Bind::type - -// A unique struct template used as the default value for the -// arguments of class template Templates. This allows us to simulate -// variadic templates (e.g. Templates, Templates, -// and etc), which C++ doesn't support directly. -template -struct NoneT {}; - -// The following family of struct and struct templates are used to -// represent template lists. In particular, TemplatesN represents a list of N templates (T1, T2, ..., and TN). Except -// for Templates0, every struct in the family has two member types: -// Head for the selector of the first template in the list, and Tail -// for the rest of the list. - -// The empty template list. -struct Templates0 {}; - -// Template lists of length 1, 2, 3, and so on. - -template -struct Templates1 { - typedef TemplateSel Head; - typedef Templates0 Tail; -}; - -$range i 2..n - -$for i [[ -$range j 1..i -$range k 2..i -template <$for j, [[GTEST_TEMPLATE_ T$j]]> -struct Templates$i { - typedef TemplateSel Head; - typedef Templates$(i-1)<$for k, [[T$k]]> Tail; -}; - - -]] - -// We don't want to require the users to write TemplatesN<...> directly, -// as that would require them to count the length. Templates<...> is much -// easier to write, but generates horrible messages when there is a -// compiler error, as gcc insists on printing out each template -// argument, even if it has the default value (this means Templates -// will appear as Templates in the compiler -// errors). -// -// Our solution is to combine the best part of the two approaches: a -// user would write Templates, and Google Test will translate -// that to TemplatesN internally to make error messages -// readable. The translation is done by the 'type' member of the -// Templates template. - -$range i 1..n -template <$for i, [[GTEST_TEMPLATE_ T$i = NoneT]]> -struct Templates { - typedef Templates$n<$for i, [[T$i]]> type; -}; - -template <> -struct Templates<$for i, [[NoneT]]> { - typedef Templates0 type; -}; - -$range i 1..n-1 -$for i [[ -$range j 1..i -$range k i+1..n -template <$for j, [[GTEST_TEMPLATE_ T$j]]> -struct Templates<$for j, [[T$j]]$for k[[, NoneT]]> { - typedef Templates$i<$for j, [[T$j]]> type; -}; - -]] - -// The TypeList template makes it possible to use either a single type -// or a Types<...> list in TYPED_TEST_SUITE() and -// INSTANTIATE_TYPED_TEST_SUITE_P(). - -template -struct TypeList { - typedef Types1 type; -}; - - -$range i 1..n -template <$for i, [[typename T$i]]> -struct TypeList > { - typedef typename Types<$for i, [[T$i]]>::type type; -}; - -#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ diff --git a/third_party/googletest/src/src/gtest-death-test.cc b/third_party/googletest/src/src/gtest-death-test.cc index da09a1cfc2..5d1031bea2 100644 --- a/third_party/googletest/src/src/gtest-death-test.cc +++ b/third_party/googletest/src/src/gtest-death-test.cc @@ -1364,7 +1364,7 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { if (!use_fork) { static const bool stack_grows_down = StackGrowsDown(); - const auto stack_size = static_cast(getpagesize()); + const auto stack_size = static_cast(getpagesize() * 2); // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead. void* const stack = mmap(nullptr, stack_size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); diff --git a/third_party/googletest/src/src/gtest-filepath.cc b/third_party/googletest/src/src/gtest-filepath.cc index bd7b99ff03..9aad12fbd1 100644 --- a/third_party/googletest/src/src/gtest-filepath.cc +++ b/third_party/googletest/src/src/gtest-filepath.cc @@ -93,7 +93,7 @@ static bool IsPathSeparator(char c) { // Returns the current working directory, or "" if unsuccessful. FilePath FilePath::GetCurrentDir() { #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ - GTEST_OS_WINDOWS_RT || ARDUINO || defined(ESP_PLATFORM) + GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32 // These platforms do not have a current directory, so we just return // something reasonable. return FilePath(kCurrentDirectoryString); @@ -323,6 +323,9 @@ bool FilePath::CreateFolder() const { delete [] unicode; #elif GTEST_OS_WINDOWS int result = _mkdir(pathname_.c_str()); +#elif GTEST_OS_ESP8266 + // do nothing + int result = 0; #else int result = mkdir(pathname_.c_str(), 0777); #endif // GTEST_OS_WINDOWS_MOBILE diff --git a/third_party/googletest/src/src/gtest-internal-inl.h b/third_party/googletest/src/src/gtest-internal-inl.h index 8ed70daab0..e42ff47539 100644 --- a/third_party/googletest/src/src/gtest-internal-inl.h +++ b/third_party/googletest/src/src/gtest-internal-inl.h @@ -42,6 +42,7 @@ #include // For memmove. #include +#include #include #include #include @@ -123,11 +124,11 @@ GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms); // On success, stores the value of the flag in *value, and returns // true. On failure, returns false without changing *value. GTEST_API_ bool ParseInt32Flag( - const char* str, const char* flag, Int32* value); + const char* str, const char* flag, int32_t* value); // Returns a random seed in range [1, kMaxRandomSeed] based on the // given --gtest_random_seed flag value. -inline int GetRandomSeedFromFlag(Int32 random_seed_flag) { +inline int GetRandomSeedFromFlag(int32_t random_seed_flag) { const unsigned int raw_seed = (random_seed_flag == 0) ? static_cast(GetTimeInMillis()) : static_cast(random_seed_flag); @@ -213,10 +214,10 @@ class GTestFlagSaver { std::string output_; bool print_time_; bool print_utf8_; - internal::Int32 random_seed_; - internal::Int32 repeat_; + int32_t random_seed_; + int32_t repeat_; bool shuffle_; - internal::Int32 stack_trace_depth_; + int32_t stack_trace_depth_; std::string stream_result_to_; bool throw_on_failure_; } GTEST_ATTRIBUTE_UNUSED_; @@ -227,7 +228,7 @@ class GTestFlagSaver { // If the code_point is not a valid Unicode code point // (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted // to "(Invalid Unicode 0xXXXXXXXX)". -GTEST_API_ std::string CodePointToUtf8(UInt32 code_point); +GTEST_API_ std::string CodePointToUtf8(uint32_t code_point); // Converts a wide string to a narrow string in UTF-8 encoding. // The wide string is assumed to have the following encoding: @@ -260,10 +261,10 @@ GTEST_API_ bool ShouldShard(const char* total_shards_str, const char* shard_index_str, bool in_subprocess_for_death_test); -// Parses the environment variable var as an Int32. If it is unset, -// returns default_val. If it is not an Int32, prints an error and +// Parses the environment variable var as a 32-bit integer. If it is unset, +// returns default_val. If it is not a 32-bit integer, prints an error and // and aborts. -GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val); +GTEST_API_ int32_t Int32FromEnvOrDie(const char* env_var, int32_t default_val); // Given the total number of shards, the shard index, and the test id, // returns true if and only if the test should be run on this shard. The test id @@ -323,7 +324,7 @@ void ShuffleRange(internal::Random* random, int begin, int end, const int last_in_range = begin + range_width - 1; const int selected = begin + - static_cast(random->Generate(static_cast(range_width))); + static_cast(random->Generate(static_cast(range_width))); std::swap((*v)[static_cast(selected)], (*v)[static_cast(last_in_range)]); } @@ -697,6 +698,17 @@ class GTEST_API_ UnitTestImpl { return parameterized_test_registry_; } + std::set* ignored_parameterized_test_suites() { + return &ignored_parameterized_test_suites_; + } + + // Returns TypeParameterizedTestSuiteRegistry object used to keep track of + // type-parameterized tests and instantiations of them. + internal::TypeParameterizedTestSuiteRegistry& + type_parameterized_test_registry() { + return type_parameterized_test_registry_; + } + // Sets the TestSuite object for the test that's currently running. void set_current_test_suite(TestSuite* a_current_test_suite) { current_test_suite_ = a_current_test_suite; @@ -873,6 +885,12 @@ class GTEST_API_ UnitTestImpl { // ParameterizedTestRegistry object used to register value-parameterized // tests. internal::ParameterizedTestSuiteRegistry parameterized_test_registry_; + internal::TypeParameterizedTestSuiteRegistry + type_parameterized_test_registry_; + + // The set holding the name of parameterized + // test suites that may go uninstantiated. + std::set ignored_parameterized_test_suites_; // Indicates whether RegisterParameterizedTests() has been called already. bool parameterized_tests_registered_; @@ -999,20 +1017,9 @@ bool ParseNaturalNumber(const ::std::string& str, Integer* number) { char* end; // BiggestConvertible is the largest integer type that system-provided // string-to-number conversion routines can return. + using BiggestConvertible = unsigned long long; // NOLINT -# if GTEST_OS_WINDOWS && !defined(__GNUC__) - - // MSVC and C++ Builder define __int64 instead of the standard long long. - typedef unsigned __int64 BiggestConvertible; - const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10); - -# else - - typedef unsigned long long BiggestConvertible; // NOLINT - const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10); - -# endif // GTEST_OS_WINDOWS && !defined(__GNUC__) - + const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10); // NOLINT const bool parse_success = *end == '\0' && errno == 0; GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed)); diff --git a/third_party/googletest/src/src/gtest-matchers.cc b/third_party/googletest/src/src/gtest-matchers.cc index 7d2fb6851e..65104ebab1 100644 --- a/third_party/googletest/src/src/gtest-matchers.cc +++ b/third_party/googletest/src/src/gtest-matchers.cc @@ -58,40 +58,40 @@ Matcher::Matcher(const std::string& s) { *this = Eq(s); } // s. Matcher::Matcher(const char* s) { *this = Eq(std::string(s)); } -#if GTEST_HAS_ABSL -// Constructs a matcher that matches a const absl::string_view& whose value is +#if GTEST_INTERNAL_HAS_STRING_VIEW +// Constructs a matcher that matches a const StringView& whose value is // equal to s. -Matcher::Matcher(const std::string& s) { +Matcher::Matcher(const std::string& s) { *this = Eq(s); } -// Constructs a matcher that matches a const absl::string_view& whose value is +// Constructs a matcher that matches a const StringView& whose value is // equal to s. -Matcher::Matcher(const char* s) { +Matcher::Matcher(const char* s) { *this = Eq(std::string(s)); } -// Constructs a matcher that matches a const absl::string_view& whose value is +// Constructs a matcher that matches a const StringView& whose value is // equal to s. -Matcher::Matcher(absl::string_view s) { +Matcher::Matcher(internal::StringView s) { *this = Eq(std::string(s)); } -// Constructs a matcher that matches a absl::string_view whose value is equal to +// Constructs a matcher that matches a StringView whose value is equal to // s. -Matcher::Matcher(const std::string& s) { *this = Eq(s); } +Matcher::Matcher(const std::string& s) { *this = Eq(s); } -// Constructs a matcher that matches a absl::string_view whose value is equal to +// Constructs a matcher that matches a StringView whose value is equal to // s. -Matcher::Matcher(const char* s) { +Matcher::Matcher(const char* s) { *this = Eq(std::string(s)); } -// Constructs a matcher that matches a absl::string_view whose value is equal to +// Constructs a matcher that matches a StringView whose value is equal to // s. -Matcher::Matcher(absl::string_view s) { +Matcher::Matcher(internal::StringView s) { *this = Eq(std::string(s)); } -#endif // GTEST_HAS_ABSL +#endif // GTEST_INTERNAL_HAS_STRING_VIEW } // namespace testing diff --git a/third_party/googletest/src/src/gtest-port.cc b/third_party/googletest/src/src/gtest-port.cc index fc5ba6becc..a05c50a39b 100644 --- a/third_party/googletest/src/src/gtest-port.cc +++ b/third_party/googletest/src/src/gtest-port.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -536,6 +537,9 @@ class ThreadLocalRegistryImpl { // Returns a value that can be used to identify the thread from other threads. static ThreadLocalValueHolderBase* GetValueOnCurrentThread( const ThreadLocalBase* thread_local_instance) { +#ifdef _MSC_VER + MemoryIsNotDeallocated memory_is_not_deallocated; +#endif // _MSC_VER DWORD current_thread = ::GetCurrentThreadId(); MutexLock lock(&mutex_); ThreadIdToThreadLocals* const thread_to_thread_locals = @@ -1286,7 +1290,7 @@ static std::string FlagToEnvVar(const char* flag) { // Parses 'str' for a 32-bit signed integer. If successful, writes // the result to *value and returns true; otherwise leaves *value // unchanged and returns false. -bool ParseInt32(const Message& src_text, const char* str, Int32* value) { +bool ParseInt32(const Message& src_text, const char* str, int32_t* value) { // Parses the environment variable as a decimal integer. char* end = nullptr; const long long_value = strtol(str, &end, 10); // NOLINT @@ -1303,13 +1307,13 @@ bool ParseInt32(const Message& src_text, const char* str, Int32* value) { return false; } - // Is the parsed value in the range of an Int32? - const Int32 result = static_cast(long_value); + // Is the parsed value in the range of an int32_t? + const auto result = static_cast(long_value); if (long_value == LONG_MAX || long_value == LONG_MIN || // The parsed value overflows as a long. (strtol() returns // LONG_MAX or LONG_MIN when the input overflows.) result != long_value - // The parsed value overflows as an Int32. + // The parsed value overflows as an int32_t. ) { Message msg; msg << "WARNING: " << src_text @@ -1342,7 +1346,7 @@ bool BoolFromGTestEnv(const char* flag, bool default_value) { // Reads and returns a 32-bit integer stored in the environment // variable corresponding to the given flag; if it isn't set or // doesn't represent a valid 32-bit integer, returns default_value. -Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) { +int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) { #if defined(GTEST_GET_INT32_FROM_ENV_) return GTEST_GET_INT32_FROM_ENV_(flag, default_value); #else @@ -1353,7 +1357,7 @@ Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) { return default_value; } - Int32 result = default_value; + int32_t result = default_value; if (!ParseInt32(Message() << "Environment variable " << env_var, string_value, &result)) { printf("The default value %s is used.\n", diff --git a/third_party/googletest/src/src/gtest-test-part.cc b/third_party/googletest/src/src/gtest-test-part.cc index 178317a6bc..a938683ced 100644 --- a/third_party/googletest/src/src/gtest-test-part.cc +++ b/third_party/googletest/src/src/gtest-test-part.cc @@ -31,6 +31,8 @@ // The Google C++ Testing and Mocking Framework (Google Test) #include "gtest/gtest-test-part.h" + +#include "gtest/internal/gtest-port.h" #include "src/gtest-internal-inl.h" namespace testing { @@ -46,7 +48,9 @@ std::string TestPartResult::ExtractSummary(const char* message) { // Prints a TestPartResult object. std::ostream& operator<<(std::ostream& os, const TestPartResult& result) { - return os << result.file_name() << ":" << result.line_number() << ": " + return os << internal::FormatFileLocation(result.file_name(), + result.line_number()) + << " " << (result.type() == TestPartResult::kSuccess ? "Success" : result.type() == TestPartResult::kSkip diff --git a/third_party/googletest/src/src/gtest-typed-test.cc b/third_party/googletest/src/src/gtest-typed-test.cc index 8677caf732..1b1cfb0dc1 100644 --- a/third_party/googletest/src/src/gtest-typed-test.cc +++ b/third_party/googletest/src/src/gtest-typed-test.cc @@ -58,7 +58,10 @@ static std::vector SplitIntoTestNames(const char* src) { // registered_tests_; returns registered_tests if successful, or // aborts the program otherwise. const char* TypedTestSuitePState::VerifyRegisteredTestNames( - const char* file, int line, const char* registered_tests) { + const char* test_suite_name, const char* file, int line, + const char* registered_tests) { + RegisterTypeParameterizedTestSuite(test_suite_name, CodeLocation(file, line)); + typedef RegisteredTestsMap::const_iterator RegisteredTestIter; registered_ = true; diff --git a/third_party/googletest/src/src/gtest.cc b/third_party/googletest/src/src/gtest.cc index a5b4e5ac78..095778e6e9 100644 --- a/third_party/googletest/src/src/gtest.cc +++ b/third_party/googletest/src/src/gtest.cc @@ -44,6 +44,7 @@ #include #include +#include #include #include #include @@ -83,8 +84,11 @@ # include // NOLINT # undef min +#ifdef _MSC_VER # include // NOLINT # include // NOLINT +#endif + # include // NOLINT # include // NOLINT # include // NOLINT @@ -330,10 +334,10 @@ namespace internal { // Generates a random number from [0, range), using a Linear // Congruential Generator (LCG). Crashes if 'range' is 0 or greater // than kMaxRange. -UInt32 Random::Generate(UInt32 range) { +uint32_t Random::Generate(uint32_t range) { // These constants are the same as are used in glibc's rand(3). // Use wider types than necessary to prevent unsigned overflow diagnostics. - state_ = static_cast(1103515245ULL*state_ + 12345U) % kMaxRange; + state_ = static_cast(1103515245ULL*state_ + 12345U) % kMaxRange; GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0)."; @@ -403,6 +407,162 @@ void AssertHelper::operator=(const Message& message) const { ); // NOLINT } +namespace { + +// When TEST_P is found without a matching INSTANTIATE_TEST_SUITE_P +// to creates test cases for it, a syntetic test case is +// inserted to report ether an error or a log message. +// +// This configuration bit will likely be removed at some point. +constexpr bool kErrorOnUninstantiatedParameterizedTest = false; +constexpr bool kErrorOnUninstantiatedTypeParameterizedTest = false; + +// A test that fails at a given file/line location with a given message. +class FailureTest : public Test { + public: + explicit FailureTest(const CodeLocation& loc, std::string error_message, + bool as_error) + : loc_(loc), + error_message_(std::move(error_message)), + as_error_(as_error) {} + + void TestBody() override { + if (as_error_) { + AssertHelper(TestPartResult::kNonFatalFailure, loc_.file.c_str(), + loc_.line, "") = Message() << error_message_; + } else { + std::cout << error_message_ << std::endl; + } + } + + private: + const CodeLocation loc_; + const std::string error_message_; + const bool as_error_; +}; + + +} // namespace + +std::set* GetIgnoredParameterizedTestSuites() { + return UnitTest::GetInstance()->impl()->ignored_parameterized_test_suites(); +} + +// Add a given test_suit to the list of them allow to go un-instantiated. +MarkAsIgnored::MarkAsIgnored(const char* test_suite) { + GetIgnoredParameterizedTestSuites()->insert(test_suite); +} + +// If this parameterized test suite has no instantiations (and that +// has not been marked as okay), emit a test case reporting that. +void InsertSyntheticTestCase(const std::string& name, CodeLocation location, + bool has_test_p) { + const auto& ignored = *GetIgnoredParameterizedTestSuites(); + if (ignored.find(name) != ignored.end()) return; + + const char kMissingInstantiation[] = // + " is defined via TEST_P, but never instantiated. None of the test cases " + "will run. Either no INSTANTIATE_TEST_SUITE_P is provided or the only " + "ones provided expand to nothing." + "\n\n" + "Ideally, TEST_P definitions should only ever be included as part of " + "binaries that intend to use them. (As opposed to, for example, being " + "placed in a library that may be linked in to get other utilities.)"; + + const char kMissingTestCase[] = // + " is instantiated via INSTANTIATE_TEST_SUITE_P, but no tests are " + "defined via TEST_P . No test cases will run." + "\n\n" + "Ideally, INSTANTIATE_TEST_SUITE_P should only ever be invoked from " + "code that always depend on code that provides TEST_P. Failing to do " + "so is often an indication of dead code, e.g. the last TEST_P was " + "removed but the rest got left behind."; + + std::string message = + "Paramaterized test suite " + name + + (has_test_p ? kMissingInstantiation : kMissingTestCase) + + "\n\n" + "To suppress this error for this test suite, insert the following line " + "(in a non-header) in the namespace it is defined in:" + "\n\n" + "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + name + ");"; + + std::string full_name = "UninstantiatedParamaterizedTestSuite<" + name + ">"; + RegisterTest( // + "GoogleTestVerification", full_name.c_str(), + nullptr, // No type parameter. + nullptr, // No value parameter. + location.file.c_str(), location.line, [message, location] { + return new FailureTest(location, message, + kErrorOnUninstantiatedParameterizedTest); + }); +} + +void RegisterTypeParameterizedTestSuite(const char* test_suite_name, + CodeLocation code_location) { + GetUnitTestImpl()->type_parameterized_test_registry().RegisterTestSuite( + test_suite_name, code_location); +} + +void RegisterTypeParameterizedTestSuiteInstantiation(const char* case_name) { + GetUnitTestImpl() + ->type_parameterized_test_registry() + .RegisterInstantiation(case_name); +} + +void TypeParameterizedTestSuiteRegistry::RegisterTestSuite( + const char* test_suite_name, CodeLocation code_location) { + suites_.emplace(std::string(test_suite_name), + TypeParameterizedTestSuiteInfo(code_location)); +} + +void TypeParameterizedTestSuiteRegistry::RegisterInstantiation( + const char* test_suite_name) { + auto it = suites_.find(std::string(test_suite_name)); + if (it != suites_.end()) { + it->second.instantiated = true; + } else { + GTEST_LOG_(ERROR) << "Unknown type parameterized test suit '" + << test_suite_name << "'"; + } +} + +void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() { + const auto& ignored = *GetIgnoredParameterizedTestSuites(); + for (const auto& testcase : suites_) { + if (testcase.second.instantiated) continue; + if (ignored.find(testcase.first) != ignored.end()) continue; + + std::string message = + "Type paramaterized test suite " + testcase.first + + " is defined via REGISTER_TYPED_TEST_SUITE_P, but never instantiated " + "via INSTANTIATE_TYPED_TEST_SUITE_P. None of the test cases will run." + "\n\n" + "Ideally, TYPED_TEST_P definitions should only ever be included as " + "part of binaries that intend to use them. (As opposed to, for " + "example, being placed in a library that may be linked in to get other " + "utilities.)" + "\n\n" + "To suppress this error for this test suite, insert the following line " + "(in a non-header) in the namespace it is definedin in:" + "\n\n" + "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + + testcase.first + ");"; + + std::string full_name = + "UninstantiatedTypeParamaterizedTestSuite<" + testcase.first + ">"; + RegisterTest( // + "GoogleTestVerification", full_name.c_str(), + nullptr, // No type parameter. + nullptr, // No value parameter. + testcase.second.code_location.file.c_str(), + testcase.second.code_location.line, [message, testcase] { + return new FailureTest(testcase.second.code_location, message, + kErrorOnUninstantiatedTypeParameterizedTest); + }); + } +} + // A copy of all command line arguments. Set by InitGoogleTest(). static ::std::vector g_argvs; @@ -1735,33 +1895,33 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) { // NOLINT // 17 - 21 bits 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx // The maximum code-point a one-byte UTF-8 sequence can represent. -const UInt32 kMaxCodePoint1 = (static_cast(1) << 7) - 1; +constexpr uint32_t kMaxCodePoint1 = (static_cast(1) << 7) - 1; // The maximum code-point a two-byte UTF-8 sequence can represent. -const UInt32 kMaxCodePoint2 = (static_cast(1) << (5 + 6)) - 1; +constexpr uint32_t kMaxCodePoint2 = (static_cast(1) << (5 + 6)) - 1; // The maximum code-point a three-byte UTF-8 sequence can represent. -const UInt32 kMaxCodePoint3 = (static_cast(1) << (4 + 2*6)) - 1; +constexpr uint32_t kMaxCodePoint3 = (static_cast(1) << (4 + 2*6)) - 1; // The maximum code-point a four-byte UTF-8 sequence can represent. -const UInt32 kMaxCodePoint4 = (static_cast(1) << (3 + 3*6)) - 1; +constexpr uint32_t kMaxCodePoint4 = (static_cast(1) << (3 + 3*6)) - 1; // Chops off the n lowest bits from a bit pattern. Returns the n // lowest bits. As a side effect, the original bit pattern will be // shifted to the right by n bits. -inline UInt32 ChopLowBits(UInt32* bits, int n) { - const UInt32 low_bits = *bits & ((static_cast(1) << n) - 1); +inline uint32_t ChopLowBits(uint32_t* bits, int n) { + const uint32_t low_bits = *bits & ((static_cast(1) << n) - 1); *bits >>= n; return low_bits; } // Converts a Unicode code point to a narrow string in UTF-8 encoding. -// code_point parameter is of type UInt32 because wchar_t may not be +// code_point parameter is of type uint32_t because wchar_t may not be // wide enough to contain a code point. // If the code_point is not a valid Unicode code point // (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted // to "(Invalid Unicode 0xXXXXXXXX)". -std::string CodePointToUtf8(UInt32 code_point) { +std::string CodePointToUtf8(uint32_t code_point) { if (code_point > kMaxCodePoint4) { return "(Invalid Unicode 0x" + String::FormatHexUInt32(code_point) + ")"; } @@ -1802,11 +1962,11 @@ inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) { } // Creates a Unicode code point from UTF16 surrogate pair. -inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first, - wchar_t second) { - const auto first_u = static_cast(first); - const auto second_u = static_cast(second); - const UInt32 mask = (1 << 10) - 1; +inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first, + wchar_t second) { + const auto first_u = static_cast(first); + const auto second_u = static_cast(second); + const uint32_t mask = (1 << 10) - 1; return (sizeof(wchar_t) == 2) ? (((first_u & mask) << 10) | (second_u & mask)) + 0x10000 : @@ -1834,7 +1994,7 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) { ::std::stringstream stream; for (int i = 0; i < num_chars; ++i) { - UInt32 unicode_code_point; + uint32_t unicode_code_point; if (str[i] == L'\0') { break; @@ -1843,7 +2003,7 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) { str[i + 1]); i++; } else { - unicode_code_point = static_cast(str[i]); + unicode_code_point = static_cast(str[i]); } stream << CodePointToUtf8(unicode_code_point); @@ -1969,7 +2129,7 @@ std::string String::FormatIntWidth2(int value) { } // Formats an int value as "%X". -std::string String::FormatHexUInt32(UInt32 value) { +std::string String::FormatHexUInt32(uint32_t value) { std::stringstream ss; ss << std::hex << std::uppercase << value; return ss.str(); @@ -1977,7 +2137,7 @@ std::string String::FormatHexUInt32(UInt32 value) { // Formats an int value as "%X". std::string String::FormatHexInt(int value) { - return FormatHexUInt32(static_cast(value)); + return FormatHexUInt32(static_cast(value)); } // Formats a byte as "%02X". @@ -2646,6 +2806,7 @@ namespace internal { void UnitTestImpl::RegisterParameterizedTests() { if (!parameterized_tests_registered_) { parameterized_test_registry_.RegisterTests(); + type_parameterized_test_registry_.CheckForInstantiations(); parameterized_tests_registered_ = true; } } @@ -3134,6 +3295,7 @@ class PrettyUnitTestResultPrinter : public TestEventListener { private: static void PrintFailedTests(const UnitTest& unit_test); + static void PrintFailedTestSuites(const UnitTest& unit_test); static void PrintSkippedTests(const UnitTest& unit_test); }; @@ -3153,7 +3315,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart( } if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) { - const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1); + const int32_t shard_index = Int32FromEnvOrDie(kTestShardIndex, -1); ColoredPrintf(COLOR_YELLOW, "Note: This is test shard %d of %s.\n", static_cast(shard_index) + 1, @@ -3220,9 +3382,7 @@ void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) { void PrettyUnitTestResultPrinter::OnTestPartResult( const TestPartResult& result) { switch (result.type()) { - // If the test part succeeded, or was skipped, - // we don't need to do anything. - case TestPartResult::kSkip: + // If the test part succeeded, we don't need to do anything. case TestPartResult::kSuccess: return; default: @@ -3288,9 +3448,8 @@ void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart( // Internal helper for printing the list of failed tests. void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) { const int failed_test_count = unit_test.failed_test_count(); - if (failed_test_count == 0) { - return; - } + ColoredPrintf(COLOR_RED, "[ FAILED ] "); + printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str()); for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { const TestSuite& test_suite = *unit_test.GetTestSuite(i); @@ -3308,6 +3467,30 @@ void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) { printf("\n"); } } + printf("\n%2d FAILED %s\n", failed_test_count, + failed_test_count == 1 ? "TEST" : "TESTS"); +} + +// Internal helper for printing the list of test suite failures not covered by +// PrintFailedTests. +void PrettyUnitTestResultPrinter::PrintFailedTestSuites( + const UnitTest& unit_test) { + int suite_failure_count = 0; + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + const TestSuite& test_suite = *unit_test.GetTestSuite(i); + if (!test_suite.should_run()) { + continue; + } + if (test_suite.ad_hoc_test_result().Failed()) { + ColoredPrintf(COLOR_RED, "[ FAILED ] "); + printf("%s: SetUpTestSuite or TearDownTestSuite\n", test_suite.name()); + ++suite_failure_count; + } + } + if (suite_failure_count > 0) { + printf("\n%2d FAILED TEST %s\n", suite_failure_count, + suite_failure_count == 1 ? "SUITE" : "SUITES"); + } } // Internal helper for printing the list of skipped tests. @@ -3355,19 +3538,14 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, PrintSkippedTests(unit_test); } - int num_failures = unit_test.failed_test_count(); if (!unit_test.Passed()) { - const int failed_test_count = unit_test.failed_test_count(); - ColoredPrintf(COLOR_RED, "[ FAILED ] "); - printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str()); PrintFailedTests(unit_test); - printf("\n%2d FAILED %s\n", num_failures, - num_failures == 1 ? "TEST" : "TESTS"); + PrintFailedTestSuites(unit_test); } int num_disabled = unit_test.reportable_disabled_test_count(); if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) { - if (!num_failures) { + if (unit_test.Passed()) { printf("\n"); // Add a spacer if no FAILURE banner is displayed. } ColoredPrintf(COLOR_YELLOW, @@ -4508,6 +4686,7 @@ class ScopedPrematureExitFile { } ~ScopedPrematureExitFile() { +#if !defined GTEST_OS_ESP8266 if (!premature_exit_filepath_.empty()) { int retval = remove(premature_exit_filepath_.c_str()); if (retval) { @@ -4516,6 +4695,7 @@ class ScopedPrematureExitFile { << retval; } } +#endif } private: @@ -4908,7 +5088,6 @@ int UnitTest::Run() { _set_abort_behavior( 0x0, // Clear the following flags: _WRITE_ABORT_MSG | _CALL_REPORTFAULT); // pop-up window, core dump. -# endif // In debug mode, the Windows CRT can crash with an assertion over invalid // input (e.g. passing an invalid file descriptor). The default handling @@ -4919,6 +5098,7 @@ int UnitTest::Run() { _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG); (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR); } +# endif } #endif // GTEST_OS_WINDOWS @@ -5299,7 +5479,7 @@ bool UnitTestImpl::RunAllTests() { // Shuffles test suites and tests if requested. if (has_tests_to_run && GTEST_FLAG(shuffle)) { - random()->Reseed(static_cast(random_seed_)); + random()->Reseed(static_cast(random_seed_)); // This should be done before calling OnTestIterationStart(), // such that a test event listener can see the actual test order // in the event. @@ -5422,8 +5602,8 @@ bool ShouldShard(const char* total_shards_env, return false; } - const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1); - const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1); + const int32_t total_shards = Int32FromEnvOrDie(total_shards_env, -1); + const int32_t shard_index = Int32FromEnvOrDie(shard_index_env, -1); if (total_shards == -1 && shard_index == -1) { return false; @@ -5460,13 +5640,13 @@ bool ShouldShard(const char* total_shards_env, // Parses the environment variable var as an Int32. If it is unset, // returns default_val. If it is not an Int32, prints an error // and aborts. -Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) { +int32_t Int32FromEnvOrDie(const char* var, int32_t default_val) { const char* str_val = posix::GetEnv(var); if (str_val == nullptr) { return default_val; } - Int32 result; + int32_t result; if (!ParseInt32(Message() << "The value of environment variable " << var, str_val, &result)) { exit(EXIT_FAILURE); @@ -5490,9 +5670,9 @@ bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) { // https://github.com/google/googletest/blob/master/googletest/docs/advanced.md // . Returns the number of tests that should run. int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { - const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ? + const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ? Int32FromEnvOrDie(kTestTotalShards, -1) : -1; - const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ? + const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ? Int32FromEnvOrDie(kTestShardIndex, -1) : -1; // num_runnable_tests are the number of tests that will @@ -5781,12 +5961,11 @@ static bool ParseBoolFlag(const char* str, const char* flag, bool* value) { return true; } -// Parses a string for an Int32 flag, in the form of -// "--flag=value". +// Parses a string for an int32_t flag, in the form of "--flag=value". // // On success, stores the value of the flag in *value, and returns // true. On failure, returns false without changing *value. -bool ParseInt32Flag(const char* str, const char* flag, Int32* value) { +bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) { // Gets the value of the flag as a string. const char* const value_str = ParseFlagValue(str, flag, false); @@ -5798,8 +5977,7 @@ bool ParseInt32Flag(const char* str, const char* flag, Int32* value) { value_str, value); } -// Parses a string for a string flag, in the form of -// "--flag=value". +// Parses a string for a string flag, in the form of "--flag=value". // // On success, stores the value of the flag in *value, and returns // true. On failure, returns false without changing *value. @@ -6149,7 +6327,11 @@ std::string TempDir() { else return std::string(temp_dir) + "\\"; #elif GTEST_OS_LINUX_ANDROID - return "/sdcard/"; + const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR"); + if (temp_dir == nullptr || temp_dir[0] == '\0') + return "/data/local/tmp/"; + else + return temp_dir; #else return "/tmp/"; #endif // GTEST_OS_WINDOWS_MOBILE diff --git a/third_party/googletest/src/src/gtest_main.cc b/third_party/googletest/src/src/gtest_main.cc index f6e1dd96fb..46b27c3d7d 100644 --- a/third_party/googletest/src/src/gtest_main.cc +++ b/third_party/googletest/src/src/gtest_main.cc @@ -30,13 +30,20 @@ #include #include "gtest/gtest.h" -#ifdef ARDUINO +#if GTEST_OS_ESP8266 || GTEST_OS_ESP32 +#if GTEST_OS_ESP8266 +extern "C" { +#endif void setup() { testing::InitGoogleTest(); } void loop() { RUN_ALL_TESTS(); } +#if GTEST_OS_ESP8266 +} +#endif + #else GTEST_API_ int main(int argc, char **argv) { From 7a92a785f233138dcb88c789ada16adec38fb0d2 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 21 Jul 2020 16:13:39 -0700 Subject: [PATCH 0027/1000] Silience warnings about uninitiated test cases BUG=b/159031848 Change-Id: I6bb88c24bd08e0590ec6b8ebfb696fd9b07ed011 --- test/avg_test.cc | 4 ++++ test/dct16x16_test.cc | 2 ++ test/fdct8x8_test.cc | 2 ++ test/lpf_test.cc | 8 ++++++++ test/test.mk | 10 +++++++++- test/variance_test.cc | 6 +++--- test/vp9_intrapred_test.cc | 3 ++- 7 files changed, 30 insertions(+), 5 deletions(-) diff --git a/test/avg_test.cc b/test/avg_test.cc index 601fb19012..3b06aa849c 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -152,6 +152,7 @@ class AverageTestHBD : public AverageTestBase, }; #endif // CONFIG_VP9_HIGHBITDEPTH +#if HAVE_NEON || HAVE_SSE2 || HAVE_MSA typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height); @@ -226,6 +227,7 @@ class IntProColTest : public AverageTestBase, int16_t sum_asm_; int16_t sum_c_; }; +#endif // HAVE_NEON || HAVE_SSE2 || HAVE_MSA typedef int (*SatdFunc)(const tran_low_t *coeffs, int length); typedef std::tuple SatdTestParam; @@ -378,6 +380,7 @@ TEST_P(AverageTestHBD, Random) { } #endif // CONFIG_VP9_HIGHBITDEPTH +#if HAVE_NEON || HAVE_SSE2 || HAVE_MSA TEST_P(IntProRowTest, MinValue) { FillConstant(0); RunComparison(); @@ -407,6 +410,7 @@ TEST_P(IntProColTest, Random) { FillRandom(); RunComparison(); } +#endif TEST_P(SatdLowbdTest, MinValue) { const int kMin = -32640; diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index d586db7208..321c66e4e9 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -714,6 +714,7 @@ TEST_P(Trans16x16HT, QuantCheck) { RunQuantCheck(429, 729); } +#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE class InvTrans16x16DCT : public Trans16x16TestBase, public ::testing::TestWithParam { public: @@ -743,6 +744,7 @@ class InvTrans16x16DCT : public Trans16x16TestBase, TEST_P(InvTrans16x16DCT, CompareReference) { CompareInvReference(ref_txfm_, thresh_); } +#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE using std::make_tuple; diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index fd9fdff611..770337da00 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -598,6 +598,7 @@ TEST_P(FwdTrans8x8HT, RoundTripErrorCheck) { RunRoundTripErrorCheck(); } TEST_P(FwdTrans8x8HT, ExtremalCheck) { RunExtremalCheck(); } +#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE class InvTrans8x8DCT : public FwdTrans8x8TestBase, public ::testing::TestWithParam { public: @@ -628,6 +629,7 @@ class InvTrans8x8DCT : public FwdTrans8x8TestBase, TEST_P(InvTrans8x8DCT, CompareReference) { CompareInvReference(ref_txfm_, thresh_); } +#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE using std::make_tuple; diff --git a/test/lpf_test.cc b/test/lpf_test.cc index 25bba74b0f..2c632a5a8c 100644 --- a/test/lpf_test.cc +++ b/test/lpf_test.cc @@ -146,6 +146,8 @@ class Loop8Test6Param : public ::testing::TestWithParam { loop_op_t ref_loopfilter_op_; }; +#if HAVE_NEON || HAVE_SSE2 || \ + (HAVE_DSPR2 || HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH) class Loop8Test9Param : public ::testing::TestWithParam { public: virtual ~Loop8Test9Param() {} @@ -164,6 +166,8 @@ class Loop8Test9Param : public ::testing::TestWithParam { dual_loop_op_t loopfilter_op_; dual_loop_op_t ref_loopfilter_op_; }; +#endif // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA && + // (!CONFIG_VP9_HIGHBITDEPTH)) TEST_P(Loop8Test6Param, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); @@ -275,6 +279,8 @@ TEST_P(Loop8Test6Param, ValueCheck) { << "First failed at test case " << first_failure; } +#if HAVE_NEON || HAVE_SSE2 || \ + (HAVE_DSPR2 || HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)) TEST_P(Loop8Test9Param, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = number_of_iterations; @@ -402,6 +408,8 @@ TEST_P(Loop8Test9Param, ValueCheck) { "loopfilter output. " << "First failed at test case " << first_failure; } +#endif // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA && + // (!CONFIG_VP9_HIGHBITDEPTH)) using std::make_tuple; diff --git a/test/test.mk b/test/test.mk index 5aecb013f3..c12fb786e7 100644 --- a/test/test.mk +++ b/test/test.mk @@ -121,11 +121,13 @@ ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes) LIBVPX_TEST_SRCS-yes += vp8_boolcoder_test.cc LIBVPX_TEST_SRCS-yes += vp8_fragments_test.cc endif - LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC) += add_noise_test.cc LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC) += pp_filter_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc +ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_SSSE3) $(HAVE_SSE4_1) $(HAVE_NEON) \ + $(HAVE_MSA) $(HAVE_MMI))) LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc +endif LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc @@ -174,7 +176,9 @@ ifneq ($(CONFIG_REALTIME_ONLY),yes) LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc +ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2))) LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc +endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc @@ -188,8 +192,10 @@ LIBVPX_TEST_SRCS-$(CONFIG_NON_GREEDY_MV) += non_greedy_mv_test.cc endif ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes) +ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2))) LIBVPX_TEST_SRCS-yes += vp9_denoiser_test.cc endif +endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc ifeq ($(CONFIG_VP9_ENCODER),yes) @@ -201,7 +207,9 @@ endif # VP9 ## Multi-codec / unconditional whitebox tests. LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc +ifneq (, $(filter yes, $(HAVE_NEON) $(HAVE_SSE2) $(HAVE_MSA))) LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sum_squares_test.cc +endif TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c diff --git a/test/variance_test.cc b/test/variance_test.cc index a6db9cff42..3a220a8103 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -807,14 +807,11 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0))); #if CONFIG_VP9_HIGHBITDEPTH -typedef MainTestClass VpxHBDMseTest; typedef MainTestClass VpxHBDVarianceTest; typedef SubpelVarianceTest VpxHBDSubpelVarianceTest; typedef SubpelVarianceTest VpxHBDSubpelAvgVarianceTest; -TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); } -TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); } TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); } TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); } TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); } @@ -825,6 +822,9 @@ TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } /* TODO(debargha): This test does not support the highbd version +typedef MainTestClass VpxHBDMseTest; +TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); } +TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); } INSTANTIATE_TEST_SUITE_P( C, VpxHBDMseTest, ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c), diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 29f689f07d..62e1771dc2 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -457,6 +457,7 @@ struct HighbdIntraPredParam { int bit_depth; }; +#if HAVE_SSSE3 || HAVE_NEON || HAVE_SSE2 template <> void IntraPredTest::Predict() { const int bit_depth = params_.bit_depth; @@ -466,7 +467,6 @@ void IntraPredTest::Predict() { } typedef IntraPredTest VP9HighbdIntraPredTest; - TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) { // max block size is 32 DECLARE_ALIGNED(16, uint16_t, left_col[2 * 32]); @@ -475,6 +475,7 @@ TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) { DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 32 * 32]); RunTest(left_col, above_data, dst, ref_dst); } +#endif #if HAVE_SSSE3 INSTANTIATE_TEST_SUITE_P( From 859d66fabf8b673904a70e1f603e82b230092692 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 23 Jul 2020 11:48:54 -0700 Subject: [PATCH 0028/1000] libs.mk: quiet curl output + fix error return Change-Id: I48a9ed70fe05df603a49b3c11f813119906fc4fb --- libs.mk | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/libs.mk b/libs.mk index 39f11d4bc8..6362418e50 100644 --- a/libs.mk +++ b/libs.mk @@ -490,16 +490,17 @@ libvpx_test_srcs.txt: @echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@ CLEAN-OBJS += libvpx_test_srcs.txt +# Attempt to download the file using curl, retrying once if it fails for a +# partial file (18). $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1 @echo " [DOWNLOAD] $@" - # Attempt to download the file using curl, retrying once if it fails for a - # partial file (18). $(qexec)( \ trap 'rm -f $@' INT TERM; \ - curl="curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))"; \ - $$curl; \ - case "$$?" in \ - 18) $$curl -C -;; \ + curl="curl -S -s --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))"; \ + $$curl; ret=$$?; \ + case "$$ret" in \ + 18) $$curl -C - ;; \ + *) exit $$ret ;; \ esac \ ) From b358f9076f153835680cd3af3765c7e30474c030 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 23 Jul 2020 16:46:01 -0700 Subject: [PATCH 0029/1000] NULL -> nullptr in CPP files This should clean up clangtidy warnings Change-Id: Ifb5a986121b2d0bd71b9ad39a79dd46c63bdb998 --- test/active_map_refresh_test.cc | 2 +- test/active_map_test.cc | 2 +- test/add_noise_test.cc | 6 +- test/avg_test.cc | 25 +-- test/blockiness_test.cc | 8 +- test/byte_alignment_test.cc | 17 +- test/comp_avg_pred_test.cc | 6 +- test/consistency_test.cc | 14 +- test/convolve_test.cc | 46 +++--- test/dct_partial_test.cc | 4 +- test/dct_test.cc | 16 +- test/decode_perf_test.cc | 10 +- test/decode_svc_test.cc | 8 +- test/decode_test_driver.cc | 10 +- test/encode_api_test.cc | 26 +-- test/encode_test_driver.cc | 10 +- test/external_frame_buffer_test.cc | 60 +++---- test/frame_size_tests.cc | 4 +- test/idct_test.cc | 10 +- test/invalid_file_test.cc | 10 +- test/non_greedy_mv_test.cc | 14 +- test/partial_idct_test.cc | 6 +- test/pp_filter_test.cc | 2 +- test/predict_test.cc | 16 +- test/quantize_test.cc | 4 +- test/resize_test.cc | 8 +- test/sad_test.cc | 12 +- test/superframe_test.cc | 2 +- test/test_intra_pred_speed.cc | 198 ++++++++++++----------- test/test_vector_test.cc | 8 +- test/user_priv_test.cc | 8 +- test/variance_test.cc | 16 +- test/vp9_boolcoder_test.cc | 2 +- test/vp9_datarate_test.cc | 2 +- test/vp9_encoder_parms_get_to_decoder.cc | 2 +- test/vp9_end_to_end_test.cc | 6 +- test/vp9_ethread_test.cc | 2 +- test/vp9_intrapred_test.cc | 7 +- test/vp9_motion_vector_test.cc | 2 +- test/vp9_quantize_test.cc | 14 +- test/vp9_skip_loopfilter_test.cc | 19 +-- test/vp9_thread_test.cc | 26 +-- test/y4m_test.cc | 12 +- vp9/ratectrl_rtc.cc | 2 +- webmdec.cc | 42 ++--- webmenc.cc | 4 +- 46 files changed, 376 insertions(+), 354 deletions(-) diff --git a/test/active_map_refresh_test.cc b/test/active_map_refresh_test.cc index 208d7c6d79..68d8856eaa 100644 --- a/test/active_map_refresh_test.cc +++ b/test/active_map_refresh_test.cc @@ -80,7 +80,7 @@ class ActiveMapRefreshTest } else if (video->frame() >= 2 && video->img()) { vpx_image_t *current = video->img(); vpx_image_t *previous = y4m_holder_->img(); - ASSERT_TRUE(previous != NULL); + ASSERT_NE(previous, nullptr); vpx_active_map_t map = vpx_active_map_t(); const int width = static_cast(current->d_w); const int height = static_cast(current->d_h); diff --git a/test/active_map_test.cc b/test/active_map_test.cc index e08d9a0367..9c55f9a8b1 100644 --- a/test/active_map_test.cc +++ b/test/active_map_test.cc @@ -62,7 +62,7 @@ class ActiveMapTest vpx_active_map_t map = vpx_active_map_t(); map.cols = (kWidth + 15) / 16; map.rows = (kHeight + 15) / 16; - map.active_map = NULL; + map.active_map = nullptr; encoder->Control(VP8E_SET_ACTIVEMAP, &map); } } diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc index 1c4403ef6a..25de4279c2 100644 --- a/test/add_noise_test.cc +++ b/test/add_noise_test.cc @@ -53,7 +53,7 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) { const int clamp = vpx_setup_noise(GET_PARAM(0), noise, kNoiseSize); uint8_t *const s = reinterpret_cast(vpx_calloc(image_size, sizeof(*s))); - ASSERT_TRUE(s != NULL); + ASSERT_NE(s, nullptr); memset(s, 99, image_size * sizeof(*s)); ASM_REGISTER_STATE_CHECK( @@ -106,8 +106,8 @@ TEST_P(AddNoiseTest, CheckCvsAssembly) { uint8_t *const s = reinterpret_cast(vpx_calloc(image_size, 1)); uint8_t *const d = reinterpret_cast(vpx_calloc(image_size, 1)); - ASSERT_TRUE(s != NULL); - ASSERT_TRUE(d != NULL); + ASSERT_NE(s, nullptr); + ASSERT_NE(d, nullptr); memset(s, 99, image_size); memset(d, 99, image_size); diff --git a/test/avg_test.cc b/test/avg_test.cc index 3b06aa849c..4bc1944b1c 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -35,12 +35,12 @@ template class AverageTestBase : public ::testing::Test { public: AverageTestBase(int width, int height) - : width_(width), height_(height), source_data_(NULL), source_stride_(0), - bit_depth_(8) {} + : width_(width), height_(height), source_data_(nullptr), + source_stride_(0), bit_depth_(8) {} virtual void TearDown() { vpx_free(source_data_); - source_data_ = NULL; + source_data_ = nullptr; libvpx_test::ClearSystemState(); } @@ -52,7 +52,7 @@ class AverageTestBase : public ::testing::Test { virtual void SetUp() { source_data_ = reinterpret_cast( vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); - ASSERT_TRUE(source_data_ != NULL); + ASSERT_NE(source_data_, nullptr); source_stride_ = (width_ + 31) & ~31; bit_depth_ = 8; rnd_.Reset(ACMRandom::DeterministicSeed()); @@ -162,7 +162,8 @@ class IntProRowTest : public AverageTestBase, public ::testing::WithParamInterface { public: IntProRowTest() - : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(NULL), hbuf_c_(NULL) { + : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(nullptr), + hbuf_c_(nullptr) { asm_func_ = GET_PARAM(1); c_func_ = GET_PARAM(2); } @@ -171,7 +172,7 @@ class IntProRowTest : public AverageTestBase, virtual void SetUp() { source_data_ = reinterpret_cast( vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); - ASSERT_TRUE(source_data_ != NULL); + ASSERT_NE(source_data_, nullptr); hbuf_asm_ = reinterpret_cast( vpx_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16)); @@ -181,11 +182,11 @@ class IntProRowTest : public AverageTestBase, virtual void TearDown() { vpx_free(source_data_); - source_data_ = NULL; + source_data_ = nullptr; vpx_free(hbuf_c_); - hbuf_c_ = NULL; + hbuf_c_ = nullptr; vpx_free(hbuf_asm_); - hbuf_asm_ = NULL; + hbuf_asm_ = nullptr; } void RunComparison() { @@ -241,7 +242,7 @@ class SatdTest : public ::testing::Test, rnd_.Reset(ACMRandom::DeterministicSeed()); src_ = reinterpret_cast( vpx_memalign(16, sizeof(*src_) * satd_size_)); - ASSERT_TRUE(src_ != NULL); + ASSERT_NE(src_, nullptr); } virtual void TearDown() { @@ -297,8 +298,8 @@ class BlockErrorTestFP vpx_memalign(16, sizeof(*coeff_) * txfm_size_)); dqcoeff_ = reinterpret_cast( vpx_memalign(16, sizeof(*dqcoeff_) * txfm_size_)); - ASSERT_TRUE(coeff_ != NULL); - ASSERT_TRUE(dqcoeff_ != NULL); + ASSERT_NE(coeff_, nullptr); + ASSERT_NE(dqcoeff_, nullptr); } virtual void TearDown() { diff --git a/test/blockiness_test.cc b/test/blockiness_test.cc index 1ad444a044..11b2a3f61f 100644 --- a/test/blockiness_test.cc +++ b/test/blockiness_test.cc @@ -44,9 +44,9 @@ class BlockinessTestBase : public ::testing::Test { static void TearDownTestSuite() { vpx_free(source_data_); - source_data_ = NULL; + source_data_ = nullptr; vpx_free(reference_data_); - reference_data_ = NULL; + reference_data_ = nullptr; } virtual void TearDown() { libvpx_test::ClearSystemState(); } @@ -154,8 +154,8 @@ class BlockinessVP9Test }; #endif // CONFIG_VP9_ENCODER -uint8_t *BlockinessTestBase::source_data_ = NULL; -uint8_t *BlockinessTestBase::reference_data_ = NULL; +uint8_t *BlockinessTestBase::source_data_ = nullptr; +uint8_t *BlockinessTestBase::reference_data_ = nullptr; #if CONFIG_VP9_ENCODER TEST_P(BlockinessVP9Test, SourceBlockierThanReference) { diff --git a/test/byte_alignment_test.cc b/test/byte_alignment_test.cc index 9feb7bdce7..1e0ffceb8d 100644 --- a/test/byte_alignment_test.cc +++ b/test/byte_alignment_test.cc @@ -55,23 +55,24 @@ const ByteAlignmentTestParam kBaTestParams[] = { class ByteAlignmentTest : public ::testing::TestWithParam { protected: - ByteAlignmentTest() : video_(NULL), decoder_(NULL), md5_file_(NULL) {} + ByteAlignmentTest() + : video_(nullptr), decoder_(nullptr), md5_file_(nullptr) {} virtual void SetUp() { video_ = new libvpx_test::WebMVideoSource(kVP9TestFile); - ASSERT_TRUE(video_ != NULL); + ASSERT_NE(video_, nullptr); video_->Init(); video_->Begin(); const vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); decoder_ = new libvpx_test::VP9Decoder(cfg, 0); - ASSERT_TRUE(decoder_ != NULL); + ASSERT_NE(decoder_, nullptr); OpenMd5File(kVP9Md5File); } virtual void TearDown() { - if (md5_file_ != NULL) fclose(md5_file_); + if (md5_file_ != nullptr) fclose(md5_file_); delete decoder_; delete video_; @@ -90,7 +91,7 @@ class ByteAlignmentTest } vpx_codec_err_t DecodeRemainingFrames(int byte_alignment_to_check) { - for (; video_->cxdata() != NULL; video_->Next()) { + for (; video_->cxdata() != nullptr; video_->Next()) { const vpx_codec_err_t res = decoder_->DecodeFrame(video_->cxdata(), video_->frame_size()); if (res != VPX_CODEC_OK) return res; @@ -113,7 +114,7 @@ class ByteAlignmentTest const vpx_image_t *img; // Get decompressed data - while ((img = dec_iter.Next()) != NULL) { + while ((img = dec_iter.Next()) != nullptr) { if (byte_alignment_to_check == kLegacyByteAlignment) { CheckByteAlignment(img->planes[0], kLegacyYPlaneByteAlignment); } else { @@ -128,12 +129,12 @@ class ByteAlignmentTest // TODO(fgalligan): Move the MD5 testing code into another class. void OpenMd5File(const std::string &md5_file_name_) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); - ASSERT_TRUE(md5_file_ != NULL) + ASSERT_NE(md5_file_, nullptr) << "MD5 file open failed. Filename: " << md5_file_name_; } void CheckMd5(const vpx_image_t &img) { - ASSERT_TRUE(md5_file_ != NULL); + ASSERT_NE(md5_file_, nullptr); char expected_md5[33]; char junk[128]; diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc index b33bc3fba9..b9201a20f9 100644 --- a/test/comp_avg_pred_test.cc +++ b/test/comp_avg_pred_test.cc @@ -29,9 +29,9 @@ uint8_t avg_with_rounding(uint8_t a, uint8_t b) { return (a + b + 1) >> 1; } void reference_pred(const Buffer &pred, const Buffer &ref, int width, int height, Buffer *avg) { - ASSERT_TRUE(avg->TopLeftPixel() != NULL); - ASSERT_TRUE(pred.TopLeftPixel() != NULL); - ASSERT_TRUE(ref.TopLeftPixel() != NULL); + ASSERT_NE(avg->TopLeftPixel(), nullptr); + ASSERT_NE(pred.TopLeftPixel(), nullptr); + ASSERT_NE(ref.TopLeftPixel(), nullptr); for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { diff --git a/test/consistency_test.cc b/test/consistency_test.cc index f31fd8c923..f0e2cb297e 100644 --- a/test/consistency_test.cc +++ b/test/consistency_test.cc @@ -54,13 +54,13 @@ class ConsistencyTestBase : public ::testing::Test { static void ClearSsim() { memset(ssim_array_, 0, kDataBufferSize / 16); } static void TearDownTestSuite() { vpx_free(source_data_[0]); - source_data_[0] = NULL; + source_data_[0] = nullptr; vpx_free(reference_data_[0]); - reference_data_[0] = NULL; + reference_data_[0] = nullptr; vpx_free(source_data_[1]); - source_data_[1] = NULL; + source_data_[1] = nullptr; vpx_free(reference_data_[1]); - reference_data_[1] = NULL; + reference_data_[1] = nullptr; delete[] ssim_array_; } @@ -145,9 +145,9 @@ class ConsistencyVP9Test }; #endif // CONFIG_VP9_ENCODER -uint8_t *ConsistencyTestBase::source_data_[2] = { NULL, NULL }; -uint8_t *ConsistencyTestBase::reference_data_[2] = { NULL, NULL }; -Ssimv *ConsistencyTestBase::ssim_array_ = NULL; +uint8_t *ConsistencyTestBase::source_data_[2] = { nullptr, nullptr }; +uint8_t *ConsistencyTestBase::reference_data_[2] = { nullptr, nullptr }; +Ssimv *ConsistencyTestBase::ssim_array_ = nullptr; #if CONFIG_VP9_ENCODER TEST_P(ConsistencyVP9Test, ConsistencyIsZero) { diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 6eef26f93a..4b2dadefac 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -365,18 +365,18 @@ class ConvolveTest : public ::testing::TestWithParam { static void TearDownTestSuite() { vpx_free(input_ - 1); - input_ = NULL; + input_ = nullptr; vpx_free(output_); - output_ = NULL; + output_ = nullptr; vpx_free(output_ref_); - output_ref_ = NULL; + output_ref_ = nullptr; #if CONFIG_VP9_HIGHBITDEPTH vpx_free(input16_ - 1); - input16_ = NULL; + input16_ = nullptr; vpx_free(output16_); - output16_ = NULL; + output16_ = nullptr; vpx_free(output16_ref_); - output16_ref_ = NULL; + output16_ref_ = nullptr; #endif } @@ -541,13 +541,13 @@ class ConvolveTest : public ::testing::TestWithParam { #endif }; -uint8_t *ConvolveTest::input_ = NULL; -uint8_t *ConvolveTest::output_ = NULL; -uint8_t *ConvolveTest::output_ref_ = NULL; +uint8_t *ConvolveTest::input_ = nullptr; +uint8_t *ConvolveTest::output_ = nullptr; +uint8_t *ConvolveTest::output_ref_ = nullptr; #if CONFIG_VP9_HIGHBITDEPTH -uint16_t *ConvolveTest::input16_ = NULL; -uint16_t *ConvolveTest::output16_ = NULL; -uint16_t *ConvolveTest::output16_ref_ = NULL; +uint16_t *ConvolveTest::input16_ = nullptr; +uint16_t *ConvolveTest::output16_ = nullptr; +uint16_t *ConvolveTest::output16_ref_ = nullptr; #endif TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); } @@ -562,7 +562,7 @@ TEST_P(ConvolveTest, DISABLED_Copy_Speed) { vpx_usec_timer_start(&timer); for (int n = 0; n < kNumTests; ++n) { - UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0, + UUT_->copy_[0](in, kInputStride, out, kOutputStride, nullptr, 0, 0, 0, 0, width, height); } vpx_usec_timer_mark(&timer); @@ -582,7 +582,7 @@ TEST_P(ConvolveTest, DISABLED_Avg_Speed) { vpx_usec_timer_start(&timer); for (int n = 0; n < kNumTests; ++n) { - UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0, + UUT_->copy_[1](in, kInputStride, out, kOutputStride, nullptr, 0, 0, 0, 0, width, height); } vpx_usec_timer_mark(&timer); @@ -780,7 +780,8 @@ TEST_P(ConvolveTest, Copy) { uint8_t *const out = output(); ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, kOutputStride, - NULL, 0, 0, 0, 0, Width(), Height())); + nullptr, 0, 0, 0, 0, Width(), + Height())); CheckGuardBlocks(); @@ -799,7 +800,8 @@ TEST_P(ConvolveTest, Avg) { CopyOutputToRef(); ASM_REGISTER_STATE_CHECK(UUT_->copy_[1](in, kInputStride, out, kOutputStride, - NULL, 0, 0, 0, 0, Width(), Height())); + nullptr, 0, 0, 0, 0, Width(), + Height())); CheckGuardBlocks(); @@ -955,9 +957,9 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) { UUT_->h8_[i](in, kInputStride, out, kOutputStride, filters, filter_x, 16, 0, 16, Width(), Height())); else - ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](in, kInputStride, out, - kOutputStride, NULL, 0, 0, - 0, 0, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->copy_[i](in, kInputStride, out, kOutputStride, nullptr, 0, + 0, 0, 0, Width(), Height())); CheckGuardBlocks(); @@ -1053,9 +1055,9 @@ TEST_P(ConvolveTest, FilterExtremes) { UUT_->h8_[0](in, kInputStride, out, kOutputStride, filters, filter_x, 16, 0, 16, Width(), Height())); else - ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, - kOutputStride, NULL, 0, 0, - 0, 0, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->copy_[0](in, kInputStride, out, kOutputStride, nullptr, + 0, 0, 0, 0, Width(), Height())); for (int y = 0; y < Height(); ++y) { for (int x = 0; x < Width(); ++x) diff --git a/test/dct_partial_test.cc b/test/dct_partial_test.cc index 2fd176f25a..8d0e3a912e 100644 --- a/test/dct_partial_test.cc +++ b/test/dct_partial_test.cc @@ -39,7 +39,7 @@ typedef tuple tran_low_t partial_fdct_ref(const Buffer &in, int size) { int64_t sum = 0; - if (in.TopLeftPixel() != NULL) { + if (in.TopLeftPixel() != nullptr) { for (int y = 0; y < size; ++y) { for (int x = 0; x < size; ++x) { sum += in.TopLeftPixel()[y * in.stride() + x]; @@ -81,7 +81,7 @@ class PartialFdctTest : public ::testing::TestWithParam { Buffer output_block = Buffer(size_, size_, 0, 16); ASSERT_TRUE(output_block.Init()); - if (output_block.TopLeftPixel() != NULL) { + if (output_block.TopLeftPixel() != nullptr) { for (int i = 0; i < 100; ++i) { if (i == 0) { input_block.Set(maxvalue); diff --git a/test/dct_test.cc b/test/dct_test.cc index a1b766e103..9541869535 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -160,17 +160,17 @@ class TransTestBase : public ::testing::TestWithParam { src_ = reinterpret_cast( vpx_memalign(16, pixel_size_ * block_size_)); - ASSERT_TRUE(src_ != NULL); + ASSERT_NE(src_, nullptr); dst_ = reinterpret_cast( vpx_memalign(16, pixel_size_ * block_size_)); - ASSERT_TRUE(dst_ != NULL); + ASSERT_NE(dst_, nullptr); } virtual void TearDown() { vpx_free(src_); - src_ = NULL; + src_ = nullptr; vpx_free(dst_); - dst_ = NULL; + dst_ = nullptr; libvpx_test::ClearSystemState(); } @@ -211,7 +211,7 @@ class TransTestBase : public ::testing::TestWithParam { Buffer test_input_block = Buffer(size_, size_, 8, size_ == 4 ? 0 : 16); ASSERT_TRUE(test_input_block.Init()); - ASSERT_TRUE(test_input_block.TopLeftPixel() != NULL); + ASSERT_NE(test_input_block.TopLeftPixel(), nullptr); Buffer test_temp_block = Buffer(size_, size_, 0, 16); ASSERT_TRUE(test_temp_block.Init()); @@ -316,7 +316,7 @@ class TransTestBase : public ::testing::TestWithParam { } else if (i == 1) { input_extreme_block.Set(-max_pixel_value_); } else { - ASSERT_TRUE(input_extreme_block.TopLeftPixel() != NULL); + ASSERT_NE(input_extreme_block.TopLeftPixel(), nullptr); for (int h = 0; h < size_; ++h) { for (int w = 0; w < size_; ++w) { input_extreme_block @@ -331,7 +331,7 @@ class TransTestBase : public ::testing::TestWithParam { // The minimum quant value is 4. EXPECT_TRUE(output_block.CheckValues(output_ref_block)); - ASSERT_TRUE(output_block.TopLeftPixel() != NULL); + ASSERT_NE(output_block.TopLeftPixel(), nullptr); for (int h = 0; h < size_; ++h) { for (int w = 0; w < size_; ++w) { EXPECT_GE( @@ -369,7 +369,7 @@ class TransTestBase : public ::testing::TestWithParam { for (int i = 0; i < count_test_block; ++i) { InitMem(); - ASSERT_TRUE(in.TopLeftPixel() != NULL); + ASSERT_NE(in.TopLeftPixel(), nullptr); // Initialize a test block with input range [-max_pixel_value_, // max_pixel_value_]. for (int h = 0; h < size_; ++h) { diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc index 34d53862a3..e07a667440 100644 --- a/test/decode_perf_test.cc +++ b/test/decode_perf_test.cc @@ -87,7 +87,7 @@ TEST_P(DecodePerfTest, PerfTest) { vpx_usec_timer t; vpx_usec_timer_start(&t); - for (video.Begin(); video.cxdata() != NULL; video.Next()) { + for (video.Begin(); video.cxdata() != nullptr; video.Next()) { decoder.DecodeFrame(video.cxdata(), video.frame_size()); } @@ -150,16 +150,16 @@ class VP9NewEncodeDecodePerfTest const std::string data_path = getenv("LIBVPX_TEST_DATA_PATH"); const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile; outfile_ = fopen(path_to_source.c_str(), "wb"); - ASSERT_TRUE(outfile_ != NULL); + ASSERT_NE(outfile_, nullptr); } virtual void EndPassHook() { - if (outfile_ != NULL) { + if (outfile_ != nullptr) { if (!fseek(outfile_, 0, SEEK_SET)) { ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_); } fclose(outfile_); - outfile_ = NULL; + outfile_ = nullptr; } } @@ -236,7 +236,7 @@ TEST_P(VP9NewEncodeDecodePerfTest, PerfTest) { vpx_usec_timer t; vpx_usec_timer_start(&t); - for (decode_video.Begin(); decode_video.cxdata() != NULL; + for (decode_video.Begin(); decode_video.cxdata() != nullptr; decode_video.Next()) { decoder.DecodeFrame(decode_video.cxdata(), decode_video.frame_size()); } diff --git a/test/decode_svc_test.cc b/test/decode_svc_test.cc index be0f32545e..ec9935da79 100644 --- a/test/decode_svc_test.cc +++ b/test/decode_svc_test.cc @@ -56,7 +56,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer0) { const std::string filename = GET_PARAM(1); std::unique_ptr video; video.reset(new libvpx_test::IVFVideoSource(filename)); - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); video->Init(); total_frames_ = 0; spatial_layer_ = 0; @@ -73,7 +73,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer1) { const std::string filename = GET_PARAM(1); std::unique_ptr video; video.reset(new libvpx_test::IVFVideoSource(filename)); - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); video->Init(); total_frames_ = 0; spatial_layer_ = 1; @@ -90,7 +90,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer2) { const std::string filename = GET_PARAM(1); std::unique_ptr video; video.reset(new libvpx_test::IVFVideoSource(filename)); - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); video->Init(); total_frames_ = 0; spatial_layer_ = 2; @@ -108,7 +108,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer10) { const std::string filename = GET_PARAM(1); std::unique_ptr video; video.reset(new libvpx_test::IVFVideoSource(filename)); - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); video->Init(); total_frames_ = 0; spatial_layer_ = 10; diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc index ae23587759..773d673d37 100644 --- a/test/decode_test_driver.cc +++ b/test/decode_test_driver.cc @@ -26,7 +26,7 @@ vpx_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size, } vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size) { - return DecodeFrame(cxdata, size, NULL); + return DecodeFrame(cxdata, size, nullptr); } vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size, @@ -67,7 +67,7 @@ void DecoderTest::HandlePeekResult(Decoder *const decoder, void DecoderTest::RunLoop(CompressedVideoSource *video, const vpx_codec_dec_cfg_t &dec_cfg) { Decoder *const decoder = codec_->CreateDecoder(dec_cfg, flags_); - ASSERT_TRUE(decoder != NULL); + ASSERT_NE(decoder, nullptr); bool end_of_file = false; // Decode frames. @@ -78,7 +78,7 @@ void DecoderTest::RunLoop(CompressedVideoSource *video, vpx_codec_stream_info_t stream_info; stream_info.sz = sizeof(stream_info); - if (video->cxdata() != NULL) { + if (video->cxdata() != nullptr) { const vpx_codec_err_t res_peek = decoder->PeekStream( video->cxdata(), video->frame_size(), &stream_info); HandlePeekResult(decoder, video, res_peek); @@ -89,13 +89,13 @@ void DecoderTest::RunLoop(CompressedVideoSource *video, if (!HandleDecodeResult(res_dec, *video, decoder)) break; } else { // Signal end of the file to the decoder. - const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0); + const vpx_codec_err_t res_dec = decoder->DecodeFrame(nullptr, 0); ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); end_of_file = true; } DxDataIterator dec_iter = decoder->GetDxData(); - const vpx_image_t *img = NULL; + const vpx_image_t *img = nullptr; // Get decompressed data while (!::testing::Test::HasFailure() && (img = dec_iter.Next())) { diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 87e29b61db..6bd7e593da 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -34,29 +34,33 @@ TEST(EncodeAPI, InvalidParams) { EXPECT_EQ(&img, vpx_img_wrap(&img, VPX_IMG_FMT_I420, 1, 1, 1, buf)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init(NULL, NULL, NULL, 0)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init(&enc, NULL, NULL, 0)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_encode(NULL, NULL, 0, 0, 0, 0)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_encode(NULL, &img, 0, 0, 0, 0)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(NULL)); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_enc_config_default(NULL, NULL, 0)); + vpx_codec_enc_init(nullptr, nullptr, nullptr, 0)); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_enc_config_default(NULL, &cfg, 0)); - EXPECT_TRUE(vpx_codec_error(NULL) != NULL); + vpx_codec_enc_init(&enc, nullptr, nullptr, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_encode(nullptr, nullptr, 0, 0, 0, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_encode(nullptr, &img, 0, 0, 0, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(nullptr)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_config_default(nullptr, nullptr, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_config_default(nullptr, &cfg, 0)); + EXPECT_NE(vpx_codec_error(nullptr), nullptr); for (int i = 0; i < NELEMENTS(kCodecs); ++i) { SCOPED_TRACE(vpx_codec_iface_name(kCodecs[i])); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_enc_init(NULL, kCodecs[i], NULL, 0)); + vpx_codec_enc_init(nullptr, kCodecs[i], nullptr, 0)); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_enc_init(&enc, kCodecs[i], NULL, 0)); + vpx_codec_enc_init(&enc, kCodecs[i], nullptr, 0)); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_enc_config_default(kCodecs[i], &cfg, 1)); EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(kCodecs[i], &cfg, 0)); EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, kCodecs[i], &cfg, 0)); - EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, NULL, 0, 0, 0, 0)); + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, nullptr, 0, 0, 0, 0)); EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc)); } diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc index 8fdbdb62ae..6914804ec0 100644 --- a/test/encode_test_driver.cc +++ b/test/encode_test_driver.cc @@ -91,7 +91,7 @@ void Encoder::EncodeFrameInternal(const VideoSource &video, void Encoder::Flush() { const vpx_codec_err_t res = - vpx_codec_encode(&encoder_, NULL, 0, 0, 0, deadline_); + vpx_codec_encode(&encoder_, nullptr, 0, 0, 0, deadline_); if (!encoder_.priv) ASSERT_EQ(VPX_CODEC_ERROR, res) << EncoderError(); else @@ -182,7 +182,7 @@ void EncoderTest::RunLoop(VideoSource *video) { BeginPassHook(pass); std::unique_ptr encoder( codec_->CreateEncoder(cfg_, deadline_, init_flags_, &stats_)); - ASSERT_TRUE(encoder.get() != NULL); + ASSERT_NE(encoder.get(), nullptr); ASSERT_NO_FATAL_FAILURE(video->Begin()); encoder->InitEncoder(video); @@ -198,7 +198,7 @@ void EncoderTest::RunLoop(VideoSource *video) { codec_->CreateDecoder(dec_cfg, dec_init_flags)); bool again; for (again = true; again; video->Next()) { - again = (video->img() != NULL); + again = (video->img() != nullptr); PreEncodeFrameHook(video); PreEncodeFrameHook(video, encoder.get()); @@ -216,7 +216,7 @@ void EncoderTest::RunLoop(VideoSource *video) { switch (pkt->kind) { case VPX_CODEC_CX_FRAME_PKT: has_cxdata = true; - if (decoder.get() != NULL && DoDecode()) { + if (decoder.get() != nullptr && DoDecode()) { PreDecodeFrameHook(video, decoder.get()); vpx_codec_err_t res_dec = decoder->DecodeFrame( (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz); @@ -240,7 +240,7 @@ void EncoderTest::RunLoop(VideoSource *video) { // Flush the decoder when there are no more fragments. if ((init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION) && has_dxdata) { - const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0); + const vpx_codec_err_t res_dec = decoder->DecodeFrame(nullptr, 0); if (!HandleDecodeResult(res_dec, *video, decoder.get())) break; } diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc index 25dd30011b..3bd4a1c473 100644 --- a/test/external_frame_buffer_test.cc +++ b/test/external_frame_buffer_test.cc @@ -36,7 +36,7 @@ struct ExternalFrameBuffer { class ExternalFrameBufferList { public: ExternalFrameBufferList() - : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(NULL) {} + : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(nullptr) {} virtual ~ExternalFrameBufferList() { for (int i = 0; i < num_buffers_; ++i) { @@ -51,7 +51,7 @@ class ExternalFrameBufferList { num_buffers_ = num_buffers; ext_fb_list_ = new ExternalFrameBuffer[num_buffers_]; - EXPECT_TRUE(ext_fb_list_ != NULL); + EXPECT_NE(ext_fb_list_, nullptr); memset(ext_fb_list_, 0, sizeof(ext_fb_list_[0]) * num_buffers_); return true; } @@ -61,7 +61,7 @@ class ExternalFrameBufferList { // frame buffer is in use by libvpx. Finally sets |fb| to point to the // external frame buffer. Returns < 0 on an error. int GetFreeFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) { - EXPECT_TRUE(fb != NULL); + EXPECT_NE(fb, nullptr); const int idx = FindFreeBufferIndex(); if (idx == num_buffers_) return -1; @@ -81,13 +81,13 @@ class ExternalFrameBufferList { // Test function that will not allocate any data for the frame buffer. // Returns < 0 on an error. int GetZeroFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) { - EXPECT_TRUE(fb != NULL); + EXPECT_NE(fb, nullptr); const int idx = FindFreeBufferIndex(); if (idx == num_buffers_) return -1; if (ext_fb_list_[idx].size < min_size) { delete[] ext_fb_list_[idx].data; - ext_fb_list_[idx].data = NULL; + ext_fb_list_[idx].data = nullptr; ext_fb_list_[idx].size = min_size; } @@ -98,14 +98,14 @@ class ExternalFrameBufferList { // Marks the external frame buffer that |fb| is pointing to as free. // Returns < 0 on an error. int ReturnFrameBuffer(vpx_codec_frame_buffer_t *fb) { - if (fb == NULL) { - EXPECT_TRUE(fb != NULL); + if (fb == nullptr) { + EXPECT_NE(fb, nullptr); return -1; } ExternalFrameBuffer *const ext_fb = reinterpret_cast(fb->priv); - if (ext_fb == NULL) { - EXPECT_TRUE(ext_fb != NULL); + if (ext_fb == nullptr) { + EXPECT_NE(ext_fb, nullptr); return -1; } EXPECT_EQ(1, ext_fb->in_use); @@ -117,7 +117,7 @@ class ExternalFrameBufferList { // Checks that the vpx_image_t data is contained within the external frame // buffer private data passed back in the vpx_image_t. void CheckImageFrameBuffer(const vpx_image_t *img) { - if (img->fb_priv != NULL) { + if (img->fb_priv != nullptr) { const struct ExternalFrameBuffer *const ext_fb = reinterpret_cast(img->fb_priv); @@ -143,7 +143,7 @@ class ExternalFrameBufferList { // Sets |fb| to an external frame buffer. idx is the index into the frame // buffer list. void SetFrameBuffer(int idx, vpx_codec_frame_buffer_t *fb) { - ASSERT_TRUE(fb != NULL); + ASSERT_NE(fb, nullptr); fb->data = ext_fb_list_[idx].data; fb->size = ext_fb_list_[idx].size; ASSERT_EQ(0, ext_fb_list_[idx].in_use); @@ -208,10 +208,10 @@ class ExternalFrameBufferMD5Test protected: ExternalFrameBufferMD5Test() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)), - md5_file_(NULL), num_buffers_(0) {} + md5_file_(nullptr), num_buffers_(0) {} virtual ~ExternalFrameBufferMD5Test() { - if (md5_file_ != NULL) fclose(md5_file_); + if (md5_file_ != nullptr) fclose(md5_file_); } virtual void PreDecodeFrameHook( @@ -228,13 +228,13 @@ class ExternalFrameBufferMD5Test void OpenMD5File(const std::string &md5_file_name_) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); - ASSERT_TRUE(md5_file_ != NULL) + ASSERT_NE(md5_file_, nullptr) << "Md5 file open failed. Filename: " << md5_file_name_; } virtual void DecompressedFrameHook(const vpx_image_t &img, const unsigned int frame_number) { - ASSERT_TRUE(md5_file_ != NULL); + ASSERT_NE(md5_file_, nullptr); char expected_md5[33]; char junk[128]; @@ -286,24 +286,25 @@ const char kVP9NonRefTestFile[] = "vp90-2-22-svc_1280x720_1.webm"; // Class for testing passing in external frame buffers to libvpx. class ExternalFrameBufferTest : public ::testing::Test { protected: - ExternalFrameBufferTest() : video_(NULL), decoder_(NULL), num_buffers_(0) {} + ExternalFrameBufferTest() + : video_(nullptr), decoder_(nullptr), num_buffers_(0) {} virtual void SetUp() { video_ = new libvpx_test::WebMVideoSource(kVP9TestFile); - ASSERT_TRUE(video_ != NULL); + ASSERT_NE(video_, nullptr); video_->Init(); video_->Begin(); vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); decoder_ = new libvpx_test::VP9Decoder(cfg, 0); - ASSERT_TRUE(decoder_ != NULL); + ASSERT_NE(decoder_, nullptr); } virtual void TearDown() { delete decoder_; - decoder_ = NULL; + decoder_ = nullptr; delete video_; - video_ = NULL; + video_ = nullptr; } // Passes the external frame buffer information to libvpx. @@ -327,7 +328,7 @@ class ExternalFrameBufferTest : public ::testing::Test { } vpx_codec_err_t DecodeRemainingFrames() { - for (; video_->cxdata() != NULL; video_->Next()) { + for (; video_->cxdata() != nullptr; video_->Next()) { const vpx_codec_err_t res = decoder_->DecodeFrame(video_->cxdata(), video_->frame_size()); if (res != VPX_CODEC_OK) return res; @@ -338,10 +339,10 @@ class ExternalFrameBufferTest : public ::testing::Test { void CheckDecodedFrames() { libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData(); - const vpx_image_t *img = NULL; + const vpx_image_t *img = nullptr; // Get decompressed data - while ((img = dec_iter.Next()) != NULL) { + while ((img = dec_iter.Next()) != nullptr) { fb_list_.CheckImageFrameBuffer(img); } } @@ -356,13 +357,13 @@ class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest { protected: virtual void SetUp() { video_ = new libvpx_test::WebMVideoSource(kVP9NonRefTestFile); - ASSERT_TRUE(video_ != NULL); + ASSERT_NE(video_, nullptr); video_->Init(); video_->Begin(); vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); decoder_ = new libvpx_test::VP9Decoder(cfg, 0); - ASSERT_TRUE(decoder_ != NULL); + ASSERT_NE(decoder_, nullptr); } virtual void CheckFrameBufferRelease() { @@ -405,7 +406,7 @@ TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) { return; #endif } - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); video->Init(); // Construct md5 file name. @@ -482,13 +483,14 @@ TEST_F(ExternalFrameBufferTest, NullGetFunction) { const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; ASSERT_EQ( VPX_CODEC_INVALID_PARAM, - SetFrameBufferFunctions(num_buffers, NULL, release_vp9_frame_buffer)); + SetFrameBufferFunctions(num_buffers, nullptr, release_vp9_frame_buffer)); } TEST_F(ExternalFrameBufferTest, NullReleaseFunction) { const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; - ASSERT_EQ(VPX_CODEC_INVALID_PARAM, - SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, NULL)); + ASSERT_EQ( + VPX_CODEC_INVALID_PARAM, + SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, nullptr)); } TEST_F(ExternalFrameBufferTest, SetAfterDecode) { diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc index 9397af2b2d..d85c193e0b 100644 --- a/test/frame_size_tests.cc +++ b/test/frame_size_tests.cc @@ -69,7 +69,7 @@ class EncoderWithExpectedError : public ::libvpx_test::Encoder { #if CONFIG_VP9_ENCODER return &vpx_codec_vp9_cx_algo; #else - return NULL; + return nullptr; #endif } }; @@ -130,7 +130,7 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest, encoder->InitEncoder(video); ASSERT_FALSE(::testing::Test::HasFatalFailure()); for (bool again = true; again; video->Next()) { - again = (video->img() != NULL); + again = (video->img() != nullptr); PreEncodeFrameHook(video, encoder.get()); encoder->EncodeFrame(video, frame_flags_, expected_err); diff --git a/test/idct_test.cc b/test/idct_test.cc index 50d5ff433d..1b9532e1c1 100644 --- a/test/idct_test.cc +++ b/test/idct_test.cc @@ -31,13 +31,13 @@ class IDCTTest : public ::testing::TestWithParam { UUT = GetParam(); input = new Buffer(4, 4, 0); - ASSERT_TRUE(input != NULL); + ASSERT_NE(input, nullptr); ASSERT_TRUE(input->Init()); predict = new Buffer(4, 4, 3); - ASSERT_TRUE(predict != NULL); + ASSERT_NE(predict, nullptr); ASSERT_TRUE(predict->Init()); output = new Buffer(4, 4, 3); - ASSERT_TRUE(output != NULL); + ASSERT_NE(output, nullptr); ASSERT_TRUE(output->Init()); } @@ -72,7 +72,7 @@ TEST_P(IDCTTest, TestAllZeros) { TEST_P(IDCTTest, TestAllOnes) { input->Set(0); - ASSERT_TRUE(input->TopLeftPixel() != NULL); + ASSERT_NE(input->TopLeftPixel(), nullptr); // When the first element is '4' it will fill the output buffer with '1'. input->TopLeftPixel()[0] = 4; predict->Set(0); @@ -90,7 +90,7 @@ TEST_P(IDCTTest, TestAddOne) { // Set the transform output to '1' and make sure it gets added to the // prediction buffer. input->Set(0); - ASSERT_TRUE(input->TopLeftPixel() != NULL); + ASSERT_NE(input->TopLeftPixel(), nullptr); input->TopLeftPixel()[0] = 4; output->Set(0); diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc index a5054a54d6..762d585f59 100644 --- a/test/invalid_file_test.cc +++ b/test/invalid_file_test.cc @@ -38,15 +38,15 @@ std::ostream &operator<<(std::ostream &os, const DecodeParam &dp) { class InvalidFileTest : public ::libvpx_test::DecoderTest, public ::libvpx_test::CodecTestWithParam { protected: - InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(NULL) {} + InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(nullptr) {} virtual ~InvalidFileTest() { - if (res_file_ != NULL) fclose(res_file_); + if (res_file_ != nullptr) fclose(res_file_); } void OpenResFile(const std::string &res_file_name_) { res_file_ = libvpx_test::OpenTestDataFile(res_file_name_); - ASSERT_TRUE(res_file_ != NULL) + ASSERT_NE(res_file_, nullptr) << "Result file open failed. Filename: " << res_file_name_; } @@ -54,7 +54,7 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest, const vpx_codec_err_t res_dec, const libvpx_test::CompressedVideoSource &video, libvpx_test::Decoder *decoder) { - EXPECT_TRUE(res_file_ != NULL); + EXPECT_NE(res_file_, nullptr); int expected_res_dec; // Read integer result. @@ -102,7 +102,7 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest, return; #endif } - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); video->Init(); // Construct result file name. The file holds a list of expected integer diff --git a/test/non_greedy_mv_test.cc b/test/non_greedy_mv_test.cc index c78331b285..927029de45 100644 --- a/test/non_greedy_mv_test.cc +++ b/test/non_greedy_mv_test.cc @@ -129,11 +129,11 @@ TEST(non_greedy_mv, smooth_mf) { const char *ground_truth_file = "non_greedy_mv_test_files/ground_truth_16x16.txt"; BLOCK_SIZE bsize = BLOCK_32X32; - MV *search_mf = NULL; - MV *smooth_mf = NULL; - MV *estimation = NULL; - MV *ground_truth = NULL; - int(*local_var)[MF_LOCAL_STRUCTURE_SIZE] = NULL; + MV *search_mf = nullptr; + MV *smooth_mf = nullptr; + MV *estimation = nullptr; + MV *ground_truth = nullptr; + int(*local_var)[MF_LOCAL_STRUCTURE_SIZE] = nullptr; int rows = 0, cols = 0; int alpha = 100, max_iter = 100; @@ -169,8 +169,8 @@ TEST(non_greedy_mv, local_var) { const char *gt_local_var_file = "non_greedy_mv_test_files/localVar_16x16.txt"; const char *search_mf_file = "non_greedy_mv_test_files/exhaust_16x16.txt"; BLOCK_SIZE bsize = BLOCK_16X16; - int(*gt_local_var)[MF_LOCAL_STRUCTURE_SIZE] = NULL; - int(*est_local_var)[MF_LOCAL_STRUCTURE_SIZE] = NULL; + int(*gt_local_var)[MF_LOCAL_STRUCTURE_SIZE] = nullptr; + int(*est_local_var)[MF_LOCAL_STRUCTURE_SIZE] = nullptr; YV12_BUFFER_CONFIG ref_frame, cur_frame; int rows, cols; MV *search_mf; diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 61e7c1098b..a160120de1 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -102,11 +102,11 @@ class PartialIDctTest : public ::testing::TestWithParam { virtual void TearDown() { vpx_free(input_block_); - input_block_ = NULL; + input_block_ = nullptr; vpx_free(output_block_); - output_block_ = NULL; + output_block_ = nullptr; vpx_free(output_block_ref_); - output_block_ref_ = NULL; + output_block_ref_ = nullptr; libvpx_test::ClearSystemState(); } diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc index 952a954295..a511ffbe98 100644 --- a/test/pp_filter_test.cc +++ b/test/pp_filter_test.cc @@ -459,7 +459,7 @@ TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) { SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); unsigned char *expected_output = new unsigned char[rows_ * cols_]; - ASSERT_TRUE(expected_output != NULL); + ASSERT_NE(expected_output, nullptr); SetRows(expected_output, rows_, cols_, cols_); RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), q2mbl(0), diff --git a/test/predict_test.cc b/test/predict_test.cc index d884932bae..7472970576 100644 --- a/test/predict_test.cc +++ b/test/predict_test.cc @@ -41,11 +41,11 @@ class PredictTestBase : public AbstractBench, public: PredictTestBase() : width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)), - src_(NULL), padded_dst_(NULL), dst_(NULL), dst_c_(NULL) {} + src_(nullptr), padded_dst_(nullptr), dst_(nullptr), dst_c_(nullptr) {} virtual void SetUp() { src_ = new uint8_t[kSrcSize]; - ASSERT_TRUE(src_ != NULL); + ASSERT_NE(src_, nullptr); // padded_dst_ provides a buffer of kBorderSize around the destination // memory to facilitate detecting out of bounds writes. @@ -53,11 +53,11 @@ class PredictTestBase : public AbstractBench, padded_dst_size_ = dst_stride_ * (kBorderSize + height_ + kBorderSize); padded_dst_ = reinterpret_cast(vpx_memalign(16, padded_dst_size_)); - ASSERT_TRUE(padded_dst_ != NULL); + ASSERT_NE(padded_dst_, nullptr); dst_ = padded_dst_ + (kBorderSize * dst_stride_) + kBorderSize; dst_c_ = new uint8_t[16 * 16]; - ASSERT_TRUE(dst_c_ != NULL); + ASSERT_NE(dst_c_, nullptr); memset(src_, 0, kSrcSize); memset(padded_dst_, 128, padded_dst_size_); @@ -66,12 +66,12 @@ class PredictTestBase : public AbstractBench, virtual void TearDown() { delete[] src_; - src_ = NULL; + src_ = nullptr; vpx_free(padded_dst_); - padded_dst_ = NULL; - dst_ = NULL; + padded_dst_ = nullptr; + dst_ = nullptr; delete[] dst_c_; - dst_c_ = NULL; + dst_c_ = nullptr; libvpx_test::ClearSystemState(); } diff --git a/test/quantize_test.cc b/test/quantize_test.cc index 96d5241712..b30b90f8f5 100644 --- a/test/quantize_test.cc +++ b/test/quantize_test.cc @@ -46,9 +46,9 @@ class QuantizeTestBase { public: virtual ~QuantizeTestBase() { vp8_remove_compressor(&vp8_comp_); - vp8_comp_ = NULL; + vp8_comp_ = nullptr; vpx_free(macroblockd_dst_); - macroblockd_dst_ = NULL; + macroblockd_dst_ = nullptr; libvpx_test::ClearSystemState(); } diff --git a/test/resize_test.cc b/test/resize_test.cc index 8ac0049162..65b94fa4f6 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -350,7 +350,7 @@ class ResizeInternalTest : public ResizeTest { protected: #if WRITE_COMPRESSED_STREAM ResizeInternalTest() - : ResizeTest(), frame0_psnr_(0.0), outfile_(NULL), out_frames_(0) {} + : ResizeTest(), frame0_psnr_(0.0), outfile_(nullptr), out_frames_(0) {} #else ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {} #endif @@ -369,7 +369,7 @@ class ResizeInternalTest : public ResizeTest { if (!fseek(outfile_, 0, SEEK_SET)) write_ivf_file_header(&cfg_, out_frames_, outfile_); fclose(outfile_); - outfile_ = NULL; + outfile_ = nullptr; } #endif } @@ -705,7 +705,7 @@ class ResizeCspTest : public ResizeTest { protected: #if WRITE_COMPRESSED_STREAM ResizeCspTest() - : ResizeTest(), frame0_psnr_(0.0), outfile_(NULL), out_frames_(0) {} + : ResizeTest(), frame0_psnr_(0.0), outfile_(nullptr), out_frames_(0) {} #else ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {} #endif @@ -724,7 +724,7 @@ class ResizeCspTest : public ResizeTest { if (!fseek(outfile_, 0, SEEK_SET)) write_ivf_file_header(&cfg_, out_frames_, outfile_); fclose(outfile_); - outfile_ = NULL; + outfile_ = nullptr; } #endif } diff --git a/test/sad_test.cc b/test/sad_test.cc index 3211c1b83b..34cb26ed11 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -99,17 +99,17 @@ class SADTestBase : public ::testing::TestWithParam { virtual void TearDown() { vpx_free(source_data8_); - source_data8_ = NULL; + source_data8_ = nullptr; vpx_free(reference_data8_); - reference_data8_ = NULL; + reference_data8_ = nullptr; vpx_free(second_pred8_); - second_pred8_ = NULL; + second_pred8_ = nullptr; vpx_free(source_data16_); - source_data16_ = NULL; + source_data16_ = nullptr; vpx_free(reference_data16_); - reference_data16_ = NULL; + reference_data16_ = nullptr; vpx_free(second_pred16_); - second_pred16_ = NULL; + second_pred16_ = nullptr; libvpx_test::ClearSystemState(); } diff --git a/test/superframe_test.cc b/test/superframe_test.cc index 7b2ce29eb8..a5c92e9147 100644 --- a/test/superframe_test.cc +++ b/test/superframe_test.cc @@ -27,7 +27,7 @@ class SuperframeTest public ::libvpx_test::CodecTestWithParam { protected: SuperframeTest() - : EncoderTest(GET_PARAM(0)), modified_buf_(NULL), last_sf_pts_(0) {} + : EncoderTest(GET_PARAM(0)), modified_buf_(nullptr), last_sf_pts_(0) {} virtual ~SuperframeTest() {} virtual void SetUp() { diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 0be9feefd9..08100a146e 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -85,7 +85,7 @@ void TestIntraPred(const char name[], VpxPredFunc const *pred_funcs, intra_pred_test_mem.Init(block_size, 8); for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) { - if (pred_funcs[k] == NULL) continue; + if (pred_funcs[k] == nullptr) continue; memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src, sizeof(intra_pred_test_mem.src)); vpx_usec_timer timer; @@ -206,58 +206,64 @@ INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c, INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2, vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2, vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2, - vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL, - NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL, + vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, nullptr, + nullptr, nullptr, vpx_d207_predictor_4x4_sse2, nullptr, vpx_tm_predictor_4x4_sse2) INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2, vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2, vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2, - vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL, - NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2) + vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, nullptr, + nullptr, nullptr, nullptr, nullptr, vpx_tm_predictor_8x8_sse2) INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2, vpx_dc_left_predictor_16x16_sse2, vpx_dc_top_predictor_16x16_sse2, vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2, - vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_16x16_sse2) + vpx_h_predictor_16x16_sse2, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_16x16_sse2) INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2, vpx_dc_left_predictor_32x32_sse2, vpx_dc_top_predictor_32x32_sse2, vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2, - vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_32x32_sse2) + vpx_h_predictor_32x32_sse2, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_32x32_sse2) #endif // HAVE_SSE2 #if HAVE_SSSE3 -INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_d153_predictor_4x4_ssse3, NULL, - vpx_d63_predictor_4x4_ssse3, NULL) -INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_d153_predictor_8x8_ssse3, - vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL) -INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_d45_predictor_16x16_ssse3, NULL, NULL, - vpx_d153_predictor_16x16_ssse3, vpx_d207_predictor_16x16_ssse3, - vpx_d63_predictor_16x16_ssse3, NULL) -INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_d45_predictor_32x32_ssse3, NULL, NULL, - vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3, - vpx_d63_predictor_32x32_ssse3, NULL) +INTRA_PRED_TEST(SSSE3, TestIntraPred4, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + vpx_d153_predictor_4x4_ssse3, nullptr, + vpx_d63_predictor_4x4_ssse3, nullptr) +INTRA_PRED_TEST(SSSE3, TestIntraPred8, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3, + vpx_d63_predictor_8x8_ssse3, nullptr) +INTRA_PRED_TEST(SSSE3, TestIntraPred16, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_d45_predictor_16x16_ssse3, nullptr, + nullptr, vpx_d153_predictor_16x16_ssse3, + vpx_d207_predictor_16x16_ssse3, vpx_d63_predictor_16x16_ssse3, + nullptr) +INTRA_PRED_TEST(SSSE3, TestIntraPred32, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_d45_predictor_32x32_ssse3, nullptr, + nullptr, vpx_d153_predictor_32x32_ssse3, + vpx_d207_predictor_32x32_ssse3, vpx_d63_predictor_32x32_ssse3, + nullptr) #endif // HAVE_SSSE3 #if HAVE_DSPR2 -INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL, - NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_tm_predictor_4x4_dspr2) -INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL, - NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_tm_predictor_8x8_c) -INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, NULL, - NULL, NULL, NULL, vpx_h_predictor_16x16_dspr2, NULL, NULL, NULL, - NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, nullptr, + nullptr, nullptr, nullptr, vpx_h_predictor_4x4_dspr2, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + vpx_tm_predictor_4x4_dspr2) +INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, nullptr, + nullptr, nullptr, nullptr, vpx_h_predictor_8x8_dspr2, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + vpx_tm_predictor_8x8_c) +INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, nullptr, + nullptr, nullptr, nullptr, vpx_h_predictor_16x16_dspr2, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr) #endif // HAVE_DSPR2 #if HAVE_NEON @@ -265,78 +271,78 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon, vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon, vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon, vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon, - vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, + vpx_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, nullptr, vpx_tm_predictor_4x4_neon) INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon, vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon, vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon, vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, - vpx_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL, + vpx_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, nullptr, vpx_tm_predictor_8x8_neon) INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon, vpx_dc_left_predictor_16x16_neon, vpx_dc_top_predictor_16x16_neon, vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon, vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, - vpx_d135_predictor_16x16_neon, NULL, NULL, NULL, NULL, - vpx_tm_predictor_16x16_neon) + vpx_d135_predictor_16x16_neon, nullptr, nullptr, nullptr, + nullptr, vpx_tm_predictor_16x16_neon) INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon, vpx_dc_left_predictor_32x32_neon, vpx_dc_top_predictor_32x32_neon, vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon, vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon, - vpx_d135_predictor_32x32_neon, NULL, NULL, NULL, NULL, - vpx_tm_predictor_32x32_neon) + vpx_d135_predictor_32x32_neon, nullptr, nullptr, nullptr, + nullptr, vpx_tm_predictor_32x32_neon) #endif // HAVE_NEON #if HAVE_MSA INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa, vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa, vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa, - vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_4x4_msa) + vpx_h_predictor_4x4_msa, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_4x4_msa) INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa, vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa, vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa, - vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_8x8_msa) + vpx_h_predictor_8x8_msa, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_8x8_msa) INTRA_PRED_TEST(MSA, TestIntraPred16, vpx_dc_predictor_16x16_msa, vpx_dc_left_predictor_16x16_msa, vpx_dc_top_predictor_16x16_msa, vpx_dc_128_predictor_16x16_msa, vpx_v_predictor_16x16_msa, - vpx_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_16x16_msa) + vpx_h_predictor_16x16_msa, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_16x16_msa) INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa, vpx_dc_left_predictor_32x32_msa, vpx_dc_top_predictor_32x32_msa, vpx_dc_128_predictor_32x32_msa, vpx_v_predictor_32x32_msa, - vpx_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_32x32_msa) + vpx_h_predictor_32x32_msa, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_32x32_msa) #endif // HAVE_MSA #if HAVE_VSX // TODO(crbug.com/webm/1522): Fix test failures. #if 0 -INTRA_PRED_TEST(VSX, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, - vpx_h_predictor_4x4_vsx, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_4x4_vsx) - -INTRA_PRED_TEST(VSX, TestIntraPred8, vpx_dc_predictor_8x8_vsx, NULL, NULL, NULL, - NULL, vpx_h_predictor_8x8_vsx, vpx_d45_predictor_8x8_vsx, NULL, - NULL, NULL, NULL, vpx_d63_predictor_8x8_vsx, - vpx_tm_predictor_8x8_vsx) +INTRA_PRED_TEST(VSX, TestIntraPred4, nullptr, nullptr, nullptr, nullptr, + nullptr, vpx_h_predictor_4x4_vsx, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, vpx_tm_predictor_4x4_vsx) + +INTRA_PRED_TEST(VSX, TestIntraPred8, vpx_dc_predictor_8x8_vsx, nullptr, nullptr, + nullptr, nullptr, vpx_h_predictor_8x8_vsx, + vpx_d45_predictor_8x8_vsx, nullptr, nullptr, nullptr, nullptr, + vpx_d63_predictor_8x8_vsx, vpx_tm_predictor_8x8_vsx) #endif INTRA_PRED_TEST(VSX, TestIntraPred16, vpx_dc_predictor_16x16_vsx, vpx_dc_left_predictor_16x16_vsx, vpx_dc_top_predictor_16x16_vsx, vpx_dc_128_predictor_16x16_vsx, vpx_v_predictor_16x16_vsx, - vpx_h_predictor_16x16_vsx, vpx_d45_predictor_16x16_vsx, NULL, - NULL, NULL, NULL, vpx_d63_predictor_16x16_vsx, + vpx_h_predictor_16x16_vsx, vpx_d45_predictor_16x16_vsx, nullptr, + nullptr, nullptr, nullptr, vpx_d63_predictor_16x16_vsx, vpx_tm_predictor_16x16_vsx) INTRA_PRED_TEST(VSX, TestIntraPred32, vpx_dc_predictor_32x32_vsx, vpx_dc_left_predictor_32x32_vsx, vpx_dc_top_predictor_32x32_vsx, vpx_dc_128_predictor_32x32_vsx, vpx_v_predictor_32x32_vsx, - vpx_h_predictor_32x32_vsx, vpx_d45_predictor_32x32_vsx, NULL, - NULL, NULL, NULL, vpx_d63_predictor_32x32_vsx, + vpx_h_predictor_32x32_vsx, vpx_d45_predictor_32x32_vsx, nullptr, + nullptr, nullptr, nullptr, vpx_d63_predictor_32x32_vsx, vpx_tm_predictor_32x32_vsx) #endif // HAVE_VSX @@ -361,7 +367,7 @@ void TestHighbdIntraPred(const char name[], VpxHighbdPredFunc const *pred_funcs, intra_pred_test_mem.Init(block_size, 12); for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) { - if (pred_funcs[k] == NULL) continue; + if (pred_funcs[k] == nullptr) continue; memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src, sizeof(intra_pred_test_mem.src)); vpx_usec_timer timer; @@ -487,19 +493,17 @@ HIGHBD_INTRA_PRED_TEST( SSE2, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_sse2, vpx_highbd_dc_left_predictor_4x4_sse2, vpx_highbd_dc_top_predictor_4x4_sse2, vpx_highbd_dc_128_predictor_4x4_sse2, vpx_highbd_v_predictor_4x4_sse2, - vpx_highbd_h_predictor_4x4_sse2, NULL, vpx_highbd_d135_predictor_4x4_sse2, - vpx_highbd_d117_predictor_4x4_sse2, vpx_highbd_d153_predictor_4x4_sse2, - vpx_highbd_d207_predictor_4x4_sse2, vpx_highbd_d63_predictor_4x4_sse2, - vpx_highbd_tm_predictor_4x4_c) - -HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8, - vpx_highbd_dc_predictor_8x8_sse2, - vpx_highbd_dc_left_predictor_8x8_sse2, - vpx_highbd_dc_top_predictor_8x8_sse2, - vpx_highbd_dc_128_predictor_8x8_sse2, - vpx_highbd_v_predictor_8x8_sse2, - vpx_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2) + vpx_highbd_h_predictor_4x4_sse2, nullptr, + vpx_highbd_d135_predictor_4x4_sse2, vpx_highbd_d117_predictor_4x4_sse2, + vpx_highbd_d153_predictor_4x4_sse2, vpx_highbd_d207_predictor_4x4_sse2, + vpx_highbd_d63_predictor_4x4_sse2, vpx_highbd_tm_predictor_4x4_c) + +HIGHBD_INTRA_PRED_TEST( + SSE2, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_sse2, + vpx_highbd_dc_left_predictor_8x8_sse2, vpx_highbd_dc_top_predictor_8x8_sse2, + vpx_highbd_dc_128_predictor_8x8_sse2, vpx_highbd_v_predictor_8x8_sse2, + vpx_highbd_h_predictor_8x8_sse2, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_highbd_tm_predictor_8x8_sse2) HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_sse2, @@ -507,8 +511,9 @@ HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16, vpx_highbd_dc_top_predictor_16x16_sse2, vpx_highbd_dc_128_predictor_16x16_sse2, vpx_highbd_v_predictor_16x16_sse2, - vpx_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL, - NULL, NULL, NULL, vpx_highbd_tm_predictor_16x16_sse2) + vpx_highbd_h_predictor_16x16_sse2, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + vpx_highbd_tm_predictor_16x16_sse2) HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_sse2, @@ -516,35 +521,40 @@ HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32, vpx_highbd_dc_top_predictor_32x32_sse2, vpx_highbd_dc_128_predictor_32x32_sse2, vpx_highbd_v_predictor_32x32_sse2, - vpx_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL, - NULL, NULL, NULL, vpx_highbd_tm_predictor_32x32_sse2) + vpx_highbd_h_predictor_32x32_sse2, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + vpx_highbd_tm_predictor_32x32_sse2) #endif // HAVE_SSE2 #if HAVE_SSSE3 -HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred4, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_highbd_d45_predictor_4x4_ssse3, NULL, - NULL, NULL, NULL, NULL, NULL) -HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_highbd_d45_predictor_8x8_ssse3, +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred4, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, + vpx_highbd_d45_predictor_4x4_ssse3, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, + vpx_highbd_d45_predictor_8x8_ssse3, vpx_highbd_d135_predictor_8x8_ssse3, vpx_highbd_d117_predictor_8x8_ssse3, vpx_highbd_d153_predictor_8x8_ssse3, vpx_highbd_d207_predictor_8x8_ssse3, - vpx_highbd_d63_predictor_8x8_ssse3, NULL) -HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_highbd_d45_predictor_16x16_ssse3, + vpx_highbd_d63_predictor_8x8_ssse3, nullptr) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, + vpx_highbd_d45_predictor_16x16_ssse3, vpx_highbd_d135_predictor_16x16_ssse3, vpx_highbd_d117_predictor_16x16_ssse3, vpx_highbd_d153_predictor_16x16_ssse3, vpx_highbd_d207_predictor_16x16_ssse3, - vpx_highbd_d63_predictor_16x16_ssse3, NULL) -HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_highbd_d45_predictor_32x32_ssse3, + vpx_highbd_d63_predictor_16x16_ssse3, nullptr) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, + vpx_highbd_d45_predictor_32x32_ssse3, vpx_highbd_d135_predictor_32x32_ssse3, vpx_highbd_d117_predictor_32x32_ssse3, vpx_highbd_d153_predictor_32x32_ssse3, vpx_highbd_d207_predictor_32x32_ssse3, - vpx_highbd_d63_predictor_32x32_ssse3, NULL) + vpx_highbd_d63_predictor_32x32_ssse3, nullptr) #endif // HAVE_SSSE3 #if HAVE_NEON @@ -553,14 +563,14 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon, vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon, vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon, - vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, + vpx_highbd_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_4x4_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon, vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon, vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon, vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon, - vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL, + vpx_highbd_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_8x8_neon) HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon, @@ -570,8 +580,8 @@ HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16, vpx_highbd_v_predictor_16x16_neon, vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon, - vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL, - NULL, vpx_highbd_tm_predictor_16x16_neon) + vpx_highbd_d135_predictor_16x16_neon, nullptr, nullptr, + nullptr, nullptr, vpx_highbd_tm_predictor_16x16_neon) HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon, vpx_highbd_dc_left_predictor_32x32_neon, @@ -580,8 +590,8 @@ HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32, vpx_highbd_v_predictor_32x32_neon, vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon, - vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL, - NULL, vpx_highbd_tm_predictor_32x32_neon) + vpx_highbd_d135_predictor_32x32_neon, nullptr, nullptr, + nullptr, nullptr, vpx_highbd_tm_predictor_32x32_neon) #endif // HAVE_NEON #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc index 6adcb221ad..ca990f4dd4 100644 --- a/test/test_vector_test.cc +++ b/test/test_vector_test.cc @@ -40,7 +40,7 @@ typedef std::tuple DecodeParam; class TestVectorTest : public ::libvpx_test::DecoderTest, public ::libvpx_test::CodecTestWithParam { protected: - TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(NULL) { + TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(nullptr) { #if CONFIG_VP9_DECODER resize_clips_.insert(::libvpx_test::kVP9TestVectorsResize, ::libvpx_test::kVP9TestVectorsResize + @@ -54,7 +54,7 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, void OpenMD5File(const std::string &md5_file_name_) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); - ASSERT_TRUE(md5_file_ != NULL) + ASSERT_NE(md5_file_, nullptr) << "Md5 file open failed. Filename: " << md5_file_name_; } @@ -79,7 +79,7 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, virtual void DecompressedFrameHook(const vpx_image_t &img, const unsigned int frame_number) { - ASSERT_TRUE(md5_file_ != NULL); + ASSERT_NE(md5_file_, nullptr); char expected_md5[33]; char junk[128]; @@ -137,7 +137,7 @@ TEST_P(TestVectorTest, MD5Match) { return; #endif } - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); video->Init(); // Construct md5 file name. diff --git a/test/user_priv_test.cc b/test/user_priv_test.cc index 7bea76b0a9..20741f8268 100644 --- a/test/user_priv_test.cc +++ b/test/user_priv_test.cc @@ -57,18 +57,18 @@ string DecodeFile(const string &filename) { void *user_priv = reinterpret_cast(&frame_num); const vpx_codec_err_t res = decoder.DecodeFrame(video.cxdata(), video.frame_size(), - (frame_num == 0) ? NULL : user_priv); + (frame_num == 0) ? nullptr : user_priv); if (res != VPX_CODEC_OK) { EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); break; } libvpx_test::DxDataIterator dec_iter = decoder.GetDxData(); - const vpx_image_t *img = NULL; + const vpx_image_t *img = nullptr; // Get decompressed data. while ((img = dec_iter.Next())) { if (frame_num == 0) { - CheckUserPrivateData(img->user_priv, NULL); + CheckUserPrivateData(img->user_priv, nullptr); } else { CheckUserPrivateData(img->user_priv, &frame_num); @@ -78,7 +78,7 @@ string DecodeFile(const string &filename) { ref.idx = rnd.Rand8() % 3; decoder.Control(VP9_GET_REFERENCE, &ref); - CheckUserPrivateData(ref.img.user_priv, NULL); + CheckUserPrivateData(ref.img.user_priv, nullptr); } md5.Add(img); } diff --git a/test/variance_test.cc b/test/variance_test.cc index 3a220a8103..72306d90c3 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -253,7 +253,7 @@ void SumOfSquaresTest::RefTest() { template struct TestParams { - TestParams(int log2w = 0, int log2h = 0, Func function = NULL, + TestParams(int log2w = 0, int log2h = 0, Func function = nullptr, int bit_depth_value = 0) : log2width(log2w), log2height(log2h), func(function) { use_high_bit_depth = (bit_depth_value > 0); @@ -297,8 +297,8 @@ class MainTestClass use_high_bit_depth() ? sizeof(uint16_t) : sizeof(uint8_t); src_ = reinterpret_cast(vpx_memalign(16, block_size() * unit)); ref_ = new uint8_t[block_size() * unit]; - ASSERT_TRUE(src_ != NULL); - ASSERT_TRUE(ref_ != NULL); + ASSERT_NE(src_, nullptr); + ASSERT_NE(ref_, nullptr); #if CONFIG_VP9_HIGHBITDEPTH if (use_high_bit_depth()) { // TODO(skal): remove! @@ -319,8 +319,8 @@ class MainTestClass vpx_free(src_); delete[] ref_; - src_ = NULL; - ref_ = NULL; + src_ = nullptr; + ref_ = nullptr; libvpx_test::ClearSystemState(); } @@ -573,9 +573,9 @@ class SubpelVarianceTest (block_size() + width() + height() + 1) * sizeof(uint16_t)))); #endif // CONFIG_VP9_HIGHBITDEPTH } - ASSERT_TRUE(src_ != NULL); - ASSERT_TRUE(sec_ != NULL); - ASSERT_TRUE(ref_ != NULL); + ASSERT_NE(src_, nullptr); + ASSERT_NE(sec_, nullptr); + ASSERT_NE(ref_, nullptr); } virtual void TearDown() { diff --git a/test/vp9_boolcoder_test.cc b/test/vp9_boolcoder_test.cc index 0cafa6730e..6ba171a000 100644 --- a/test/vp9_boolcoder_test.cc +++ b/test/vp9_boolcoder_test.cc @@ -74,7 +74,7 @@ TEST(VP9, TestBitIO) { GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0); vpx_reader br; - vpx_reader_init(&br, bw_buffer, kBufferSize, NULL, NULL); + vpx_reader_init(&br, bw_buffer, kBufferSize, nullptr, nullptr); bit_rnd.Reset(random_seed); for (int i = 0; i < kBitsToTest; ++i) { if (bit_method == 2) { diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc index 7e4d08f111..9930c754c7 100644 --- a/test/vp9_datarate_test.cc +++ b/test/vp9_datarate_test.cc @@ -701,7 +701,7 @@ TEST_P(DatarateTestVP9RealTime, RegionOfInterest) { // Use 2 states: 1 is center square, 0 is the rest. roi_.roi_map = reinterpret_cast( calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map))); - ASSERT_TRUE(roi_.roi_map != NULL); + ASSERT_NE(roi_.roi_map, nullptr); for (unsigned int i = 0; i < roi_.rows; ++i) { for (unsigned int j = 0; j < roi_.cols; ++j) { diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc index 5286d7a85f..ce2198c592 100644 --- a/test/vp9_encoder_parms_get_to_decoder.cc +++ b/test/vp9_encoder_parms_get_to_decoder.cc @@ -142,7 +142,7 @@ TEST_P(VpxEncoderParmsGetToDecoder, BitstreamParms) { std::unique_ptr video( new libvpx_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames)); - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); } diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc index 315ce219ed..4e3a78fac4 100644 --- a/test/vp9_end_to_end_test.cc +++ b/test/vp9_end_to_end_test.cc @@ -257,7 +257,7 @@ TEST_P(EndToEndNV12, EndtoEndNV12Test) { video.reset(new libvpx_test::YUVVideoSource(test_video_param_.filename, test_video_param_.fmt, 352, 288, 30, 1, 0, 100)); - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); } @@ -280,7 +280,7 @@ TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight, kFramerate, 1, 0, kFrames)); } - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); const double psnr = GetAveragePsnr(); @@ -307,7 +307,7 @@ TEST_P(EndToEndTestLarge, EndtoEndPSNRDenoiserAQTest) { test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight, kFramerate, 1, 0, kFrames)); } - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); const double psnr = GetAveragePsnr(); diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc index 1041dd78c0..21caf7918c 100644 --- a/test/vp9_ethread_test.cc +++ b/test/vp9_ethread_test.cc @@ -41,7 +41,7 @@ class VPxFirstPassEncoderThreadTest row_mt_mode_ = 1; first_pass_only_ = true; - firstpass_stats_.buf = NULL; + firstpass_stats_.buf = nullptr; firstpass_stats_.sz = 0; } virtual ~VPxFirstPassEncoderThreadTest() { free(firstpass_stats_.buf); } diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 62e1771dc2..fdcccd5c22 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -32,7 +32,7 @@ typedef void (*IntraPredFunc)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); struct IntraPredParam { - IntraPredParam(IntraPredFunc pred = NULL, IntraPredFunc ref = NULL, + IntraPredParam(IntraPredFunc pred = nullptr, IntraPredFunc ref = nullptr, int block_size_value = 0, int bit_depth_value = 0) : pred_fn(pred), ref_fn(ref), block_size(block_size_value), bit_depth(bit_depth_value) {} @@ -446,8 +446,9 @@ typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bps); struct HighbdIntraPredParam { - HighbdIntraPredParam(HighbdIntraPred pred = NULL, HighbdIntraPred ref = NULL, - int block_size_value = 0, int bit_depth_value = 0) + HighbdIntraPredParam(HighbdIntraPred pred = nullptr, + HighbdIntraPred ref = nullptr, int block_size_value = 0, + int bit_depth_value = 0) : pred_fn(pred), ref_fn(ref), block_size(block_size_value), bit_depth(bit_depth_value) {} diff --git a/test/vp9_motion_vector_test.cc b/test/vp9_motion_vector_test.cc index 68eecb8232..6b1082a106 100644 --- a/test/vp9_motion_vector_test.cc +++ b/test/vp9_motion_vector_test.cc @@ -88,7 +88,7 @@ TEST_P(MotionVectorTestLarge, OverallTest) { "niklas_640_480_30.yuv", VPX_IMG_FMT_I420, 3840, 2160, // 2048, 1080, 30, 1, 0, 5)); - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); } diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 083a8844bf..cb4481b103 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -110,13 +110,13 @@ class VP9QuantizeBase : public AbstractBench { vpx_free(quant_ptr_); vpx_free(quant_shift_ptr_); vpx_free(dequant_ptr_); - zbin_ptr_ = NULL; - round_fp_ptr_ = NULL; - quant_fp_ptr_ = NULL; - round_ptr_ = NULL; - quant_ptr_ = NULL; - quant_shift_ptr_ = NULL; - dequant_ptr_ = NULL; + zbin_ptr_ = nullptr; + round_fp_ptr_ = nullptr; + quant_fp_ptr_ = nullptr; + round_ptr_ = nullptr; + quant_ptr_ = nullptr; + quant_shift_ptr_ = nullptr; + dequant_ptr_ = nullptr; libvpx_test::ClearSystemState(); } diff --git a/test/vp9_skip_loopfilter_test.cc b/test/vp9_skip_loopfilter_test.cc index 1fb7e48485..c080a2caae 100644 --- a/test/vp9_skip_loopfilter_test.cc +++ b/test/vp9_skip_loopfilter_test.cc @@ -24,10 +24,11 @@ const char kVp9Md5File[] = "vp90-2-08-tile_1x8_frame_parallel.webm.md5"; // Class for testing shutting off the loop filter. class SkipLoopFilterTest { public: - SkipLoopFilterTest() : video_(NULL), decoder_(NULL), md5_file_(NULL) {} + SkipLoopFilterTest() + : video_(nullptr), decoder_(nullptr), md5_file_(nullptr) {} ~SkipLoopFilterTest() { - if (md5_file_ != NULL) fclose(md5_file_); + if (md5_file_ != nullptr) fclose(md5_file_); delete decoder_; delete video_; } @@ -37,8 +38,8 @@ class SkipLoopFilterTest { expected_md5_[0] = '\0'; junk_[0] = '\0'; video_ = new libvpx_test::WebMVideoSource(kVp9TestFile); - if (video_ == NULL) { - EXPECT_TRUE(video_ != NULL); + if (video_ == nullptr) { + EXPECT_NE(video_, nullptr); return false; } video_->Init(); @@ -47,8 +48,8 @@ class SkipLoopFilterTest { vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); if (num_threads > 0) cfg.threads = num_threads; decoder_ = new libvpx_test::VP9Decoder(cfg, 0); - if (decoder_ == NULL) { - EXPECT_TRUE(decoder_ != NULL); + if (decoder_ == nullptr) { + EXPECT_NE(decoder_, nullptr); return false; } @@ -73,7 +74,7 @@ class SkipLoopFilterTest { } vpx_codec_err_t DecodeRemainingFrames() { - for (; video_->cxdata() != NULL; video_->Next()) { + for (; video_->cxdata() != nullptr; video_->Next()) { const vpx_codec_err_t res = decoder_->DecodeFrame(video_->cxdata(), video_->frame_size()); if (res != VPX_CODEC_OK) return res; @@ -93,13 +94,13 @@ class SkipLoopFilterTest { // TODO(fgalligan): Move the MD5 testing code into another class. void OpenMd5File(const std::string &md5_file_name) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name); - ASSERT_TRUE(md5_file_ != NULL) + ASSERT_NE(md5_file_, nullptr) << "MD5 file open failed. Filename: " << md5_file_name; } // Reads the next line of the MD5 file. void ReadMd5() { - ASSERT_TRUE(md5_file_ != NULL); + ASSERT_NE(md5_file_, nullptr); const int res = fscanf(md5_file_, "%s %s", expected_md5_, junk_); ASSERT_NE(EOF, res) << "Read md5 data failed"; expected_md5_[32] = '\0'; diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc index 67f2bec800..352ad71eca 100644 --- a/test/vp9_thread_test.cc +++ b/test/vp9_thread_test.cc @@ -128,18 +128,18 @@ TEST_P(VPxWorkerThreadTest, EndWithoutSync) { } TEST(VPxWorkerThreadTest, TestInterfaceAPI) { - EXPECT_EQ(0, vpx_set_worker_interface(NULL)); - EXPECT_TRUE(vpx_get_worker_interface() != NULL); + EXPECT_EQ(0, vpx_set_worker_interface(nullptr)); + EXPECT_NE(vpx_get_worker_interface(), nullptr); for (int i = 0; i < 6; ++i) { VPxWorkerInterface winterface = *vpx_get_worker_interface(); switch (i) { default: - case 0: winterface.init = NULL; break; - case 1: winterface.reset = NULL; break; - case 2: winterface.sync = NULL; break; - case 3: winterface.launch = NULL; break; - case 4: winterface.execute = NULL; break; - case 5: winterface.end = NULL; break; + case 0: winterface.init = nullptr; break; + case 1: winterface.reset = nullptr; break; + case 2: winterface.sync = nullptr; break; + case 3: winterface.launch = nullptr; break; + case 4: winterface.execute = nullptr; break; + case 5: winterface.end = nullptr; break; } EXPECT_EQ(0, vpx_set_worker_interface(&winterface)); } @@ -172,7 +172,7 @@ string DecodeFile(const string &filename, int num_threads) { } libvpx_test::DxDataIterator dec_iter = decoder.GetDxData(); - const vpx_image_t *img = NULL; + const vpx_image_t *img = nullptr; // Get decompressed data while ((img = dec_iter.Next())) { @@ -183,7 +183,7 @@ string DecodeFile(const string &filename, int num_threads) { } void DecodeFiles(const FileList files[]) { - for (const FileList *iter = files; iter->name != NULL; ++iter) { + for (const FileList *iter = files; iter->name != nullptr; ++iter) { SCOPED_TRACE(iter->name); for (int t = 1; t <= 8; ++t) { EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t)) @@ -245,7 +245,7 @@ TEST(VP9DecodeMultiThreadedTest, FrameParallel) { "368ebc6ebf3a5e478d85b2c3149b2848" }, { "vp90-2-08-tile_1x8_frame_parallel.webm", "17e439da2388aff3a0f69cb22579c6c1" }, - { NULL, NULL } }; + { nullptr, nullptr } }; DecodeFiles(files); } @@ -296,7 +296,7 @@ TEST(VP9DecodeMultiThreadedTest, FrameParallelResize) { "ae96f21f21b6370cc0125621b441fc52" }, { "vp90-2-14-resize-fp-tiles-8-4.webm", "3eb4f24f10640d42218f7fd7b9fd30d4" }, - { NULL, NULL } + { nullptr, nullptr } }; DecodeFiles(files); @@ -309,7 +309,7 @@ TEST(VP9DecodeMultiThreadedTest, NonFrameParallel) { { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" }, { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" }, { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" }, - { NULL, NULL } + { nullptr, nullptr } }; DecodeFiles(files); diff --git a/test/y4m_test.cc b/test/y4m_test.cc index e7b86ac500..46cb5cff80 100644 --- a/test/y4m_test.cc +++ b/test/y4m_test.cc @@ -90,7 +90,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam, // Checks y4m header information void HeaderChecks(unsigned int bit_depth, vpx_img_fmt_t fmt) { - ASSERT_TRUE(input_file_ != NULL); + ASSERT_NE(input_file_, nullptr); ASSERT_EQ(y4m_.pic_w, (int)kWidth); ASSERT_EQ(y4m_.pic_h, (int)kHeight); ASSERT_EQ(img()->d_w, kWidth); @@ -116,7 +116,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam, // Checks MD5 of the raw frame data void Md5Check(const string &expected_md5) { - ASSERT_TRUE(input_file_ != NULL); + ASSERT_NE(input_file_, nullptr); libvpx_test::MD5 md5; for (unsigned int i = start_; i < limit_; i++) { md5.Add(img()); @@ -138,11 +138,11 @@ INSTANTIATE_TEST_SUITE_P(C, Y4mVideoSourceTest, class Y4mVideoWriteTest : public Y4mVideoSourceTest { protected: - Y4mVideoWriteTest() : tmpfile_(NULL) {} + Y4mVideoWriteTest() : tmpfile_(nullptr) {} virtual ~Y4mVideoWriteTest() { delete tmpfile_; - input_file_ = NULL; + input_file_ = nullptr; } void ReplaceInputFile(FILE *input_file) { @@ -155,11 +155,11 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest { // Writes out a y4m file and then reads it back void WriteY4mAndReadBack() { - ASSERT_TRUE(input_file_ != NULL); + ASSERT_NE(input_file_, nullptr); char buf[Y4M_BUFFER_SIZE] = { 0 }; const struct VpxRational framerate = { y4m_.fps_n, y4m_.fps_d }; tmpfile_ = new libvpx_test::TempOutFile; - ASSERT_TRUE(tmpfile_->file() != NULL); + ASSERT_NE(tmpfile_->file(), nullptr); y4m_write_file_header(buf, sizeof(buf), kWidth, kHeight, &framerate, y4m_.vpx_fmt, y4m_.bit_depth); fputs(buf, tmpfile_->file()); diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index 6238b3a955..47f9f3ba33 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -159,7 +159,7 @@ int VP9RateControlRTC::GetQP() const { return cpi_->common.base_qindex; } int VP9RateControlRTC::GetLoopfilterLevel() const { struct loopfilter *const lf = &cpi_->common.lf; - vp9_pick_filter_level(NULL, cpi_, LPF_PICK_FROM_Q); + vp9_pick_filter_level(nullptr, cpi_, LPF_PICK_FROM_Q); return lf->filter_level; } diff --git a/webmdec.cc b/webmdec.cc index d609075a93..68c6f4782d 100644 --- a/webmdec.cc +++ b/webmdec.cc @@ -19,25 +19,25 @@ namespace { void reset(struct WebmInputContext *const webm_ctx) { - if (webm_ctx->reader != NULL) { + if (webm_ctx->reader != nullptr) { mkvparser::MkvReader *const reader = reinterpret_cast(webm_ctx->reader); delete reader; } - if (webm_ctx->segment != NULL) { + if (webm_ctx->segment != nullptr) { mkvparser::Segment *const segment = reinterpret_cast(webm_ctx->segment); delete segment; } - if (webm_ctx->buffer != NULL) { + if (webm_ctx->buffer != nullptr) { delete[] webm_ctx->buffer; } - webm_ctx->reader = NULL; - webm_ctx->segment = NULL; - webm_ctx->buffer = NULL; - webm_ctx->cluster = NULL; - webm_ctx->block_entry = NULL; - webm_ctx->block = NULL; + webm_ctx->reader = nullptr; + webm_ctx->segment = nullptr; + webm_ctx->buffer = nullptr; + webm_ctx->cluster = nullptr; + webm_ctx->block_entry = nullptr; + webm_ctx->block = nullptr; webm_ctx->block_frame_index = 0; webm_ctx->video_track_index = 0; webm_ctx->timestamp_ns = 0; @@ -84,7 +84,7 @@ int file_is_webm(struct WebmInputContext *webm_ctx, } const mkvparser::Tracks *const tracks = segment->GetTracks(); - const mkvparser::VideoTrack *video_track = NULL; + const mkvparser::VideoTrack *video_track = nullptr; for (unsigned long i = 0; i < tracks->GetTracksCount(); ++i) { const mkvparser::Track *const track = tracks->GetTrackByIndex(i); if (track->GetType() == mkvparser::Track::kVideo) { @@ -94,7 +94,7 @@ int file_is_webm(struct WebmInputContext *webm_ctx, } } - if (video_track == NULL || video_track->GetCodecId() == NULL) { + if (video_track == nullptr || video_track->GetCodecId() == nullptr) { rewind_and_reset(webm_ctx, vpx_ctx); return 0; } @@ -137,12 +137,12 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, do { long status = 0; bool get_new_block = false; - if (block_entry == NULL && !block_entry_eos) { + if (block_entry == nullptr && !block_entry_eos) { status = cluster->GetFirst(block_entry); get_new_block = true; } else if (block_entry_eos || block_entry->EOS()) { cluster = segment->GetNext(cluster); - if (cluster == NULL || cluster->EOS()) { + if (cluster == nullptr || cluster->EOS()) { *buffer_size = 0; webm_ctx->reached_eos = 1; return 1; @@ -150,22 +150,22 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, status = cluster->GetFirst(block_entry); block_entry_eos = false; get_new_block = true; - } else if (block == NULL || + } else if (block == nullptr || webm_ctx->block_frame_index == block->GetFrameCount() || block->GetTrackNumber() != webm_ctx->video_track_index) { status = cluster->GetNext(block_entry, block_entry); - if (block_entry == NULL || block_entry->EOS()) { + if (block_entry == nullptr || block_entry->EOS()) { block_entry_eos = true; continue; } get_new_block = true; } - if (status || block_entry == NULL) { + if (status || block_entry == nullptr) { return -1; } if (get_new_block) { block = block_entry->GetBlock(); - if (block == NULL) return -1; + if (block == nullptr) return -1; webm_ctx->block_frame_index = 0; } } while (block_entry_eos || @@ -181,7 +181,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, if (frame.len > static_cast(*buffer_size)) { delete[] * buffer; *buffer = new uint8_t[frame.len]; - if (*buffer == NULL) { + if (*buffer == nullptr) { return -1; } webm_ctx->buffer = *buffer; @@ -198,7 +198,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, int webm_guess_framerate(struct WebmInputContext *webm_ctx, struct VpxInputContext *vpx_ctx) { uint32_t i = 0; - uint8_t *buffer = NULL; + uint8_t *buffer = nullptr; size_t buffer_size = 0; while (webm_ctx->timestamp_ns < 1000000000 && i < 50) { if (webm_read_frame(webm_ctx, &buffer, &buffer_size)) { @@ -212,8 +212,8 @@ int webm_guess_framerate(struct WebmInputContext *webm_ctx, delete[] buffer; get_first_cluster(webm_ctx); - webm_ctx->block = NULL; - webm_ctx->block_entry = NULL; + webm_ctx->block = nullptr; + webm_ctx->block_entry = nullptr; webm_ctx->block_frame_index = 0; webm_ctx->timestamp_ns = 0; webm_ctx->reached_eos = 0; diff --git a/webmenc.cc b/webmenc.cc index 66606674b0..c718ab5a9f 100644 --- a/webmenc.cc +++ b/webmenc.cc @@ -90,6 +90,6 @@ void write_webm_file_footer(struct WebmOutputContext *webm_ctx) { segment->Finalize(); delete segment; delete writer; - webm_ctx->writer = NULL; - webm_ctx->segment = NULL; + webm_ctx->writer = nullptr; + webm_ctx->segment = nullptr; } From bdbf8725243c7eec4c8d0c59310a94bc002c81e2 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Thu, 30 Jul 2020 22:48:09 -0700 Subject: [PATCH 0030/1000] Assign correct values for zcoeff_blk in sub8x8 RDO This fixes a lossless encoding bug as reported in the issue tracker. Coding performance change is neutral. BUG=webm:1700 Change-Id: I0f034b16b57e917e722709a7e9addef864b83d27 --- vp9/encoder/vp9_rdopt.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 39b99d50c4..37de4e4839 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -4443,6 +4443,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, tmp_best_sse = total_sse; tmp_best_skippable = skippable; tmp_best_mbmode = *mi; + x->sum_y_eobs[TX_4X4] = 0; for (i = 0; i < 4; i++) { tmp_best_bmodes[i] = xd->mi[0]->bmi[i]; x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i]; @@ -4476,6 +4477,11 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, &rate, &rate_y, &distortion, &skippable, &total_sse, (int)this_rd_thresh, seg_mvs, bsi, 0, mi_row, mi_col); if (tmp_rd == INT64_MAX) continue; + x->sum_y_eobs[TX_4X4] = 0; + for (i = 0; i < 4; i++) { + x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i]; + x->sum_y_eobs[TX_4X4] += x->plane[0].eobs[i]; + } } else { total_sse = tmp_best_sse; rate = tmp_best_rate; From 566905e91ec746835ffc91b1a87ddd6d352aed08 Mon Sep 17 00:00:00 2001 From: angiebird Date: Sat, 1 Aug 2020 13:03:46 -0700 Subject: [PATCH 0031/1000] Add recode loop logics for rate_ctrl experiment Change-Id: I4de5a38e25d6b0836d90e8fcd0e56d268e5fd838 --- vp9/encoder/vp9_encoder.c | 169 +++++++++++++++++++++++++++++++++++++- vp9/encoder/vp9_encoder.h | 37 +++++++++ 2 files changed, 205 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 06dadc42a6..68669ebf63 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4201,6 +4201,134 @@ static int get_qstep_adj(int rate_excess, int rate_limit) { return VPXMIN(qstep, MAX_QSTEP_ADJ); } +#if CONFIG_RATE_CTRL +#define RATE_CTRL_MAX_RECODE_NUM 7 + +typedef struct RATE_QINDEX_HISTORY { + int recode_count; + int q_index_history[RATE_CTRL_MAX_RECODE_NUM]; + int rate_history[RATE_CTRL_MAX_RECODE_NUM]; + int q_index_high; + int q_index_low; +} RATE_QINDEX_HISTORY; + +static void init_rq_history(RATE_QINDEX_HISTORY *rq_history) { + rq_history->recode_count = 0; + rq_history->q_index_high = 255; + rq_history->q_index_low = 0; +} + +static void update_rq_history(RATE_QINDEX_HISTORY *rq_history, int target_bits, + int actual_bits, int q_index) { + rq_history->q_index_history[rq_history->recode_count] = q_index; + rq_history->rate_history[rq_history->recode_count] = actual_bits; + if (actual_bits <= target_bits) { + rq_history->q_index_high = q_index; + } + if (actual_bits >= target_bits) { + rq_history->q_index_low = q_index; + } + rq_history->recode_count += 1; +} + +static int guess_q_index_from_model(const RATE_QSTEP_MODEL *rq_model, + int target_bits) { + // The model predicts bits as follows. + // target_bits = bias - ratio * log2(q_step) + // Given the target_bits, we compute the q_step as follows. + const double q_step = + pow(2.0, (rq_model->bias - target_bits) / rq_model->ratio); + // TODO(angiebird): Make this function support highbitdepth. + return vp9_convert_q_to_qindex(q_step, VPX_BITS_8); +} + +static int guess_q_index_linear(int prev_q_index, int target_bits, + int actual_bits, int gap) { + int q_index = prev_q_index; + if (actual_bits < target_bits) { + q_index -= gap; + q_index = VPXMAX(q_index, 0); + } else { + q_index += gap; + q_index = VPXMIN(q_index, 255); + } + return q_index; +} + +static double get_bits_percent_diff(int target_bits, int actual_bits) { + double diff = abs(target_bits - actual_bits) * 1. / target_bits; + diff *= 100; + return diff; +} + +static int rq_model_predict_q_index(const RATE_QSTEP_MODEL *rq_model, + const RATE_QINDEX_HISTORY *rq_history, + int target_bits) { + int q_index = -1; + if (rq_history->recode_count > 0) { + const int actual_bits = + rq_history->rate_history[rq_history->recode_count - 1]; + const int prev_q_index = + rq_history->q_index_history[rq_history->recode_count - 1]; + const double percent_diff = get_bits_percent_diff(target_bits, actual_bits); + if (percent_diff > 50) { + // Binary search. + // When the actual_bits and target_bits are far apart, binary search + // q_index is faster. + q_index = (rq_history->q_index_low + rq_history->q_index_high) / 2; + } else { + if (rq_model->ready) { + q_index = guess_q_index_from_model(rq_model, target_bits); + } else { + // TODO(angiebird): Find a better way to set the gap. + q_index = + guess_q_index_linear(prev_q_index, target_bits, actual_bits, 20); + } + } + } else { + if (rq_model->ready) { + q_index = guess_q_index_from_model(rq_model, target_bits); + } + } + + assert(rq_history->q_index_low <= rq_history->q_index_high); + if (q_index <= rq_history->q_index_low) { + q_index = rq_history->q_index_low + 1; + } + if (q_index >= rq_history->q_index_high) { + q_index = rq_history->q_index_high - 1; + } + return q_index; +} + +static void rq_model_update(const RATE_QINDEX_HISTORY *rq_history, + int target_bits, RATE_QSTEP_MODEL *rq_model) { + const int recode_count = rq_history->recode_count; + if (recode_count >= 2) { + // Fit the ratio and bias of rq_model based on last two recode histories. + const double s1 = vp9_convert_qindex_to_q( + rq_history->q_index_history[recode_count - 2], VPX_BITS_8); + const double s2 = vp9_convert_qindex_to_q( + rq_history->q_index_history[recode_count - 1], VPX_BITS_8); + const double r1 = rq_history->rate_history[recode_count - 2]; + const double r2 = rq_history->rate_history[recode_count - 1]; + rq_model->ratio = (r2 - r1) / (log2(s1) - log2(s2)); + rq_model->bias = r1 + (rq_model->ratio) * log2(s1); + rq_model->ready = 1; + } else if (recode_count == 1) { + if (rq_model->ready) { + // Update the ratio only when the initial model exists and we only have + // one recode history. + const int prev_q = rq_history->q_index_history[recode_count - 1]; + const double prev_q_step = vp9_convert_qindex_to_q(prev_q, VPX_BITS_8); + const int actual_bits = rq_history->rate_history[recode_count - 1]; + rq_model->ratio = + rq_model->ratio - (target_bits - actual_bits) / log2(prev_q_step); + } + } +} +#endif // CONFIG_RATE_CTRL + static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest) { const VP9EncoderConfig *const oxcf = &cpi->oxcf; @@ -4220,6 +4348,15 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, int qrange_adj = 1; #endif +#if CONFIG_RATE_CTRL + const FRAME_UPDATE_TYPE update_type = + cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]; + const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type); + RATE_QSTEP_MODEL *rq_model = &cpi->rq_model[frame_type]; + RATE_QINDEX_HISTORY rq_history; + init_rq_history(&rq_history); +#endif // CONFIG_RATE_CTRL + if (cm->show_existing_frame) { rc->this_frame_target = 0; if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi); @@ -4266,6 +4403,15 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, loop_at_this_size = 0; } +#if CONFIG_RATE_CTRL + { + const int suggested_q_index = rq_model_predict_q_index( + rq_model, &rq_history, rc->this_frame_target); + if (suggested_q_index != -1) { + q = suggested_q_index; + } + } +#endif // CONFIG_RATE_CTRL // Decide frame size bounds first time through. if (loop_count == 0) { vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target, @@ -4359,7 +4505,28 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, if (cpi->encode_command.use_external_quantize_index) { break; } -#endif + + if (cpi->encode_command.use_external_target_frame_bits) { + const double percent_diff = get_bits_percent_diff( + rc->this_frame_target, rc->projected_frame_size); + update_rq_history(&rq_history, rc->this_frame_target, + rc->projected_frame_size, q); + loop_count += 1; + + rq_model_update(&rq_history, rc->this_frame_target, rq_model); + + // Check if we hit the target bitrate. + if (percent_diff <= 15 || + rq_history.recode_count >= RATE_CTRL_MAX_RECODE_NUM || + rq_history.q_index_low >= rq_history.q_index_high) { + break; + } + + loop = 1; + restore_coding_context(cpi); + continue; + } +#endif // CONFIG_RATE_CTRL if (oxcf->rc_mode == VPX_Q) { loop = 0; diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 34ae0a09fb..31fbce24f4 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -560,6 +560,41 @@ static INLINE int gop_command_coding_frame_count( return gop_command->show_frame_count + gop_command->use_alt_ref; } +// TODO(angiebird): See if we can merge this one with FrameType in +// simple_encode.h +typedef enum ENCODE_FRAME_TYPE { + ENCODE_FRAME_TYPE_KEY, + ENCODE_FRAME_TYPE_INTER, + ENCODE_FRAME_TYPE_ALTREF, + ENCODE_FRAME_TYPE_OVERLAY, + ENCODE_FRAME_TYPE_GOLDEN, + ENCODE_FRAME_TYPES, +} ENCODE_FRAME_TYPE; + +// TODO(angiebird): Merge this function with get_frame_type_from_update_type() +static INLINE ENCODE_FRAME_TYPE +get_encode_frame_type(FRAME_UPDATE_TYPE update_type) { + switch (update_type) { + case KF_UPDATE: return ENCODE_FRAME_TYPE_KEY; + case ARF_UPDATE: return ENCODE_FRAME_TYPE_ALTREF; + case GF_UPDATE: return ENCODE_FRAME_TYPE_GOLDEN; + case OVERLAY_UPDATE: return ENCODE_FRAME_TYPE_OVERLAY; + case LF_UPDATE: return ENCODE_FRAME_TYPE_INTER; + default: + fprintf(stderr, "Unsupported update_type %d\n", update_type); + abort(); + return ENCODE_FRAME_TYPE_INTER; + } +} + +typedef struct RATE_QSTEP_MODEL { + // The rq model predict the bit usage as follows. + // rate = bias - ratio * log2(q_step) + int ready; + double bias; + double ratio; +} RATE_QSTEP_MODEL; + typedef struct ENCODE_COMMAND { int use_external_quantize_index; int external_quantize_index; @@ -917,6 +952,8 @@ typedef struct VP9_COMP { ENCODE_COMMAND encode_command; PARTITION_INFO *partition_info; MOTION_VECTOR_INFO *motion_vector_info; + + RATE_QSTEP_MODEL rq_model[ENCODE_FRAME_TYPES]; #endif } VP9_COMP; From f9ab864199a7a4d94df8c3ba3492d4feb3cfa90d Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Wed, 29 Jul 2020 12:49:51 -0700 Subject: [PATCH 0032/1000] L2E: Add ObserveFirstPassMotionVector Store motion vectors for each 16x16 block found in the first pass motion search. Provide an api "ObserveFirstPassMotionVector()" in SimpleEncode class, similar to "ObserveFirstPassStats()". Change-Id: Ia86386b7e4aa549f7000e7965c287380bf52e62c --- test/simple_encode_test.cc | 21 +++++++++++++++++++++ vp9/encoder/vp9_encoder.c | 2 ++ vp9/encoder/vp9_encoder.h | 26 ++++++++++++++++++++++++++ vp9/encoder/vp9_firstpass.c | 27 +++++++++++++++++++++++++++ vp9/simple_encode.cc | 17 +++++++++++++++-- vp9/simple_encode.h | 27 +++++++++++++++++++++++---- 6 files changed, 114 insertions(+), 6 deletions(-) diff --git a/test/simple_encode_test.cc b/test/simple_encode_test.cc index 6848351682..9a938a8d1e 100644 --- a/test/simple_encode_test.cc +++ b/test/simple_encode_test.cc @@ -60,6 +60,27 @@ TEST_F(SimpleEncodeTest, ComputeFirstPassStats) { } } +TEST_F(SimpleEncodeTest, ObserveFirstPassMotionVectors) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, + in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + std::vector> fps_motion_vectors = + simple_encode.ObserveFirstPassMotionVectors(); + EXPECT_EQ(fps_motion_vectors.size(), static_cast(num_frames_)); + const size_t num_blocks = ((width_ + 15) >> 4) * ((height_ + 15) >> 4); + EXPECT_EQ(num_blocks, fps_motion_vectors[0].size()); + for (size_t i = 0; i < fps_motion_vectors.size(); ++i) { + EXPECT_EQ(num_blocks, fps_motion_vectors[i].size()); + for (size_t j = 0; j < num_blocks; ++j) { + const int mv_count = fps_motion_vectors[i][j].mv_count; + const int ref_count = (fps_motion_vectors[i][j].ref_frame[0] > 0) + + (fps_motion_vectors[i][j].ref_frame[1] > 0); + EXPECT_EQ(mv_count, ref_count); + } + } +} + TEST_F(SimpleEncodeTest, GetCodingFrameNum) { SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, target_bitrate_, num_frames_, diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 68669ebf63..5471f9902c 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1024,6 +1024,7 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { #if CONFIG_RATE_CTRL free_partition_info(cpi); free_motion_vector_info(cpi); + free_fp_motion_vector_info(cpi); #endif vp9_free_ref_frame_buffers(cm->buffer_pool); @@ -2661,6 +2662,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, encode_command_init(&cpi->encode_command); partition_info_init(cpi); motion_vector_info_init(cpi); + fp_motion_vector_info_init(cpi); #endif return cpi; diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 31fbce24f4..85071459fd 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -642,6 +642,9 @@ static INLINE void encode_command_init(ENCODE_COMMAND *encode_command) { // Returns number of units in size of 4, if not multiple not a multiple of 4, // round it up. For example, size is 7, return 2. static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; } +// Returns number of units in size of 16, if not multiple not a multiple of 16, +// round it up. For example, size is 17, return 2. +static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; } #endif // CONFIG_RATE_CTRL typedef struct VP9_COMP { @@ -952,6 +955,7 @@ typedef struct VP9_COMP { ENCODE_COMMAND encode_command; PARTITION_INFO *partition_info; MOTION_VECTOR_INFO *motion_vector_info; + MOTION_VECTOR_INFO *fp_motion_vector_info; RATE_QSTEP_MODEL rq_model[ENCODE_FRAME_TYPES]; #endif @@ -1000,6 +1004,27 @@ static INLINE void free_motion_vector_info(struct VP9_COMP *cpi) { cpi->motion_vector_info = NULL; } +// Allocates memory for the first pass motion vector information. +// The unit size is each 16x16 block. +// Only called once in vp9_create_compressor(). +static INLINE void fp_motion_vector_info_init(struct VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int unit_width = get_num_unit_16x16(cpi->frame_info.frame_width); + const int unit_height = get_num_unit_16x16(cpi->frame_info.frame_height); + CHECK_MEM_ERROR(cm, cpi->fp_motion_vector_info, + (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height, + sizeof(MOTION_VECTOR_INFO))); + memset(cpi->fp_motion_vector_info, 0, + unit_width * unit_height * sizeof(MOTION_VECTOR_INFO)); +} + +// Frees memory of the first pass motion vector information. +// Only called once in dealloc_compressor_data(). +static INLINE void free_fp_motion_vector_info(struct VP9_COMP *cpi) { + vpx_free(cpi->fp_motion_vector_info); + cpi->fp_motion_vector_info = NULL; +} + // This is the c-version counter part of ImageBuffer typedef struct IMAGE_BUFFER { int allocated; @@ -1021,6 +1046,7 @@ typedef struct ENCODE_FRAME_RESULT { FRAME_COUNTS frame_counts; const PARTITION_INFO *partition_info; const MOTION_VECTOR_INFO *motion_vector_info; + const MOTION_VECTOR_INFO *fp_motion_vector_info; IMAGE_BUFFER coded_frame; #endif // CONFIG_RATE_CTRL int quantize_index; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 45b003e1e8..009bab7c50 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -839,6 +839,26 @@ static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile, fp_acc_data->image_data_start_row); } +#if CONFIG_RATE_CTRL +static void store_fp_motion_vector(VP9_COMP *cpi, const MV *mv, + const int mb_row, const int mb_col, + const int is_second_mv) { + VP9_COMMON *const cm = &cpi->common; + const int mb_index = mb_row * cm->mb_cols + mb_col; + MOTION_VECTOR_INFO *this_motion_vector_info = + &cpi->fp_motion_vector_info[mb_index]; + if (!is_second_mv) { + this_motion_vector_info->ref_frame[0] = LAST_FRAME; + this_motion_vector_info->mv[0].as_mv.row = mv->row; + this_motion_vector_info->mv[0].as_mv.col = mv->col; + return; + } + this_motion_vector_info->ref_frame[1] = GOLDEN_FRAME; + this_motion_vector_info->mv[1].as_mv.row = mv->row; + this_motion_vector_info->mv[1].as_mv.col = mv->col; +} +#endif // CONFIG_RATE_CTRL + #define NZ_MOTION_PENALTY 128 #define INTRA_MODE_PENALTY 1024 void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, @@ -1137,6 +1157,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, vp9_get_mvpred_var(x, &tmp_mv, &zero_mv, &v_fn_ptr, 0); } } +#if CONFIG_RATE_CTRL + store_fp_motion_vector(cpi, &mv, mb_row, mb_col, /*is_second_mv=*/0); +#endif // CONFIG_RAGE_CTRL // Search in an older reference frame. if ((cm->current_video_frame > 1) && gld_yv12 != NULL) { @@ -1158,6 +1181,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, #endif // CONFIG_VP9_HIGHBITDEPTH first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error); +#if CONFIG_RATE_CTRL + store_fp_motion_vector(cpi, &tmp_mv, mb_row, mb_col, + /*is_second_mv=*/1); +#endif // CONFIG_RAGE_CTRL if (gf_motion_error < motion_error && gf_motion_error < this_error) ++(fp_acc_data->second_ref_count); diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index d083a44c2d..678cf8adde 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -471,8 +471,8 @@ static bool init_encode_frame_result(EncodeFrameResult *encode_frame_result, encode_frame_result->coding_data.reset( new (std::nothrow) uint8_t[max_coding_data_byte_size]); - encode_frame_result->num_rows_4x4 = get_num_unit_4x4(frame_width); - encode_frame_result->num_cols_4x4 = get_num_unit_4x4(frame_height); + encode_frame_result->num_rows_4x4 = get_num_unit_4x4(frame_height); + encode_frame_result->num_cols_4x4 = get_num_unit_4x4(frame_width); encode_frame_result->partition_info.resize(encode_frame_result->num_rows_4x4 * encode_frame_result->num_cols_4x4); encode_frame_result->motion_vector_info.resize( @@ -742,6 +742,8 @@ void SimpleEncode::ComputeFirstPassStats() { struct lookahead_ctx *lookahead = cpi->lookahead; int i; int use_highbitdepth = 0; + const int num_rows_16x16 = get_num_unit_16x16(frame_height_); + const int num_cols_16x16 = get_num_unit_16x16(frame_width_); #if CONFIG_VP9_HIGHBITDEPTH use_highbitdepth = cpi->common.use_highbitdepth; #endif @@ -774,6 +776,12 @@ void SimpleEncode::ComputeFirstPassStats() { // vp9_get_compressed_data only generates first pass stats not // compresses data assert(size == 0); + // Get vp9 first pass motion vector info. + std::vector mv_info(num_rows_16x16 * num_cols_16x16); + update_motion_vector_info(&encode_frame_info.fp_motion_vector_info[0], + num_rows_16x16, num_cols_16x16, + mv_info.data()); + fp_motion_vector_info_.push_back(mv_info); } impl_ptr_->first_pass_stats.push_back(vp9_get_frame_stats(&cpi->twopass)); } @@ -811,6 +819,11 @@ std::vector> SimpleEncode::ObserveFirstPassStats() { return output_stats; } +std::vector> +SimpleEncode::ObserveFirstPassMotionVectors() { + return fp_motion_vector_info_; +} + void SimpleEncode::SetExternalGroupOfPicturesMap(int *gop_map, int gop_map_size) { for (int i = 0; i < gop_map_size; ++i) { diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index ae36eb2c55..6c66aafda4 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -60,7 +60,9 @@ struct PartitionInfo { constexpr int kMotionVectorPrecision = 8; -// The frame is split to 4x4 blocks. +// In the first pass. The frame is split to 16x16 blocks. +// This structure contains the information of each 16x16 block. +// In the second pass. The frame is split to 4x4 blocks. // This structure contains the information of each 4x4 block. struct MotionVectorInfo { // Number of valid motion vectors, always 0 if this block is in the key frame. @@ -68,8 +70,8 @@ struct MotionVectorInfo { int mv_count; // The reference frame for motion vectors. If the second motion vector does // not exist (mv_count = 1), the reference frame is kNoneRefFrame. - // Otherwise, the reference frame is either kLastFrame, or kGoldenFrame, - // or kAltRefFrame. + // Otherwise, the reference frame is either kRefFrameTypeLast, or + // kRefFrameTypePast, or kRefFrameTypeFuture. RefFrameType ref_frame[2]; // The row offset of motion vectors in the unit of pixel. // If the second motion vector does not exist, the value is 0. @@ -245,7 +247,7 @@ struct EncodeFrameResult { std::vector partition_info; // A vector of the motion vector information of the frame. // The number of elements is |num_rows_4x4| * |num_cols_4x4|. - // The frame is divided 4x4 blocks of |num_rows_4x4| rows and + // The frame is divided into 4x4 blocks of |num_rows_4x4| rows and // |num_cols_4x4| columns. // Each 4x4 block contains 0 motion vector if this is an intra predicted // frame (for example, the key frame). If the frame is inter predicted, @@ -324,6 +326,12 @@ class SimpleEncode { // values. For details, please check FIRSTPASS_STATS in vp9_firstpass.h std::vector> ObserveFirstPassStats(); + // Outputs the first pass motion vectors represented by a 2-D vector. + // One can use the frame index at first dimension to retrieve the mvs for + // each video frame. The frame is divided into 16x16 blocks. The number of + // elements is round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4). + std::vector> ObserveFirstPassMotionVectors(); + // Ouputs a copy of key_frame_map_, a binary vector with size equal to the // number of show frames in the video. For each entry in the vector, 1 // indicates the position is a key frame and 0 indicates it's not a key frame. @@ -451,6 +459,17 @@ class SimpleEncode { // frame appears? // Reference frames info of the to-be-coded frame. RefFrameInfo ref_frame_info_; + + // A 2-D vector of motion vector information of the frame collected + // from the first pass. The first dimension is the frame index. + // Each frame is divided into 16x16 blocks. The number of elements is + // round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4). + // Each 16x16 block contains 0 motion vector if this is an intra predicted + // frame (for example, the key frame). If the frame is inter predicted, + // each 16x16 block contains either 1 or 2 motion vectors. + // The first motion vector is always from the LAST_FRAME. + // The second motion vector is always from the GOLDEN_FRAME. + std::vector> fp_motion_vector_info_; }; } // namespace vp9 From 3e0967af8fda2c67266a6c594b713cae0353954d Mon Sep 17 00:00:00 2001 From: angiebird Date: Mon, 3 Aug 2020 19:52:31 -0700 Subject: [PATCH 0033/1000] Cosmetic changes for rate_ctrl experiment Change-Id: I133c93c2ad4c824fc97a18de3ac2cb2aedac9013 --- vp9/encoder/vp9_encoder.c | 3 +-- vp9/encoder/vp9_encoder.h | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 68669ebf63..560bde3828 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4257,8 +4257,7 @@ static int guess_q_index_linear(int prev_q_index, int target_bits, static double get_bits_percent_diff(int target_bits, int actual_bits) { double diff = abs(target_bits - actual_bits) * 1. / target_bits; - diff *= 100; - return diff; + return diff * 100; } static int rq_model_predict_q_index(const RATE_QSTEP_MODEL *rq_model, diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 31fbce24f4..9d2c8414b3 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -588,7 +588,7 @@ get_encode_frame_type(FRAME_UPDATE_TYPE update_type) { } typedef struct RATE_QSTEP_MODEL { - // The rq model predict the bit usage as follows. + // The rq model predicts the bit usage as follows. // rate = bias - ratio * log2(q_step) int ready; double bias; From 927fad48472b969065226e6e727e5e7670d2ff68 Mon Sep 17 00:00:00 2001 From: angiebird Date: Wed, 5 Aug 2020 12:16:50 -0700 Subject: [PATCH 0034/1000] Correct rq_model_update when recode_count == 1 This will reduce the avg recode times per frame form 3.19 to 2.81 when targeting 15% error margin for target bitrate per frame. Change-Id: I28c9ec09a1b1318c09fe5229ccb7e51b32b9dfb9 --- vp9/encoder/vp9_encoder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 458c2ae4bb..7985e18d80 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4324,7 +4324,7 @@ static void rq_model_update(const RATE_QINDEX_HISTORY *rq_history, const double prev_q_step = vp9_convert_qindex_to_q(prev_q, VPX_BITS_8); const int actual_bits = rq_history->rate_history[recode_count - 1]; rq_model->ratio = - rq_model->ratio - (target_bits - actual_bits) / log2(prev_q_step); + rq_model->ratio + (target_bits - actual_bits) / log2(prev_q_step); } } } From e3ae48b861d18badb414c8fbe201e5c4aafb5180 Mon Sep 17 00:00:00 2001 From: angiebird Date: Wed, 5 Aug 2020 13:44:01 -0700 Subject: [PATCH 0035/1000] Make initial q_index guess at 128 This reduce the average recode times per frame from 2.81 to 2.73 when targeting 15% error for target bitrate per frame. Change-Id: I58f0be86443643ba23623cb1d522ae41897734a3 --- vp9/encoder/vp9_encoder.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 7985e18d80..169cd190eb 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4265,7 +4265,7 @@ static double get_bits_percent_diff(int target_bits, int actual_bits) { static int rq_model_predict_q_index(const RATE_QSTEP_MODEL *rq_model, const RATE_QINDEX_HISTORY *rq_history, int target_bits) { - int q_index = -1; + int q_index = 128; if (rq_history->recode_count > 0) { const int actual_bits = rq_history->rate_history[rq_history->recode_count - 1]; @@ -4405,12 +4405,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, } #if CONFIG_RATE_CTRL - { - const int suggested_q_index = rq_model_predict_q_index( - rq_model, &rq_history, rc->this_frame_target); - if (suggested_q_index != -1) { - q = suggested_q_index; - } + if (cpi->encode_command.use_external_target_frame_bits) { + q = rq_model_predict_q_index(rq_model, &rq_history, + rc->this_frame_target); } #endif // CONFIG_RATE_CTRL // Decide frame size bounds first time through. From 68e1198375c34d165848cd69a35060b0684e3958 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 7 Aug 2020 13:32:39 -0700 Subject: [PATCH 0036/1000] test/*: use canonical downloads.webmproject url prefer https://storage.googleapis.com/downloads.webmproject.org/ to http://downloads.webmproject.org/ similar to libs.mk BUG=b/163149610 Change-Id: I6abe0848120849b9512fc5a6122ddc54b5cc2240 --- test/android/README | 2 +- test/stress.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/android/README b/test/android/README index f67fea50c7..0cd30779d4 100644 --- a/test/android/README +++ b/test/android/README @@ -16,7 +16,7 @@ Note: Both adb and ndk-build are available at: 3) Run get_files.py to download the test files: python get_files.py -i /path/to/test-data.sha1 -o /path/to/put/files \ - -u http://downloads.webmproject.org/test_data/libvpx + -u https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx 4) Transfer files to device using adb. Ensure you have proper permissions for the target diff --git a/test/stress.sh b/test/stress.sh index b11c658a96..ba79a52ac3 100755 --- a/test/stress.sh +++ b/test/stress.sh @@ -18,7 +18,7 @@ YUV="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.yuv" VP8="${LIBVPX_TEST_DATA_PATH}/tos_vp8.webm" VP9="${LIBVPX_TEST_DATA_PATH}/vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm" -DATA_URL="http://downloads.webmproject.org/test_data/libvpx/" +DATA_URL="https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/" SHA1_FILE="$(dirname $0)/test-data.sha1" # Set sha1sum to proper sha program (sha1sum, shasum, sha1). This code is From bb7a2ccc38475ce0e70be134b5bf862000a3fc82 Mon Sep 17 00:00:00 2001 From: angiebird Date: Fri, 7 Aug 2020 15:44:08 -0700 Subject: [PATCH 0037/1000] Fix ObserveFirstPassMotionVectors() 1) Use kRefFrameTypeNone in the unit test 2) Reset mv_info in fp_motion_vector_info_init 3) Call fp_motion_vector_info_init() in first_pass_encode() 4) Set mv_info for intra frame. 5) Set mv_info with zero mv as default for inter frame 6) Remove duplicated fp_motion_vector_info in encode_frame_info Change-Id: I2f7db5cd4cf1f19db039c9ce638d17b832f45b6e --- test/simple_encode_test.cc | 5 +++-- vp9/encoder/vp9_encoder.h | 14 +++++++++++--- vp9/encoder/vp9_firstpass.c | 31 +++++++++++++++++++------------ vp9/simple_encode.cc | 5 ++--- 4 files changed, 35 insertions(+), 20 deletions(-) diff --git a/test/simple_encode_test.cc b/test/simple_encode_test.cc index 9a938a8d1e..1790b56ac3 100644 --- a/test/simple_encode_test.cc +++ b/test/simple_encode_test.cc @@ -74,8 +74,9 @@ TEST_F(SimpleEncodeTest, ObserveFirstPassMotionVectors) { EXPECT_EQ(num_blocks, fps_motion_vectors[i].size()); for (size_t j = 0; j < num_blocks; ++j) { const int mv_count = fps_motion_vectors[i][j].mv_count; - const int ref_count = (fps_motion_vectors[i][j].ref_frame[0] > 0) + - (fps_motion_vectors[i][j].ref_frame[1] > 0); + const int ref_count = + (fps_motion_vectors[i][j].ref_frame[0] != kRefFrameTypeNone) + + (fps_motion_vectors[i][j].ref_frame[1] != kRefFrameTypeNone); EXPECT_EQ(mv_count, ref_count); } } diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 3315a425cc..3758ea3aea 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -983,6 +983,13 @@ static INLINE void free_partition_info(struct VP9_COMP *cpi) { cpi->partition_info = NULL; } +static INLINE void reset_mv_info(MOTION_VECTOR_INFO *mv_info) { + mv_info->ref_frame[0] = NONE; + mv_info->ref_frame[1] = NONE; + mv_info->mv[0].as_int = INVALID_MV; + mv_info->mv[1].as_int = INVALID_MV; +} + // Allocates memory for the motion vector information. // The unit size is each 4x4 block. // Only called once in vp9_create_compressor(). @@ -1011,11 +1018,13 @@ static INLINE void fp_motion_vector_info_init(struct VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const int unit_width = get_num_unit_16x16(cpi->frame_info.frame_width); const int unit_height = get_num_unit_16x16(cpi->frame_info.frame_height); + int i; CHECK_MEM_ERROR(cm, cpi->fp_motion_vector_info, (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height, sizeof(MOTION_VECTOR_INFO))); - memset(cpi->fp_motion_vector_info, 0, - unit_width * unit_height * sizeof(MOTION_VECTOR_INFO)); + for (i = 0; i < unit_width * unit_height; ++i) { + reset_mv_info(cpi->fp_motion_vector_info + i); + } } // Frees memory of the first pass motion vector information. @@ -1046,7 +1055,6 @@ typedef struct ENCODE_FRAME_RESULT { FRAME_COUNTS frame_counts; const PARTITION_INFO *partition_info; const MOTION_VECTOR_INFO *motion_vector_info; - const MOTION_VECTOR_INFO *fp_motion_vector_info; IMAGE_BUFFER coded_frame; #endif // CONFIG_RATE_CTRL int quantize_index; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 009bab7c50..3847755073 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -842,20 +842,16 @@ static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile, #if CONFIG_RATE_CTRL static void store_fp_motion_vector(VP9_COMP *cpi, const MV *mv, const int mb_row, const int mb_col, - const int is_second_mv) { + MV_REFERENCE_FRAME frame_type, + const int mv_idx) { VP9_COMMON *const cm = &cpi->common; const int mb_index = mb_row * cm->mb_cols + mb_col; MOTION_VECTOR_INFO *this_motion_vector_info = &cpi->fp_motion_vector_info[mb_index]; - if (!is_second_mv) { - this_motion_vector_info->ref_frame[0] = LAST_FRAME; - this_motion_vector_info->mv[0].as_mv.row = mv->row; - this_motion_vector_info->mv[0].as_mv.col = mv->col; - return; + this_motion_vector_info->ref_frame[mv_idx] = frame_type; + if (frame_type != INTRA_FRAME) { + this_motion_vector_info->mv[mv_idx].as_mv = *mv; } - this_motion_vector_info->ref_frame[1] = GOLDEN_FRAME; - this_motion_vector_info->mv[1].as_mv.row = mv->row; - this_motion_vector_info->mv[1].as_mv.col = mv->col; } #endif // CONFIG_RATE_CTRL @@ -1093,6 +1089,11 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, struct buf_2d unscaled_last_source_buf_2d; vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; +#if CONFIG_RATE_CTRL + // Store zero mv as default + store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0); +#endif // CONFIG_RAGE_CTRL + xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { @@ -1158,7 +1159,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, } } #if CONFIG_RATE_CTRL - store_fp_motion_vector(cpi, &mv, mb_row, mb_col, /*is_second_mv=*/0); + store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0); #endif // CONFIG_RAGE_CTRL // Search in an older reference frame. @@ -1182,8 +1183,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error); #if CONFIG_RATE_CTRL - store_fp_motion_vector(cpi, &tmp_mv, mb_row, mb_col, - /*is_second_mv=*/1); + store_fp_motion_vector(cpi, &tmp_mv, mb_row, mb_col, GOLDEN_FRAME, 1); #endif // CONFIG_RAGE_CTRL if (gf_motion_error < motion_error && gf_motion_error < this_error) @@ -1358,6 +1358,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, } } else { fp_acc_data->sr_coded_error += (int64_t)this_error; +#if CONFIG_RATE_CTRL + store_fp_motion_vector(cpi, NULL, mb_row, mb_col, INTRA_FRAME, 0); +#endif // CONFIG_RAGE_CTRL } fp_acc_data->coded_error += (int64_t)this_error; @@ -1384,6 +1387,10 @@ static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) { // Tiling is ignored in the first pass. vp9_tile_init(tile, cm, 0, 0); +#if CONFIG_RATE_CTRL + fp_motion_vector_info_init(cpi); +#endif + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { best_ref_mv = zero_mv; vp9_first_pass_encode_tile_mb_row(cpi, &cpi->td, fp_acc_data, &tile_data, diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 678cf8adde..4ee0bd2e84 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -778,9 +778,8 @@ void SimpleEncode::ComputeFirstPassStats() { assert(size == 0); // Get vp9 first pass motion vector info. std::vector mv_info(num_rows_16x16 * num_cols_16x16); - update_motion_vector_info(&encode_frame_info.fp_motion_vector_info[0], - num_rows_16x16, num_cols_16x16, - mv_info.data()); + update_motion_vector_info(cpi->fp_motion_vector_info, num_rows_16x16, + num_cols_16x16, mv_info.data()); fp_motion_vector_info_.push_back(mv_info); } impl_ptr_->first_pass_stats.push_back(vp9_get_frame_stats(&cpi->twopass)); From 3ec043a795cdf4dfa4cf31a9a0c82cbeef52c866 Mon Sep 17 00:00:00 2001 From: angiebird Date: Mon, 3 Aug 2020 19:49:43 -0700 Subject: [PATCH 0038/1000] Add rq_history to encode_frame_result Change-Id: Ic2a52dcf5e5a6d57b80d390a2c48ee498e89e7b2 --- test/simple_encode_test.cc | 34 +++++++++++++++++++++++++++++++++ vp9/encoder/vp9_encoder.c | 39 ++++++++++++++++++-------------------- vp9/encoder/vp9_encoder.h | 12 ++++++++++++ vp9/simple_encode.cc | 14 ++++++++++++++ vp9/simple_encode.h | 6 ++++++ 5 files changed, 84 insertions(+), 21 deletions(-) diff --git a/test/simple_encode_test.cc b/test/simple_encode_test.cc index 1790b56ac3..c5674466d7 100644 --- a/test/simple_encode_test.cc +++ b/test/simple_encode_test.cc @@ -163,6 +163,40 @@ TEST_F(SimpleEncodeTest, ObserveKeyFrameMap) { simple_encode.EndEncode(); } +TEST_F(SimpleEncodeTest, EncodeFrameWithTargetFrameBits) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, + in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + const int num_coding_frames = simple_encode.GetCodingFrameNum(); + simple_encode.StartEncode(); + for (int i = 0; i < num_coding_frames; ++i) { + EncodeFrameInfo encode_frame_info = simple_encode.GetNextEncodeFrameInfo(); + int target_frame_bits = 20000; + if (encode_frame_info.frame_type == kFrameTypeKey || + encode_frame_info.frame_type == kFrameTypeAltRef || + encode_frame_info.frame_type == kFrameTypeGolden) { + target_frame_bits = 100000; + } + if (encode_frame_info.frame_type == kFrameTypeOverlay) { + target_frame_bits = 2000; + } + + EncodeFrameResult encode_frame_result; + simple_encode.EncodeFrameWithTargetFrameBits(&encode_frame_result, + target_frame_bits); + const int recode_count = encode_frame_result.recode_count; + // TODO(angiebird): Replace 7 by RATE_CTRL_MAX_RECODE_NUM + EXPECT_LE(recode_count, 7); + EXPECT_GE(recode_count, 1); + + double diff = fabs((double)encode_frame_result.coding_data_bit_size - + target_frame_bits); + EXPECT_LE(diff * 100 / target_frame_bits, 15); + } + simple_encode.EndEncode(); +} + TEST_F(SimpleEncodeTest, EncodeFrameWithQuantizeIndex) { SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, target_bitrate_, num_frames_, diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 169cd190eb..4820b2529f 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4204,16 +4204,6 @@ static int get_qstep_adj(int rate_excess, int rate_limit) { } #if CONFIG_RATE_CTRL -#define RATE_CTRL_MAX_RECODE_NUM 7 - -typedef struct RATE_QINDEX_HISTORY { - int recode_count; - int q_index_history[RATE_CTRL_MAX_RECODE_NUM]; - int rate_history[RATE_CTRL_MAX_RECODE_NUM]; - int q_index_high; - int q_index_low; -} RATE_QINDEX_HISTORY; - static void init_rq_history(RATE_QINDEX_HISTORY *rq_history) { rq_history->recode_count = 0; rq_history->q_index_high = 255; @@ -4330,8 +4320,12 @@ static void rq_model_update(const RATE_QINDEX_HISTORY *rq_history, } #endif // CONFIG_RATE_CTRL -static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, - uint8_t *dest) { +static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest +#if CONFIG_RATE_CTRL + , + RATE_QINDEX_HISTORY *rq_history +#endif // CONFIG_RATE_CTRL +) { const VP9EncoderConfig *const oxcf = &cpi->oxcf; VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; @@ -4354,8 +4348,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]; const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type); RATE_QSTEP_MODEL *rq_model = &cpi->rq_model[frame_type]; - RATE_QINDEX_HISTORY rq_history; - init_rq_history(&rq_history); + init_rq_history(rq_history); #endif // CONFIG_RATE_CTRL if (cm->show_existing_frame) { @@ -4406,8 +4399,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, #if CONFIG_RATE_CTRL if (cpi->encode_command.use_external_target_frame_bits) { - q = rq_model_predict_q_index(rq_model, &rq_history, - rc->this_frame_target); + q = rq_model_predict_q_index(rq_model, rq_history, rc->this_frame_target); } #endif // CONFIG_RATE_CTRL // Decide frame size bounds first time through. @@ -4507,16 +4499,16 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, if (cpi->encode_command.use_external_target_frame_bits) { const double percent_diff = get_bits_percent_diff( rc->this_frame_target, rc->projected_frame_size); - update_rq_history(&rq_history, rc->this_frame_target, + update_rq_history(rq_history, rc->this_frame_target, rc->projected_frame_size, q); loop_count += 1; - rq_model_update(&rq_history, rc->this_frame_target, rq_model); + rq_model_update(rq_history, rc->this_frame_target, rq_model); // Check if we hit the target bitrate. if (percent_diff <= 15 || - rq_history.recode_count >= RATE_CTRL_MAX_RECODE_NUM || - rq_history.q_index_low >= rq_history.q_index_high) { + rq_history->recode_count >= RATE_CTRL_MAX_RECODE_NUM || + rq_history->q_index_low >= rq_history->q_index_high) { break; } @@ -5374,8 +5366,12 @@ static void encode_frame_to_data_rate( if (!encode_without_recode_loop(cpi, size, dest)) return; } else { #if !CONFIG_REALTIME_ONLY +#if CONFIG_RATE_CTRL + encode_with_recode_loop(cpi, size, dest, &encode_frame_result->rq_history); +#else // CONFIG_RATE_CTRL encode_with_recode_loop(cpi, size, dest); -#endif +#endif // CONFIG_RATE_CTRL +#endif // !CONFIG_REALTIME_ONLY } // TODO(jingning): When using show existing frame mode, we assume that the @@ -7561,6 +7557,7 @@ void vp9_init_encode_frame_result(ENCODE_FRAME_RESULT *encode_frame_result) { encode_frame_result->frame_coding_index = -1; vp9_zero(encode_frame_result->coded_frame); encode_frame_result->coded_frame.allocated = 0; + init_rq_history(&encode_frame_result->rq_history); #endif // CONFIG_RATE_CTRL } diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 3758ea3aea..0d537ca972 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1041,6 +1041,17 @@ typedef struct IMAGE_BUFFER { int plane_height[3]; uint8_t *plane_buffer[3]; } IMAGE_BUFFER; + +#define RATE_CTRL_MAX_RECODE_NUM 7 + +typedef struct RATE_QINDEX_HISTORY { + int recode_count; + int q_index_history[RATE_CTRL_MAX_RECODE_NUM]; + int rate_history[RATE_CTRL_MAX_RECODE_NUM]; + int q_index_high; + int q_index_low; +} RATE_QINDEX_HISTORY; + #endif // CONFIG_RATE_CTRL typedef struct ENCODE_FRAME_RESULT { @@ -1056,6 +1067,7 @@ typedef struct ENCODE_FRAME_RESULT { const PARTITION_INFO *partition_info; const MOTION_VECTOR_INFO *motion_vector_info; IMAGE_BUFFER coded_frame; + RATE_QINDEX_HISTORY rq_history; #endif // CONFIG_RATE_CTRL int quantize_index; } ENCODE_FRAME_RESULT; diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 4ee0bd2e84..da9c4e10aa 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -485,6 +485,18 @@ static bool init_encode_frame_result(EncodeFrameResult *encode_frame_result, frame_height, img_fmt); } +static void encode_frame_result_update_rq_history( + const RATE_QINDEX_HISTORY *rq_history, + EncodeFrameResult *encode_frame_result) { + encode_frame_result->recode_count = rq_history->recode_count; + for (int i = 0; i < encode_frame_result->recode_count; ++i) { + const int q_index = rq_history->q_index_history[i]; + const int rate = rq_history->rate_history[i]; + encode_frame_result->q_index_history.push_back(q_index); + encode_frame_result->rate_history.push_back(rate); + } +} + static void update_encode_frame_result( EncodeFrameResult *encode_frame_result, const ENCODE_FRAME_RESULT *encode_frame_info) { @@ -514,6 +526,8 @@ static void update_encode_frame_result( &encode_frame_result->motion_vector_info[0]); update_frame_counts(&encode_frame_info->frame_counts, &encode_frame_result->frame_counts); + encode_frame_result_update_rq_history(&encode_frame_info->rq_history, + encode_frame_result); } static void IncreaseGroupOfPictureIndex(GroupOfPicture *group_of_picture) { diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index 6c66aafda4..702895e05c 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -256,6 +256,12 @@ struct EncodeFrameResult { // share the same motion vector information. std::vector motion_vector_info; ImageBuffer coded_frame; + + // recode_count, q_index_history and rate_history are only available when + // EncodeFrameWithTargetFrameBits() is used. + int recode_count; + std::vector q_index_history; + std::vector rate_history; }; struct GroupOfPicture { From d6f2ae2c127b4dd61fd7bedf725152138d02b20e Mon Sep 17 00:00:00 2001 From: angiebird Date: Fri, 7 Aug 2020 18:12:18 -0700 Subject: [PATCH 0039/1000] Avoid division by zero for rate q_step model Change-Id: Ic5709b79131a3969fcb2a0eb3f53994f788b5cc9 --- vp9/encoder/vp9_encoder.c | 52 +++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 4820b2529f..3348cf66ff 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4228,8 +4228,9 @@ static int guess_q_index_from_model(const RATE_QSTEP_MODEL *rq_model, // The model predicts bits as follows. // target_bits = bias - ratio * log2(q_step) // Given the target_bits, we compute the q_step as follows. - const double q_step = - pow(2.0, (rq_model->bias - target_bits) / rq_model->ratio); + double q_step; + assert(rq_model->ratio > 0); + q_step = pow(2.0, (rq_model->bias - target_bits) / rq_model->ratio); // TODO(angiebird): Make this function support highbitdepth. return vp9_convert_q_to_qindex(q_step, VPX_BITS_8); } @@ -4248,7 +4249,9 @@ static int guess_q_index_linear(int prev_q_index, int target_bits, } static double get_bits_percent_diff(int target_bits, int actual_bits) { - double diff = abs(target_bits - actual_bits) * 1. / target_bits; + double diff; + target_bits = VPXMAX(target_bits, 1); + diff = abs(target_bits - actual_bits) * 1. / target_bits; return diff * 100; } @@ -4295,26 +4298,43 @@ static int rq_model_predict_q_index(const RATE_QSTEP_MODEL *rq_model, static void rq_model_update(const RATE_QINDEX_HISTORY *rq_history, int target_bits, RATE_QSTEP_MODEL *rq_model) { const int recode_count = rq_history->recode_count; + const double delta = 0.00001; if (recode_count >= 2) { - // Fit the ratio and bias of rq_model based on last two recode histories. - const double s1 = vp9_convert_qindex_to_q( - rq_history->q_index_history[recode_count - 2], VPX_BITS_8); - const double s2 = vp9_convert_qindex_to_q( - rq_history->q_index_history[recode_count - 1], VPX_BITS_8); - const double r1 = rq_history->rate_history[recode_count - 2]; - const double r2 = rq_history->rate_history[recode_count - 1]; - rq_model->ratio = (r2 - r1) / (log2(s1) - log2(s2)); - rq_model->bias = r1 + (rq_model->ratio) * log2(s1); - rq_model->ready = 1; + const int q_index1 = rq_history->q_index_history[recode_count - 2]; + const int q_index2 = rq_history->q_index_history[recode_count - 1]; + const int r1 = rq_history->rate_history[recode_count - 2]; + const int r2 = rq_history->rate_history[recode_count - 1]; + int valid = 0; + // lower q_index should yield higher bit rate + if (q_index1 < q_index2) { + valid = r1 > r2; + } else if (q_index1 > q_index2) { + valid = r1 < r2; + } + // Only update the model when the q_index and rate behave normally. + if (valid) { + // Fit the ratio and bias of rq_model based on last two recode histories. + const double s1 = vp9_convert_qindex_to_q(q_index1, VPX_BITS_8); + const double s2 = vp9_convert_qindex_to_q(q_index2, VPX_BITS_8); + if (fabs(log2(s1) - log2(s2)) > delta) { + rq_model->ratio = (r2 - r1) / (log2(s1) - log2(s2)); + rq_model->bias = r1 + (rq_model->ratio) * log2(s1); + if (rq_model->ratio > delta && rq_model->bias > delta) { + rq_model->ready = 1; + } + } + } } else if (recode_count == 1) { if (rq_model->ready) { // Update the ratio only when the initial model exists and we only have // one recode history. const int prev_q = rq_history->q_index_history[recode_count - 1]; const double prev_q_step = vp9_convert_qindex_to_q(prev_q, VPX_BITS_8); - const int actual_bits = rq_history->rate_history[recode_count - 1]; - rq_model->ratio = - rq_model->ratio + (target_bits - actual_bits) / log2(prev_q_step); + if (fabs(log2(prev_q_step)) > delta) { + const int actual_bits = rq_history->rate_history[recode_count - 1]; + rq_model->ratio = + rq_model->ratio + (target_bits - actual_bits) / log2(prev_q_step); + } } } } From 246a65c6961cfa314d41780ec3909a2308a568c6 Mon Sep 17 00:00:00 2001 From: angiebird Date: Fri, 7 Aug 2020 18:39:18 -0700 Subject: [PATCH 0040/1000] Make target_frame_bits error margin configurable. Change-Id: I05dd4d60741743c13951727ce6608acf4224ebec --- test/simple_encode_test.cc | 10 +++++++--- vp9/encoder/vp9_encoder.c | 2 +- vp9/encoder/vp9_encoder.h | 9 ++++++++- vp9/simple_encode.cc | 5 +++-- vp9/simple_encode.h | 9 +++++++-- 5 files changed, 26 insertions(+), 9 deletions(-) diff --git a/test/simple_encode_test.cc b/test/simple_encode_test.cc index c5674466d7..eeb65c2d28 100644 --- a/test/simple_encode_test.cc +++ b/test/simple_encode_test.cc @@ -182,9 +182,13 @@ TEST_F(SimpleEncodeTest, EncodeFrameWithTargetFrameBits) { target_frame_bits = 2000; } + double percent_diff = 15; + if (encode_frame_info.frame_type == kFrameTypeOverlay) { + percent_diff = 100; + } EncodeFrameResult encode_frame_result; - simple_encode.EncodeFrameWithTargetFrameBits(&encode_frame_result, - target_frame_bits); + simple_encode.EncodeFrameWithTargetFrameBits( + &encode_frame_result, target_frame_bits, percent_diff); const int recode_count = encode_frame_result.recode_count; // TODO(angiebird): Replace 7 by RATE_CTRL_MAX_RECODE_NUM EXPECT_LE(recode_count, 7); @@ -192,7 +196,7 @@ TEST_F(SimpleEncodeTest, EncodeFrameWithTargetFrameBits) { double diff = fabs((double)encode_frame_result.coding_data_bit_size - target_frame_bits); - EXPECT_LE(diff * 100 / target_frame_bits, 15); + EXPECT_LE(diff * 100 / target_frame_bits, percent_diff); } simple_encode.EndEncode(); } diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 3348cf66ff..0485590615 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4526,7 +4526,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest rq_model_update(rq_history, rc->this_frame_target, rq_model); // Check if we hit the target bitrate. - if (percent_diff <= 15 || + if (percent_diff <= cpi->encode_command.target_frame_bits_error_percent || rq_history->recode_count >= RATE_CTRL_MAX_RECODE_NUM || rq_history->q_index_low >= rq_history->q_index_high) { break; diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 0d537ca972..26d35cccf9 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -598,8 +598,11 @@ typedef struct RATE_QSTEP_MODEL { typedef struct ENCODE_COMMAND { int use_external_quantize_index; int external_quantize_index; + int use_external_target_frame_bits; int target_frame_bits; + double target_frame_bits_error_percent; + GOP_COMMAND gop_command; } ENCODE_COMMAND; @@ -621,15 +624,19 @@ static INLINE void encode_command_reset_external_quantize_index( } static INLINE void encode_command_set_target_frame_bits( - ENCODE_COMMAND *encode_command, int target_frame_bits) { + ENCODE_COMMAND *encode_command, int target_frame_bits, + double target_frame_bits_error_percent) { encode_command->use_external_target_frame_bits = 1; encode_command->target_frame_bits = target_frame_bits; + encode_command->target_frame_bits_error_percent = + target_frame_bits_error_percent; } static INLINE void encode_command_reset_target_frame_bits( ENCODE_COMMAND *encode_command) { encode_command->use_external_target_frame_bits = 0; encode_command->target_frame_bits = -1; + encode_command->target_frame_bits_error_percent = 0; } static INLINE void encode_command_init(ENCODE_COMMAND *encode_command) { diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index da9c4e10aa..bb7f544792 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -1098,9 +1098,10 @@ void SimpleEncode::EncodeFrameWithQuantizeIndex( } void SimpleEncode::EncodeFrameWithTargetFrameBits( - EncodeFrameResult *encode_frame_result, int target_frame_bits) { + EncodeFrameResult *encode_frame_result, int target_frame_bits, + double percent_diff) { encode_command_set_target_frame_bits(&impl_ptr_->cpi->encode_command, - target_frame_bits); + target_frame_bits, percent_diff); EncodeFrame(encode_frame_result); encode_command_reset_target_frame_bits(&impl_ptr_->cpi->encode_command); } diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index 702895e05c..5db85fd7d3 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -398,9 +398,14 @@ class SimpleEncode { // Encode a frame with target frame bits usage. // The encoder will find a quantize index to make the actual frame bits usage - // match the target. + // match the target. EncodeFrameWithTargetFrameBits() will recode the frame + // update to 7 times to find a q_index to make the actual_frame_bits to + // satisfy following inequality. + // |actual_frame_bits - target_frame_bits| * 100 / target_frame_bits + // <= percent_diff. void EncodeFrameWithTargetFrameBits(EncodeFrameResult *encode_frame_result, - int target_frame_bits); + int target_frame_bits, + double percent_diff); // Gets the number of coding frames for the video. The coding frames include // show frame and no show frame. From 7122eea6a42b136c6e4b59663847ba39e021b73a Mon Sep 17 00:00:00 2001 From: angiebird Date: Mon, 10 Aug 2020 15:46:37 -0700 Subject: [PATCH 0041/1000] Cosmetic change for simple_encode_test.cc Change-Id: I50b4d38f7deceb5b416e72dd944d2ed31e42dafa --- test/simple_encode_test.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/simple_encode_test.cc b/test/simple_encode_test.cc index eeb65c2d28..ab893045d8 100644 --- a/test/simple_encode_test.cc +++ b/test/simple_encode_test.cc @@ -172,14 +172,14 @@ TEST_F(SimpleEncodeTest, EncodeFrameWithTargetFrameBits) { simple_encode.StartEncode(); for (int i = 0; i < num_coding_frames; ++i) { EncodeFrameInfo encode_frame_info = simple_encode.GetNextEncodeFrameInfo(); - int target_frame_bits = 20000; - if (encode_frame_info.frame_type == kFrameTypeKey || - encode_frame_info.frame_type == kFrameTypeAltRef || - encode_frame_info.frame_type == kFrameTypeGolden) { - target_frame_bits = 100000; - } - if (encode_frame_info.frame_type == kFrameTypeOverlay) { - target_frame_bits = 2000; + int target_frame_bits; + switch (encode_frame_info.frame_type) { + case kFrameTypeInter: target_frame_bits = 20000; break; + case kFrameTypeKey: + case kFrameTypeAltRef: + case kFrameTypeGolden: target_frame_bits = 100000; break; + case kFrameTypeOverlay: target_frame_bits = 2000; break; + default: target_frame_bits = 20000; } double percent_diff = 15; @@ -194,8 +194,8 @@ TEST_F(SimpleEncodeTest, EncodeFrameWithTargetFrameBits) { EXPECT_LE(recode_count, 7); EXPECT_GE(recode_count, 1); - double diff = fabs((double)encode_frame_result.coding_data_bit_size - - target_frame_bits); + const double diff = fabs((double)encode_frame_result.coding_data_bit_size - + target_frame_bits); EXPECT_LE(diff * 100 / target_frame_bits, percent_diff); } simple_encode.EndEncode(); From 04db83211cd1607f51155c6a8e51381f45a441a5 Mon Sep 17 00:00:00 2001 From: angiebird Date: Mon, 10 Aug 2020 15:59:30 -0700 Subject: [PATCH 0042/1000] Correct the first pass motion vector scale Change-Id: I005a648f7f9ead9d36a39330dfbb096919affb34 --- vp9/simple_encode.cc | 17 ++++++++++------- vp9/simple_encode.h | 3 ++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index bb7f544792..46b25d1fdf 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -167,7 +167,8 @@ static RefFrameType mv_ref_frame_to_ref_frame_type( static void update_motion_vector_info( const MOTION_VECTOR_INFO *input_motion_vector_info, const int num_rows_4x4, - const int num_cols_4x4, MotionVectorInfo *output_motion_vector_info) { + const int num_cols_4x4, MotionVectorInfo *output_motion_vector_info, + int motion_vector_scale) { const int num_units_4x4 = num_rows_4x4 * num_cols_4x4; for (int i = 0; i < num_units_4x4; ++i) { const MV_REFERENCE_FRAME *in_ref_frame = @@ -185,16 +186,16 @@ static void update_motion_vector_info( mv_ref_frame_to_ref_frame_type(in_ref_frame[1]); output_motion_vector_info[i].mv_row[0] = (double)input_motion_vector_info[i].mv[0].as_mv.row / - kMotionVectorPrecision; + motion_vector_scale; output_motion_vector_info[i].mv_column[0] = (double)input_motion_vector_info[i].mv[0].as_mv.col / - kMotionVectorPrecision; + motion_vector_scale; output_motion_vector_info[i].mv_row[1] = (double)input_motion_vector_info[i].mv[1].as_mv.row / - kMotionVectorPrecision; + motion_vector_scale; output_motion_vector_info[i].mv_column[1] = (double)input_motion_vector_info[i].mv[1].as_mv.col / - kMotionVectorPrecision; + motion_vector_scale; } } @@ -523,7 +524,8 @@ static void update_encode_frame_result( update_motion_vector_info(encode_frame_info->motion_vector_info, encode_frame_result->num_rows_4x4, encode_frame_result->num_cols_4x4, - &encode_frame_result->motion_vector_info[0]); + &encode_frame_result->motion_vector_info[0], + kMotionVectorSubPixelPrecision); update_frame_counts(&encode_frame_info->frame_counts, &encode_frame_result->frame_counts); encode_frame_result_update_rq_history(&encode_frame_info->rq_history, @@ -793,7 +795,8 @@ void SimpleEncode::ComputeFirstPassStats() { // Get vp9 first pass motion vector info. std::vector mv_info(num_rows_16x16 * num_cols_16x16); update_motion_vector_info(cpi->fp_motion_vector_info, num_rows_16x16, - num_cols_16x16, mv_info.data()); + num_cols_16x16, mv_info.data(), + kMotionVectorFullPixelPrecision); fp_motion_vector_info_.push_back(mv_info); } impl_ptr_->first_pass_stats.push_back(vp9_get_frame_stats(&cpi->twopass)); diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index 5db85fd7d3..8f0d374393 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -58,7 +58,8 @@ struct PartitionInfo { int height; // prediction block height }; -constexpr int kMotionVectorPrecision = 8; +constexpr int kMotionVectorSubPixelPrecision = 8; +constexpr int kMotionVectorFullPixelPrecision = 1; // In the first pass. The frame is split to 16x16 blocks. // This structure contains the information of each 16x16 block. From 7370cecd8929141adb8140b924d3dd8ac1887d36 Mon Sep 17 00:00:00 2001 From: angiebird Date: Mon, 10 Aug 2020 15:37:24 -0700 Subject: [PATCH 0043/1000] Close out file in EndEncode() Change-Id: Ib6549f954ce6d5d966eef09a119b46f0cc2f54f7 --- vp9/simple_encode.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 46b25d1fdf..ba076fd586 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -959,6 +959,10 @@ void SimpleEncode::EndEncode() { impl_ptr_->cpi = nullptr; vpx_img_free(&impl_ptr_->tmp_img); rewind(in_file_); + if (out_file_ != nullptr) { + fclose(out_file_); + out_file_ = nullptr; + } } void SimpleEncode::UpdateKeyFrameGroup(int key_frame_show_index) { From 343c4dca644aaa78315866a8e166c27d9b2b85cb Mon Sep 17 00:00:00 2001 From: angiebird Date: Mon, 10 Aug 2020 19:37:30 -0700 Subject: [PATCH 0044/1000] Cosmetic changes in simple_encode.h Change-Id: If7d2711e7f37f00629874914f7c4d2396358e39d --- vp9/simple_encode.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index 8f0d374393..dc055e9754 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -400,9 +400,9 @@ class SimpleEncode { // Encode a frame with target frame bits usage. // The encoder will find a quantize index to make the actual frame bits usage // match the target. EncodeFrameWithTargetFrameBits() will recode the frame - // update to 7 times to find a q_index to make the actual_frame_bits to - // satisfy following inequality. - // |actual_frame_bits - target_frame_bits| * 100 / target_frame_bits + // up to 7 times to find a q_index to make the actual_frame_bits satisfy the + // following inequality. |actual_frame_bits - target_frame_bits| * 100 / + // target_frame_bits // <= percent_diff. void EncodeFrameWithTargetFrameBits(EncodeFrameResult *encode_frame_result, int target_frame_bits, From 6ac8a0c9b01b9a4b65e5926814d399f13981902f Mon Sep 17 00:00:00 2001 From: angiebird Date: Mon, 10 Aug 2020 19:53:10 -0700 Subject: [PATCH 0045/1000] Avoid re-allocating fp_motion_vector_info Replace fp_motion_vector_info_init() by fp_motion_vector_info_reset() in first_pass_encode() Change-Id: Iadacb1ecc4f07435340399564fdd3bfd4ac702f4 --- vp9/encoder/vp9_encoder.h | 11 +++++++++-- vp9/encoder/vp9_firstpass.c | 4 +++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 26d35cccf9..b201e25c16 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1025,12 +1025,19 @@ static INLINE void fp_motion_vector_info_init(struct VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const int unit_width = get_num_unit_16x16(cpi->frame_info.frame_width); const int unit_height = get_num_unit_16x16(cpi->frame_info.frame_height); - int i; CHECK_MEM_ERROR(cm, cpi->fp_motion_vector_info, (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height, sizeof(MOTION_VECTOR_INFO))); +} + +static INLINE void fp_motion_vector_info_reset( + int frame_width, int frame_height, + MOTION_VECTOR_INFO *fp_motion_vector_info) { + const int unit_width = get_num_unit_16x16(frame_width); + const int unit_height = get_num_unit_16x16(frame_height); + int i; for (i = 0; i < unit_width * unit_height; ++i) { - reset_mv_info(cpi->fp_motion_vector_info + i); + reset_mv_info(fp_motion_vector_info + i); } } diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 3847755073..de954f7575 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1388,7 +1388,9 @@ static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) { vp9_tile_init(tile, cm, 0, 0); #if CONFIG_RATE_CTRL - fp_motion_vector_info_init(cpi); + fp_motion_vector_info_reset(cpi->frame_info.frame_width, + cpi->frame_info.frame_height, + cpi->fp_motion_vector_info); #endif for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { From 529c29bb0f6511a9b3e25177cece0d843827458b Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Mon, 17 Aug 2020 21:49:41 -0700 Subject: [PATCH 0046/1000] rtc-vp9: Fix to rcstats in vp9_spatial_svc_encoder Fixes the rcstats for case when #spatial_layers = 1. Change-Id: Ie28d99852033307bc4c69c7e738e1d4cab4e8cf5 --- examples/vp9_spatial_svc_encoder.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c index b987989a86..f273c1208a 100644 --- a/examples/vp9_spatial_svc_encoder.c +++ b/examples/vp9_spatial_svc_encoder.c @@ -828,12 +828,15 @@ static void svc_output_rc_stats( vpx_codec_control(codec, VP9E_GET_SVC_LAYER_ID, layer_id); parse_superframe_index(cx_pkt->data.frame.buf, cx_pkt->data.frame.sz, sizes_parsed, &count); - if (enc_cfg->ss_number_layers == 1) sizes[0] = cx_pkt->data.frame.sz; - for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { - sizes[sl] = 0; - if (cx_pkt->data.frame.spatial_layer_encoded[sl]) { - sizes[sl] = sizes_parsed[num_layers_encoded]; - num_layers_encoded++; + if (enc_cfg->ss_number_layers == 1) { + sizes[0] = cx_pkt->data.frame.sz; + } else { + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + sizes[sl] = 0; + if (cx_pkt->data.frame.spatial_layer_encoded[sl]) { + sizes[sl] = sizes_parsed[num_layers_encoded]; + num_layers_encoded++; + } } } for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { From ea6562f2fcd8ea6baa946eaad69cbb5aec5dd278 Mon Sep 17 00:00:00 2001 From: jinbo Date: Sat, 15 Aug 2020 17:20:36 +0800 Subject: [PATCH 0047/1000] Refine MMI & MSA detection for mips 1.Add compile check to probe the native ability of toolchain to decide whether a feature can be enabled. 2.Add runtime check to probe cpu supported features. MSA will be prefered if MSA and MMI are both supported. 3.You can configure and build as following commands: ./configure --cpu=loongson3a && make -j4 Change-Id: I057553216dbc79cfaba9c691d5f4cdab144e1123 --- build/make/Makefile | 4 ++ build/make/configure.sh | 20 +++++----- build/make/rtcd.pl | 35 ++++++++++++++--- vp8/common/generic/systemdependent.c | 4 ++ vpx_ports/mips.h | 27 +++++++++++++ vpx_ports/mips_cpudetect.c | 57 ++++++++++++++++++++++++++++ vpx_ports/vpx_ports.mk | 3 ++ 7 files changed, 135 insertions(+), 15 deletions(-) create mode 100644 vpx_ports/mips.h create mode 100644 vpx_ports/mips_cpudetect.c diff --git a/build/make/Makefile b/build/make/Makefile index be28102295..9ca97c8c64 100644 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -147,6 +147,10 @@ $(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq - $(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx $(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx +# MIPS +$(BUILD_PFX)%_msa.c.d: CFLAGS += -mmsa +$(BUILD_PFX)%_msa.c.o: CFLAGS += -mmsa + $(BUILD_PFX)%.c.d: %.c $(if $(quiet),@echo " [DEP] $@") $(qexec)mkdir -p $(dir $@) diff --git a/build/make/configure.sh b/build/make/configure.sh index 206b54f775..f8aa102775 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -1195,25 +1195,27 @@ EOF check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64 check_add_ldflags -mips64r6 -mabi=64 -mfp64 ;; + loongson3*) + check_cflags -march=loongson3a && soft_enable mmi \ + || disable_feature mmi + check_cflags -mmsa && soft_enable msa \ + || disable_feature msa + tgt_isa=loongson3a + ;; esac + if enabled mmi || enabled msa; then + soft_enable runtime_cpu_detect + fi + if enabled msa; then # TODO(libyuv:793) # The new mips functions in libyuv do not build # with the toolchains we currently use for testing. soft_disable libyuv - - add_cflags -mmsa - add_asflags -mmsa - add_ldflags -mmsa fi fi - if enabled mmi; then - tgt_isa=loongson3a - check_add_ldflags -march=loongson3a - fi - check_add_cflags -march=${tgt_isa} check_add_asflags -march=${tgt_isa} check_add_asflags -KPIC diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl index 7483200411..acb9f6e466 100755 --- a/build/make/rtcd.pl +++ b/build/make/rtcd.pl @@ -315,14 +315,26 @@ () sub mips() { determine_indirection("c", @ALL_ARCHS); + + # Assign the helper variable for each enabled extension + foreach my $opt (@ALL_ARCHS) { + my $opt_uc = uc $opt; + eval "\$have_${opt}=\"flags & HAS_${opt_uc}\""; + } + common_top; print <) { if (/HAVE_DSPR2=yes/) { - @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/); - last; + $have_dspr2 = 1; } if (/HAVE_MSA=yes/) { - @ALL_ARCHS = filter("$opts{arch}", qw/msa/); - last; + $have_msa = 1; } if (/HAVE_MMI=yes/) { - @ALL_ARCHS = filter("$opts{arch}", qw/mmi/); - last; + $have_mmi = 1; } } close CONFIG_FILE; + if ($have_dspr2 == 1) { + @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/); + } elsif ($have_msa == 1 && $have_mmi == 1) { + @ALL_ARCHS = filter("$opts{arch}", qw/mmi msa/); + } elsif ($have_msa == 1) { + @ALL_ARCHS = filter("$opts{arch}", qw/msa/); + } elsif ($have_mmi == 1) { + @ALL_ARCHS = filter("$opts{arch}", qw/mmi/); + } else { + unoptimized; + } mips; } elsif ($opts{arch} =~ /armv7\w?/) { @ALL_ARCHS = filter(qw/neon_asm neon/); diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c index 75ce7ef359..cd1b02c9cc 100644 --- a/vp8/common/generic/systemdependent.c +++ b/vp8/common/generic/systemdependent.c @@ -16,6 +16,8 @@ #include "vpx_ports/x86.h" #elif VPX_ARCH_PPC #include "vpx_ports/ppc.h" +#elif VPX_ARCH_MIPS +#include "vpx_ports/mips.h" #endif #include "vp8/common/onyxc_int.h" #include "vp8/common/systemdependent.h" @@ -96,6 +98,8 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) { ctx->cpu_caps = x86_simd_caps(); #elif VPX_ARCH_PPC ctx->cpu_caps = ppc_simd_caps(); +#elif VPX_ARCH_MIPS + ctx->cpu_caps = mips_cpu_caps(); #else // generic-gnu targets. ctx->cpu_caps = 0; diff --git a/vpx_ports/mips.h b/vpx_ports/mips.h new file mode 100644 index 0000000000..bdc7525f7b --- /dev/null +++ b/vpx_ports/mips.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_PORTS_MIPS_H_ +#define VPX_PORTS_MIPS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define HAS_MMI 0x01 +#define HAS_MSA 0x02 + +int mips_cpu_caps(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_PORTS_MIPS_H_ diff --git a/vpx_ports/mips_cpudetect.c b/vpx_ports/mips_cpudetect.c new file mode 100644 index 0000000000..e0eca2d48d --- /dev/null +++ b/vpx_ports/mips_cpudetect.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include +#include "./vpx_config.h" +#include "vpx_ports/mips.h" + +#if CONFIG_RUNTIME_CPU_DETECT +#if defined(__mips__) && defined(__linux__) +int mips_cpu_caps(void) { + char cpuinfo_line[512]; + int flag = 0x0; + FILE *f = fopen("/proc/cpuinfo", "r"); + if (!f) { + // Assume nothing if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. + return 0; + } + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { + if (memcmp(cpuinfo_line, "cpu model", 9) == 0) { + // Workaround early kernel without mmi in ASEs line. + if (strstr(cpuinfo_line, "Loongson-3")) { + flag |= HAS_MMI; + } else if (strstr(cpuinfo_line, "Loongson-2K")) { + flag |= HAS_MMI | HAS_MSA; + } + } + if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) { + if (strstr(cpuinfo_line, "loongson-mmi") && + strstr(cpuinfo_line, "loongson-ext")) { + flag |= HAS_MMI; + } + if (strstr(cpuinfo_line, "msa")) { + flag |= HAS_MSA; + } + // ASEs is the last line, so we can break here. + break; + } + } + fclose(f); + return flag; +} +#else /* end __mips__ && __linux__ */ +#error \ + "--enable-runtime-cpu-detect selected, but no CPU detection method " \ +"available for your platform. Reconfigure with --disable-runtime-cpu-detect." +#endif +#else /* end CONFIG_RUNTIME_CPU_DETECT */ +int mips_cpu_caps(void) { return 0; } +#endif diff --git a/vpx_ports/vpx_ports.mk b/vpx_ports/vpx_ports.mk index 2331773693..e5001be496 100644 --- a/vpx_ports/vpx_ports.mk +++ b/vpx_ports/vpx_ports.mk @@ -42,6 +42,9 @@ PORTS_SRCS-$(VPX_ARCH_ARM) += arm.h PORTS_SRCS-$(VPX_ARCH_PPC) += ppc_cpudetect.c PORTS_SRCS-$(VPX_ARCH_PPC) += ppc.h +PORTS_SRCS-$(VPX_ARCH_MIPS) += mips_cpudetect.c +PORTS_SRCS-$(VPX_ARCH_MIPS) += mips.h + ifeq ($(VPX_ARCH_MIPS), yes) PORTS_SRCS-yes += asmdefs_mmi.h endif From 53747dfe65eaf670a7192f55117f3bf1e0280743 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Tue, 18 Aug 2020 17:56:51 -0700 Subject: [PATCH 0048/1000] vp9-svc: Fix to resetting RC for temporal layers Fix to reset RC for temporal layers: the first_spatial_layer_to_encode is usually/default 0, so the logic to reset for temporal layers was not being executed. Use VPXMAX(1, ) to make sure all temporal layers will be reset (when max-q is used for overshoot). Change-Id: Iec669870c865420d01d52eab9425cd6c7714eddc --- vp9/encoder/vp9_ratectrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 9bb8d45d2d..27a529914d 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -3262,7 +3262,7 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) { int tl = 0; int sl = 0; SVC *svc = &cpi->svc; - for (sl = 0; sl < svc->first_spatial_layer_to_encode; ++sl) { + for (sl = 0; sl < VPXMAX(1, svc->first_spatial_layer_to_encode); ++sl) { for (tl = 0; tl < svc->number_temporal_layers; ++tl) { const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); From c413c8f18eb1932b100850505031980e27160d5f Mon Sep 17 00:00:00 2001 From: Daniel Sommermann Date: Mon, 17 Aug 2020 09:42:08 -0700 Subject: [PATCH 0049/1000] Escape number sign in Makefiles Number signs are handled differently in Makefile variable parsing as compared to bash variable parsing. See this demo: ``` $ cat Makefile A=foo#bar B='foo#bar' C="foo#bar" D=foo\#bar E='foo\#bar' F="foo\#bar" $(info $(A)) $(info $(B)) $(info $(C)) $(info $(D)) $(info $(E)) $(info $(F)) $ make foo 'foo "foo foo#bar 'foo#bar' "foo#bar" make: *** No targets. Stop. $ make -v GNU Make 4.2.1 ``` In other words, the `#` character is evaluated first when parsing Makefiles, causing the rest of the line to become a comment. The effect of this is that paths that contain embedded `#` symbols are not handled properly in the vpx build system. To test this change, clone vpx to a directory containing a `#` symbol and attempt a build. With this change, it worked for me on Fedora 31, however without the change the build failed. Change-Id: Iaee6383e2435049b680484cc5cefdea9f2d9df46 --- build/make/configure.sh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/build/make/configure.sh b/build/make/configure.sh index f8aa102775..8bdfef39db 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -262,6 +262,9 @@ if [ -z "$source_path" ] || [ "$source_path" = "." ]; then source_path="`pwd`" disable_feature source_path_used fi +# Makefiles greedily process the '#' character as a comment, even if it is +# inside quotes. So, this character must be escaped in all paths in Makefiles. +source_path_mk=$(echo $source_path | sed -e 's;\#;\\\#;g') if test ! -z "$TMPDIR" ; then TMPDIRx="${TMPDIR}" @@ -481,11 +484,11 @@ write_common_target_config_mk() { cat >> $1 << EOF # This file automatically generated by configure. Do not edit! -SRC_PATH="$source_path" -SRC_PATH_BARE=$source_path +SRC_PATH="$source_path_mk" +SRC_PATH_BARE=$source_path_mk BUILD_PFX=${BUILD_PFX} TOOLCHAIN=${toolchain} -ASM_CONVERSION=${asm_conversion_cmd:-${source_path}/build/make/ads2gas.pl} +ASM_CONVERSION=${asm_conversion_cmd:-${source_path_mk}/build/make/ads2gas.pl} GEN_VCPROJ=${gen_vcproj_cmd} MSVS_ARCH_DIR=${msvs_arch_dir} @@ -984,7 +987,7 @@ EOF fi enabled debug && add_asflags -g - asm_conversion_cmd="${source_path}/build/make/ads2gas.pl" + asm_conversion_cmd="${source_path_mk}/build/make/ads2gas.pl" case ${tgt_os} in win*) @@ -1006,7 +1009,7 @@ EOF # respective SDKs' limitations. Fortunately, these are all 32-bit ABIs # and so can be selected as 'win32'. if [ ${tgt_os} = "win32" ]; then - asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl" + asm_conversion_cmd="${source_path_mk}/build/make/ads2armasm_ms.pl" AS_SFX=.S msvs_arch_dir=arm-msvs disable_feature multithread @@ -1141,7 +1144,7 @@ EOF fi fi - asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl" + asm_conversion_cmd="${source_path_mk}/build/make/ads2gas_apple.pl" ;; linux*) From d1a78971ebcfd728c9c73b0cfbee69f470d4dc72 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Mon, 11 May 2020 12:22:19 -0700 Subject: [PATCH 0050/1000] vp9-rtc: Add control to disable maxq on overshoot Add encoder control to disable feature to increase Q on overshoot detection, for CBR. Default (no usage of the control) means the feature is internally enabled. Add the control to the sample encoders, but keep it disabled as default (set to 0, so feature is on). Change-Id: Ia2237bc4aaea9770e5080dab20bfff9e3fd09199 --- examples/vp9_spatial_svc_encoder.c | 2 ++ examples/vpx_temporal_svc_encoder.c | 1 + vp9/encoder/vp9_ratectrl.c | 1 + vp9/encoder/vp9_ratectrl.h | 3 ++- vp9/encoder/vp9_speed_features.c | 4 ++-- vp9/vp9_cx_iface.c | 9 +++++++++ vpx/vp8cx.h | 10 ++++++++++ 7 files changed, 27 insertions(+), 3 deletions(-) diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c index f273c1208a..5bb5a3183f 100644 --- a/examples/vp9_spatial_svc_encoder.c +++ b/examples/vp9_spatial_svc_encoder.c @@ -1053,6 +1053,8 @@ int main(int argc, const char **argv) { vpx_codec_control(&encoder, VP9E_SET_TUNE_CONTENT, app_input.tune_content); + vpx_codec_control(&encoder, VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, 0); + svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP; for (sl = 0; sl < (unsigned int)svc_ctx.spatial_layers; ++sl) svc_drop_frame.framedrop_thresh[sl] = enc_cfg.rc_dropframe_thresh; diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c index 872751cefe..c5a3228b11 100644 --- a/examples/vpx_temporal_svc_encoder.c +++ b/examples/vpx_temporal_svc_encoder.c @@ -831,6 +831,7 @@ int main(int argc, char **argv) { } else if (strncmp(encoder->name, "vp9", 3) == 0) { vpx_svc_extra_cfg_t svc_params; memset(&svc_params, 0, sizeof(svc_params)); + vpx_codec_control(&codec, VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, 0); vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed); vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0); diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 27a529914d..4693309ce4 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -441,6 +441,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->last_post_encode_dropped_scene_change = 0; rc->use_post_encode_drop = 0; rc->ext_use_post_encode_drop = 0; + rc->disable_overshoot_maxq_cbr = 0; rc->arf_active_best_quality_adjustment_factor = 1.0; rc->arf_increase_active_best_quality = 0; rc->preserve_arf_as_gld = 0; diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index c5ffb153c8..0120f90a01 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -195,7 +195,8 @@ typedef struct { int use_post_encode_drop; // External flag to enable post encode frame dropping, controlled by user. int ext_use_post_encode_drop; - + // Flag to disable CBR feature to increase Q on overshoot detection. + int disable_overshoot_maxq_cbr; int damped_adjustment[RATE_FACTOR_LEVELS]; double arf_active_best_quality_adjustment_factor; int arf_increase_active_best_quality; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 5bbe549236..585c9604c6 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -621,7 +621,7 @@ static void set_rt_speed_feature_framesize_independent( // increase in encoding time. if (cpi->use_svc && svc->spatial_layer_id > 0) sf->nonrd_keyframe = 1; if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG && - cpi->oxcf.rc_mode == VPX_CBR) { + cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.disable_overshoot_maxq_cbr) { if (cm->width * cm->height <= 352 * 288 && !cpi->use_svc && cpi->oxcf.content != VP9E_CONTENT_SCREEN) sf->overshoot_detection_cbr_rt = RE_ENCODE_MAXQ; @@ -668,7 +668,7 @@ static void set_rt_speed_feature_framesize_independent( sf->base_mv_aggressive = 1; } if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG && - cpi->oxcf.rc_mode == VPX_CBR) + cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.disable_overshoot_maxq_cbr) sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ; } diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 6caa4f7390..172a248230 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1716,6 +1716,14 @@ static vpx_codec_err_t ctrl_set_postencode_drop(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_disable_overshoot_maxq_cbr( + vpx_codec_alg_priv_t *ctx, va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + cpi->rc.disable_overshoot_maxq_cbr = data; + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP8_COPY_REFERENCE, ctrl_copy_reference }, @@ -1760,6 +1768,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_SET_TARGET_LEVEL, ctrl_set_target_level }, { VP9E_SET_ROW_MT, ctrl_set_row_mt }, { VP9E_SET_POSTENCODE_DROP, ctrl_set_postencode_drop }, + { VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, ctrl_set_disable_overshoot_maxq_cbr }, { VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test }, { VP9E_SET_SVC_INTER_LAYER_PRED, ctrl_set_svc_inter_layer_pred }, { VP9E_SET_SVC_FRAME_DROP_LAYER, ctrl_set_svc_frame_drop_layer }, diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index dcdd710c0b..ed3e2516af 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -684,6 +684,14 @@ enum vp8e_enc_control_id { * Supported in codecs: VP9 */ VP9E_SET_DELTA_Q_UV, + + /*!\brief Codec control function to disable increase Q on overshoot in CBR. + * + * 0: On (default), 1: Disable. + * + * Supported in codecs: VP9 + */ + VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, }; /*!\brief vpx 1-D scaling mode @@ -1034,6 +1042,8 @@ VPX_CTRL_USE_TYPE(VP9E_SET_POSTENCODE_DROP, unsigned int) VPX_CTRL_USE_TYPE(VP9E_SET_DELTA_Q_UV, int) #define VPX_CTRL_VP9E_SET_DELTA_Q_UV +VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, int) +#define VPX_CTRL_VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR /*!\endcond */ /*! @} - end defgroup vp8_encoder */ #ifdef __cplusplus From ea0bc1b321f1b8a51e5f5701ac48f597cb417ef3 Mon Sep 17 00:00:00 2001 From: Sarah Parker Date: Wed, 9 Sep 2020 16:10:17 -0700 Subject: [PATCH 0051/1000] Upstream GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST BUG=b/159031848 Change-Id: I013770f4e54d0ea92304fa3e9cf4d46f5723f129 --- test/avg_test.cc | 2 ++ test/dct16x16_test.cc | 1 + test/fdct8x8_test.cc | 1 + test/lpf_test.cc | 2 ++ test/quantize_test.cc | 1 + test/sum_squares_test.cc | 1 + test/variance_test.cc | 1 + test/vp9_block_error_test.cc | 1 + test/vp9_denoiser_test.cc | 1 + test/vp9_intrapred_test.cc | 2 ++ 10 files changed, 13 insertions(+) diff --git a/test/avg_test.cc b/test/avg_test.cc index 4bc1944b1c..196522ce58 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -202,6 +202,7 @@ class IntProRowTest : public AverageTestBase, int16_t *hbuf_asm_; int16_t *hbuf_c_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProRowTest); typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width); @@ -228,6 +229,7 @@ class IntProColTest : public AverageTestBase, int16_t sum_asm_; int16_t sum_c_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProColTest); #endif // HAVE_NEON || HAVE_SSE2 || HAVE_MSA typedef int (*SatdFunc)(const tran_low_t *coeffs, int length); diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index 321c66e4e9..c04880ec95 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -740,6 +740,7 @@ class InvTrans16x16DCT : public Trans16x16TestBase, IdctFunc inv_txfm_; int thresh_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans16x16DCT); TEST_P(InvTrans16x16DCT, CompareReference) { CompareInvReference(ref_txfm_, thresh_); diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 770337da00..0822666e70 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -625,6 +625,7 @@ class InvTrans8x8DCT : public FwdTrans8x8TestBase, IdctFunc inv_txfm_; int thresh_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans8x8DCT); TEST_P(InvTrans8x8DCT, CompareReference) { CompareInvReference(ref_txfm_, thresh_); diff --git a/test/lpf_test.cc b/test/lpf_test.cc index 2c632a5a8c..62c6f30a07 100644 --- a/test/lpf_test.cc +++ b/test/lpf_test.cc @@ -145,6 +145,7 @@ class Loop8Test6Param : public ::testing::TestWithParam { loop_op_t loopfilter_op_; loop_op_t ref_loopfilter_op_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param); #if HAVE_NEON || HAVE_SSE2 || \ (HAVE_DSPR2 || HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH) @@ -166,6 +167,7 @@ class Loop8Test9Param : public ::testing::TestWithParam { dual_loop_op_t loopfilter_op_; dual_loop_op_t ref_loopfilter_op_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param); #endif // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA && // (!CONFIG_VP9_HIGHBITDEPTH)) diff --git a/test/quantize_test.cc b/test/quantize_test.cc index b30b90f8f5..792b21432e 100644 --- a/test/quantize_test.cc +++ b/test/quantize_test.cc @@ -146,6 +146,7 @@ class QuantizeTest : public QuantizeTestBase, VP8Quantize asm_quant_; VP8Quantize c_quant_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(QuantizeTest); TEST_P(QuantizeTest, TestZeroInput) { FillCoeffConstant(0); diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc index a47f5b5b1b..df6da84037 100644 --- a/test/sum_squares_test.cc +++ b/test/sum_squares_test.cc @@ -45,6 +45,7 @@ class SumSquaresTest : public ::testing::TestWithParam { SSI16Func ref_func_; SSI16Func tst_func_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSquaresTest); TEST_P(SumSquaresTest, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); diff --git a/test/variance_test.cc b/test/variance_test.cc index 72306d90c3..1b76b20419 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -840,6 +840,7 @@ INSTANTIATE_TEST_SUITE_P( MseParams(4, 4, &vpx_highbd_8_mse8x16_c), MseParams(4, 4, &vpx_highbd_8_mse8x8_c))); */ +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VpxHBDMseTest); INSTANTIATE_TEST_SUITE_P( C, VpxHBDVarianceTest, diff --git a/test/vp9_block_error_test.cc b/test/vp9_block_error_test.cc index dc1ff49c68..b93b014e65 100644 --- a/test/vp9_block_error_test.cc +++ b/test/vp9_block_error_test.cc @@ -67,6 +67,7 @@ class BlockErrorTest : public ::testing::TestWithParam { HBDBlockErrorFunc error_block_op_; HBDBlockErrorFunc ref_error_block_op_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlockErrorTest); TEST_P(BlockErrorTest, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); diff --git a/test/vp9_denoiser_test.cc b/test/vp9_denoiser_test.cc index 3d76edfaa2..d884b7eb92 100644 --- a/test/vp9_denoiser_test.cc +++ b/test/vp9_denoiser_test.cc @@ -51,6 +51,7 @@ class VP9DenoiserTest protected: BLOCK_SIZE bs_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VP9DenoiserTest); TEST_P(VP9DenoiserTest, BitexactCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index fdcccd5c22..ccace719ea 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -468,6 +468,8 @@ void IntraPredTest::Predict() { } typedef IntraPredTest VP9HighbdIntraPredTest; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VP9HighbdIntraPredTest); + TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) { // max block size is 32 DECLARE_ALIGNED(16, uint16_t, left_col[2 * 32]); From 478c70f6d2c1f83d4c4e82ced533e71d9e19ef32 Mon Sep 17 00:00:00 2001 From: Sarah Parker Date: Thu, 10 Sep 2020 20:33:02 -0700 Subject: [PATCH 0052/1000] googletest: enable failure on uninstantiated tests Similar to the change in https://aomedia-review.googlesource.com/c/aom/+/115162. This currently is a warning, but the tree should be clean now in the default x86-64 configuration so we can use it to prevent regressions and find any remaining issues in other configurations. BUG=b/159031844 Change-Id: I097537ff018668492d37164fdba5edd241dc5dbe --- third_party/googletest/README.libvpx | 2 ++ third_party/googletest/src/src/gtest.cc | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/third_party/googletest/README.libvpx b/third_party/googletest/README.libvpx index 0f6202cb7b..ed55fb09f9 100644 --- a/third_party/googletest/README.libvpx +++ b/third_party/googletest/README.libvpx @@ -19,3 +19,5 @@ Local Modifications: LICENSE README.md src +- Enable kErrorOnUninstantiatedParameterizedTest and + kErrorOnUninstantiatedTypeParameterizedTest in gtest.cc diff --git a/third_party/googletest/src/src/gtest.cc b/third_party/googletest/src/src/gtest.cc index 095778e6e9..b8f6a5c31c 100644 --- a/third_party/googletest/src/src/gtest.cc +++ b/third_party/googletest/src/src/gtest.cc @@ -414,8 +414,8 @@ namespace { // inserted to report ether an error or a log message. // // This configuration bit will likely be removed at some point. -constexpr bool kErrorOnUninstantiatedParameterizedTest = false; -constexpr bool kErrorOnUninstantiatedTypeParameterizedTest = false; +constexpr bool kErrorOnUninstantiatedParameterizedTest = true; +constexpr bool kErrorOnUninstantiatedTypeParameterizedTest = true; // A test that fails at a given file/line location with a given message. class FailureTest : public Test { From 97356acb50e212fcfb7c91715718ec70953f780c Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 13 Sep 2020 22:05:47 -0400 Subject: [PATCH 0053/1000] vp8: Remove sched_yield on POSIX systems libvpx does sched_yield() on Linux. This is highly frowned upon these days mainly because it is not needed and causes high scheduler overhead. It is not needed because the kernel will preempt the task while it is spinning which will imply a yield. On ChromeOS, not yielding has the following improvements: 1. power_VideoCall test as seen on perf profile: With yield: 9.40% [kernel] [k] __pi___clean_dcache_area_poc 7.32% [kernel] [k] _raw_spin_unlock_irq <-- kernel scheduler Without yield: 8.76% [kernel] [k] __pi___clean_dcache_area_poc 2.27% [kernel] [k] _raw_spin_unlock_irq <-- kernel scheduler As you can see, there is a 5% drop in the scheduler's CPU utilization. 2. power_VideoCall test results: There is a 3% improvement on max video FPS, from 30 to 31. This improvement is consistent. Also note that the sched_yield() manpage itself says it is intended only for RT tasks. From manpagE: "sched_yield() is intended for use with real-time scheduling policies (i.e., SCHED_FIFO or SCHED_RR) and very likely means your application design is broken." BUG=b/168205004 Change-Id: Idb84ab19e94f6d0c7f9e544e7a407c946d5ced5c Signed-off-by: Joel Fernandes --- vp8/common/threading.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/vp8/common/threading.h b/vp8/common/threading.h index f921369386..1cfb9fec51 100644 --- a/vp8/common/threading.h +++ b/vp8/common/threading.h @@ -171,17 +171,20 @@ static inline int sem_destroy(sem_t *sem) { #define sem_wait(sem) (semaphore_wait(*sem)) #define sem_post(sem) semaphore_signal(*sem) #define sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem) -#define thread_sleep(nms) -/* { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = - 1000*nms;nanosleep(&ts, NULL);} */ #else #include #include -#define thread_sleep(nms) sched_yield(); +#endif /* __APPLE__ */ +/* Not Windows. Assume pthreads */ + +/* thread_sleep implementation: yield unless Linux/Unix. */ +#if defined(__unix__) || defined(__APPLE__) +#define thread_sleep(nms) /* {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */ -#endif -/* Not Windows. Assume pthreads */ +#else +#define thread_sleep(nms) sched_yield(); +#endif /* __unix__ || __APPLE__ */ #endif From 58d02a85016311406a3b2bd648b1e75d2f844b0e Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 16 Sep 2020 13:40:01 -0700 Subject: [PATCH 0054/1000] test/encode_test_driver: rm redundant get() w/unique_ptr Change-Id: I3c1ece92ba9f43df4cbaf47109e35aaf0a807d97 --- test/encode_test_driver.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc index 6914804ec0..1ce39eaeff 100644 --- a/test/encode_test_driver.cc +++ b/test/encode_test_driver.cc @@ -216,7 +216,7 @@ void EncoderTest::RunLoop(VideoSource *video) { switch (pkt->kind) { case VPX_CODEC_CX_FRAME_PKT: has_cxdata = true; - if (decoder.get() != nullptr && DoDecode()) { + if (decoder != nullptr && DoDecode()) { PreDecodeFrameHook(video, decoder.get()); vpx_codec_err_t res_dec = decoder->DecodeFrame( (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz); From c211b82d08231a959aee8354e76db179e7dd2139 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 16 Sep 2020 13:43:05 -0700 Subject: [PATCH 0055/1000] vp9_ratectrl,vp9_resize_one_pass_cbr: rm redundant casts avg_frame_bandwidth is an int, quiets a clang-tidy warning Change-Id: I2a2822652ca6a06e9d1d6d4318f544d419d437e8 --- vp9/encoder/vp9_ratectrl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 4693309ce4..4b87ff2f0c 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -2705,18 +2705,18 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { // Force downsize based on per-frame-bandwidth, for extreme case, // for HD input. if (cpi->resize_state == ORIG && cm->width * cm->height >= 1280 * 720) { - if (rc->avg_frame_bandwidth < (int)(300000 / 30)) { + if (rc->avg_frame_bandwidth < 300000 / 30) { resize_action = DOWN_ONEHALF; cpi->resize_state = ONE_HALF; force_downsize_rate = 1; - } else if (rc->avg_frame_bandwidth < (int)(400000 / 30)) { + } else if (rc->avg_frame_bandwidth < 400000 / 30) { resize_action = ONEHALFONLY_RESIZE ? DOWN_ONEHALF : DOWN_THREEFOUR; cpi->resize_state = ONEHALFONLY_RESIZE ? ONE_HALF : THREE_QUARTER; force_downsize_rate = 1; } } else if (cpi->resize_state == THREE_QUARTER && cm->width * cm->height >= 960 * 540) { - if (rc->avg_frame_bandwidth < (int)(300000 / 30)) { + if (rc->avg_frame_bandwidth < 300000 / 30) { resize_action = DOWN_ONEHALF; cpi->resize_state = ONE_HALF; force_downsize_rate = 1; From 979e27c9700448bded96fb1d4ba31441a8da82db Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 25 Sep 2020 13:21:11 -0700 Subject: [PATCH 0056/1000] configure: add darwin20 support this release will have arm64 and x86_64 support. in the future it might be useful to move to mac/iphone targets to help disambiguate arm64-darwin-gcc and arm64-darwin20-gcc. Change-Id: I1f8b145303204af316955822f5e8bab51c47f353 --- README | 2 ++ build/make/configure.sh | 14 +++++++++++--- configure | 2 ++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/README b/README index fd4c93a465..62fef34593 100644 --- a/README +++ b/README @@ -62,6 +62,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: arm64-android-gcc arm64-darwin-gcc + arm64-darwin20-gcc arm64-linux-gcc arm64-win64-gcc arm64-win64-vs15 @@ -113,6 +114,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86_64-darwin17-gcc x86_64-darwin18-gcc x86_64-darwin19-gcc + x86_64-darwin20-gcc x86_64-iphonesimulator-gcc x86_64-linux-gcc x86_64-linux-icc diff --git a/build/make/configure.sh b/build/make/configure.sh index 8bdfef39db..3c09aa37d3 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -774,6 +774,10 @@ process_common_toolchain() { tgt_isa=x86_64 tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'` ;; + *darwin20*) + tgt_isa=`uname -m` + tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'` + ;; x86_64*mingw32*) tgt_os=win64 ;; @@ -848,7 +852,7 @@ process_common_toolchain() { # Handle darwin variants. Newer SDKs allow targeting older # platforms, so use the newest one available. case ${toolchain} in - arm*-darwin*) + arm*-darwin-) add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}" iphoneos_sdk_dir="$(show_darwin_sdk_path iphoneos)" if [ -d "${iphoneos_sdk_dir}" ]; then @@ -856,7 +860,7 @@ process_common_toolchain() { add_ldflags "-isysroot ${iphoneos_sdk_dir}" fi ;; - x86*-darwin*) + *-darwin*) osx_sdk_dir="$(show_darwin_sdk_path macosx)" if [ -d "${osx_sdk_dir}" ]; then add_cflags "-isysroot ${osx_sdk_dir}" @@ -914,6 +918,10 @@ process_common_toolchain() { add_cflags "-mmacosx-version-min=10.15" add_ldflags "-mmacosx-version-min=10.15" ;; + *-darwin20-*) + add_cflags "-mmacosx-version-min=10.16" + add_ldflags "-mmacosx-version-min=10.16" + ;; *-iphonesimulator-*) add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}" add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}" @@ -1087,7 +1095,7 @@ EOF soft_enable unit_tests ;; - darwin*) + darwin) if ! enabled external_build; then XCRUN_FIND="xcrun --sdk iphoneos --find" CXX="$(${XCRUN_FIND} clang++)" diff --git a/configure b/configure index 32272ce36f..f7e11aaf2d 100755 --- a/configure +++ b/configure @@ -99,6 +99,7 @@ EOF # alphabetically by architecture, generic-gnu last. all_platforms="${all_platforms} arm64-android-gcc" all_platforms="${all_platforms} arm64-darwin-gcc" +all_platforms="${all_platforms} arm64-darwin20-gcc" all_platforms="${all_platforms} arm64-linux-gcc" all_platforms="${all_platforms} arm64-win64-gcc" all_platforms="${all_platforms} arm64-win64-vs15" @@ -150,6 +151,7 @@ all_platforms="${all_platforms} x86_64-darwin16-gcc" all_platforms="${all_platforms} x86_64-darwin17-gcc" all_platforms="${all_platforms} x86_64-darwin18-gcc" all_platforms="${all_platforms} x86_64-darwin19-gcc" +all_platforms="${all_platforms} x86_64-darwin20-gcc" all_platforms="${all_platforms} x86_64-iphonesimulator-gcc" all_platforms="${all_platforms} x86_64-linux-gcc" all_platforms="${all_platforms} x86_64-linux-icc" From 956d3cac34711ce90e25593b65fcd3f9d0baa763 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 29 Sep 2020 10:38:41 -0700 Subject: [PATCH 0057/1000] configure.sh: fix arm64-darwin-gcc match after: 979e27c97 configure: add darwin20 support make the condition more specific by including the trailing -gcc (-*) Change-Id: I78f481b6c5ad9137e6b6973198e8671e806ee82c --- build/make/configure.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/make/configure.sh b/build/make/configure.sh index 3c09aa37d3..91a64b5041 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -852,7 +852,7 @@ process_common_toolchain() { # Handle darwin variants. Newer SDKs allow targeting older # platforms, so use the newest one available. case ${toolchain} in - arm*-darwin-) + arm*-darwin-*) add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}" iphoneos_sdk_dir="$(show_darwin_sdk_path iphoneos)" if [ -d "${iphoneos_sdk_dir}" ]; then From 3c31fb80531baf040cee8da3d85c1b6286bda9bf Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 29 Sep 2020 10:58:56 -0700 Subject: [PATCH 0058/1000] ratectrl_rtc_test.cc: fix signed/unsigned comparison Change-Id: Id522c12faf4c959f60b5df1b0f7312f14a71720d --- test/ratectrl_rtc_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc index e044885e7c..aefb4d975c 100644 --- a/test/ratectrl_rtc_test.cc +++ b/test/ratectrl_rtc_test.cc @@ -88,7 +88,7 @@ class RcInterfaceTest : public ::testing::Test { std::ifstream one_layer_file; one_layer_file.open(libvpx_test::GetDataPath() + "/rc_interface_test_one_layer"); - ASSERT_EQ(one_layer_file.rdstate() & std::ifstream::failbit, 0); + ASSERT_TRUE(one_layer_file.good()); for (size_t i = 0; i < kNumFrame; i++) { one_layer_file >> frame_info; if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME; @@ -117,7 +117,7 @@ class RcInterfaceTest : public ::testing::Test { std::ifstream svc_file; svc_file.open(std::string(std::getenv("LIBVPX_TEST_DATA_PATH")) + "/rc_interface_test_svc"); - ASSERT_EQ(svc_file.rdstate() & std::ifstream::failbit, 0); + ASSERT_TRUE(svc_file.good()); for (size_t i = 0; i < kNumFrame * rc_cfg_.ss_number_layers; i++) { svc_file >> frame_info; if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME; From 73681bf6a421d1ebf2efb949d6e3e855221d09cf Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 30 Sep 2020 11:01:30 -0700 Subject: [PATCH 0059/1000] Add file for rate control interface test. Change-Id: Id09dc5b653c1e5bb2b02f63579ac776f887ce0eb --- test/ratectrl_rtc_test.cc | 4 ++-- test/test-data.mk | 2 ++ test/test-data.sha1 | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc index e044885e7c..98b35544a5 100644 --- a/test/ratectrl_rtc_test.cc +++ b/test/ratectrl_rtc_test.cc @@ -79,7 +79,7 @@ class RcInterfaceTest : public ::testing::Test { protected: void RunOneLayer() { SetConfigOneLayer(); - rc_api_->Create(rc_cfg_); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); FrameInfo frame_info; libvpx::VP9FrameParamsQpRTC frame_params; frame_params.frame_type = KEY_FRAME; @@ -110,7 +110,7 @@ class RcInterfaceTest : public ::testing::Test { void RunSVC() { SetConfigSVC(); - rc_api_->Create(rc_cfg_); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); FrameInfo frame_info; libvpx::VP9FrameParamsQpRTC frame_params; frame_params.frame_type = KEY_FRAME; diff --git a/test/test-data.mk b/test/test-data.mk index ca2e11442e..436dfa4e49 100644 --- a/test/test-data.mk +++ b/test/test-data.mk @@ -27,6 +27,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += noisy_clip_640_360.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_one_layer +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_svc LIBVPX_TEST_DATA-$(CONFIG_RATE_CTRL) += bus_352x288_420_f20_b8.yuv # Test vectors diff --git a/test/test-data.sha1 b/test/test-data.sha1 index 668992fba2..6f9021554d 100644 --- a/test/test-data.sha1 +++ b/test/test-data.sha1 @@ -869,3 +869,5 @@ bac455906360b45338a16dd626ac5f19bc36a307 *desktop_office1.1280_720-020.yuv 518a0be998afece76d3df76047d51e256c591ff2 *invalid-bug-148271109.ivf d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv +03f827c0e36ff9a6e23c5cc11936924e4f1827ab *rc_interface_test_one_layer +99e4f4c2961d46dc286db230090a39d78460b25d *rc_interface_test_svc From 7e8ea22e4056a3da04b139fcc812a3f6937bbed7 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 2 Oct 2020 10:47:47 -0700 Subject: [PATCH 0060/1000] Add codec control to disable loopfilter for vp9 Change-Id: I6d693e84570c353d20ec314acea43363956c0590 --- examples/vp9_spatial_svc_encoder.c | 1 + examples/vpx_temporal_svc_encoder.c | 1 + vp9/encoder/vp9_encoder.c | 6 ++++++ vp9/encoder/vp9_encoder.h | 8 ++++++++ vp9/vp9_cx_iface.c | 9 +++++++++ vpx/vp8cx.h | 12 ++++++++++++ 6 files changed, 37 insertions(+) diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c index 5bb5a3183f..6305572979 100644 --- a/examples/vp9_spatial_svc_encoder.c +++ b/examples/vp9_spatial_svc_encoder.c @@ -1054,6 +1054,7 @@ int main(int argc, const char **argv) { vpx_codec_control(&encoder, VP9E_SET_TUNE_CONTENT, app_input.tune_content); vpx_codec_control(&encoder, VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, 0); + vpx_codec_control(&encoder, VP9E_SET_DISABLE_LOOPFILTER, 0); svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP; for (sl = 0; sl < (unsigned int)svc_ctx.spatial_layers; ++sl) diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c index c5a3228b11..ffeae2abc4 100644 --- a/examples/vpx_temporal_svc_encoder.c +++ b/examples/vpx_temporal_svc_encoder.c @@ -841,6 +841,7 @@ int main(int argc, char **argv) { vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0); vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, get_msb(cfg.g_threads)); + vpx_codec_control(&codec, VP9E_SET_DISABLE_LOOPFILTER, 0); #if ROI_MAP set_roi_map(encoder->name, &cfg, &roi); if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi)) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 0485590615..e513cef84c 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -3316,6 +3316,12 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { return; } + if (cpi->loopfilter_ctrl == NO_LOOPFILTER || + (!is_reference_frame && cpi->loopfilter_ctrl == LOOPFILTER_REFERENCE)) { + lf->filter_level = 0; + return; + } + if (xd->lossless) { lf->filter_level = 0; lf->last_filt_level = 0; diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index b201e25c16..263b10cdbb 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -147,6 +147,12 @@ typedef enum { kVeryHighSad = 6, } CONTENT_STATE_SB; +typedef enum { + LOOPFILTER_ALL = 0, + LOOPFILTER_REFERENCE = 1, // Disable loopfilter on non reference frames. + NO_LOOPFILTER = 2, // Disable loopfilter on all frames. +} LOOPFILTER_CONTROL; + typedef struct VP9EncoderConfig { BITSTREAM_PROFILE profile; vpx_bit_depth_t bit_depth; // Codec bit-depth. @@ -958,6 +964,8 @@ typedef struct VP9_COMP { int multi_layer_arf; vpx_roi_map_t roi; + + LOOPFILTER_CONTROL loopfilter_ctrl; #if CONFIG_RATE_CTRL ENCODE_COMMAND encode_command; PARTITION_INFO *partition_info; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 172a248230..9586937927 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1724,6 +1724,14 @@ static vpx_codec_err_t ctrl_set_disable_overshoot_maxq_cbr( return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_disable_loopfilter(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + cpi->loopfilter_ctrl = data; + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP8_COPY_REFERENCE, ctrl_copy_reference }, @@ -1775,6 +1783,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_SET_SVC_GF_TEMPORAL_REF, ctrl_set_svc_gf_temporal_ref }, { VP9E_SET_SVC_SPATIAL_LAYER_SYNC, ctrl_set_svc_spatial_layer_sync }, { VP9E_SET_DELTA_Q_UV, ctrl_set_delta_q_uv }, + { VP9E_SET_DISABLE_LOOPFILTER, ctrl_set_disable_loopfilter }, // Getters { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer }, diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index ed3e2516af..59753cdfde 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -692,6 +692,15 @@ enum vp8e_enc_control_id { * Supported in codecs: VP9 */ VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, + + /*!\brief Codec control function to disable loopfilter. + * + * 0: Loopfilter on all frames, 1: Disable on non reference frames. + * 2: Disable on all frames. + * + * Supported in codecs: VP9 + */ + VP9E_SET_DISABLE_LOOPFILTER, }; /*!\brief vpx 1-D scaling mode @@ -1044,6 +1053,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_DELTA_Q_UV, int) VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, int) #define VPX_CTRL_VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR + +VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int) +#define VPX_CTRL_VP9E_SET_DISABLE_LOOPFILTER /*!\endcond */ /*! @} - end defgroup vp8_encoder */ #ifdef __cplusplus From da7c503fe585de1e1f613f289a24b31567fd7113 Mon Sep 17 00:00:00 2001 From: angiebird Date: Thu, 1 Oct 2020 19:18:09 -0700 Subject: [PATCH 0061/1000] Add SetEncodeConfig and DumpEncodeConfigs Change-Id: Ie6864b1133c26021d9c4883df033ecd2969585ed --- vp9/simple_encode.cc | 110 ++++++++++++++++++++++++++++++++---- vp9/simple_encode.h | 31 ++++++++++ vp9/vp9_cx_iface.c | 132 +++++++++++++++++++++---------------------- vp9/vp9_cx_iface.h | 2 +- 4 files changed, 196 insertions(+), 79 deletions(-) diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 46b25d1fdf..568df97aaa 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -90,12 +90,20 @@ static int img_read(vpx_image_t *img, FILE *file) { return 1; } +// Assume every config in VP9EncoderConfig is less than 100 characters. +#define ENCODE_CONFIG_BUF_SIZE 100 +struct EncodeConfig { + char name[ENCODE_CONFIG_BUF_SIZE]; + char value[ENCODE_CONFIG_BUF_SIZE]; +}; + class SimpleEncode::EncodeImpl { public: VP9_COMP *cpi; vpx_img_fmt_t img_fmt; vpx_image_t tmp_img; std::vector first_pass_stats; + std::vector encode_config_list; }; static VP9_COMP *init_encoder(const VP9EncoderConfig *oxcf, @@ -711,6 +719,50 @@ static void UpdateGroupOfPicture(const VP9_COMP *cpi, int start_coding_index, start_ref_frame_info, group_of_picture); } +#define SET_STRUCT_VALUE(config, structure, ret, field) \ + if (strcmp(config.name, #field) == 0) { \ + structure->field = atoi(config.value); \ + ret = 1; \ + } + +static void UpdateEncodeConfig(const EncodeConfig &config, + VP9EncoderConfig *oxcf) { + int ret = 0; + SET_STRUCT_VALUE(config, oxcf, ret, key_freq); + SET_STRUCT_VALUE(config, oxcf, ret, two_pass_vbrmin_section); + SET_STRUCT_VALUE(config, oxcf, ret, two_pass_vbrmax_section); + SET_STRUCT_VALUE(config, oxcf, ret, under_shoot_pct); + SET_STRUCT_VALUE(config, oxcf, ret, over_shoot_pct); + SET_STRUCT_VALUE(config, oxcf, ret, max_threads); + SET_STRUCT_VALUE(config, oxcf, ret, frame_parallel_decoding_mode); + SET_STRUCT_VALUE(config, oxcf, ret, tile_columns); + SET_STRUCT_VALUE(config, oxcf, ret, arnr_max_frames); + SET_STRUCT_VALUE(config, oxcf, ret, arnr_strength); + SET_STRUCT_VALUE(config, oxcf, ret, lag_in_frames); + SET_STRUCT_VALUE(config, oxcf, ret, encode_breakout); + SET_STRUCT_VALUE(config, oxcf, ret, enable_tpl_model); + SET_STRUCT_VALUE(config, oxcf, ret, enable_auto_arf); + if (ret == 0) { + fprintf(stderr, "Ignored unsupported encode_config %s\n", config.name); + } +} + +static VP9EncoderConfig GetEncodeConfig( + int frame_width, int frame_height, vpx_rational_t frame_rate, + int target_bitrate, int encode_speed, vpx_enc_pass enc_pass, + const std::vector &encode_config_list) { + VP9EncoderConfig oxcf = + vp9_get_encoder_config(frame_width, frame_height, frame_rate, + target_bitrate, encode_speed, enc_pass); + for (const auto &config : encode_config_list) { + UpdateEncodeConfig(config, &oxcf); + } + if (enc_pass == VPX_RC_FIRST_PASS) { + oxcf.lag_in_frames = 0; + } + return oxcf; +} + SimpleEncode::SimpleEncode(int frame_width, int frame_height, int frame_rate_num, int frame_rate_den, int target_bitrate, int num_frames, @@ -748,12 +800,45 @@ void SimpleEncode::SetEncodeSpeed(int encode_speed) { encode_speed_ = encode_speed; } +StatusCode SimpleEncode::SetEncodeConfig(const char *name, const char *value) { + if (name == nullptr || value == nullptr) { + fprintf(stderr, "SetEncodeConfig: null pointer, name %p value %p\n", name, + value); + return StatusError; + } + EncodeConfig config; + snprintf(config.name, ENCODE_CONFIG_BUF_SIZE, "%s", name); + snprintf(config.value, ENCODE_CONFIG_BUF_SIZE, "%s", value); + impl_ptr_->encode_config_list.push_back(config); + return StatusOk; +} + +StatusCode SimpleEncode::DumpEncodeConfigs(int pass, FILE *fp) { + if (fp == nullptr) { + fprintf(stderr, "DumpEncodeConfigs: null pointer, fp %p\n", fp); + return StatusError; + } + vpx_enc_pass enc_pass; + if (pass == 1) { + enc_pass = VPX_RC_FIRST_PASS; + } else { + enc_pass = VPX_RC_LAST_PASS; + } + const vpx_rational_t frame_rate = + make_vpx_rational(frame_rate_num_, frame_rate_den_); + const VP9EncoderConfig oxcf = + GetEncodeConfig(frame_width_, frame_height_, frame_rate, target_bitrate_, + encode_speed_, enc_pass, impl_ptr_->encode_config_list); + vp9_dump_encoder_config(&oxcf, fp); + return StatusOk; +} + void SimpleEncode::ComputeFirstPassStats() { vpx_rational_t frame_rate = make_vpx_rational(frame_rate_num_, frame_rate_den_); - const VP9EncoderConfig oxcf = - vp9_get_encoder_config(frame_width_, frame_height_, frame_rate, - target_bitrate_, encode_speed_, VPX_RC_FIRST_PASS); + const VP9EncoderConfig oxcf = GetEncodeConfig( + frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, + VPX_RC_FIRST_PASS, impl_ptr_->encode_config_list); VP9_COMP *cpi = init_encoder(&oxcf, impl_ptr_->img_fmt); struct lookahead_ctx *lookahead = cpi->lookahead; int i; @@ -913,9 +998,10 @@ void SimpleEncode::StartEncode() { assert(impl_ptr_->first_pass_stats.size() > 0); vpx_rational_t frame_rate = make_vpx_rational(frame_rate_num_, frame_rate_den_); - VP9EncoderConfig oxcf = - vp9_get_encoder_config(frame_width_, frame_height_, frame_rate, - target_bitrate_, encode_speed_, VPX_RC_LAST_PASS); + VP9EncoderConfig oxcf = GetEncodeConfig( + frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, + VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); + vpx_fixed_buf_t stats; stats.buf = GetVectorData(impl_ptr_->first_pass_stats); stats.sz = sizeof(impl_ptr_->first_pass_stats[0]) * @@ -1132,9 +1218,9 @@ int SimpleEncode::GetCodingFrameNum() const { const int allow_alt_ref = 1; vpx_rational_t frame_rate = make_vpx_rational(frame_rate_num_, frame_rate_den_); - const VP9EncoderConfig oxcf = - vp9_get_encoder_config(frame_width_, frame_height_, frame_rate, - target_bitrate_, encode_speed_, VPX_RC_LAST_PASS); + const VP9EncoderConfig oxcf = GetEncodeConfig( + frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, + VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); FRAME_INFO frame_info = vp9_get_frame_info(&oxcf); FIRST_PASS_INFO first_pass_info; fps_init_first_pass_info(&first_pass_info, @@ -1149,9 +1235,9 @@ std::vector SimpleEncode::ComputeKeyFrameMap() const { assert(impl_ptr_->first_pass_stats.size() == num_frames_ + 1); vpx_rational_t frame_rate = make_vpx_rational(frame_rate_num_, frame_rate_den_); - const VP9EncoderConfig oxcf = - vp9_get_encoder_config(frame_width_, frame_height_, frame_rate, - target_bitrate_, encode_speed_, VPX_RC_LAST_PASS); + const VP9EncoderConfig oxcf = GetEncodeConfig( + frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, + VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); FRAME_INFO frame_info = vp9_get_frame_info(&oxcf); FIRST_PASS_INFO first_pass_info; fps_init_first_pass_info(&first_pass_info, diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index dc055e9754..ccbde9934b 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -19,6 +19,11 @@ namespace vp9 { +enum StatusCode { + StatusOk = 0, + StatusError, +}; + // TODO(angiebird): Add description for each frame type. enum FrameType { kFrameTypeKey = 0, @@ -322,6 +327,32 @@ class SimpleEncode { // at the cost of lower compression efficiency. void SetEncodeSpeed(int encode_speed); + // Set encoder config + // The following configs in VP9EncoderConfig are allowed to change in this + // function. See https://ffmpeg.org/ffmpeg-codecs.html#libvpx for each + // config's meaning. + // Configs in VP9EncoderConfig: Equivalent configs in ffmpeg: + // 1 key_freq -g + // 2 two_pass_vbrmin_section -minrate * 100LL / bit_rate + // 3 two_pass_vbrmax_section -maxrate * 100LL / bit_rate + // 4 under_shoot_pct -undershoot-pct + // 5 over_shoot_pct -overshoot-pct + // 6 max_threads -threads + // 7 frame_parallel_decoding_mode -frame-parallel + // 8 tile_column -tile-columns + // 9 arnr_max_frames -arnr-maxframes + // 10 arnr_strength -arnr-strength + // 11 lag_in_frames -rc_lookahead + // 12 encode_breakout -static-thresh + // 13 enable_tpl_model -enable-tpl + // 14 enable_auto_arf -auto-alt-ref + StatusCode SetEncodeConfig(const char *name, const char *value); + + // A debug function that dumps configs from VP9EncoderConfig + // pass = 1: first pass, pass = 2: second pass + // fp: file pointer for dumping config + StatusCode DumpEncodeConfigs(int pass, FILE *fp); + // Makes encoder compute the first pass stats and store it at // impl_ptr_->first_pass_stats. key_frame_map_ is also computed based on the // first pass stats. diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 9586937927..7b828a5a35 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -633,7 +633,7 @@ static vpx_codec_err_t set_encoder_config( } if (get_level_index(oxcf->target_level) >= 0) config_target_level(oxcf); - // vp9_dump_encoder_config(oxcf); + // vp9_dump_encoder_config(oxcf, stderr); return VPX_CODEC_OK; } @@ -1970,46 +1970,46 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, return oxcf; } -#define DUMP_STRUCT_VALUE(struct, value) \ - printf(#value " %" PRId64 "\n", (int64_t)(struct)->value) +#define DUMP_STRUCT_VALUE(fp, structure, value) \ + fprintf(fp, #value " %" PRId64 "\n", (int64_t)(structure)->value) -void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf) { - DUMP_STRUCT_VALUE(oxcf, profile); - DUMP_STRUCT_VALUE(oxcf, bit_depth); - DUMP_STRUCT_VALUE(oxcf, width); - DUMP_STRUCT_VALUE(oxcf, height); - DUMP_STRUCT_VALUE(oxcf, input_bit_depth); - DUMP_STRUCT_VALUE(oxcf, init_framerate); +void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp) { + DUMP_STRUCT_VALUE(fp, oxcf, profile); + DUMP_STRUCT_VALUE(fp, oxcf, bit_depth); + DUMP_STRUCT_VALUE(fp, oxcf, width); + DUMP_STRUCT_VALUE(fp, oxcf, height); + DUMP_STRUCT_VALUE(fp, oxcf, input_bit_depth); + DUMP_STRUCT_VALUE(fp, oxcf, init_framerate); // TODO(angiebird): dump g_timebase // TODO(angiebird): dump g_timebase_in_ts - DUMP_STRUCT_VALUE(oxcf, target_bandwidth); + DUMP_STRUCT_VALUE(fp, oxcf, target_bandwidth); - DUMP_STRUCT_VALUE(oxcf, noise_sensitivity); - DUMP_STRUCT_VALUE(oxcf, sharpness); - DUMP_STRUCT_VALUE(oxcf, speed); - DUMP_STRUCT_VALUE(oxcf, rc_max_intra_bitrate_pct); - DUMP_STRUCT_VALUE(oxcf, rc_max_inter_bitrate_pct); - DUMP_STRUCT_VALUE(oxcf, gf_cbr_boost_pct); + DUMP_STRUCT_VALUE(fp, oxcf, noise_sensitivity); + DUMP_STRUCT_VALUE(fp, oxcf, sharpness); + DUMP_STRUCT_VALUE(fp, oxcf, speed); + DUMP_STRUCT_VALUE(fp, oxcf, rc_max_intra_bitrate_pct); + DUMP_STRUCT_VALUE(fp, oxcf, rc_max_inter_bitrate_pct); + DUMP_STRUCT_VALUE(fp, oxcf, gf_cbr_boost_pct); - DUMP_STRUCT_VALUE(oxcf, mode); - DUMP_STRUCT_VALUE(oxcf, pass); + DUMP_STRUCT_VALUE(fp, oxcf, mode); + DUMP_STRUCT_VALUE(fp, oxcf, pass); // Key Framing Operations - DUMP_STRUCT_VALUE(oxcf, auto_key); - DUMP_STRUCT_VALUE(oxcf, key_freq); + DUMP_STRUCT_VALUE(fp, oxcf, auto_key); + DUMP_STRUCT_VALUE(fp, oxcf, key_freq); - DUMP_STRUCT_VALUE(oxcf, lag_in_frames); + DUMP_STRUCT_VALUE(fp, oxcf, lag_in_frames); // ---------------------------------------------------------------- // DATARATE CONTROL OPTIONS // vbr, cbr, constrained quality or constant quality - DUMP_STRUCT_VALUE(oxcf, rc_mode); + DUMP_STRUCT_VALUE(fp, oxcf, rc_mode); // buffer targeting aggressiveness - DUMP_STRUCT_VALUE(oxcf, under_shoot_pct); - DUMP_STRUCT_VALUE(oxcf, over_shoot_pct); + DUMP_STRUCT_VALUE(fp, oxcf, under_shoot_pct); + DUMP_STRUCT_VALUE(fp, oxcf, over_shoot_pct); // buffering parameters // TODO(angiebird): dump tarting_buffer_level_ms @@ -2017,37 +2017,37 @@ void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf) { // TODO(angiebird): dump maximum_buffer_size_ms // Frame drop threshold. - DUMP_STRUCT_VALUE(oxcf, drop_frames_water_mark); + DUMP_STRUCT_VALUE(fp, oxcf, drop_frames_water_mark); // controlling quality - DUMP_STRUCT_VALUE(oxcf, fixed_q); - DUMP_STRUCT_VALUE(oxcf, worst_allowed_q); - DUMP_STRUCT_VALUE(oxcf, best_allowed_q); - DUMP_STRUCT_VALUE(oxcf, cq_level); - DUMP_STRUCT_VALUE(oxcf, aq_mode); + DUMP_STRUCT_VALUE(fp, oxcf, fixed_q); + DUMP_STRUCT_VALUE(fp, oxcf, worst_allowed_q); + DUMP_STRUCT_VALUE(fp, oxcf, best_allowed_q); + DUMP_STRUCT_VALUE(fp, oxcf, cq_level); + DUMP_STRUCT_VALUE(fp, oxcf, aq_mode); // Special handling of Adaptive Quantization for AltRef frames - DUMP_STRUCT_VALUE(oxcf, alt_ref_aq); + DUMP_STRUCT_VALUE(fp, oxcf, alt_ref_aq); // Internal frame size scaling. - DUMP_STRUCT_VALUE(oxcf, resize_mode); - DUMP_STRUCT_VALUE(oxcf, scaled_frame_width); - DUMP_STRUCT_VALUE(oxcf, scaled_frame_height); + DUMP_STRUCT_VALUE(fp, oxcf, resize_mode); + DUMP_STRUCT_VALUE(fp, oxcf, scaled_frame_width); + DUMP_STRUCT_VALUE(fp, oxcf, scaled_frame_height); // Enable feature to reduce the frame quantization every x frames. - DUMP_STRUCT_VALUE(oxcf, frame_periodic_boost); + DUMP_STRUCT_VALUE(fp, oxcf, frame_periodic_boost); // two pass datarate control - DUMP_STRUCT_VALUE(oxcf, two_pass_vbrbias); - DUMP_STRUCT_VALUE(oxcf, two_pass_vbrmin_section); - DUMP_STRUCT_VALUE(oxcf, two_pass_vbrmax_section); - DUMP_STRUCT_VALUE(oxcf, vbr_corpus_complexity); + DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrbias); + DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrmin_section); + DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrmax_section); + DUMP_STRUCT_VALUE(fp, oxcf, vbr_corpus_complexity); // END DATARATE CONTROL OPTIONS // ---------------------------------------------------------------- // Spatial and temporal scalability. - DUMP_STRUCT_VALUE(oxcf, ss_number_layers); - DUMP_STRUCT_VALUE(oxcf, ts_number_layers); + DUMP_STRUCT_VALUE(fp, oxcf, ss_number_layers); + DUMP_STRUCT_VALUE(fp, oxcf, ts_number_layers); // Bitrate allocation for spatial layers. // TODO(angiebird): dump layer_target_bitrate[VPX_MAX_LAYERS] @@ -2055,25 +2055,25 @@ void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf) { // TODO(angiebird): dump ss_enable_auto_arf[VPX_SS_MAX_LAYERS] // TODO(angiebird): dump ts_rate_decimator[VPX_TS_MAX_LAYERS] - DUMP_STRUCT_VALUE(oxcf, enable_auto_arf); - DUMP_STRUCT_VALUE(oxcf, encode_breakout); - DUMP_STRUCT_VALUE(oxcf, error_resilient_mode); - DUMP_STRUCT_VALUE(oxcf, frame_parallel_decoding_mode); + DUMP_STRUCT_VALUE(fp, oxcf, enable_auto_arf); + DUMP_STRUCT_VALUE(fp, oxcf, encode_breakout); + DUMP_STRUCT_VALUE(fp, oxcf, error_resilient_mode); + DUMP_STRUCT_VALUE(fp, oxcf, frame_parallel_decoding_mode); - DUMP_STRUCT_VALUE(oxcf, arnr_max_frames); - DUMP_STRUCT_VALUE(oxcf, arnr_strength); + DUMP_STRUCT_VALUE(fp, oxcf, arnr_max_frames); + DUMP_STRUCT_VALUE(fp, oxcf, arnr_strength); - DUMP_STRUCT_VALUE(oxcf, min_gf_interval); - DUMP_STRUCT_VALUE(oxcf, max_gf_interval); + DUMP_STRUCT_VALUE(fp, oxcf, min_gf_interval); + DUMP_STRUCT_VALUE(fp, oxcf, max_gf_interval); - DUMP_STRUCT_VALUE(oxcf, tile_columns); - DUMP_STRUCT_VALUE(oxcf, tile_rows); + DUMP_STRUCT_VALUE(fp, oxcf, tile_columns); + DUMP_STRUCT_VALUE(fp, oxcf, tile_rows); - DUMP_STRUCT_VALUE(oxcf, enable_tpl_model); + DUMP_STRUCT_VALUE(fp, oxcf, enable_tpl_model); - DUMP_STRUCT_VALUE(oxcf, max_threads); + DUMP_STRUCT_VALUE(fp, oxcf, max_threads); - DUMP_STRUCT_VALUE(oxcf, target_level); + DUMP_STRUCT_VALUE(fp, oxcf, target_level); // TODO(angiebird): dump two_pass_stats_in @@ -2081,19 +2081,19 @@ void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf) { // TODO(angiebird): dump firstpass_mb_stats_in #endif - DUMP_STRUCT_VALUE(oxcf, tuning); - DUMP_STRUCT_VALUE(oxcf, content); + DUMP_STRUCT_VALUE(fp, oxcf, tuning); + DUMP_STRUCT_VALUE(fp, oxcf, content); #if CONFIG_VP9_HIGHBITDEPTH - DUMP_STRUCT_VALUE(oxcf, use_highbitdepth); + DUMP_STRUCT_VALUE(fp, oxcf, use_highbitdepth); #endif - DUMP_STRUCT_VALUE(oxcf, color_space); - DUMP_STRUCT_VALUE(oxcf, color_range); - DUMP_STRUCT_VALUE(oxcf, render_width); - DUMP_STRUCT_VALUE(oxcf, render_height); - DUMP_STRUCT_VALUE(oxcf, temporal_layering_mode); - - DUMP_STRUCT_VALUE(oxcf, row_mt); - DUMP_STRUCT_VALUE(oxcf, motion_vector_unit_test); + DUMP_STRUCT_VALUE(fp, oxcf, color_space); + DUMP_STRUCT_VALUE(fp, oxcf, color_range); + DUMP_STRUCT_VALUE(fp, oxcf, render_width); + DUMP_STRUCT_VALUE(fp, oxcf, render_height); + DUMP_STRUCT_VALUE(fp, oxcf, temporal_layering_mode); + + DUMP_STRUCT_VALUE(fp, oxcf, row_mt); + DUMP_STRUCT_VALUE(fp, oxcf, motion_vector_unit_test); } FRAME_INFO vp9_get_frame_info(const VP9EncoderConfig *oxcf) { diff --git a/vp9/vp9_cx_iface.h b/vp9/vp9_cx_iface.h index 3188575dd6..01338adb4e 100644 --- a/vp9/vp9_cx_iface.h +++ b/vp9/vp9_cx_iface.h @@ -22,7 +22,7 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, int target_bitrate, int encode_speed, vpx_enc_pass enc_pass); -void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf); +void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp); FRAME_INFO vp9_get_frame_info(const VP9EncoderConfig *oxcf); From a04f68148f2b78c4eb1f356c12952c716f67adf0 Mon Sep 17 00:00:00 2001 From: angiebird Date: Thu, 24 Sep 2020 11:33:55 -0700 Subject: [PATCH 0062/1000] Add codec control for external rate control lib VP9E_SET_EXTERNAL_RATE_CONTROL One can assign an external library using the control flag, VP9E_SET_EXTERNAL_RATE_CONTROL. The args alongside the control flag should be of type char**. args[0]: char* points to the path of rate control library args[1]: char* points to the config of the rate control library. Change-Id: Iae47362cdfafa00614bac427884bffcf6944c583 --- vp9/encoder/vp9_encoder.h | 7 +++++++ vp9/vp9_cx_iface.c | 19 +++++++++++++++++++ vpx/vp8cx.h | 14 ++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 263b10cdbb..67574ec0eb 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -660,6 +660,12 @@ static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; } static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; } #endif // CONFIG_RATE_CTRL +#define MAX_EXT_RATECTRL_BUF_SIZE 500 +typedef struct EXT_RATECTRL { + char library_path[MAX_EXT_RATECTRL_BUF_SIZE]; + char config[MAX_EXT_RATECTRL_BUF_SIZE]; +} EXT_RATECTRL; + typedef struct VP9_COMP { FRAME_INFO frame_info; QUANTS quants; @@ -974,6 +980,7 @@ typedef struct VP9_COMP { RATE_QSTEP_MODEL rq_model[ENCODE_FRAME_TYPES]; #endif + EXT_RATECTRL ext_ratectrl; } VP9_COMP; #if CONFIG_RATE_CTRL diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 7b828a5a35..588290de0e 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1732,6 +1732,24 @@ static vpx_codec_err_t ctrl_set_disable_loopfilter(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, + va_list args) { + EXT_RATECTRL *ext_ratectrl = &ctx->cpi->ext_ratectrl; + char **str_ptr = CAST(VP9E_SET_EXTERNAL_RATE_CONTROL, args); + const char *library_path = str_ptr[0]; + const char *config = str_ptr[1]; + if (strlen(library_path) >= MAX_EXT_RATECTRL_BUF_SIZE) { + return VPX_CODEC_ERROR; + } + if (strlen(config) >= MAX_EXT_RATECTRL_BUF_SIZE) { + return VPX_CODEC_ERROR; + } + snprintf(ext_ratectrl->library_path, MAX_EXT_RATECTRL_BUF_SIZE, "%s", + library_path); + snprintf(ext_ratectrl->config, MAX_EXT_RATECTRL_BUF_SIZE, "%s", config); + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP8_COPY_REFERENCE, ctrl_copy_reference }, @@ -1784,6 +1802,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_SET_SVC_SPATIAL_LAYER_SYNC, ctrl_set_svc_spatial_layer_sync }, { VP9E_SET_DELTA_Q_UV, ctrl_set_delta_q_uv }, { VP9E_SET_DISABLE_LOOPFILTER, ctrl_set_disable_loopfilter }, + { VP9E_SET_EXTERNAL_RATE_CONTROL, ctrl_set_external_rate_control }, // Getters { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer }, diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 59753cdfde..8b0b1af6ee 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -701,6 +701,16 @@ enum vp8e_enc_control_id { * Supported in codecs: VP9 */ VP9E_SET_DISABLE_LOOPFILTER, + + /*!\brief Codec control function to enable external rate control library. + * + * args[0]: path of the rate control library + * + * args[1]: private config of the rate control library + * + * Supported in codecs: VP9 + */ + VP9E_SET_EXTERNAL_RATE_CONTROL, }; /*!\brief vpx 1-D scaling mode @@ -1056,6 +1066,10 @@ VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, int) VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int) #define VPX_CTRL_VP9E_SET_DISABLE_LOOPFILTER + +VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, char **) +#define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL + /*!\endcond */ /*! @} - end defgroup vp8_encoder */ #ifdef __cplusplus From 6dba0d0a058c42ace95aee8048eb043c890531e8 Mon Sep 17 00:00:00 2001 From: angiebird Date: Wed, 7 Oct 2020 15:29:51 -0700 Subject: [PATCH 0063/1000] Add callback functions for external_rate_control Change-Id: I20a1179a2131d2cd069dae9076aa2c18b80784f3 --- vp9/encoder/vp9_encoder.h | 7 ++- vp9/vp9_cx_iface.c | 15 +---- vpx/vp8cx.h | 3 +- vpx/vpx_codec.mk | 1 + vpx/vpx_ext_ratectrl.h | 124 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 133 insertions(+), 17 deletions(-) create mode 100644 vpx/vpx_ext_ratectrl.h diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 67574ec0eb..28e00299b0 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -15,6 +15,7 @@ #include "./vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vpx_ext_ratectrl.h" #include "vpx/vp8cx.h" #if CONFIG_INTERNAL_STATS #include "vpx_dsp/ssim.h" @@ -660,10 +661,10 @@ static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; } static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; } #endif // CONFIG_RATE_CTRL -#define MAX_EXT_RATECTRL_BUF_SIZE 500 typedef struct EXT_RATECTRL { - char library_path[MAX_EXT_RATECTRL_BUF_SIZE]; - char config[MAX_EXT_RATECTRL_BUF_SIZE]; + int ready; + vpx_rc_model_t model; + vpx_rc_funcs_t funcs; } EXT_RATECTRL; typedef struct VP9_COMP { diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 588290de0e..42847edee4 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "vpx/vpx_encoder.h" +#include "vpx/vpx_ext_ratectrl.h" #include "vpx_dsp/psnr.h" #include "vpx_ports/vpx_once.h" #include "vpx_ports/static_assert.h" @@ -1734,19 +1735,7 @@ static vpx_codec_err_t ctrl_set_disable_loopfilter(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, va_list args) { - EXT_RATECTRL *ext_ratectrl = &ctx->cpi->ext_ratectrl; - char **str_ptr = CAST(VP9E_SET_EXTERNAL_RATE_CONTROL, args); - const char *library_path = str_ptr[0]; - const char *config = str_ptr[1]; - if (strlen(library_path) >= MAX_EXT_RATECTRL_BUF_SIZE) { - return VPX_CODEC_ERROR; - } - if (strlen(config) >= MAX_EXT_RATECTRL_BUF_SIZE) { - return VPX_CODEC_ERROR; - } - snprintf(ext_ratectrl->library_path, MAX_EXT_RATECTRL_BUF_SIZE, "%s", - library_path); - snprintf(ext_ratectrl->config, MAX_EXT_RATECTRL_BUF_SIZE, "%s", config); + ctx->cpi->ext_ratectrl.funcs = *CAST(VP9E_SET_EXTERNAL_RATE_CONTROL, args); return VPX_CODEC_OK; } diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 8b0b1af6ee..37ad07d33d 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -17,6 +17,7 @@ */ #include "./vp8.h" #include "./vpx_encoder.h" +#include "./vpx_ext_ratectrl.h" /*!\file * \brief Provides definitions for using VP8 or VP9 encoder algorithm within the @@ -1067,7 +1068,7 @@ VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, int) VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int) #define VPX_CTRL_VP9E_SET_DISABLE_LOOPFILTER -VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, char **) +VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *) #define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL /*!\endcond */ diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk index 4ed77ad6d9..67812f21b2 100644 --- a/vpx/vpx_codec.mk +++ b/vpx/vpx_codec.mk @@ -39,3 +39,4 @@ API_SRCS-yes += vpx_codec.mk API_SRCS-yes += vpx_frame_buffer.h API_SRCS-yes += vpx_image.h API_SRCS-yes += vpx_integer.h +API_SRCS-yes += vpx_ext_ratectrl.h diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h new file mode 100644 index 0000000000..08e8470dd7 --- /dev/null +++ b/vpx/vpx_ext_ratectrl.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_VPX_EXT_RATECTRL_H_ +#define VPX_VPX_VPX_EXT_RATECTRL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./vpx_integer.h" + +typedef void *vpx_rc_model_t; + +typedef struct vpx_rc_encodeframe_decision { + int q_index; +} vpx_rc_encodeframe_decision_t; + +typedef struct vpx_rc_encodeframe_info { + int frame_type; + int show_index; + int coding_index; +} vpx_rc_encodeframe_info_t; + +typedef struct vpx_rc_encodeframe_result { + int64_t sse; + int64_t bit_count; + int64_t pixel_count; +} vpx_rc_encodeframe_result_t; + +typedef struct vpx_rc_firstpass_stats { + double (*frame_stats)[26]; + int num_frames; +} vpx_rc_firstpass_stats_t; + +typedef struct vpx_rc_config { + int frame_width; + int frame_height; + int show_frame_count; + int target_bitrate_kbps; +} vpx_rc_config_t; + +/*!\brief Create an external rate control model callback prototype + * + * This callback is invoked by the encoder to create an external rate control + * model. + * + * \param[in] priv Callback's private data + * \param[in] ratectrl_config Pointer to vpx_rc_config_t + * \param[out] rate_ctrl_model_pt Pointer to vpx_rc_model_t + */ +typedef int (*vpx_rc_create_model_cb_fn_t)( + void *priv, const vpx_rc_config_t *ratectrl_config, + vpx_rc_model_t *rate_ctrl_model_pt); + +/*!\brief Send first pass stats to the external rate control model callback + * prototype + * + * This callback is invoked by the encoder to send first pass stats to the + * external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] first_pass_stats first pass stats + */ +typedef int (*vpx_rc_send_firstpass_stats_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_firstpass_stats_t *first_pass_stats); + +/*!\brief Receive encode frame decision callback prototype + * + * This callback is invoked by the encoder to receive encode frame decision from + * the external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] encode_frame_info information of the coding frame + * \param[out] frame_decision encode decision of the coding frame + */ +typedef int (*vpx_rc_get_encodeframe_decision_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision); + +/*!\brief Update encode frame result callback prototype + * + * This callback is invoked by the encoder to update encode frame result to the + * external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[out] encode_frame_result encode result of the coding frame + */ +typedef int (*vpx_rc_update_encodeframe_result_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, + vpx_rc_encodeframe_result_t *encode_frame_result); + +/*!\brief Delete the external rate control model callback prototype + * + * This callback is invoked by the encoder to delete the external rate control + * model. + * + * \param[in] rate_ctrl_model rate control model + */ +typedef int (*vpx_rc_delete_model_cb_fn_t)(vpx_rc_model_t rate_ctrl_model); + +typedef struct vpx_rc_funcs { + vpx_rc_create_model_cb_fn_t create_model; + vpx_rc_send_firstpass_stats_cb_fn_t send_firstpass_stats; + vpx_rc_get_encodeframe_decision_cb_fn_t get_encodeframe_decision; + vpx_rc_update_encodeframe_result_cb_fn_t update_encodeframe_result; + vpx_rc_delete_model_cb_fn_t delete_model; + void *priv; +} vpx_rc_funcs_t; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_VPX_EXT_RATECTRL_H_ From 20bca1350a16af4c3346cfbf7f8f6c181e21a9ca Mon Sep 17 00:00:00 2001 From: angiebird Date: Wed, 7 Oct 2020 15:29:51 -0700 Subject: [PATCH 0064/1000] Add vp9_extrc_init/create/delete Change-Id: I9fcb9f4cc5c565794229593fadde87286fcf0ffd --- vp9/encoder/vp9_encoder.c | 3 +++ vp9/encoder/vp9_encoder.h | 7 +------ vp9/encoder/vp9_ext_ratectrl.c | 32 ++++++++++++++++++++++++++++++++ vp9/encoder/vp9_ext_ratectrl.h | 30 ++++++++++++++++++++++++++++++ vp9/vp9_cx_iface.c | 19 ++++++++++++++++++- vp9/vp9cx.mk | 2 ++ vpx/vpx_ext_ratectrl.h | 2 ++ 7 files changed, 88 insertions(+), 7 deletions(-) create mode 100644 vp9/encoder/vp9_ext_ratectrl.c create mode 100644 vp9/encoder/vp9_ext_ratectrl.h diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index e513cef84c..80bc4355b0 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2664,6 +2664,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, motion_vector_info_init(cpi); fp_motion_vector_info_init(cpi); #endif + vp9_extrc_init(&cpi->ext_ratectrl); return cpi; } @@ -2834,6 +2835,8 @@ void vp9_remove_compressor(VP9_COMP *cpi) { } #endif + vp9_extrc_delete(&cpi->ext_ratectrl); + vp9_remove_common(cm); vp9_free_ref_frame_buffers(cm->buffer_pool); #if CONFIG_VP9_POSTPROC diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 28e00299b0..b35d56608e 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -39,6 +39,7 @@ #include "vp9/encoder/vp9_context_tree.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_ext_ratectrl.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_job_queue.h" #include "vp9/encoder/vp9_lookahead.h" @@ -661,12 +662,6 @@ static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; } static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; } #endif // CONFIG_RATE_CTRL -typedef struct EXT_RATECTRL { - int ready; - vpx_rc_model_t model; - vpx_rc_funcs_t funcs; -} EXT_RATECTRL; - typedef struct VP9_COMP { FRAME_INFO frame_info; QUANTS quants; diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c new file mode 100644 index 0000000000..5a1263bd87 --- /dev/null +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/encoder/vp9_ext_ratectrl.h" +#include "vp9/common/vp9_common.h" + +void vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) { vp9_zero(*ext_ratectrl); } + +void vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_rc_config_t ratectrl_config, + EXT_RATECTRL *ext_ratectrl) { + vp9_extrc_delete(ext_ratectrl); + ext_ratectrl->funcs = funcs; + ext_ratectrl->ratectrl_config = ratectrl_config; + ext_ratectrl->funcs.create_model(ext_ratectrl->funcs.priv, + &ext_ratectrl->ratectrl_config, + ext_ratectrl->model); + ext_ratectrl->ready = 1; +} + +void vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) { + if (ext_ratectrl->ready) { + ext_ratectrl->funcs.delete_model(ext_ratectrl->model); + } + vp9_extrc_init(ext_ratectrl); +} diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h new file mode 100644 index 0000000000..4505b0fd69 --- /dev/null +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ +#define VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ + +#include "vpx/vpx_ext_ratectrl.h" + +typedef struct EXT_RATECTRL { + int ready; + vpx_rc_model_t model; + vpx_rc_funcs_t funcs; + vpx_rc_config_t ratectrl_config; +} EXT_RATECTRL; + +void vp9_extrc_init(EXT_RATECTRL *ext_ratectrl); + +void vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_rc_config_t ratectrl_config, + EXT_RATECTRL *ext_ratectrl); + +void vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl); + +#endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 42847edee4..aa13fc9cf1 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1735,7 +1735,24 @@ static vpx_codec_err_t ctrl_set_disable_loopfilter(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, va_list args) { - ctx->cpi->ext_ratectrl.funcs = *CAST(VP9E_SET_EXTERNAL_RATE_CONTROL, args); + vpx_rc_funcs_t funcs = *CAST(VP9E_SET_EXTERNAL_RATE_CONTROL, args); + VP9_COMP *cpi = ctx->cpi; + EXT_RATECTRL *ext_ratectrl = &cpi->ext_ratectrl; + const VP9EncoderConfig *oxcf = &cpi->oxcf; + const FRAME_INFO *frame_info = &cpi->frame_info; + vpx_rc_config_t ratectrl_config; + + ratectrl_config.frame_width = frame_info->frame_width; + ratectrl_config.frame_height = frame_info->frame_height; + ratectrl_config.show_frame_count = cpi->twopass.first_pass_info.num_frames; + + // TODO(angiebird): Double check whether this is the proper way to set up + // target_bitrate and frame_rate. + ratectrl_config.target_bitrate_kbps = (int)(oxcf->target_bandwidth / 1000); + ratectrl_config.frame_rate_num = oxcf->g_timebase.den; + ratectrl_config.frame_rate_den = oxcf->g_timebase.num; + + vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl); return VPX_CODEC_OK; } diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 666f228827..38e99165af 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -96,6 +96,8 @@ VP9_CX_SRCS-yes += encoder/vp9_skin_detection.c VP9_CX_SRCS-yes += encoder/vp9_skin_detection.h VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.c VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.h +VP9_CX_SRCS-yes += encoder/vp9_ext_ratectrl.c +VP9_CX_SRCS-yes += encoder/vp9_ext_ratectrl.h ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 08e8470dd7..3b00e1181e 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -45,6 +45,8 @@ typedef struct vpx_rc_config { int frame_height; int show_frame_count; int target_bitrate_kbps; + int frame_rate_num; + int frame_rate_den; } vpx_rc_config_t; /*!\brief Create an external rate control model callback prototype From e6208a95073f12f035fb554c1bd3e6a32a92f877 Mon Sep 17 00:00:00 2001 From: angiebird Date: Thu, 8 Oct 2020 18:40:34 -0700 Subject: [PATCH 0065/1000] Add vp9_extrc_send_firstpass_stats() Change-Id: Ia2457b416200a2b2d1558600bff90ac2746cf396 --- vp9/encoder/vp9_ext_ratectrl.c | 51 ++++++++++++++++++++++++++++++++++ vp9/encoder/vp9_ext_ratectrl.h | 5 ++++ vpx/vpx_ext_ratectrl.h | 2 +- 3 files changed, 57 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 5a1263bd87..204611cc60 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -15,12 +15,18 @@ void vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) { vp9_zero(*ext_ratectrl); } void vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_rc_config_t ratectrl_config, EXT_RATECTRL *ext_ratectrl) { + vpx_rc_firstpass_stats_t *rc_firstpass_stats; vp9_extrc_delete(ext_ratectrl); ext_ratectrl->funcs = funcs; ext_ratectrl->ratectrl_config = ratectrl_config; ext_ratectrl->funcs.create_model(ext_ratectrl->funcs.priv, &ext_ratectrl->ratectrl_config, ext_ratectrl->model); + rc_firstpass_stats = &ext_ratectrl->rc_firstpass_stats; + rc_firstpass_stats->num_frames = ratectrl_config.show_frame_count; + rc_firstpass_stats->frame_stats = + vpx_malloc(sizeof(*rc_firstpass_stats->frame_stats) * + rc_firstpass_stats->num_frames); ext_ratectrl->ready = 1; } @@ -30,3 +36,48 @@ void vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) { } vp9_extrc_init(ext_ratectrl); } + +static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats, + double *rc_frame_stats) { + rc_frame_stats[0] = stats->frame; + rc_frame_stats[1] = stats->weight; + rc_frame_stats[2] = stats->intra_error; + rc_frame_stats[3] = stats->coded_error; + rc_frame_stats[4] = stats->sr_coded_error; + rc_frame_stats[5] = stats->frame_noise_energy; + rc_frame_stats[6] = stats->pcnt_inter; + rc_frame_stats[7] = stats->pcnt_motion; + rc_frame_stats[8] = stats->pcnt_second_ref; + rc_frame_stats[9] = stats->pcnt_neutral; + rc_frame_stats[10] = stats->pcnt_intra_low; + rc_frame_stats[11] = stats->pcnt_intra_high; + rc_frame_stats[12] = stats->intra_skip_pct; + rc_frame_stats[13] = stats->intra_smooth_pct; + rc_frame_stats[14] = stats->inactive_zone_rows; + rc_frame_stats[15] = stats->inactive_zone_cols; + rc_frame_stats[16] = stats->MVr; + rc_frame_stats[17] = stats->mvr_abs; + rc_frame_stats[18] = stats->MVc; + rc_frame_stats[19] = stats->mvc_abs; + rc_frame_stats[20] = stats->MVrv; + rc_frame_stats[21] = stats->MVcv; + rc_frame_stats[22] = stats->mv_in_out_count; + rc_frame_stats[23] = stats->duration; + rc_frame_stats[24] = stats->count; +} + +void vp9_extrc_send_firstpass_stats(const FIRST_PASS_INFO *first_pass_info, + EXT_RATECTRL *ext_ratectrl) { + if (ext_ratectrl->ready) { + vpx_rc_firstpass_stats_t *rc_firstpass_stats = + &ext_ratectrl->rc_firstpass_stats; + int i; + assert(rc_firstpass_stats->num_frames == first_pass_info->num_frames); + for (i = 0; i < rc_firstpass_stats->num_frames; ++i) { + gen_rc_firstpass_stats(&first_pass_info->stats[i], + rc_firstpass_stats->frame_stats[i]); + } + ext_ratectrl->funcs.send_firstpass_stats(ext_ratectrl->model, + rc_firstpass_stats); + } +} diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index 4505b0fd69..b04598cd45 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -12,12 +12,14 @@ #define VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ #include "vpx/vpx_ext_ratectrl.h" +#include "vp9/encoder/vp9_firstpass.h" typedef struct EXT_RATECTRL { int ready; vpx_rc_model_t model; vpx_rc_funcs_t funcs; vpx_rc_config_t ratectrl_config; + vpx_rc_firstpass_stats_t rc_firstpass_stats; } EXT_RATECTRL; void vp9_extrc_init(EXT_RATECTRL *ext_ratectrl); @@ -27,4 +29,7 @@ void vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_rc_config_t ratectrl_config, void vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl); +void vp9_extrc_send_firstpass_stats(const FIRST_PASS_INFO *first_pass_info, + EXT_RATECTRL *ext_ratectrl); + #endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 3b00e1181e..fc6d845239 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -36,7 +36,7 @@ typedef struct vpx_rc_encodeframe_result { } vpx_rc_encodeframe_result_t; typedef struct vpx_rc_firstpass_stats { - double (*frame_stats)[26]; + double (*frame_stats)[25]; int num_frames; } vpx_rc_firstpass_stats_t; From 705bf9de8c96cfe5301451f1d7e5c90a41c64e5f Mon Sep 17 00:00:00 2001 From: angiebird Date: Fri, 9 Oct 2020 17:12:51 -0700 Subject: [PATCH 0066/1000] Add vpx_rc_frame_stats_t Change-Id: I496ce13592f71779bb00cc8bbb601835bca8ff09 --- vp9/encoder/vp9_ext_ratectrl.c | 54 +++++++++++++++++----------------- vpx/vpx_ext_ratectrl.h | 32 +++++++++++++++++++- 2 files changed, 58 insertions(+), 28 deletions(-) diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 204611cc60..f253f5c226 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -38,32 +38,32 @@ void vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) { } static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats, - double *rc_frame_stats) { - rc_frame_stats[0] = stats->frame; - rc_frame_stats[1] = stats->weight; - rc_frame_stats[2] = stats->intra_error; - rc_frame_stats[3] = stats->coded_error; - rc_frame_stats[4] = stats->sr_coded_error; - rc_frame_stats[5] = stats->frame_noise_energy; - rc_frame_stats[6] = stats->pcnt_inter; - rc_frame_stats[7] = stats->pcnt_motion; - rc_frame_stats[8] = stats->pcnt_second_ref; - rc_frame_stats[9] = stats->pcnt_neutral; - rc_frame_stats[10] = stats->pcnt_intra_low; - rc_frame_stats[11] = stats->pcnt_intra_high; - rc_frame_stats[12] = stats->intra_skip_pct; - rc_frame_stats[13] = stats->intra_smooth_pct; - rc_frame_stats[14] = stats->inactive_zone_rows; - rc_frame_stats[15] = stats->inactive_zone_cols; - rc_frame_stats[16] = stats->MVr; - rc_frame_stats[17] = stats->mvr_abs; - rc_frame_stats[18] = stats->MVc; - rc_frame_stats[19] = stats->mvc_abs; - rc_frame_stats[20] = stats->MVrv; - rc_frame_stats[21] = stats->MVcv; - rc_frame_stats[22] = stats->mv_in_out_count; - rc_frame_stats[23] = stats->duration; - rc_frame_stats[24] = stats->count; + vpx_rc_frame_stats_t *rc_frame_stats) { + rc_frame_stats->frame = stats->frame; + rc_frame_stats->weight = stats->weight; + rc_frame_stats->intra_error = stats->intra_error; + rc_frame_stats->coded_error = stats->coded_error; + rc_frame_stats->sr_coded_error = stats->sr_coded_error; + rc_frame_stats->frame_noise_energy = stats->frame_noise_energy; + rc_frame_stats->pcnt_inter = stats->pcnt_inter; + rc_frame_stats->pcnt_motion = stats->pcnt_motion; + rc_frame_stats->pcnt_second_ref = stats->pcnt_second_ref; + rc_frame_stats->pcnt_neutral = stats->pcnt_neutral; + rc_frame_stats->pcnt_intra_low = stats->pcnt_intra_low; + rc_frame_stats->pcnt_intra_high = stats->pcnt_intra_high; + rc_frame_stats->intra_skip_pct = stats->intra_skip_pct; + rc_frame_stats->intra_smooth_pct = stats->intra_smooth_pct; + rc_frame_stats->inactive_zone_rows = stats->inactive_zone_rows; + rc_frame_stats->inactive_zone_cols = stats->inactive_zone_cols; + rc_frame_stats->MVr = stats->MVr; + rc_frame_stats->mvr_abs = stats->mvr_abs; + rc_frame_stats->MVc = stats->MVc; + rc_frame_stats->mvc_abs = stats->mvc_abs; + rc_frame_stats->MVrv = stats->MVrv; + rc_frame_stats->MVcv = stats->MVcv; + rc_frame_stats->mv_in_out_count = stats->mv_in_out_count; + rc_frame_stats->duration = stats->duration; + rc_frame_stats->count = stats->count; } void vp9_extrc_send_firstpass_stats(const FIRST_PASS_INFO *first_pass_info, @@ -75,7 +75,7 @@ void vp9_extrc_send_firstpass_stats(const FIRST_PASS_INFO *first_pass_info, assert(rc_firstpass_stats->num_frames == first_pass_info->num_frames); for (i = 0; i < rc_firstpass_stats->num_frames; ++i) { gen_rc_firstpass_stats(&first_pass_info->stats[i], - rc_firstpass_stats->frame_stats[i]); + &rc_firstpass_stats->frame_stats[i]); } ext_ratectrl->funcs.send_firstpass_stats(ext_ratectrl->model, rc_firstpass_stats); diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index fc6d845239..c464de8845 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -35,8 +35,38 @@ typedef struct vpx_rc_encodeframe_result { int64_t pixel_count; } vpx_rc_encodeframe_result_t; +// This is a mirror of vp9's FIRSTPASS_STATS +// Only spatial_layer_id is omitted +typedef struct vpx_rc_frame_stats { + double frame; + double weight; + double intra_error; + double coded_error; + double sr_coded_error; + double frame_noise_energy; + double pcnt_inter; + double pcnt_motion; + double pcnt_second_ref; + double pcnt_neutral; + double pcnt_intra_low; + double pcnt_intra_high; + double intra_skip_pct; + double intra_smooth_pct; + double inactive_zone_rows; + double inactive_zone_cols; + double MVr; + double mvr_abs; + double MVc; + double mvc_abs; + double MVrv; + double MVcv; + double mv_in_out_count; + double duration; + double count; +} vpx_rc_frame_stats_t; + typedef struct vpx_rc_firstpass_stats { - double (*frame_stats)[25]; + vpx_rc_frame_stats_t *frame_stats; int num_frames; } vpx_rc_firstpass_stats_t; From 9857515cd6deae3e981792c74613e0dac624db56 Mon Sep 17 00:00:00 2001 From: angiebird Date: Fri, 9 Oct 2020 17:47:02 -0700 Subject: [PATCH 0067/1000] Call vp9_extrc_send_firstpass_stats() properly Change-Id: I28db5010ba647cc91b8c0aa59309d7e953cd1216 --- vp9/encoder/vp9_encoder.c | 5 ++++- vp9/encoder/vp9_ext_ratectrl.c | 4 ++-- vp9/encoder/vp9_ext_ratectrl.h | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 80bc4355b0..251a834d13 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2463,6 +2463,8 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; + vp9_extrc_init(&cpi->ext_ratectrl); + #if !CONFIG_REALTIME_ONLY if (oxcf->pass == 1) { vp9_init_first_pass(cpi); @@ -2536,6 +2538,8 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, num_frames = packets - 1; fps_init_first_pass_info(&cpi->twopass.first_pass_info, oxcf->two_pass_stats_in.buf, num_frames); + vp9_extrc_send_firstpass_stats(&cpi->ext_ratectrl, + &cpi->twopass.first_pass_info); vp9_init_second_pass(cpi); } @@ -2664,7 +2668,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, motion_vector_info_init(cpi); fp_motion_vector_info_init(cpi); #endif - vp9_extrc_init(&cpi->ext_ratectrl); return cpi; } diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index f253f5c226..51139f6ace 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -66,8 +66,8 @@ static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats, rc_frame_stats->count = stats->count; } -void vp9_extrc_send_firstpass_stats(const FIRST_PASS_INFO *first_pass_info, - EXT_RATECTRL *ext_ratectrl) { +void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl, + const FIRST_PASS_INFO *first_pass_info) { if (ext_ratectrl->ready) { vpx_rc_firstpass_stats_t *rc_firstpass_stats = &ext_ratectrl->rc_firstpass_stats; diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index b04598cd45..77d3bdd99f 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -29,7 +29,7 @@ void vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_rc_config_t ratectrl_config, void vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl); -void vp9_extrc_send_firstpass_stats(const FIRST_PASS_INFO *first_pass_info, - EXT_RATECTRL *ext_ratectrl); +void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl, + const FIRST_PASS_INFO *first_pass_info); #endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ From 122a74eda7011f3f6b95b075054a25d171abaca5 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 15 Oct 2020 15:24:36 -0700 Subject: [PATCH 0068/1000] install vpx_ext_ratectrl.h fixes encoder detection / compile with installed headers after: 6dba0d0a0 Add callback functions for external_rate_control Bug: webm:1707 Change-Id: I370d8c94d6f1b8201002a722077ecf6b3d8cede5 --- libs.mk | 6 +++++- vpx/vpx_codec.mk | 1 + vpx/vpx_ext_ratectrl.h | 8 ++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/libs.mk b/libs.mk index 78f2c92786..d14439a3da 100644 --- a/libs.mk +++ b/libs.mk @@ -63,6 +63,7 @@ ifeq ($(CONFIG_VP8_ENCODER),yes) CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS)) CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS)) INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h + INSTALL-LIBS-yes += include/vpx/vpx_ext_ratectrl.h INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/% CODEC_DOC_SECTIONS += vp8 vp8_encoder endif @@ -87,13 +88,16 @@ ifeq ($(CONFIG_VP9_ENCODER),yes) CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS)) CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS)) CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h + CODEC_SRCS-yes += vpx/vpx_ext_ratectrl.h INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h + INSTALL-LIBS-yes += include/vpx/vpx_ext_ratectrl.h INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/% - CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h + CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h vpx/vpx_ext_ratectrl.h CODEC_DOC_SECTIONS += vp9 vp9_encoder RC_RTC_SRCS := $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS)) RC_RTC_SRCS += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h + RC_RTC_SRCS += vpx/vpx_ext_ratectrl.h RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.cc RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.h INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.cc diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk index 67812f21b2..350dc247bc 100644 --- a/vpx/vpx_codec.mk +++ b/vpx/vpx_codec.mk @@ -24,6 +24,7 @@ API_DOC_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h API_DOC_SRCS-yes += vpx_codec.h API_DOC_SRCS-yes += vpx_decoder.h API_DOC_SRCS-yes += vpx_encoder.h +API_DOC_SRCS-yes += vpx_ext_ratectrl.h API_DOC_SRCS-yes += vpx_frame_buffer.h API_DOC_SRCS-yes += vpx_image.h diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index c464de8845..940227ac29 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -17,6 +17,10 @@ extern "C" { #include "./vpx_integer.h" +/*!\cond + TODO(angiebird): document these structures and fields to clear doxygen + warnings.*/ + typedef void *vpx_rc_model_t; typedef struct vpx_rc_encodeframe_decision { @@ -149,6 +153,10 @@ typedef struct vpx_rc_funcs { void *priv; } vpx_rc_funcs_t; +/*!\endcond + TODO(angiebird): document these structures and fields to clear doxygen + warnings.*/ + #ifdef __cplusplus } // extern "C" #endif From f71dd6e23e9d238ab0b03cdf105db733b8486778 Mon Sep 17 00:00:00 2001 From: angiebird Date: Fri, 9 Oct 2020 19:03:23 -0700 Subject: [PATCH 0069/1000] vp9_extrc_get_encodeframe_decision() Bug: webm:1707 Change-Id: I90a327b97d7158b65767fe3fbfd5f260030e17f5 --- vp9/common/vp9_onyxc_int.h | 6 ------ vp9/encoder/vp9_encoder.c | 11 +++++++++++ vp9/encoder/vp9_ext_ratectrl.c | 31 +++++++++++++++++++++++++++++++ vp9/encoder/vp9_ext_ratectrl.h | 4 ++++ vp9/simple_encode.h | 8 ++++---- 5 files changed, 50 insertions(+), 10 deletions(-) diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index 6f9c6985f0..ea47218b84 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -240,13 +240,11 @@ typedef struct VP9Common { // TODO(angiebird): current_video_frame/current_frame_coding_index into a // structure unsigned int current_video_frame; -#if CONFIG_RATE_CTRL // Each show or no show frame is assigned with a coding index based on its // coding order (starting from zero). // Current frame's coding index. int current_frame_coding_index; -#endif BITSTREAM_PROFILE profile; // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3. @@ -276,9 +274,7 @@ typedef struct VP9Common { static INLINE void init_frame_indexes(VP9_COMMON *cm) { cm->current_video_frame = 0; -#if CONFIG_RATE_CTRL cm->current_frame_coding_index = 0; -#endif // CONFIG_RATE_CTRL } static INLINE void update_frame_indexes(VP9_COMMON *cm, int show_frame) { @@ -287,9 +283,7 @@ static INLINE void update_frame_indexes(VP9_COMMON *cm, int show_frame) { // update not a real frame ++cm->current_video_frame; } -#if CONFIG_RATE_CTRL ++cm->current_frame_coding_index; -#endif // CONFIG_RATE_CTRL } typedef struct { diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 251a834d13..bf338eed8c 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4480,6 +4480,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest q = cpi->encode_command.external_quantize_index; } #endif + if (cpi->ext_ratectrl.ready) { + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + vpx_rc_encodeframe_decision_t encode_frame_decision; + vp9_extrc_get_encodeframe_decision( + &cpi->ext_ratectrl, gf_group, cm->current_video_frame, + cm->current_frame_coding_index, &encode_frame_decision); + q = encode_frame_decision.q_index; + } vp9_set_quantizer(cpi, q); @@ -4519,6 +4527,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; } + if (cpi->ext_ratectrl.ready) { + break; + } #if CONFIG_RATE_CTRL // This part needs to be after save_coding_context() because // restore_coding_context will be called in the end of this function. diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 51139f6ace..59b5c09b8a 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -81,3 +81,34 @@ void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl, rc_firstpass_stats); } } + +static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { + // TODO(angiebird): Add unit test to make sure this function behaves like + // get_frame_type_from_update_type() + // TODO(angiebird): Merge this function with get_frame_type_from_update_type() + switch (update_type) { + case KF_UPDATE: return 0; // kFrameTypeKey; + case ARF_UPDATE: return 2; // kFrameTypeAltRef; + case GF_UPDATE: return 4; // kFrameTypeGolden; + case OVERLAY_UPDATE: return 3; // kFrameTypeOverlay; + case LF_UPDATE: return 1; // kFrameTypeInter; + default: + fprintf(stderr, "Unsupported update_type %d\n", update_type); + abort(); + return 1; + } +} + +void vp9_extrc_get_encodeframe_decision( + EXT_RATECTRL *ext_ratectrl, const GF_GROUP *gf_group, int show_index, + int coding_index, vpx_rc_encodeframe_decision_t *encode_frame_decision) { + if (ext_ratectrl->ready) { + FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; + vpx_rc_encodeframe_info_t encode_frame_info; + encode_frame_info.show_index = show_index; + encode_frame_info.coding_index = coding_index; + encode_frame_info.frame_type = extrc_get_frame_type(update_type); + ext_ratectrl->funcs.get_encodeframe_decision( + ext_ratectrl->model, &encode_frame_info, encode_frame_decision); + } +} diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index 77d3bdd99f..990735ebc0 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -32,4 +32,8 @@ void vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl); void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info); +void vp9_extrc_get_encodeframe_decision( + EXT_RATECTRL *ext_ratectrl, const GF_GROUP *gf_group, int show_index, + int coding_index, vpx_rc_encodeframe_decision_t *encode_frame_decision); + #endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index ccbde9934b..ce370a795e 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -27,10 +27,10 @@ enum StatusCode { // TODO(angiebird): Add description for each frame type. enum FrameType { kFrameTypeKey = 0, - kFrameTypeInter, - kFrameTypeAltRef, - kFrameTypeOverlay, - kFrameTypeGolden, + kFrameTypeInter = 1, + kFrameTypeAltRef = 2, + kFrameTypeOverlay = 3, + kFrameTypeGolden = 4, }; // TODO(angiebird): Add description for each reference frame type. From 8bfc92063138476dee2d719ec45bb95d16a1a38f Mon Sep 17 00:00:00 2001 From: angiebird Date: Mon, 12 Oct 2020 17:58:16 -0700 Subject: [PATCH 0070/1000] Add vp9_extrc_update_encodeframe_result() Bug: webm:1707 Change-Id: I962ffa23f03b953f7c0dfd81f49dc79d1975bbba --- vp9/encoder/vp9_encoder.c | 9 ++++++++- vp9/encoder/vp9_ext_ratectrl.c | 26 ++++++++++++++++++++++++++ vp9/encoder/vp9_ext_ratectrl.h | 6 ++++++ vpx/vpx_ext_ratectrl.h | 2 +- 4 files changed, 41 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index bf338eed8c..3838a9de77 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -5479,6 +5479,13 @@ static void encode_frame_to_data_rate( // build the bitstream vp9_pack_bitstream(cpi, dest, size); + { + const RefCntBuffer *coded_frame_buf = + get_ref_cnt_buffer(cm, cm->new_fb_idx); + vp9_extrc_update_encodeframe_result(&cpi->ext_ratectrl, (*size) << 3, + cpi->Source, &coded_frame_buf->buf, + cpi->oxcf.input_bit_depth); + } #if CONFIG_REALTIME_ONLY (void)encode_frame_result; assert(encode_frame_result == NULL); @@ -7541,7 +7548,7 @@ static void update_encode_frame_result( #if CONFIG_RATE_CTRL PSNR_STATS psnr; #if CONFIG_VP9_HIGHBITDEPTH - vpx_calc_highbd_psnr(source_frame, coded_frame_buf->buf, &psnr, bit_depth, + vpx_calc_highbd_psnr(source_frame, &coded_frame_buf->buf, &psnr, bit_depth, input_bit_depth); #else // CONFIG_VP9_HIGHBITDEPTH (void)bit_depth; diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 59b5c09b8a..64414cd38a 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -10,6 +10,7 @@ #include "vp9/encoder/vp9_ext_ratectrl.h" #include "vp9/common/vp9_common.h" +#include "vpx_dsp/psnr.h" void vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) { vp9_zero(*ext_ratectrl); } @@ -112,3 +113,28 @@ void vp9_extrc_get_encodeframe_decision( ext_ratectrl->model, &encode_frame_info, encode_frame_decision); } } + +void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl, + int64_t bit_count, + const YV12_BUFFER_CONFIG *source_frame, + const YV12_BUFFER_CONFIG *coded_frame, + uint32_t input_bit_depth) { + if (ext_ratectrl->ready) { + PSNR_STATS psnr; + vpx_rc_encodeframe_result_t encode_frame_result; + encode_frame_result.bit_count = bit_count; + encode_frame_result.pixel_count = + source_frame->y_width * source_frame->y_height + + 2 * source_frame->uv_width * source_frame->uv_height; +#if CONFIG_VP9_HIGHBITDEPTH + vpx_calc_highbd_psnr(source_frame, coded_frame, &psnr, + source_frame->bit_depth, input_bit_depth); +#else + (void)input_bit_depth; + vpx_calc_psnr(source_frame, coded_frame, &psnr); +#endif + encode_frame_result.sse = psnr.sse[0]; + ext_ratectrl->funcs.update_encodeframe_result(ext_ratectrl->model, + &encode_frame_result); + } +} diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index 990735ebc0..82f300c3a5 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -36,4 +36,10 @@ void vp9_extrc_get_encodeframe_decision( EXT_RATECTRL *ext_ratectrl, const GF_GROUP *gf_group, int show_index, int coding_index, vpx_rc_encodeframe_decision_t *encode_frame_decision); +void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl, + int64_t bit_count, + const YV12_BUFFER_CONFIG *source_frame, + const YV12_BUFFER_CONFIG *coded_frame, + uint32_t input_bit_depth); + #endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 940227ac29..c99434457b 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -133,7 +133,7 @@ typedef int (*vpx_rc_get_encodeframe_decision_cb_fn_t)( */ typedef int (*vpx_rc_update_encodeframe_result_cb_fn_t)( vpx_rc_model_t rate_ctrl_model, - vpx_rc_encodeframe_result_t *encode_frame_result); + const vpx_rc_encodeframe_result_t *encode_frame_result); /*!\brief Delete the external rate control model callback prototype * From e94000aa35c5beb8901952abb9c0d037d3c977f1 Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Thu, 15 Oct 2020 12:05:37 -0700 Subject: [PATCH 0071/1000] Add unit test for vp9_ext_ratectrl Fix three bugs along the way. 1) Call vp9_extrc_send_firstpass_stats() after vp9_extrc_create() 2) Pass in model pointer in vp9_extrc_create() 3) Free frame_stats buffer in vp9_extrc_delete() Bug: webm:1707 Change-Id: Ic8bd62c7b4ebd85a7479ae5e4c82d7f6059d782f --- test/encode_test_driver.h | 7 ++ test/test.mk | 1 + test/vp9_ext_ratectrl_test.cc | 156 +++++++++++++++++++++++++++++++++ vp9/encoder/vp9_encoder.c | 15 ++-- vp9/encoder/vp9_ext_ratectrl.c | 9 +- vp9/encoder/vp9_ext_ratectrl.h | 1 + vp9/vp9_cx_iface.c | 31 ++++--- 7 files changed, 197 insertions(+), 23 deletions(-) create mode 100644 test/vp9_ext_ratectrl_test.cc diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index 3edba4b926..38c61952eb 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -148,6 +148,13 @@ class Encoder { ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } +#if CONFIG_VP9_ENCODER + void Control(int ctrl_id, vpx_rc_funcs_t *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } +#endif // CONFIG_VP9_ENCODER + #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER void Control(int ctrl_id, vpx_active_map_t *arg) { const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); diff --git a/test/test.mk b/test/test.mk index c12fb786e7..04902382db 100644 --- a/test/test.mk +++ b/test/test.mk @@ -58,6 +58,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ext_ratectrl_test.cc LIBVPX_TEST_SRCS-yes += decode_test_driver.cc LIBVPX_TEST_SRCS-yes += decode_test_driver.h diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc new file mode 100644 index 0000000000..aa36093445 --- /dev/null +++ b/test/vp9_ext_ratectrl_test.cc @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/yuv_video_source.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "vpx/vpx_ext_ratectrl.h" + +namespace { + +constexpr int kModelMagicNumber = 51396; +constexpr unsigned int PrivMagicNumber = 5566UL; +constexpr int kFrameNum = 5; +constexpr int kLosslessCodingIndex = 2; + +struct ToyRateCtrl { + int magic_number; + int coding_index; +}; + +int rc_create_model(void *priv, const vpx_rc_config_t *ratectrl_config, + vpx_rc_model_t *rate_ctrl_model_pt) { + ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; + EXPECT_NE(toy_rate_ctrl, nullptr); + toy_rate_ctrl->magic_number = kModelMagicNumber; + toy_rate_ctrl->coding_index = -1; + *rate_ctrl_model_pt = (vpx_rc_model_t)toy_rate_ctrl; + EXPECT_EQ(priv, reinterpret_cast(PrivMagicNumber)); + EXPECT_EQ(ratectrl_config->frame_width, 352); + EXPECT_EQ(ratectrl_config->frame_height, 288); + EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNum); + EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 24000); + EXPECT_EQ(ratectrl_config->frame_rate_num, 30); + EXPECT_EQ(ratectrl_config->frame_rate_den, 1); + return 0; +} + +int rc_send_firstpass_stats(vpx_rc_model_t rate_ctrl_model, + const vpx_rc_firstpass_stats_t *first_pass_stats) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(first_pass_stats->num_frames, kFrameNum); + for (int i = 0; i < first_pass_stats->num_frames; ++i) { + EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); + } + return 0; +} + +int rc_get_encodeframe_decision( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + toy_rate_ctrl->coding_index += 1; + + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + + EXPECT_LT(encode_frame_info->show_index, kFrameNum); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/); + } + + if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/); + } + + if (encode_frame_info->coding_index >= 2 && + encode_frame_info->coding_index < 5) { + EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/); + } + + if (encode_frame_info->coding_index == 5) { + EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/); + } + if (encode_frame_info->coding_index == kLosslessCodingIndex) { + // We should get sse == 0 at rc_update_encodeframe_result() + frame_decision->q_index = 0; + } else { + frame_decision->q_index = 100; + } + return 0; +} + +int rc_update_encodeframe_result( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_result_t *encode_frame_result) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + + int64_t ref_pixel_count = 352 * 288 * 3 / 2; + EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count); + if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) { + EXPECT_EQ(encode_frame_result->sse, 0); + } + return 0; +} + +int rc_delete_model(vpx_rc_model_t rate_ctrl_model) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + delete toy_rate_ctrl; + return 0; +} + +class ExtRateCtrlTest : public ::libvpx_test::EncoderTest, + public ::testing::Test { + protected: + ExtRateCtrlTest() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTest() = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + vpx_rc_funcs_t rc_funcs; + rc_funcs.create_model = rc_create_model; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats; + rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision; + rc_funcs.update_encodeframe_result = rc_update_encodeframe_result; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } +}; + +TEST_F(ExtRateCtrlTest, EncodeTest) { + cfg_.rc_target_bitrate = 24000; + + std::unique_ptr video; + video.reset(new libvpx_test::YUVVideoSource("bus_352x288_420_f20_b8.yuv", + VPX_IMG_FMT_I420, 352, 288, 30, 1, + 0, kFrameNum)); + + ASSERT_NE(video.get(), nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} +} // namespace diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 3838a9de77..084e1bab89 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2538,8 +2538,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, num_frames = packets - 1; fps_init_first_pass_info(&cpi->twopass.first_pass_info, oxcf->two_pass_stats_in.buf, num_frames); - vp9_extrc_send_firstpass_stats(&cpi->ext_ratectrl, - &cpi->twopass.first_pass_info); vp9_init_second_pass(cpi); } @@ -5482,9 +5480,9 @@ static void encode_frame_to_data_rate( { const RefCntBuffer *coded_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx); - vp9_extrc_update_encodeframe_result(&cpi->ext_ratectrl, (*size) << 3, - cpi->Source, &coded_frame_buf->buf, - cpi->oxcf.input_bit_depth); + vp9_extrc_update_encodeframe_result( + &cpi->ext_ratectrl, (*size) << 3, cpi->Source, &coded_frame_buf->buf, + cm->bit_depth, cpi->oxcf.input_bit_depth); } #if CONFIG_REALTIME_ONLY (void)encode_frame_result; @@ -5516,7 +5514,7 @@ static void encode_frame_to_data_rate( ref_frame_flags, cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index], cpi->Source, coded_frame_buf, ref_frame_bufs, vp9_get_quantizer(cpi), - cpi->oxcf.input_bit_depth, cm->bit_depth, cpi->td.counts, + cm->bit_depth, cpi->oxcf.input_bit_depth, cpi->td.counts, #if CONFIG_RATE_CTRL cpi->partition_info, cpi->motion_vector_info, #endif // CONFIG_RATE_CTRL @@ -5673,6 +5671,11 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags, ENCODE_FRAME_RESULT *encode_frame_result) { cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; + + if (cpi->common.current_frame_coding_index == 0) { + vp9_extrc_send_firstpass_stats(&cpi->ext_ratectrl, + &cpi->twopass.first_pass_info); + } #if CONFIG_MISMATCH_DEBUG mismatch_move_frame_idx_w(); #endif diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 64414cd38a..ca75651d44 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -22,7 +22,7 @@ void vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_rc_config_t ratectrl_config, ext_ratectrl->ratectrl_config = ratectrl_config; ext_ratectrl->funcs.create_model(ext_ratectrl->funcs.priv, &ext_ratectrl->ratectrl_config, - ext_ratectrl->model); + &ext_ratectrl->model); rc_firstpass_stats = &ext_ratectrl->rc_firstpass_stats; rc_firstpass_stats->num_frames = ratectrl_config.show_frame_count; rc_firstpass_stats->frame_stats = @@ -34,6 +34,7 @@ void vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_rc_config_t ratectrl_config, void vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) { if (ext_ratectrl->ready) { ext_ratectrl->funcs.delete_model(ext_ratectrl->model); + vpx_free(ext_ratectrl->rc_firstpass_stats.frame_stats); } vp9_extrc_init(ext_ratectrl); } @@ -118,6 +119,7 @@ void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl, int64_t bit_count, const YV12_BUFFER_CONFIG *source_frame, const YV12_BUFFER_CONFIG *coded_frame, + uint32_t bit_depth, uint32_t input_bit_depth) { if (ext_ratectrl->ready) { PSNR_STATS psnr; @@ -127,9 +129,10 @@ void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl, source_frame->y_width * source_frame->y_height + 2 * source_frame->uv_width * source_frame->uv_height; #if CONFIG_VP9_HIGHBITDEPTH - vpx_calc_highbd_psnr(source_frame, coded_frame, &psnr, - source_frame->bit_depth, input_bit_depth); + vpx_calc_highbd_psnr(source_frame, coded_frame, &psnr, bit_depth, + input_bit_depth); #else + (void)bit_depth; (void)input_bit_depth; vpx_calc_psnr(source_frame, coded_frame, &psnr); #endif diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index 82f300c3a5..fe8a66cf3c 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -40,6 +40,7 @@ void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl, int64_t bit_count, const YV12_BUFFER_CONFIG *source_frame, const YV12_BUFFER_CONFIG *coded_frame, + uint32_t bit_depth, uint32_t input_bit_depth); #endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index aa13fc9cf1..52da52ab87 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1739,20 +1739,23 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, VP9_COMP *cpi = ctx->cpi; EXT_RATECTRL *ext_ratectrl = &cpi->ext_ratectrl; const VP9EncoderConfig *oxcf = &cpi->oxcf; - const FRAME_INFO *frame_info = &cpi->frame_info; - vpx_rc_config_t ratectrl_config; - - ratectrl_config.frame_width = frame_info->frame_width; - ratectrl_config.frame_height = frame_info->frame_height; - ratectrl_config.show_frame_count = cpi->twopass.first_pass_info.num_frames; - - // TODO(angiebird): Double check whether this is the proper way to set up - // target_bitrate and frame_rate. - ratectrl_config.target_bitrate_kbps = (int)(oxcf->target_bandwidth / 1000); - ratectrl_config.frame_rate_num = oxcf->g_timebase.den; - ratectrl_config.frame_rate_den = oxcf->g_timebase.num; - - vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl); + // TODO(angiebird): Check the possibility of this flag being set at pass == 1 + if (oxcf->pass == 2) { + const FRAME_INFO *frame_info = &cpi->frame_info; + vpx_rc_config_t ratectrl_config; + + ratectrl_config.frame_width = frame_info->frame_width; + ratectrl_config.frame_height = frame_info->frame_height; + ratectrl_config.show_frame_count = cpi->twopass.first_pass_info.num_frames; + + // TODO(angiebird): Double check whether this is the proper way to set up + // target_bitrate and frame_rate. + ratectrl_config.target_bitrate_kbps = (int)(oxcf->target_bandwidth / 1000); + ratectrl_config.frame_rate_num = oxcf->g_timebase.den; + ratectrl_config.frame_rate_den = oxcf->g_timebase.num; + + vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl); + } return VPX_CODEC_OK; } From 94384b5c685ad3baac8989f19ee587eb72093a7f Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Sun, 18 Oct 2020 22:42:48 -0700 Subject: [PATCH 0072/1000] vp9-rtc: Fix to control for disabling loopfilter Adding unit test. Change-Id: Ic3c03fee7e9c2c224d927bb09914551422bdf816 --- test/svc_end_to_end_test.cc | 96 +++++++++++++++++++++++++++++++++++-- vp9/encoder/vp9_encoder.c | 1 + 2 files changed, 93 insertions(+), 4 deletions(-) diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc index edd4c887fc..b1ab0d7d95 100644 --- a/test/svc_end_to_end_test.cc +++ b/test/svc_end_to_end_test.cc @@ -121,7 +121,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, frame_to_start_decode_(0), frame_to_sync_(0), inter_layer_pred_mode_(GET_PARAM(1)), decode_to_layer_before_sync_(-1), decode_to_layer_after_sync_(-1), denoiser_on_(0), - intra_only_test_(false), mismatch_nframes_(0), num_nonref_frames_(0) { + intra_only_test_(false), loopfilter_off_(0), mismatch_nframes_(0), + num_nonref_frames_(0) { SetMode(::libvpx_test::kRealTime); memset(&svc_layer_sync_, 0, sizeof(svc_layer_sync_)); } @@ -154,6 +155,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, // So set it here in these tess to avoid encoder-decoder // mismatch check on color space setting. encoder->Control(VP9E_SET_COLOR_SPACE, VPX_CS_BT_601); + + encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_); } if (video->frame() == frame_to_sync_) { encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync_); @@ -214,7 +217,10 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, int decode_to_layer_after_sync_; int denoiser_on_; bool intra_only_test_; + int loopfilter_off_; vpx_svc_spatial_layer_sync_t svc_layer_sync_; + unsigned int mismatch_nframes_; + unsigned int num_nonref_frames_; private: virtual void SetConfig(const int num_temporal_layer) { @@ -243,9 +249,6 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, cfg_.temporal_layering_mode = 1; } } - - unsigned int mismatch_nframes_; - unsigned int num_nonref_frames_; }; // Test for sync layer for 1 pass CBR SVC: 3 spatial layers and @@ -470,8 +473,93 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc1SL3TLSyncFrameIntraOnlyQVGA) { #endif } +// Params: Loopfilter modes. +class LoopfilterOnePassCbrSvc : public OnePassCbrSvc, + public ::libvpx_test::CodecTestWithParam { + public: + LoopfilterOnePassCbrSvc() + : OnePassCbrSvc(GET_PARAM(0)), loopfilter_off_(GET_PARAM(1)), + mismatch_nframes_(0), num_nonref_frames_(0) { + SetMode(::libvpx_test::kRealTime); + } + + protected: + virtual ~LoopfilterOnePassCbrSvc() {} + + virtual void SetUp() { + InitializeConfig(); + speed_setting_ = 7; + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + PreEncodeFrameHookSetup(video, encoder); + encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_); + } + + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + // Keep track of number of non-reference frames, needed for mismatch check. + // Non-reference frames are top spatial and temporal layer frames, + // for TL > 0. + if (temporal_layer_id_ == number_temporal_layers_ - 1 && + temporal_layer_id_ > 0 && + pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1]) + num_nonref_frames_++; + } + + virtual void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) { + ++mismatch_nframes_; + } + + virtual void SetConfig(const int /*num_temporal_layer*/) {} + + int GetMismatchFrames() const { return mismatch_nframes_; } + int GetNonRefFrames() const { return num_nonref_frames_; } + + int loopfilter_off_; + + private: + int mismatch_nframes_; + int num_nonref_frames_; +}; + +TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc1SL3TLLoopfilterOff) { + SetSvcConfig(1, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_target_bitrate = 800; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + if (loopfilter_off_ == 0) + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); + else + EXPECT_EQ(GetMismatchFrames(), 0); +#endif +} + VP9_INSTANTIATE_TEST_SUITE(SyncFrameOnePassCbrSvc, ::testing::Range(0, 3)); +VP9_INSTANTIATE_TEST_SUITE(LoopfilterOnePassCbrSvc, ::testing::Range(0, 3)); + INSTANTIATE_TEST_SUITE_P( VP9, ScalePartitionOnePassCbrSvc, ::testing::Values( diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 3838a9de77..6ffe41edfe 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -3325,6 +3325,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { if (cpi->loopfilter_ctrl == NO_LOOPFILTER || (!is_reference_frame && cpi->loopfilter_ctrl == LOOPFILTER_REFERENCE)) { lf->filter_level = 0; + vpx_extend_frame_inner_borders(cm->frame_to_show); return; } From 90271b22016b89a115206a38ab4790a76cce3553 Mon Sep 17 00:00:00 2001 From: angiebird Date: Mon, 12 Oct 2020 15:12:03 -0700 Subject: [PATCH 0073/1000] Add vpx_rc_status_t Let callback functions in vpx_ext_ratectrl.h return vpx_rc_status_t Bug: webm:1707 Change-Id: I4ebed47278b228740f6c73b07aa472787b2617d2 --- test/vp9_ext_ratectrl_test.cc | 26 ++++++++++++++------------ vpx/vpx_ext_ratectrl.h | 16 +++++++++++----- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index aa36093445..2de3cb036f 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -27,8 +27,9 @@ struct ToyRateCtrl { int coding_index; }; -int rc_create_model(void *priv, const vpx_rc_config_t *ratectrl_config, - vpx_rc_model_t *rate_ctrl_model_pt) { +vpx_rc_status_t rc_create_model(void *priv, + const vpx_rc_config_t *ratectrl_config, + vpx_rc_model_t *rate_ctrl_model_pt) { ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; EXPECT_NE(toy_rate_ctrl, nullptr); toy_rate_ctrl->magic_number = kModelMagicNumber; @@ -41,11 +42,12 @@ int rc_create_model(void *priv, const vpx_rc_config_t *ratectrl_config, EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 24000); EXPECT_EQ(ratectrl_config->frame_rate_num, 30); EXPECT_EQ(ratectrl_config->frame_rate_den, 1); - return 0; + return vpx_rc_ok; } -int rc_send_firstpass_stats(vpx_rc_model_t rate_ctrl_model, - const vpx_rc_firstpass_stats_t *first_pass_stats) { +vpx_rc_status_t rc_send_firstpass_stats( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_firstpass_stats_t *first_pass_stats) { const ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); @@ -53,10 +55,10 @@ int rc_send_firstpass_stats(vpx_rc_model_t rate_ctrl_model, for (int i = 0; i < first_pass_stats->num_frames; ++i) { EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); } - return 0; + return vpx_rc_ok; } -int rc_get_encodeframe_decision( +vpx_rc_status_t rc_get_encodeframe_decision( vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *encode_frame_info, vpx_rc_encodeframe_decision_t *frame_decision) { @@ -90,10 +92,10 @@ int rc_get_encodeframe_decision( } else { frame_decision->q_index = 100; } - return 0; + return vpx_rc_ok; } -int rc_update_encodeframe_result( +vpx_rc_status_t rc_update_encodeframe_result( vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_result_t *encode_frame_result) { const ToyRateCtrl *toy_rate_ctrl = @@ -105,14 +107,14 @@ int rc_update_encodeframe_result( if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) { EXPECT_EQ(encode_frame_result->sse, 0); } - return 0; + return vpx_rc_ok; } -int rc_delete_model(vpx_rc_model_t rate_ctrl_model) { +vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) { ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); delete toy_rate_ctrl; - return 0; + return vpx_rc_ok; } class ExtRateCtrlTest : public ::libvpx_test::EncoderTest, diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index c99434457b..2f42660960 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -39,6 +39,11 @@ typedef struct vpx_rc_encodeframe_result { int64_t pixel_count; } vpx_rc_encodeframe_result_t; +typedef enum vpx_rc_status { + vpx_rc_ok = 0, + vpx_rc_error = 1, +} vpx_rc_status_t; + // This is a mirror of vp9's FIRSTPASS_STATS // Only spatial_layer_id is omitted typedef struct vpx_rc_frame_stats { @@ -92,7 +97,7 @@ typedef struct vpx_rc_config { * \param[in] ratectrl_config Pointer to vpx_rc_config_t * \param[out] rate_ctrl_model_pt Pointer to vpx_rc_model_t */ -typedef int (*vpx_rc_create_model_cb_fn_t)( +typedef vpx_rc_status_t (*vpx_rc_create_model_cb_fn_t)( void *priv, const vpx_rc_config_t *ratectrl_config, vpx_rc_model_t *rate_ctrl_model_pt); @@ -105,7 +110,7 @@ typedef int (*vpx_rc_create_model_cb_fn_t)( * \param[in] rate_ctrl_model rate control model * \param[in] first_pass_stats first pass stats */ -typedef int (*vpx_rc_send_firstpass_stats_cb_fn_t)( +typedef vpx_rc_status_t (*vpx_rc_send_firstpass_stats_cb_fn_t)( vpx_rc_model_t rate_ctrl_model, const vpx_rc_firstpass_stats_t *first_pass_stats); @@ -118,7 +123,7 @@ typedef int (*vpx_rc_send_firstpass_stats_cb_fn_t)( * \param[in] encode_frame_info information of the coding frame * \param[out] frame_decision encode decision of the coding frame */ -typedef int (*vpx_rc_get_encodeframe_decision_cb_fn_t)( +typedef vpx_rc_status_t (*vpx_rc_get_encodeframe_decision_cb_fn_t)( vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *encode_frame_info, vpx_rc_encodeframe_decision_t *frame_decision); @@ -131,7 +136,7 @@ typedef int (*vpx_rc_get_encodeframe_decision_cb_fn_t)( * \param[in] rate_ctrl_model rate control model * \param[out] encode_frame_result encode result of the coding frame */ -typedef int (*vpx_rc_update_encodeframe_result_cb_fn_t)( +typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)( vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_result_t *encode_frame_result); @@ -142,7 +147,8 @@ typedef int (*vpx_rc_update_encodeframe_result_cb_fn_t)( * * \param[in] rate_ctrl_model rate control model */ -typedef int (*vpx_rc_delete_model_cb_fn_t)(vpx_rc_model_t rate_ctrl_model); +typedef vpx_rc_status_t (*vpx_rc_delete_model_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model); typedef struct vpx_rc_funcs { vpx_rc_create_model_cb_fn_t create_model; From 9bfdf4a9d0c35706012fdcf7657d57c68bca0eb8 Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Fri, 16 Oct 2020 18:36:09 -0700 Subject: [PATCH 0074/1000] Add ref frame info to vpx_rc_encodeframe_info_t Bug: webm:1707 Change-Id: I2ff9e54a9c8ae535628c1c471a2d078652f49a31 --- test/vp9_ext_ratectrl_test.cc | 26 +++++++++ vp9/common/vp9_onyxc_int.h | 6 +- vp9/encoder/vp9_encoder.c | 102 ++++++++++++++++++--------------- vp9/encoder/vp9_encoder.h | 5 ++ vp9/encoder/vp9_ext_ratectrl.c | 13 ++++- vp9/encoder/vp9_ext_ratectrl.h | 6 +- vpx/vpx_ext_ratectrl.h | 2 + 7 files changed, 106 insertions(+), 54 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index 2de3cb036f..708718164f 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -72,10 +72,24 @@ vpx_rc_status_t rc_get_encodeframe_decision( if (encode_frame_info->coding_index == 0) { EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture } if (encode_frame_info->coding_index == 1) { EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 0); // kRefFrameTypeLast } if (encode_frame_info->coding_index >= 2 && @@ -85,6 +99,18 @@ vpx_rc_status_t rc_get_encodeframe_decision( if (encode_frame_info->coding_index == 5) { EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 1); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 1); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 4); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[2], + 1); // kRefFrameTypeFuture } if (encode_frame_info->coding_index == kLosslessCodingIndex) { // We should get sse == 0 at rc_update_encodeframe_result() diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index ea47218b84..1cfc12f6fa 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -75,12 +75,10 @@ typedef struct { // TODO(angiebird): Set frame_index/frame_coding_index on the decoder side // properly. - int frame_index; // Display order in the video, it's equivalent to the - // show_idx defined in EncodeFrameInfo. -#if CONFIG_RATE_CTRL + int frame_index; // Display order in the video, it's equivalent to the + // show_idx defined in EncodeFrameInfo. int frame_coding_index; // The coding order (starting from zero) of this // frame. -#endif // CONFIG_RATE_CTRL vpx_codec_frame_buffer_t raw_frame_buffer; YV12_BUFFER_CONFIG buf; } RefCntBuffer; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 084e1bab89..cac04405f7 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4205,6 +4205,27 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, return 1; } +static int get_ref_frame_flags(const VP9_COMP *cpi) { + const int *const map = cpi->common.ref_frame_map; + const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx]; + const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx]; + const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx]; + int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; + + if (gold_is_last) flags &= ~VP9_GOLD_FLAG; + + if (cpi->rc.frames_till_gf_update_due == INT_MAX && + (cpi->svc.number_temporal_layers == 1 && + cpi->svc.number_spatial_layers == 1)) + flags &= ~VP9_GOLD_FLAG; + + if (alt_is_last) flags &= ~VP9_ALT_FLAG; + + if (gold_is_alt) flags &= ~VP9_ALT_FLAG; + + return flags; +} + #if !CONFIG_REALTIME_ONLY #define MAX_QSTEP_ADJ 4 static int get_qstep_adj(int rate_excess, int rate_limit) { @@ -4481,9 +4502,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest if (cpi->ext_ratectrl.ready) { const GF_GROUP *gf_group = &cpi->twopass.gf_group; vpx_rc_encodeframe_decision_t encode_frame_decision; + FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; + const int ref_frame_flags = get_ref_frame_flags(cpi); + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; + get_ref_frame_bufs(cpi, ref_frame_bufs); vp9_extrc_get_encodeframe_decision( - &cpi->ext_ratectrl, gf_group, cm->current_video_frame, - cm->current_frame_coding_index, &encode_frame_decision); + &cpi->ext_ratectrl, cm->current_video_frame, + cm->current_frame_coding_index, update_type, ref_frame_bufs, + ref_frame_flags, &encode_frame_decision); q = encode_frame_decision.q_index; } @@ -4772,27 +4798,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest } #endif // !CONFIG_REALTIME_ONLY -static int get_ref_frame_flags(const VP9_COMP *cpi) { - const int *const map = cpi->common.ref_frame_map; - const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx]; - const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx]; - const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx]; - int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; - - if (gold_is_last) flags &= ~VP9_GOLD_FLAG; - - if (cpi->rc.frames_till_gf_update_due == INT_MAX && - (cpi->svc.number_temporal_layers == 1 && - cpi->svc.number_spatial_layers == 1)) - flags &= ~VP9_GOLD_FLAG; - - if (alt_is_last) flags &= ~VP9_ALT_FLAG; - - if (gold_is_alt) flags &= ~VP9_ALT_FLAG; - - return flags; -} - static void set_ext_overrides(VP9_COMP *cpi) { // Overrides the defaults with the externally supplied values with // vp9_update_reference() and vp9_update_entropy() calls @@ -5097,9 +5102,7 @@ static void set_frame_index(VP9_COMP *cpi, VP9_COMMON *cm) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; ref_buffer->frame_index = cm->current_video_frame + gf_group->arf_src_offset[gf_group->index]; -#if CONFIG_RATE_CTRL ref_buffer->frame_coding_index = cm->current_frame_coding_index; -#endif // CONFIG_RATE_CTRL } } @@ -7391,6 +7394,30 @@ static void setup_tpl_stats(VP9_COMP *cpi) { #endif // CONFIG_NON_GREEDY_MV } +void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], + int *ref_frame_coding_indexes, + int *ref_frame_valid_list) { + if (update_type != KF_UPDATE) { + const VP9_REFFRAME inter_ref_flags[MAX_INTER_REF_FRAMES] = { VP9_LAST_FLAG, + VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + int i; + for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) { + assert(ref_frame_bufs[i] != NULL); + ref_frame_coding_indexes[i] = ref_frame_bufs[i]->frame_coding_index; + ref_frame_valid_list[i] = (ref_frame_flags & inter_ref_flags[i]) != 0; + } + } else { + // No reference frame is available when this is a key frame. + int i; + for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) { + ref_frame_coding_indexes[i] = -1; + ref_frame_valid_list[i] = 0; + } + } +} + #if !CONFIG_REALTIME_ONLY #if CONFIG_RATE_CTRL static void copy_frame_counts(const FRAME_COUNTS *input_counts, @@ -7538,6 +7565,7 @@ static void yv12_buffer_to_image_buffer(const YV12_BUFFER_CONFIG *yv12_buffer, } } #endif // CONFIG_RATE_CTRL + static void update_encode_frame_result( int ref_frame_flags, FRAME_UPDATE_TYPE update_type, const YV12_BUFFER_CONFIG *source_frame, const RefCntBuffer *coded_frame_buf, @@ -7560,26 +7588,10 @@ static void update_encode_frame_result( #endif // CONFIG_VP9_HIGHBITDEPTH encode_frame_result->frame_coding_index = coded_frame_buf->frame_coding_index; - if (update_type != KF_UPDATE) { - const VP9_REFFRAME inter_ref_flags[MAX_INTER_REF_FRAMES] = { VP9_LAST_FLAG, - VP9_GOLD_FLAG, - VP9_ALT_FLAG }; - int i; - for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) { - assert(ref_frame_bufs[i] != NULL); - encode_frame_result->ref_frame_coding_indexes[i] = - ref_frame_bufs[i]->frame_coding_index; - encode_frame_result->ref_frame_valid_list[i] = - (ref_frame_flags & inter_ref_flags[i]) != 0; - } - } else { - // No reference frame is available when this is a key frame. - int i; - for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) { - encode_frame_result->ref_frame_coding_indexes[i] = -1; - encode_frame_result->ref_frame_valid_list[i] = 0; - } - } + vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs, + encode_frame_result->ref_frame_coding_indexes, + encode_frame_result->ref_frame_valid_list); + encode_frame_result->psnr = psnr.psnr[0]; encode_frame_result->sse = psnr.sse[0]; copy_frame_counts(counts, &encode_frame_result->frame_counts); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index b35d56608e..91cb6f5b14 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1268,6 +1268,11 @@ void vp9_scale_references(VP9_COMP *cpi); void vp9_update_reference_frames(VP9_COMP *cpi); +void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], + int *ref_frame_coding_indexes, + int *ref_frame_valid_list); + void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv); YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index ca75651d44..94c2addd25 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -9,6 +9,7 @@ */ #include "vp9/encoder/vp9_ext_ratectrl.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/common/vp9_common.h" #include "vpx_dsp/psnr.h" @@ -102,14 +103,20 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { } void vp9_extrc_get_encodeframe_decision( - EXT_RATECTRL *ext_ratectrl, const GF_GROUP *gf_group, int show_index, - int coding_index, vpx_rc_encodeframe_decision_t *encode_frame_decision) { + EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, + FRAME_UPDATE_TYPE update_type, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + vpx_rc_encodeframe_decision_t *encode_frame_decision) { if (ext_ratectrl->ready) { - FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; vpx_rc_encodeframe_info_t encode_frame_info; encode_frame_info.show_index = show_index; encode_frame_info.coding_index = coding_index; encode_frame_info.frame_type = extrc_get_frame_type(update_type); + + vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs, + encode_frame_info.ref_frame_coding_indexes, + encode_frame_info.ref_frame_valid_list); + ext_ratectrl->funcs.get_encodeframe_decision( ext_ratectrl->model, &encode_frame_info, encode_frame_decision); } diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index fe8a66cf3c..fb6cfe1ac8 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -33,8 +33,10 @@ void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info); void vp9_extrc_get_encodeframe_decision( - EXT_RATECTRL *ext_ratectrl, const GF_GROUP *gf_group, int show_index, - int coding_index, vpx_rc_encodeframe_decision_t *encode_frame_decision); + EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, + FRAME_UPDATE_TYPE update_type, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + vpx_rc_encodeframe_decision_t *encode_frame_decision); void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl, int64_t bit_count, diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 2f42660960..5d5a7c92d5 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -31,6 +31,8 @@ typedef struct vpx_rc_encodeframe_info { int frame_type; int show_index; int coding_index; + int ref_frame_coding_indexes[3]; + int ref_frame_valid_list[3]; } vpx_rc_encodeframe_info_t; typedef struct vpx_rc_encodeframe_result { From a207a0f6b9af8e05e4379d5a8f29f549d3b32c61 Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Tue, 20 Oct 2020 17:32:27 -0700 Subject: [PATCH 0075/1000] Small changes of vp9_ext_ratectrl_test.cc Change-Id: I27932c41a826cd3c10cc7801956cd32e4877133a --- test/vp9_ext_ratectrl_test.cc | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index 708718164f..a8c4032d45 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include + #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/util.h" @@ -18,7 +20,7 @@ namespace { constexpr int kModelMagicNumber = 51396; -constexpr unsigned int PrivMagicNumber = 5566UL; +constexpr unsigned int PrivMagicNumber = 5566; constexpr int kFrameNum = 5; constexpr int kLosslessCodingIndex = 2; @@ -34,7 +36,7 @@ vpx_rc_status_t rc_create_model(void *priv, EXPECT_NE(toy_rate_ctrl, nullptr); toy_rate_ctrl->magic_number = kModelMagicNumber; toy_rate_ctrl->coding_index = -1; - *rate_ctrl_model_pt = (vpx_rc_model_t)toy_rate_ctrl; + *rate_ctrl_model_pt = toy_rate_ctrl; EXPECT_EQ(priv, reinterpret_cast(PrivMagicNumber)); EXPECT_EQ(ratectrl_config->frame_width, 352); EXPECT_EQ(ratectrl_config->frame_height, 288); @@ -128,7 +130,7 @@ vpx_rc_status_t rc_update_encodeframe_result( static_cast(rate_ctrl_model); EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - int64_t ref_pixel_count = 352 * 288 * 3 / 2; + const int64_t ref_pixel_count = 352 * 288 * 3 / 2; EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count); if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) { EXPECT_EQ(encode_frame_result->sse, 0); @@ -174,11 +176,12 @@ TEST_F(ExtRateCtrlTest, EncodeTest) { cfg_.rc_target_bitrate = 24000; std::unique_ptr video; - video.reset(new libvpx_test::YUVVideoSource("bus_352x288_420_f20_b8.yuv", - VPX_IMG_FMT_I420, 352, 288, 30, 1, - 0, kFrameNum)); + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, + kFrameNum)); ASSERT_NE(video.get(), nullptr); ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); } + } // namespace From 16154dae714c3e88bfa17c25d5d6ad8198fac63a Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Mon, 26 Oct 2020 11:09:59 -0700 Subject: [PATCH 0076/1000] Download bus_352x288_420_f20_b8.yuv properly Bug: webm:1707 Change-Id: I6aabad7cdcddf2bc41a0cc7b5cdfd7d9759f9fae --- test/test-data.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test-data.mk b/test/test-data.mk index 436dfa4e49..744901b115 100644 --- a/test/test-data.mk +++ b/test/test-data.mk @@ -29,7 +29,7 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_one_layer LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_svc -LIBVPX_TEST_DATA-$(CONFIG_RATE_CTRL) += bus_352x288_420_f20_b8.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += bus_352x288_420_f20_b8.yuv # Test vectors LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf From 8b8b15e086dae3ff99c6096c5c6b6b85eb2d017a Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 27 Oct 2020 11:02:13 -0700 Subject: [PATCH 0077/1000] Add cmd line option to control loopfilter for vpxenc Change-Id: I4f5e6ce2f1b535a586bdb6c9e55a3d49ebf61af4 --- vpxenc.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vpxenc.c b/vpxenc.c index 64288e83d2..8c92f23917 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -465,6 +465,13 @@ static const arg_def_t target_level = ARG_DEF( static const arg_def_t row_mt = ARG_DEF(NULL, "row-mt", 1, "Enable row based non-deterministic multi-threading in VP9"); + +static const arg_def_t disable_loopfilter = + ARG_DEF(NULL, "disable-loopfilter", 1, + "Control Loopfilter in VP9\n" + "0: Loopfilter on for all frames (default)\n" + "1: Loopfilter off for non reference frames\n" + "2: Loopfilter off for all frames"); #endif #if CONFIG_VP9_ENCODER @@ -495,6 +502,7 @@ static const arg_def_t *vp9_args[] = { &cpu_used_vp9, &max_gf_interval, &target_level, &row_mt, + &disable_loopfilter, #if CONFIG_VP9_HIGHBITDEPTH &bitdeptharg, &inbitdeptharg, @@ -527,6 +535,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED, VP9E_SET_MAX_GF_INTERVAL, VP9E_SET_TARGET_LEVEL, VP9E_SET_ROW_MT, + VP9E_SET_DISABLE_LOOPFILTER, 0 }; #endif From 89ddf6f32a94c636ac9e6de4096295a058269222 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 27 Oct 2020 17:09:08 -0700 Subject: [PATCH 0078/1000] vp9_ext_ratectrl_test: add missing override for ~ExtRateCtrlTest() Change-Id: I311a400093c8c1ee2c002ba000d0b33c4fde209f --- test/vp9_ext_ratectrl_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index a8c4032d45..8db0a358d0 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -150,7 +150,7 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest, protected: ExtRateCtrlTest() : EncoderTest(&::libvpx_test::kVP9) {} - ~ExtRateCtrlTest() = default; + ~ExtRateCtrlTest() override = default; void SetUp() override { InitializeConfig(); From 8b27a92490347d7e5e818a9371783c17dc0a4da8 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Sun, 20 Sep 2020 11:57:13 -0700 Subject: [PATCH 0079/1000] Add a comment about bitdeptharg and inbitdeptharg Add a comment to vp9_args to point out that bitdeptharg and inbitdeptharg do not have a corresponding entry in vp9_arg_ctrl_map and must be listed at the end of vp9_args. Change-Id: Ic9834ab72599c067156ca5a315824c7f0760824a --- vpxenc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vpxenc.c b/vpxenc.c index 8c92f23917..5d7546eb28 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -503,6 +503,9 @@ static const arg_def_t *vp9_args[] = { &cpu_used_vp9, &target_level, &row_mt, &disable_loopfilter, +// NOTE: The entries above have a corresponding entry in vp9_arg_ctrl_map. The +// entries below do not have a corresponding entry in vp9_arg_ctrl_map. They +// must be listed at the end of vp9_args. #if CONFIG_VP9_HIGHBITDEPTH &bitdeptharg, &inbitdeptharg, From 9ab65c55d987be33601c6922ce62a1456b122fe2 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 29 Oct 2020 15:46:21 -0700 Subject: [PATCH 0080/1000] libs.mk: set LC_ALL=C w/egrep invocations this guarantees consistent interpretation of the character ranges BUG=webm:1711 Change-Id: Ia9123f079cc7ac248b9eff4d817e2e103d627b2b --- libs.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs.mk b/libs.mk index d14439a3da..b5bc35755c 100644 --- a/libs.mk +++ b/libs.mk @@ -420,13 +420,13 @@ ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes) # YASM $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h @echo " [CREATE] $@" - @egrep "#define [A-Z0-9_]+ [01]" $< \ + @LC_ALL=C egrep "#define [A-Z0-9_]+ [01]" $< \ | awk '{print $$2 " equ " $$3}' > $@ else ADS2GAS=$(if $(filter yes,$(CONFIG_GCC)),| $(ASM_CONVERSION)) $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h @echo " [CREATE] $@" - @egrep "#define [A-Z0-9_]+ [01]" $< \ + @LC_ALL=C egrep "#define [A-Z0-9_]+ [01]" $< \ | awk '{print $$2 " EQU " $$3}' $(ADS2GAS) > $@ @echo " END" $(ADS2GAS) >> $@ CLEAN-OBJS += $(BUILD_PFX)vpx_config.asm From b1d704f12af9b96b39ce1e1493c36bb4b3a3fb2a Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Thu, 5 Nov 2020 15:26:54 -0800 Subject: [PATCH 0081/1000] Accumulate frame tpl stats and pass through rate control api Tpl stats is computed at the beginning of encoding the altref frame. We aggregate tpl stats of all blocks for every frame of the current group of picture. After the altref frame is encoded, the tpl stats is passed through the encode frame result to external environment. Change-Id: I2284f8cf9c45d35ba02f3ea45f0187edbbf48294 --- vp9/encoder/vp9_encoder.c | 53 ++++++++++++++++++++++++++++++++++++++- vp9/encoder/vp9_encoder.h | 19 ++++++++++++++ vp9/simple_encode.cc | 30 ++++++++++++++++++++-- vp9/simple_encode.h | 30 ++++++++++++++++++++++ 4 files changed, 129 insertions(+), 3 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index f2e6ba1acd..f4587d42d9 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1025,6 +1025,7 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { free_partition_info(cpi); free_motion_vector_info(cpi); free_fp_motion_vector_info(cpi); + free_tpl_stats_info(cpi); #endif vp9_free_ref_frame_buffers(cm->buffer_pool); @@ -2665,6 +2666,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, partition_info_init(cpi); motion_vector_info_init(cpi); fp_motion_vector_info_init(cpi); + tpl_stats_info_init(cpi); #endif return cpi; @@ -5306,6 +5308,7 @@ static void update_encode_frame_result( #if CONFIG_RATE_CTRL const PARTITION_INFO *partition_info, const MOTION_VECTOR_INFO *motion_vector_info, + const TplDepStats *tpl_stats_info, #endif // CONFIG_RATE_CTRL ENCODE_FRAME_RESULT *encode_frame_result); #endif // !CONFIG_REALTIME_ONLY @@ -5520,7 +5523,7 @@ static void encode_frame_to_data_rate( cpi->Source, coded_frame_buf, ref_frame_bufs, vp9_get_quantizer(cpi), cm->bit_depth, cpi->oxcf.input_bit_depth, cpi->td.counts, #if CONFIG_RATE_CTRL - cpi->partition_info, cpi->motion_vector_info, + cpi->partition_info, cpi->motion_vector_info, cpi->tpl_stats_info, #endif // CONFIG_RATE_CTRL encode_frame_result); } @@ -7371,6 +7374,48 @@ static void free_tpl_buffer(VP9_COMP *cpi) { } } +#if CONFIG_RATE_CTRL +static void accumulate_frame_tpl_stats(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + int show_frame_count = 0; + int frame_idx; + // Accumulate tpl stats for each frame in the current group of picture. + for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + if (!tpl_frame->is_valid) continue; + + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + const int tpl_stride = tpl_frame->stride; + int64_t intra_cost_base = 0; + int64_t inter_cost_base = 0; + int64_t mc_dep_cost_base = 0; + int64_t mc_ref_cost_base = 0; + int64_t mc_flow_base = 0; + int row, col; + + for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) { + for (col = 0; col < cm->mi_cols; ++col) { + TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; + intra_cost_base += this_stats->intra_cost; + inter_cost_base += this_stats->inter_cost; + mc_dep_cost_base += this_stats->mc_dep_cost; + mc_ref_cost_base += this_stats->mc_ref_cost; + mc_flow_base += this_stats->mc_flow; + } + } + + cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base; + cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base; + cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base; + cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base; + cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base; + + ++show_frame_count; + } +} +#endif // CONFIG_RATE_CTRL + static void setup_tpl_stats(VP9_COMP *cpi) { GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE]; const GF_GROUP *gf_group = &cpi->twopass.gf_group; @@ -7393,6 +7438,10 @@ static void setup_tpl_stats(VP9_COMP *cpi) { dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize); #endif // DUMP_TPL_STATS #endif // CONFIG_NON_GREEDY_MV + +#if CONFIG_RATE_CTRL + accumulate_frame_tpl_stats(cpi); +#endif // CONFIG_RATE_CTRL } void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags, @@ -7575,6 +7624,7 @@ static void update_encode_frame_result( #if CONFIG_RATE_CTRL const PARTITION_INFO *partition_info, const MOTION_VECTOR_INFO *motion_vector_info, + const TplDepStats *tpl_stats_info, #endif // CONFIG_RATE_CTRL ENCODE_FRAME_RESULT *encode_frame_result) { #if CONFIG_RATE_CTRL @@ -7598,6 +7648,7 @@ static void update_encode_frame_result( copy_frame_counts(counts, &encode_frame_result->frame_counts); encode_frame_result->partition_info = partition_info; encode_frame_result->motion_vector_info = motion_vector_info; + encode_frame_result->tpl_stats_info = tpl_stats_info; if (encode_frame_result->coded_frame.allocated) { yv12_buffer_to_image_buffer(&coded_frame_buf->buf, &encode_frame_result->coded_frame); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 91cb6f5b14..8763a5e789 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -973,6 +973,7 @@ typedef struct VP9_COMP { PARTITION_INFO *partition_info; MOTION_VECTOR_INFO *motion_vector_info; MOTION_VECTOR_INFO *fp_motion_vector_info; + TplDepStats *tpl_stats_info; RATE_QSTEP_MODEL rq_model[ENCODE_FRAME_TYPES]; #endif @@ -1029,6 +1030,23 @@ static INLINE void free_motion_vector_info(struct VP9_COMP *cpi) { cpi->motion_vector_info = NULL; } +// Allocates memory for the tpl stats information. +// Only called once in vp9_create_compressor(). +static INLINE void tpl_stats_info_init(struct VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + CHECK_MEM_ERROR( + cm, cpi->tpl_stats_info, + (TplDepStats *)vpx_calloc(MAX_LAG_BUFFERS, sizeof(TplDepStats))); + memset(cpi->tpl_stats_info, 0, MAX_LAG_BUFFERS * sizeof(TplDepStats)); +} + +// Frees memory of the tpl stats information. +// Only called once in dealloc_compressor_data(). +static INLINE void free_tpl_stats_info(struct VP9_COMP *cpi) { + vpx_free(cpi->tpl_stats_info); + cpi->tpl_stats_info = NULL; +} + // Allocates memory for the first pass motion vector information. // The unit size is each 16x16 block. // Only called once in vp9_create_compressor(). @@ -1091,6 +1109,7 @@ typedef struct ENCODE_FRAME_RESULT { FRAME_COUNTS frame_counts; const PARTITION_INFO *partition_info; const MOTION_VECTOR_INFO *motion_vector_info; + const TplDepStats *tpl_stats_info; IMAGE_BUFFER coded_frame; RATE_QINDEX_HISTORY rq_history; #endif // CONFIG_RATE_CTRL diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 568df97aaa..afda6e2035 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -207,6 +207,24 @@ static void update_motion_vector_info( } } +static void update_tpl_stats_info(const TplDepStats *input_tpl_stats_info, + const int show_frame_count, + TplStatsInfo *output_tpl_stats_info) { + int frame_idx; + for (frame_idx = 0; frame_idx < show_frame_count; ++frame_idx) { + output_tpl_stats_info[frame_idx].intra_cost = + input_tpl_stats_info[frame_idx].intra_cost; + output_tpl_stats_info[frame_idx].inter_cost = + input_tpl_stats_info[frame_idx].inter_cost; + output_tpl_stats_info[frame_idx].mc_flow = + input_tpl_stats_info[frame_idx].mc_flow; + output_tpl_stats_info[frame_idx].mc_dep_cost = + input_tpl_stats_info[frame_idx].mc_dep_cost; + output_tpl_stats_info[frame_idx].mc_ref_cost = + input_tpl_stats_info[frame_idx].mc_ref_cost; + } +} + static void update_frame_counts(const FRAME_COUNTS *input_counts, FrameCounts *output_counts) { // Init array sizes. @@ -486,6 +504,7 @@ static bool init_encode_frame_result(EncodeFrameResult *encode_frame_result, encode_frame_result->num_cols_4x4); encode_frame_result->motion_vector_info.resize( encode_frame_result->num_rows_4x4 * encode_frame_result->num_cols_4x4); + encode_frame_result->tpl_stats_info.resize(MAX_LAG_BUFFERS); if (encode_frame_result->coding_data.get() == nullptr) { return false; @@ -507,7 +526,7 @@ static void encode_frame_result_update_rq_history( } static void update_encode_frame_result( - EncodeFrameResult *encode_frame_result, + EncodeFrameResult *encode_frame_result, const int show_frame_count, const ENCODE_FRAME_RESULT *encode_frame_info) { encode_frame_result->coding_data_bit_size = encode_frame_result->coding_data_byte_size * 8; @@ -536,6 +555,10 @@ static void update_encode_frame_result( kMotionVectorSubPixelPrecision); update_frame_counts(&encode_frame_info->frame_counts, &encode_frame_result->frame_counts); + if (encode_frame_result->frame_type == kFrameTypeAltRef) { + update_tpl_stats_info(encode_frame_info->tpl_stats_info, show_frame_count, + &encode_frame_result->tpl_stats_info[0]); + } encode_frame_result_update_rq_history(&encode_frame_info->rq_history, encode_frame_result); } @@ -1169,7 +1192,10 @@ void SimpleEncode::EncodeFrame(EncodeFrameResult *encode_frame_result) { abort(); } - update_encode_frame_result(encode_frame_result, &encode_frame_info); + const GroupOfPicture group_of_picture = this->ObserveGroupOfPicture(); + const int show_frame_count = group_of_picture.show_frame_count; + update_encode_frame_result(encode_frame_result, show_frame_count, + &encode_frame_info); PostUpdateState(*encode_frame_result); } else { // TODO(angiebird): Clean up encode_frame_result. diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index ce370a795e..380e8118fc 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -87,6 +87,24 @@ struct MotionVectorInfo { double mv_column[2]; }; +// Accumulated tpl stats of all blocks in one frame. +// For each frame, the tpl stats are computed per 32x32 block. +struct TplStatsInfo { + // Intra complexity: the sum of absolute transform difference (SATD) of + // intra predicted residuals. + int64_t intra_cost; + // Inter complexity: the SATD of inter predicted residuals. + int64_t inter_cost; + // Motion compensated information flow. It measures how much information + // is propagated from the current frame to other frames. + int64_t mc_flow; + // Motion compensated dependency cost. It equals to its own intra_cost + // plus the mc_flow. + int64_t mc_dep_cost; + // Motion compensated reference cost. + int64_t mc_ref_cost; +}; + struct RefFrameInfo { int coding_indexes[kRefFrameTypeMax]; @@ -261,6 +279,18 @@ struct EncodeFrameResult { // Similar to partition info, all 4x4 blocks inside the same partition block // share the same motion vector information. std::vector motion_vector_info; + // A vector of the tpl stats information. + // The tpl stats measure the complexity of a frame, as well as the + // informatioin propagated along the motion trajactory between frames, in + // the reference frame structure. + // The tpl stats could be used as a more accurate spatial and temporal + // complexity measure in addition to the first pass stats. + // The vector contains tpl stats for all show frames in a GOP. + // The tpl stats stored in the vector is according to the encoding order. + // For example, suppose there are N show frames for the current GOP. + // Then tpl_stats_info[0] stores the information of the first frame to be + // encoded for this GOP, i.e, the AltRef frame. + std::vector tpl_stats_info; ImageBuffer coded_frame; // recode_count, q_index_history and rate_history are only available when From 7beafefd16b9d41eaf0bfc09e6bbb843ada9e952 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Wed, 11 Nov 2020 23:11:16 -0800 Subject: [PATCH 0082/1000] vp9: Allow for disabling loopfilter per spatial layer For SVC: add parameter to the control SET_SVC_PARAMS to allow for disabling the loopfilter per spatial layer. Note this svc setting will override the setting via VP9E_SET_DISABLE_LOOPFILTER (which should only be used for non-SVC). Add unittest to handle both SVC (spatial or temporal layers) and non-SVC (single layer) case. Change-Id: I4092f01668bae42aac724a6df5b6f6a604337448 --- test/svc_end_to_end_test.cc | 85 +++++++++++++++++++++++++++++- test/svc_test.cc | 13 ++--- vp9/encoder/vp9_svc_layercontext.c | 2 + vp9/encoder/vp9_svc_layercontext.h | 1 + vp9/vp9_cx_iface.c | 1 + vpx/vpx_encoder.h | 1 + 6 files changed, 96 insertions(+), 7 deletions(-) diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc index b1ab0d7d95..518824d03f 100644 --- a/test/svc_end_to_end_test.cc +++ b/test/svc_end_to_end_test.cc @@ -494,7 +494,31 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc, virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { PreEncodeFrameHookSetup(video, encoder); - encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_); + if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) { + // Consider 3 cases: + if (loopfilter_off_ == 0) { + // loopfilter is on for all spatial layers on every superrframe. + for (int i = 0; i < VPX_SS_MAX_LAYERS; ++i) { + svc_params_.loopfilter_ctrl[i] = 0; + } + } else if (loopfilter_off_ == 1) { + // loopfilter is off for non-reference frames for all spatial layers. + for (int i = 0; i < VPX_SS_MAX_LAYERS; ++i) { + svc_params_.loopfilter_ctrl[i] = 1; + } + } else { + // loopfilter is off for all SL0 frames, and off only for non-reference + // frames for SL > 0. + svc_params_.loopfilter_ctrl[0] = 2; + for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) { + svc_params_.loopfilter_ctrl[i] = 1; + } + } + encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); + } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 1) { + // For non-SVC mode use the single layer control. + encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_); + } } virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { @@ -524,6 +548,35 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc, int num_nonref_frames_; }; +TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc1SL1TLLoopfilterOff) { + SetSvcConfig(1, 1); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_target_bitrate = 800; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + if (loopfilter_off_ == 0) + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); + else + EXPECT_EQ(GetMismatchFrames(), 0); +#endif +} + TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc1SL3TLLoopfilterOff) { SetSvcConfig(1, 3); cfg_.rc_buf_initial_sz = 500; @@ -542,7 +595,37 @@ TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc1SL3TLLoopfilterOff) { cfg_.ts_rate_decimator[1] = 2; cfg_.ts_rate_decimator[2] = 1; cfg_.temporal_layering_mode = 3; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + if (loopfilter_off_ == 0) + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); + else + EXPECT_EQ(GetMismatchFrames(), 0); +#endif +} +TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc3SL3TLLoopfilterOff) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_target_bitrate = 800; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, 0, 400); cfg_.rc_target_bitrate = 600; diff --git a/test/svc_test.cc b/test/svc_test.cc index 4798c77183..cbc0abe032 100644 --- a/test/svc_test.cc +++ b/test/svc_test.cc @@ -43,13 +43,14 @@ void OnePassCbrSvc::PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video, svc_params_.max_quantizers[i] = 63; svc_params_.min_quantizers[i] = 0; } - svc_params_.speed_per_layer[0] = base_speed_setting_; - for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) { - svc_params_.speed_per_layer[i] = speed_setting_; + if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) { + svc_params_.speed_per_layer[0] = base_speed_setting_; + for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) { + svc_params_.speed_per_layer[i] = speed_setting_; + } + encoder->Control(VP9E_SET_SVC, 1); + encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); } - - encoder->Control(VP9E_SET_SVC, 1); - encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); encoder->Control(VP8E_SET_CPUUSED, speed_setting_); encoder->Control(VP9E_SET_AQ_MODE, 3); encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300); diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index d85b3632cd..9c75d77263 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -357,6 +357,8 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { if (is_one_pass_cbr_svc(cpi) && lc->speed > 0) { cpi->oxcf.speed = lc->speed; } + if (lc->loopfilter_ctrl >= 0 || lc->loopfilter_ctrl < 3) + cpi->loopfilter_ctrl = lc->loopfilter_ctrl; // Reset the frames_since_key and frames_to_key counters to their values // before the layer restore. Keep these defined for the stream (not layer). if (cpi->svc.number_temporal_layers > 1 || diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index e7d9712aae..b12e7e01a7 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -71,6 +71,7 @@ typedef struct { int actual_num_seg2_blocks; int counter_encode_maxq_scene_change; uint8_t speed; + int loopfilter_ctrl; } LAYER_CONTEXT; typedef struct SVC { diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index aa13fc9cf1..0ccb6750b6 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1573,6 +1573,7 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx, lc->scaling_factor_num = params->scaling_factor_num[sl]; lc->scaling_factor_den = params->scaling_factor_den[sl]; lc->speed = params->speed_per_layer[sl]; + lc->loopfilter_ctrl = params->loopfilter_ctrl[sl]; } } diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index c84d40f7f7..39b2aef625 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -705,6 +705,7 @@ typedef struct vpx_svc_parameters { int scaling_factor_den[VPX_MAX_LAYERS]; /**< Scaling factor-denominator */ int speed_per_layer[VPX_MAX_LAYERS]; /**< Speed setting for each sl */ int temporal_layering_mode; /**< Temporal layering mode */ + int loopfilter_ctrl[VPX_MAX_LAYERS]; /**< Loopfilter ctrl for each sl */ } vpx_svc_extra_cfg_t; /*!\brief Initialize an encoder instance From d4453c73ff9467d5c7cd4ce8b6070dbfa24eff37 Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Fri, 13 Nov 2020 18:13:14 -0800 Subject: [PATCH 0083/1000] Fix the warning of C90 mixed declarations and code Change-Id: I1a6c57525bbe8bf1a97057ecd64985bc23d1df2e --- vp9/encoder/vp9_encoder.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index f4587d42d9..8d60a0c001 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -7383,8 +7383,6 @@ static void accumulate_frame_tpl_stats(VP9_COMP *cpi) { // Accumulate tpl stats for each frame in the current group of picture. for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) { TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - if (!tpl_frame->is_valid) continue; - TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; const int tpl_stride = tpl_frame->stride; int64_t intra_cost_base = 0; @@ -7394,6 +7392,8 @@ static void accumulate_frame_tpl_stats(VP9_COMP *cpi) { int64_t mc_flow_base = 0; int row, col; + if (!tpl_frame->is_valid) continue; + for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) { for (col = 0; col < cm->mi_cols; ++col) { TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; From 4e7fd0273ade37a1f128ad131ed1d6a263f13d22 Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Fri, 13 Nov 2020 18:17:48 -0800 Subject: [PATCH 0084/1000] Fix uninitialized warning in resize_test.cc Change-Id: I12a72d3aa57b13dbcbeb037e1deea41529ea4194 --- test/resize_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/resize_test.cc b/test/resize_test.cc index 65b94fa4f6..c57170ff9b 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -271,8 +271,8 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource { protected: virtual void Next() { ++frame_; - unsigned int width; - unsigned int height; + unsigned int width = 0; + unsigned int height = 0; ScaleForFrameNumber(frame_, kInitialWidth, kInitialHeight, &width, &height, flag_codec_, smaller_width_larger_size_); SetSize(width, height); From b5d77a48d740e211a130c8e45d9353ef8c154a47 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 16 Nov 2020 14:12:50 -0800 Subject: [PATCH 0085/1000] Remove condition on copying svc loopfilter flag Change-Id: Ib37ef0aa3dc0ec73b25332be6d89969093bd7aeb --- vp9/encoder/vp9_svc_layercontext.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 9c75d77263..b6c7c74e17 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -357,8 +357,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { if (is_one_pass_cbr_svc(cpi) && lc->speed > 0) { cpi->oxcf.speed = lc->speed; } - if (lc->loopfilter_ctrl >= 0 || lc->loopfilter_ctrl < 3) - cpi->loopfilter_ctrl = lc->loopfilter_ctrl; + cpi->loopfilter_ctrl = lc->loopfilter_ctrl; // Reset the frames_since_key and frames_to_key counters to their values // before the layer restore. Keep these defined for the stream (not layer). if (cpi->svc.number_temporal_layers > 1 || From ca7a16babc8bed02f060dd98f7297db7f3c90443 Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Fri, 13 Nov 2020 18:32:34 -0800 Subject: [PATCH 0086/1000] Add doxygen to structs in vpx_ext_ratectrl.h Bug: webm:1707 Change-Id: Ib5f6b6f143f55e5279e39eb386fcd3340211de59 --- vpx/vpx_ext_ratectrl.h | 57 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 5d5a7c92d5..6919f2ac6f 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -17,35 +17,70 @@ extern "C" { #include "./vpx_integer.h" -/*!\cond - TODO(angiebird): document these structures and fields to clear doxygen - warnings.*/ - +/*!\brief Abstract rate control model handler + * + * The encoder will receive the model handler from create_model() defined in + * vpx_rc_funcs_t. + */ typedef void *vpx_rc_model_t; +/*!\brief Encode frame decision made by the external rate control model + * + * The encoder will receive the decision from the external rate control model + * through get_encodeframe_decision() defined in vpx_rc_funcs_t. + */ typedef struct vpx_rc_encodeframe_decision { - int q_index; + int q_index; /**< Quantizer step index [0..255]*/ } vpx_rc_encodeframe_decision_t; +/*!\brief Information for the frame to be encoded. + * + * The encoder will send the information to external rate control model through + * get_encodeframe_decision() defined in vpx_rc_funcs_t. + * + */ typedef struct vpx_rc_encodeframe_info { + /*! + * 0: Key frame + * 1: Inter frame + * 2: Alternate reference frame + * 3: Overlay frame + * 4: Golden frame + */ int frame_type; - int show_index; - int coding_index; - int ref_frame_coding_indexes[3]; + int show_index; /**< display index, starts from zero*/ + int coding_index; /**< coding index, starts from zero*/ + int ref_frame_coding_indexes[3]; /**< three reference frames' coding indices*/ + /*! + * The validity of the three reference frames. + * 0: Invalid + * 1: Valid + */ int ref_frame_valid_list[3]; } vpx_rc_encodeframe_info_t; +/*!\brief Frame coding result + * + * The encoder will send the result to the external rate control model through + * update_encodeframe_result() defined in vpx_rc_funcs_t. + */ typedef struct vpx_rc_encodeframe_result { - int64_t sse; - int64_t bit_count; - int64_t pixel_count; + int64_t sse; /**< sum of squared error of the reconstructed frame */ + int64_t bit_count; /**< number of bits spent on coding the frame*/ + int64_t pixel_count; /**< number of pixels in YUV planes of the frame*/ } vpx_rc_encodeframe_result_t; +/*!\brief Status returned by rate control callback functions. + */ typedef enum vpx_rc_status { vpx_rc_ok = 0, vpx_rc_error = 1, } vpx_rc_status_t; +/*!\cond + TODO(angiebird): document these structures and fields to clear doxygen + warnings.*/ + // This is a mirror of vp9's FIRSTPASS_STATS // Only spatial_layer_id is omitted typedef struct vpx_rc_frame_stats { From c22a783bea8512a3413d9dd4abf82622cd89adcd Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Fri, 13 Nov 2020 19:26:14 -0800 Subject: [PATCH 0087/1000] Copy first pass stats documentation from AV1 to VP9 Bug: webm:1707 Change-Id: Iae7eaa9ba681272b70b6dad17cd2247edab6ef79 --- vpx/vpx_ext_ratectrl.h | 120 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 114 insertions(+), 6 deletions(-) diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 6919f2ac6f..494b149554 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -77,45 +77,153 @@ typedef enum vpx_rc_status { vpx_rc_error = 1, } vpx_rc_status_t; -/*!\cond - TODO(angiebird): document these structures and fields to clear doxygen - warnings.*/ - -// This is a mirror of vp9's FIRSTPASS_STATS -// Only spatial_layer_id is omitted +/*!\brief First pass frame stats + * This is a mirror of vp9's FIRSTPASS_STATS except that spatial_layer_id is + * omitted + */ typedef struct vpx_rc_frame_stats { + /*! + * Frame number in display order, if stats are for a single frame. + * No real meaning for a collection of frames. + */ double frame; + /*! + * Weight assigned to this frame (or total weight for the collection of + * frames) currently based on intra factor and brightness factor. This is used + * to distribute bits between easier and harder frames. + */ double weight; + /*! + * Intra prediction error. + */ double intra_error; + /*! + * Best of intra pred error and inter pred error using last frame as ref. + */ double coded_error; + /*! + * Best of intra pred error and inter pred error using golden frame as ref. + */ double sr_coded_error; + /*! + * Estimate the noise energy of the current frame. + */ double frame_noise_energy; + /*! + * Percentage of blocks with inter pred error < intra pred error. + */ double pcnt_inter; + /*! + * Percentage of blocks using (inter prediction and) non-zero motion vectors. + */ double pcnt_motion; + /*! + * Percentage of blocks where golden frame was better than last or intra: + * inter pred error using golden frame < inter pred error using last frame and + * inter pred error using golden frame < intra pred error + */ double pcnt_second_ref; + /*! + * Percentage of blocks where intra and inter prediction errors were very + * close. Note that this is a 'weighted count', that is, the so blocks may be + * weighted by how close the two errors were. + */ double pcnt_neutral; + /*! + * Percentage of blocks that have intra error < inter error and inter error < + * LOW_I_THRESH LOW_I_THRESH = 24000 using bit_depth 8 LOW_I_THRESH = 24000 << + * 4 using bit_depth 10 LOW_I_THRESH = 24000 << 8 using bit_depth 12 + */ double pcnt_intra_low; + /*! + * Percentage of blocks that have intra error < inter error and intra error < + * LOW_I_THRESH but inter error >= LOW_I_THRESH LOW_I_THRESH = 24000 using + * bit_depth 8 LOW_I_THRESH = 24000 << 4 using bit_depth 10 LOW_I_THRESH = + * 24000 << 8 using bit_depth 12 + */ double pcnt_intra_high; + /*! + * Percentage of blocks that have almost no intra error residual + * (i.e. are in effect completely flat and untextured in the intra + * domain). In natural videos this is uncommon, but it is much more + * common in animations, graphics and screen content, so may be used + * as a signal to detect these types of content. + */ double intra_skip_pct; + /*! + * Percentage of blocks that have intra error < SMOOTH_INTRA_THRESH + * SMOOTH_INTRA_THRESH = 4000 using bit_depth 8 + * SMOOTH_INTRA_THRESH = 4000 << 4 using bit_depth 10 + * SMOOTH_INTRA_THRESH = 4000 << 8 using bit_depth 12 + */ double intra_smooth_pct; + /*! + * Image mask rows top and bottom. + */ double inactive_zone_rows; + /*! + * Image mask columns at left and right edges. + */ double inactive_zone_cols; + /*! + * Average of row motion vectors. + */ double MVr; + /*! + * Mean of absolute value of row motion vectors. + */ double mvr_abs; + /*! + * Mean of column motion vectors. + */ double MVc; + /*! + * Mean of absolute value of column motion vectors. + */ double mvc_abs; + /*! + * Variance of row motion vectors. + */ double MVrv; + /*! + * Variance of column motion vectors. + */ double MVcv; + /*! + * Value in range [-1,1] indicating fraction of row and column motion vectors + * that point inwards (negative MV value) or outwards (positive MV value). + * For example, value of 1 indicates, all row/column MVs are inwards. + */ double mv_in_out_count; + /*! + * Duration of the frame / collection of frames. + */ double duration; + /*! + * 1.0 if stats are for a single frame, OR + * Number of frames in this collection for which the stats are accumulated. + */ double count; } vpx_rc_frame_stats_t; +/*!\brief Collection of first pass frame stats + */ typedef struct vpx_rc_firstpass_stats { + /*! + * Pointer to first pass frame stats. + * The pointed array of vpx_rc_frame_stats_t should have length equal to + * number of show frames in the video. + */ vpx_rc_frame_stats_t *frame_stats; + /*! + * Number of show frames in the video. + */ int num_frames; } vpx_rc_firstpass_stats_t; +/*!\cond + TODO(angiebird): document these structures and fields to clear doxygen + warnings.*/ typedef struct vpx_rc_config { int frame_width; int frame_height; From a7731ba488202ea62adfedf3fb49477cafe80b88 Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Fri, 13 Nov 2020 19:40:13 -0800 Subject: [PATCH 0088/1000] Add doxygen for vpx_rc_config Bug: webm:1707 Change-Id: I65bab6b2b792653e70cb136a5f9a21796e34b829 --- vpx/vpx_ext_ratectrl.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 494b149554..bc0ed98191 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -221,16 +221,18 @@ typedef struct vpx_rc_firstpass_stats { int num_frames; } vpx_rc_firstpass_stats_t; -/*!\cond - TODO(angiebird): document these structures and fields to clear doxygen - warnings.*/ +/*!\brief Encode config sent to external rate control model + */ typedef struct vpx_rc_config { - int frame_width; - int frame_height; - int show_frame_count; + int frame_width; /**< frame width */ + int frame_height; /**< frame height */ + int show_frame_count; /**< number of visible frames in the video */ + /*! + * Target bitrate in kilobytes per second + */ int target_bitrate_kbps; - int frame_rate_num; - int frame_rate_den; + int frame_rate_num; /**< numerator of frame rate */ + int frame_rate_den; /**< denominator of frame rate */ } vpx_rc_config_t; /*!\brief Create an external rate control model callback prototype @@ -295,6 +297,10 @@ typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)( typedef vpx_rc_status_t (*vpx_rc_delete_model_cb_fn_t)( vpx_rc_model_t rate_ctrl_model); +/*!\cond + TODO(angiebird): document these structures and fields to clear doxygen + warnings.*/ + typedef struct vpx_rc_funcs { vpx_rc_create_model_cb_fn_t create_model; vpx_rc_send_firstpass_stats_cb_fn_t send_firstpass_stats; From 275c2769933d599ae74002610563fe11321668bc Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Tue, 17 Nov 2020 14:37:20 -0800 Subject: [PATCH 0089/1000] Add doxygen for vpx_rc_funcs_t Change-Id: If75215d574fe0b075add50154a9eece5d387741a --- vpx/vpx_ext_ratectrl.h | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index bc0ed98191..8aee4f4d82 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -297,23 +297,39 @@ typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)( typedef vpx_rc_status_t (*vpx_rc_delete_model_cb_fn_t)( vpx_rc_model_t rate_ctrl_model); -/*!\cond - TODO(angiebird): document these structures and fields to clear doxygen - warnings.*/ - +/*!\brief Callback function set for external rate control. + * + * The user can enable external rate control by registering + * a set of callback functions with the codec control flag + * VP9E_SET_EXTERNAL_RATE_CONTROL. + */ typedef struct vpx_rc_funcs { + /*! + * Create an external rate control model. + */ vpx_rc_create_model_cb_fn_t create_model; + /*! + * Send first pass stats to the external rate control model. + */ vpx_rc_send_firstpass_stats_cb_fn_t send_firstpass_stats; + /*! + * Get encodeframe decision from the external rate control model. + */ vpx_rc_get_encodeframe_decision_cb_fn_t get_encodeframe_decision; + /*! + * Update encodeframe result to the external rate control model. + */ vpx_rc_update_encodeframe_result_cb_fn_t update_encodeframe_result; + /*! + * Delete the external rate control model. + */ vpx_rc_delete_model_cb_fn_t delete_model; + /*! + * Private data for the external rate control model. + */ void *priv; } vpx_rc_funcs_t; -/*!\endcond - TODO(angiebird): document these structures and fields to clear doxygen - warnings.*/ - #ifdef __cplusplus } // extern "C" #endif From 5b63f0f821e94f8072eb483014cfc33b05978bb9 Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Tue, 17 Nov 2020 15:30:55 -0800 Subject: [PATCH 0090/1000] Capitalize VPX_RC_OK / VPX_RC_ERROR Change-Id: I526bd6a6c2d2095db564f96d63c7ab7ee4dd90ad --- test/vp9_ext_ratectrl_test.cc | 10 +++++----- vpx/vpx_ext_ratectrl.h | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index 8db0a358d0..01d8019969 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -44,7 +44,7 @@ vpx_rc_status_t rc_create_model(void *priv, EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 24000); EXPECT_EQ(ratectrl_config->frame_rate_num, 30); EXPECT_EQ(ratectrl_config->frame_rate_den, 1); - return vpx_rc_ok; + return VPX_RC_OK; } vpx_rc_status_t rc_send_firstpass_stats( @@ -57,7 +57,7 @@ vpx_rc_status_t rc_send_firstpass_stats( for (int i = 0; i < first_pass_stats->num_frames; ++i) { EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); } - return vpx_rc_ok; + return VPX_RC_OK; } vpx_rc_status_t rc_get_encodeframe_decision( @@ -120,7 +120,7 @@ vpx_rc_status_t rc_get_encodeframe_decision( } else { frame_decision->q_index = 100; } - return vpx_rc_ok; + return VPX_RC_OK; } vpx_rc_status_t rc_update_encodeframe_result( @@ -135,14 +135,14 @@ vpx_rc_status_t rc_update_encodeframe_result( if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) { EXPECT_EQ(encode_frame_result->sse, 0); } - return vpx_rc_ok; + return VPX_RC_OK; } vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) { ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); delete toy_rate_ctrl; - return vpx_rc_ok; + return VPX_RC_OK; } class ExtRateCtrlTest : public ::libvpx_test::EncoderTest, diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 8aee4f4d82..bb3caa6148 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -73,8 +73,8 @@ typedef struct vpx_rc_encodeframe_result { /*!\brief Status returned by rate control callback functions. */ typedef enum vpx_rc_status { - vpx_rc_ok = 0, - vpx_rc_error = 1, + VPX_RC_OK = 0, + VPX_RC_ERROR = 1, } vpx_rc_status_t; /*!\brief First pass frame stats From e56e8dcd6fc9e2b04316be5144c18ca6772f6263 Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Tue, 17 Nov 2020 17:25:31 -0800 Subject: [PATCH 0091/1000] Add gop_index to vpx_ext_ratectrl.h Bug: webm:1707 Change-Id: I48826d5f3a7cc292825a7f1e30ac6d0f57adc569 --- test/vp9_ext_ratectrl_test.cc | 5 +++++ vp9/encoder/vp9_encoder.c | 4 ++-- vp9/encoder/vp9_ext_ratectrl.c | 3 ++- vp9/encoder/vp9_ext_ratectrl.h | 2 +- vpx/vpx_encoder.h | 4 +++- vpx/vpx_ext_ratectrl.h | 18 ++++++++++++++++-- 6 files changed, 29 insertions(+), 7 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index 01d8019969..812a18ed22 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -73,6 +73,7 @@ vpx_rc_status_t rc_get_encodeframe_decision( EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->gop_index, 0); EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/); EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], 0); // kRefFrameTypeLast @@ -83,6 +84,7 @@ vpx_rc_status_t rc_get_encodeframe_decision( } if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->gop_index, 1); EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/); EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], 1); // kRefFrameTypeLast @@ -96,10 +98,13 @@ vpx_rc_status_t rc_get_encodeframe_decision( if (encode_frame_info->coding_index >= 2 && encode_frame_info->coding_index < 5) { + // In the first group of pictures, coding_index and gop_index are equal. + EXPECT_EQ(encode_frame_info->gop_index, encode_frame_info->coding_index); EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/); } if (encode_frame_info->coding_index == 5) { + EXPECT_EQ(encode_frame_info->gop_index, 0); EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/); EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], 1); // kRefFrameTypeLast diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 8d60a0c001..37f644501d 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4511,8 +4511,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest get_ref_frame_bufs(cpi, ref_frame_bufs); vp9_extrc_get_encodeframe_decision( &cpi->ext_ratectrl, cm->current_video_frame, - cm->current_frame_coding_index, update_type, ref_frame_bufs, - ref_frame_flags, &encode_frame_decision); + cm->current_frame_coding_index, gf_group->index, update_type, + ref_frame_bufs, ref_frame_flags, &encode_frame_decision); q = encode_frame_decision.q_index; } diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 94c2addd25..a6a3e21d3b 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -103,7 +103,7 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { } void vp9_extrc_get_encodeframe_decision( - EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, + EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, FRAME_UPDATE_TYPE update_type, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, vpx_rc_encodeframe_decision_t *encode_frame_decision) { @@ -111,6 +111,7 @@ void vp9_extrc_get_encodeframe_decision( vpx_rc_encodeframe_info_t encode_frame_info; encode_frame_info.show_index = show_index; encode_frame_info.coding_index = coding_index; + encode_frame_info.gop_index = gop_index; encode_frame_info.frame_type = extrc_get_frame_type(update_type); vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs, diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index fb6cfe1ac8..6a86218dac 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -33,7 +33,7 @@ void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info); void vp9_extrc_get_encodeframe_decision( - EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, + EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, FRAME_UPDATE_TYPE update_type, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, vpx_rc_encodeframe_decision_t *encode_frame_decision); diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 39b2aef625..da36095775 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -30,6 +30,7 @@ extern "C" { #endif #include "./vpx_codec.h" +#include "./vpx_ext_ratectrl.h" /*! Temporal Scalability: Maximum length of the sequence defining frame * layer membership @@ -57,7 +58,8 @@ extern "C" { * fields to structures */ #define VPX_ENCODER_ABI_VERSION \ - (14 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ + (14 + VPX_CODEC_ABI_VERSION + \ + VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield * diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index bb3caa6148..5c57cf3319 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -17,6 +17,16 @@ extern "C" { #include "./vpx_integer.h" +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures. + */ +#define VPX_EXT_RATECTRL_ABI_VERSION (1) + /*!\brief Abstract rate control model handler * * The encoder will receive the model handler from create_model() defined in @@ -48,8 +58,12 @@ typedef struct vpx_rc_encodeframe_info { * 4: Golden frame */ int frame_type; - int show_index; /**< display index, starts from zero*/ - int coding_index; /**< coding index, starts from zero*/ + int show_index; /**< display index, starts from zero*/ + int coding_index; /**< coding index, starts from zero*/ + /*! + * index in group of picture, starts from zero. + */ + int gop_index; int ref_frame_coding_indexes[3]; /**< three reference frames' coding indices*/ /*! * The validity of the three reference frames. From 2ccee3928d9bb2995dbf634ac6f9f172d4d86f3f Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Thu, 19 Nov 2020 19:55:33 -0800 Subject: [PATCH 0092/1000] Allow user to set rc_mode and cq_level in SimpleEncode Change-Id: If3f56837e2c78a8b0fe7e0040f297c3f3ddb9c8b --- vp9/simple_encode.cc | 10 ++++++++++ vp9/simple_encode.h | 41 ++++++++++++++++++++++++++--------------- 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index afda6e2035..d4eb0c669d 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -765,6 +765,16 @@ static void UpdateEncodeConfig(const EncodeConfig &config, SET_STRUCT_VALUE(config, oxcf, ret, encode_breakout); SET_STRUCT_VALUE(config, oxcf, ret, enable_tpl_model); SET_STRUCT_VALUE(config, oxcf, ret, enable_auto_arf); + if (strcmp(config.name, "rc_mode") == 0) { + int rc_mode = atoi(config.value); + if (rc_mode >= VPX_VBR && rc_mode <= VPX_Q) { + oxcf->rc_mode = (enum vpx_rc_mode)rc_mode; + ret = 1; + } else { + fprintf(stderr, "Invalid rc_mode value: %d\n", rc_mode); + } + } + SET_STRUCT_VALUE(config, oxcf, ret, cq_level); if (ret == 0) { fprintf(stderr, "Ignored unsupported encode_config %s\n", config.name); } diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index 380e8118fc..e3ef3cea95 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -361,21 +361,32 @@ class SimpleEncode { // The following configs in VP9EncoderConfig are allowed to change in this // function. See https://ffmpeg.org/ffmpeg-codecs.html#libvpx for each // config's meaning. - // Configs in VP9EncoderConfig: Equivalent configs in ffmpeg: - // 1 key_freq -g - // 2 two_pass_vbrmin_section -minrate * 100LL / bit_rate - // 3 two_pass_vbrmax_section -maxrate * 100LL / bit_rate - // 4 under_shoot_pct -undershoot-pct - // 5 over_shoot_pct -overshoot-pct - // 6 max_threads -threads - // 7 frame_parallel_decoding_mode -frame-parallel - // 8 tile_column -tile-columns - // 9 arnr_max_frames -arnr-maxframes - // 10 arnr_strength -arnr-strength - // 11 lag_in_frames -rc_lookahead - // 12 encode_breakout -static-thresh - // 13 enable_tpl_model -enable-tpl - // 14 enable_auto_arf -auto-alt-ref + // Configs in VP9EncoderConfig: Equivalent configs in ffmpeg: + // 1 key_freq -g + // 2 two_pass_vbrmin_section -minrate * 100LL / bit_rate + // 3 two_pass_vbrmax_section -maxrate * 100LL / bit_rate + // 4 under_shoot_pct -undershoot-pct + // 5 over_shoot_pct -overshoot-pct + // 6 max_threads -threads + // 7 frame_parallel_decoding_mode -frame-parallel + // 8 tile_column -tile-columns + // 9 arnr_max_frames -arnr-maxframes + // 10 arnr_strength -arnr-strength + // 11 lag_in_frames -rc_lookahead + // 12 encode_breakout -static-thresh + // 13 enable_tpl_model -enable-tpl + // 14 enable_auto_arf -auto-alt-ref + // 15 rc_mode + // Possible Settings: + // 0 - Variable Bit Rate (VPX_VBR) -b:v + // 1 - Constant Bit Rate (VPX_CBR) -b:v -minrate + // -maxrate + // two_pass_vbrmin_section == 100 i.e. bit_rate == minrate == maxrate + // two_pass_vbrmax_section == 100 + // 2 - Constrained Quality (VPX_CQ) -crf -b:v bit_rate + // 3 - Constant Quality (VPX_Q) -crf -b:v 0 + // See https://trac.ffmpeg.org/wiki/Encode/VP9 for more details. + // 16 cq_level see rc_mode for details. StatusCode SetEncodeConfig(const char *name, const char *value); // A debug function that dumps configs from VP9EncoderConfig From c341440874f9f469e3861d905ea5f2f725b4f16b Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Fri, 20 Nov 2020 17:41:09 -0800 Subject: [PATCH 0093/1000] Refine documentation of vpx_ext_ratectrl.h Bug: webm:1707 Change-Id: Iba04b5292c157e22dd8618a79e8c977ec9fc2199 --- vpx/vpx_ext_ratectrl.h | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 5c57cf3319..dc4d856a8b 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -139,21 +139,23 @@ typedef struct vpx_rc_frame_stats { double pcnt_second_ref; /*! * Percentage of blocks where intra and inter prediction errors were very - * close. Note that this is a 'weighted count', that is, the so blocks may be - * weighted by how close the two errors were. + * close. */ double pcnt_neutral; /*! * Percentage of blocks that have intra error < inter error and inter error < - * LOW_I_THRESH LOW_I_THRESH = 24000 using bit_depth 8 LOW_I_THRESH = 24000 << - * 4 using bit_depth 10 LOW_I_THRESH = 24000 << 8 using bit_depth 12 + * LOW_I_THRESH + * - bit_depth 8: LOW_I_THRESH = 24000 + * - bit_depth 10: LOW_I_THRESH = 24000 << 4 + * - bit_depth 12: LOW_I_THRESH = 24000 << 8 */ double pcnt_intra_low; /*! * Percentage of blocks that have intra error < inter error and intra error < - * LOW_I_THRESH but inter error >= LOW_I_THRESH LOW_I_THRESH = 24000 using - * bit_depth 8 LOW_I_THRESH = 24000 << 4 using bit_depth 10 LOW_I_THRESH = - * 24000 << 8 using bit_depth 12 + * LOW_I_THRESH but inter error >= LOW_I_THRESH LOW_I_THRESH + * - bit_depth 8: LOW_I_THRESH = 24000 + * - bit_depth 10: LOW_I_THRESH = 24000 << 4 + * - bit_depth 12: LOW_I_THRESH = 24000 << 8 */ double pcnt_intra_high; /*! @@ -166,9 +168,9 @@ typedef struct vpx_rc_frame_stats { double intra_skip_pct; /*! * Percentage of blocks that have intra error < SMOOTH_INTRA_THRESH - * SMOOTH_INTRA_THRESH = 4000 using bit_depth 8 - * SMOOTH_INTRA_THRESH = 4000 << 4 using bit_depth 10 - * SMOOTH_INTRA_THRESH = 4000 << 8 using bit_depth 12 + * - bit_depth 8: SMOOTH_INTRA_THRESH = 4000 + * - bit_depth 10: SMOOTH_INTRA_THRESH = 4000 << 4 + * - bit_depth 12: SMOOTH_INTRA_THRESH = 4000 << 8 */ double intra_smooth_pct; /*! @@ -180,7 +182,7 @@ typedef struct vpx_rc_frame_stats { */ double inactive_zone_cols; /*! - * Average of row motion vectors. + * Mean of row motion vectors. */ double MVr; /*! @@ -214,8 +216,8 @@ typedef struct vpx_rc_frame_stats { */ double duration; /*! - * 1.0 if stats are for a single frame, OR - * Number of frames in this collection for which the stats are accumulated. + * 1.0 if stats are for a single frame, or + * number of frames whose stats are accumulated. */ double count; } vpx_rc_frame_stats_t; From 5459c4ab98c3d4377ab8f3379984c809ccf6e7eb Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Tue, 24 Nov 2020 02:55:24 +0000 Subject: [PATCH 0094/1000] Revert "Close out file in EndEncode()" This reverts commit 7370cecd8929141adb8140b924d3dd8ac1887d36. Reason for revert: I accidentally check in this CL Change-Id: I71ff0b98649070df3edd13b98170a7091541057b --- vp9/simple_encode.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index ba076fd586..46b25d1fdf 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -959,10 +959,6 @@ void SimpleEncode::EndEncode() { impl_ptr_->cpi = nullptr; vpx_img_free(&impl_ptr_->tmp_img); rewind(in_file_); - if (out_file_ != nullptr) { - fclose(out_file_); - out_file_ = nullptr; - } } void SimpleEncode::UpdateKeyFrameGroup(int key_frame_show_index) { From ebac57ce9250aac85701f5258ec54f9cf9bf14a8 Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Wed, 25 Nov 2020 12:58:24 -0800 Subject: [PATCH 0095/1000] Fix typos in simple_encode.h Change-Id: Id83eff6cc12c441ce991fb1a73820d106311cf5e --- vp9/simple_encode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index e3ef3cea95..8ec7069e83 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -281,7 +281,7 @@ struct EncodeFrameResult { std::vector motion_vector_info; // A vector of the tpl stats information. // The tpl stats measure the complexity of a frame, as well as the - // informatioin propagated along the motion trajactory between frames, in + // information propagated along the motion trajectory between frames, in // the reference frame structure. // The tpl stats could be used as a more accurate spatial and temporal // complexity measure in addition to the first pass stats. From ffc179d8bfb836d7b39aaf8595a9051b25a7b437 Mon Sep 17 00:00:00 2001 From: Jeremy Leconte Date: Thu, 10 Dec 2020 17:54:54 +0100 Subject: [PATCH 0096/1000] Fix nullptr with offset. The error occurs with low resolution when LibvpxVp8Encoder::NumberOfThreads returns 1. Bug: b:175283098 Change-Id: Icc9387c75f4ac6e4f09f102b3143e83c998c5e38 --- vp8/encoder/encodeframe.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 2b3d9564ce..2f84381d24 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -343,8 +343,11 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, const int nsync = cpi->mt_sync_range; vpx_atomic_int rightmost_col = VPX_ATOMIC_INIT(cm->mb_cols + nsync); const vpx_atomic_int *last_row_current_mb_col; - vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; + vpx_atomic_int *current_mb_col = NULL; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) { + current_mb_col = &cpi->mt_current_mb_col[mb_row]; + } if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0 && mb_row != 0) { last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1]; } else { From 723fca7dd122c94e209864316e01535a23dc6d09 Mon Sep 17 00:00:00 2001 From: Gregor Jasny Date: Fri, 11 Dec 2020 08:00:07 +0100 Subject: [PATCH 0097/1000] configure: add darwin20 cross-compile support Change-Id: I91c0e832a6e76172397e97413329fd43edc81c78 --- build/make/configure.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/make/configure.sh b/build/make/configure.sh index 91a64b5041..c4e938fc72 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -919,8 +919,8 @@ process_common_toolchain() { add_ldflags "-mmacosx-version-min=10.15" ;; *-darwin20-*) - add_cflags "-mmacosx-version-min=10.16" - add_ldflags "-mmacosx-version-min=10.16" + add_cflags "-mmacosx-version-min=10.16 -arch ${toolchain%%-*}" + add_ldflags "-mmacosx-version-min=10.16 -arch ${toolchain%%-*}" ;; *-iphonesimulator-*) add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}" From 8ed23d5f7fd1a003ecd1eb543f3a662772306155 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Tue, 15 Dec 2020 22:40:09 -0800 Subject: [PATCH 0098/1000] First pass: skip motion search for intra-only BUG=webm:1713 Change-Id: Ibad79cf5d12aa913e8c87a31d7d2124c00958691 --- vp9/encoder/vp9_firstpass.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index de954f7575..2a9cf52898 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1081,8 +1081,8 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, x->mv_limits.col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16; - // Other than for the first frame do a motion search. - if (cm->current_video_frame > 0) { + // Other than for intra-only frame do a motion search. + if (!frame_is_intra_only(cm)) { int tmp_err, motion_error, this_motion_error, raw_motion_error; // Assume 0,0 motion with no mv overhead. MV mv = { 0, 0 }, tmp_mv = { 0, 0 }; From 67b1d7f1740158e795a26a722491424e6257c0dc Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Thu, 17 Dec 2020 17:32:26 -0800 Subject: [PATCH 0099/1000] Correct pixel_count in encode_frame_result Change-Id: I3270af4f793f8e453e10d1caf8ffa1a8d5d584a7 --- vp9/encoder/vp9_ext_ratectrl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index a6a3e21d3b..7d553a2ecd 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -134,8 +134,8 @@ void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl, vpx_rc_encodeframe_result_t encode_frame_result; encode_frame_result.bit_count = bit_count; encode_frame_result.pixel_count = - source_frame->y_width * source_frame->y_height + - 2 * source_frame->uv_width * source_frame->uv_height; + source_frame->y_crop_width * source_frame->y_crop_height + + 2 * source_frame->uv_crop_width * source_frame->uv_crop_height; #if CONFIG_VP9_HIGHBITDEPTH vpx_calc_highbd_psnr(source_frame, coded_frame, &psnr, bit_depth, input_bit_depth); From 3a38edea2cd114d53914cab017cab2e43a600031 Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Thu, 17 Dec 2020 18:09:55 -0800 Subject: [PATCH 0100/1000] Fix show_index in vp9_extrc_encodeframe_decision() Change-Id: I93bb1fb3c14126d881d3f691d30875a0062e436c --- test/vp9_ext_ratectrl_test.cc | 3 +++ vp9/encoder/vp9_encoder.c | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index 812a18ed22..4b3693a347 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -73,6 +73,7 @@ vpx_rc_status_t rc_get_encodeframe_decision( EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->show_index, 0); EXPECT_EQ(encode_frame_info->gop_index, 0); EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/); EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], @@ -84,6 +85,7 @@ vpx_rc_status_t rc_get_encodeframe_decision( } if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->show_index, 4); EXPECT_EQ(encode_frame_info->gop_index, 1); EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/); EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], @@ -104,6 +106,7 @@ vpx_rc_status_t rc_get_encodeframe_decision( } if (encode_frame_info->coding_index == 5) { + EXPECT_EQ(encode_frame_info->show_index, 4); EXPECT_EQ(encode_frame_info->gop_index, 0); EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/); EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 37f644501d..6968e57919 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4508,9 +4508,11 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; const int ref_frame_flags = get_ref_frame_flags(cpi); RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; + const RefCntBuffer *curr_frame_buf = + get_ref_cnt_buffer(cm, cm->new_fb_idx); get_ref_frame_bufs(cpi, ref_frame_bufs); vp9_extrc_get_encodeframe_decision( - &cpi->ext_ratectrl, cm->current_video_frame, + &cpi->ext_ratectrl, curr_frame_buf->frame_index, cm->current_frame_coding_index, gf_group->index, update_type, ref_frame_bufs, ref_frame_flags, &encode_frame_decision); q = encode_frame_decision.q_index; From 576e0801f9281fd54e2c69ad5be5fef7af656011 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Wed, 13 Jan 2021 10:51:39 -0800 Subject: [PATCH 0101/1000] vpxenc: initalize the image object Otherwise it would cause problem when calling vpx_img_free() at the end if no frame is read. Change-Id: Ide0ed28eeb142d65d04703442cc4f098ac8edb34 --- vpxenc.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/vpxenc.c b/vpxenc.c index 5d7546eb28..5042e688c9 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -1636,6 +1636,7 @@ int main(int argc, const char **argv_) { int res = 0; memset(&input, 0, sizeof(input)); + memset(&raw, 0, sizeof(raw)); exec_name = argv_[0]; /* Setup default input stream settings */ @@ -1781,14 +1782,10 @@ int main(int argc, const char **argv_) { FOREACH_STREAM(show_stream_config(stream, &global, &input)); if (pass == (global.pass ? global.pass - 1 : 0)) { - if (input.file_type == FILE_TYPE_Y4M) - /*The Y4M reader does its own allocation. - Just initialize this here to avoid problems if we never read any - frames.*/ - memset(&raw, 0, sizeof(raw)); - else + // The Y4M reader does its own allocation. + if (input.file_type != FILE_TYPE_Y4M) { vpx_img_alloc(&raw, input.fmt, input.width, input.height, 32); - + } FOREACH_STREAM(stream->rate_hist = init_rate_histogram( &stream->config.cfg, &global.framerate)); } From ecbb0e0e2a9b0500db432922b436d1f59ae9b011 Mon Sep 17 00:00:00 2001 From: Elliott Karpilovsky Date: Thu, 14 Jan 2021 14:17:08 -0800 Subject: [PATCH 0102/1000] Relax constraints on Y4M header parsing Some refactoring and cleanup -- do not count the first 9 bytes against the header limit. Add a unit test. BUG=aomedia:2876 Change-Id: Id897d565e2917b48460cc77cd082cec4c98b42cb --- test/y4m_test.cc | 25 +++ y4minput.c | 394 +++++++++++++++++++++++++---------------------- y4minput.h | 12 +- 3 files changed, 241 insertions(+), 190 deletions(-) diff --git a/test/y4m_test.cc b/test/y4m_test.cc index 46cb5cff80..5df389f520 100644 --- a/test/y4m_test.cc +++ b/test/y4m_test.cc @@ -188,4 +188,29 @@ TEST_P(Y4mVideoWriteTest, WriteTest) { INSTANTIATE_TEST_SUITE_P(C, Y4mVideoWriteTest, ::testing::ValuesIn(kY4mTestVectors)); + +static const char kY4MRegularHeader[] = + "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG\n" + "FRAME\n" + "012345678912345601230123"; + +TEST(Y4MHeaderTest, RegularHeader) { + libvpx_test::TempOutFile f; + fwrite(kY4MRegularHeader, 1, sizeof(kY4MRegularHeader), f.file()); + fflush(f.file()); + EXPECT_EQ(0, fseek(f.file(), 0, 0)); + + y4m_input y4m; + EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL, + /*num_skip=*/0, /*only_420=*/0), + 0); + EXPECT_EQ(y4m.pic_w, 4); + EXPECT_EQ(y4m.pic_h, 4); + EXPECT_EQ(y4m.fps_n, 30); + EXPECT_EQ(y4m.fps_d, 1); + EXPECT_EQ(y4m.interlace, 'p'); + EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0); + y4m_input_close(&y4m); +} + } // namespace diff --git a/y4minput.c b/y4minput.c index 007bd9971b..68000768c9 100644 --- a/y4minput.c +++ b/y4minput.c @@ -10,6 +10,7 @@ * Based on code from the OggTheora software codec source code, * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors. */ +#include #include #include #include @@ -784,277 +785,294 @@ static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst, (void)_aux; } -int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, - int only_420) { +static const char TAG[] = "YUV4MPEG2"; + +int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, + int num_skip, int only_420) { + // File must start with |TAG|. + char tag_buffer[9]; // 9 == strlen(TAG) char buffer[80] = { 0 }; int ret; int i; + // Read as much as possible from |skip_buffer|, which were characters + // that were previously read from the file to do input-type detection. + assert(num_skip >= 0 && num_skip <= 8); + if (num_skip > 0) { + memcpy(tag_buffer, skip_buffer, num_skip); + } + // Start reading from the file now that the |skip_buffer| is depleted. + if (!file_read(tag_buffer + num_skip, 9 - num_skip, file)) { + return -1; + } + if (memcmp(TAG, tag_buffer, 9) != 0) { + fprintf(stderr, "Error parsing header: must start with %s\n", TAG); + return -1; + } + // Next character must be a space. + if (!file_read(buffer, 1, file) || buffer[0] != ' ') { + fprintf(stderr, "Error parsing header: space must follow %s\n", TAG); + return -1; + } /*Read until newline, or 80 cols, whichever happens first.*/ for (i = 0; i < 79; i++) { - if (_nskip > 0) { - buffer[i] = *_skip++; - _nskip--; - } else { - if (!file_read(buffer + i, 1, _fin)) return -1; - } + if (!file_read(buffer + i, 1, file)) return -1; if (buffer[i] == '\n') break; } /*We skipped too much header data.*/ - if (_nskip > 0) return -1; if (i == 79) { - fprintf(stderr, "Error parsing header; not a YUV2MPEG2 file?\n"); + fprintf(stderr, "Error parsing header; not a YUV4MPEG2 file?\n"); return -1; } buffer[i] = '\0'; - if (memcmp(buffer, "YUV4MPEG", 8)) { - fprintf(stderr, "Incomplete magic for YUV4MPEG file.\n"); - return -1; - } - if (buffer[8] != '2') { - fprintf(stderr, "Incorrect YUV input file version; YUV4MPEG2 required.\n"); - } - ret = y4m_parse_tags(_y4m, buffer + 5); + ret = y4m_parse_tags(y4m_ctx, buffer); if (ret < 0) { fprintf(stderr, "Error parsing YUV4MPEG2 header.\n"); return ret; } - if (_y4m->interlace == '?') { + if (y4m_ctx->interlace == '?') { fprintf(stderr, "Warning: Input video interlacing format unknown; " "assuming progressive scan.\n"); - } else if (_y4m->interlace != 'p') { + } else if (y4m_ctx->interlace != 'p') { fprintf(stderr, "Input video is interlaced; " "Only progressive scan handled.\n"); return -1; } - _y4m->vpx_fmt = VPX_IMG_FMT_I420; - _y4m->bps = 12; - _y4m->bit_depth = 8; - if (strcmp(_y4m->chroma_type, "420") == 0 || - strcmp(_y4m->chroma_type, "420jpeg") == 0) { - _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = - _y4m->pic_w * _y4m->pic_h + - 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I420; + y4m_ctx->bps = 12; + y4m_ctx->bit_depth = 8; + y4m_ctx->aux_buf = NULL; + y4m_ctx->dst_buf = NULL; + if (strcmp(y4m_ctx->chroma_type, "420") == 0 || + strcmp(y4m_ctx->chroma_type, "420jpeg") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v = + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = + y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); /* Natively supported: no conversion required. */ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; - } else if (strcmp(_y4m->chroma_type, "420p10") == 0) { - _y4m->src_c_dec_h = 2; - _y4m->dst_c_dec_h = 2; - _y4m->src_c_dec_v = 2; - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = - 2 * (_y4m->pic_w * _y4m->pic_h + - 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2)); + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + } else if (strcmp(y4m_ctx->chroma_type, "420p10") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2)); /* Natively supported: no conversion required. */ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; - _y4m->bit_depth = 10; - _y4m->bps = 15; - _y4m->vpx_fmt = VPX_IMG_FMT_I42016; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + y4m_ctx->bit_depth = 10; + y4m_ctx->bps = 15; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42016; if (only_420) { fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n"); return -1; } - } else if (strcmp(_y4m->chroma_type, "420p12") == 0) { - _y4m->src_c_dec_h = 2; - _y4m->dst_c_dec_h = 2; - _y4m->src_c_dec_v = 2; - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = - 2 * (_y4m->pic_w * _y4m->pic_h + - 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2)); + } else if (strcmp(y4m_ctx->chroma_type, "420p12") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2)); /* Natively supported: no conversion required. */ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; - _y4m->bit_depth = 12; - _y4m->bps = 18; - _y4m->vpx_fmt = VPX_IMG_FMT_I42016; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + y4m_ctx->bit_depth = 12; + y4m_ctx->bps = 18; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42016; if (only_420) { fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n"); return -1; } - } else if (strcmp(_y4m->chroma_type, "420mpeg2") == 0) { - _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + } else if (strcmp(y4m_ctx->chroma_type, "420mpeg2") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v = + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first.*/ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = - 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); - _y4m->convert = y4m_convert_42xmpeg2_42xjpeg; - } else if (strcmp(_y4m->chroma_type, "420paldv") == 0) { - _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); + y4m_ctx->convert = y4m_convert_42xmpeg2_42xjpeg; + } else if (strcmp(y4m_ctx->chroma_type, "420paldv") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v = + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first. We need to make two filter passes, so we need some extra space in the aux buffer.*/ - _y4m->aux_buf_sz = 3 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); - _y4m->aux_buf_read_sz = - 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); - _y4m->convert = y4m_convert_42xpaldv_42xjpeg; - } else if (strcmp(_y4m->chroma_type, "422jpeg") == 0) { - _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2; - _y4m->src_c_dec_v = 1; - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + y4m_ctx->aux_buf_sz = + 3 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); + y4m_ctx->aux_buf_read_sz = + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); + y4m_ctx->convert = y4m_convert_42xpaldv_42xjpeg; + } else if (strcmp(y4m_ctx->chroma_type, "422jpeg") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first.*/ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = - 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; - _y4m->convert = y4m_convert_422jpeg_420jpeg; - } else if (strcmp(_y4m->chroma_type, "422") == 0) { - _y4m->src_c_dec_h = 2; - _y4m->src_c_dec_v = 1; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_422jpeg_420jpeg; + } else if (strcmp(y4m_ctx->chroma_type, "422") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; if (only_420) { - _y4m->dst_c_dec_h = 2; - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first. We need to make two filter passes, so we need some extra space in the aux buffer.*/ - _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; - _y4m->aux_buf_sz = - _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; - _y4m->convert = y4m_convert_422_420jpeg; + y4m_ctx->aux_buf_read_sz = + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz + + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_422_420jpeg; } else { - _y4m->vpx_fmt = VPX_IMG_FMT_I422; - _y4m->bps = 16; - _y4m->dst_c_dec_h = _y4m->src_c_dec_h; - _y4m->dst_c_dec_v = _y4m->src_c_dec_v; - _y4m->dst_buf_read_sz = - _y4m->pic_w * _y4m->pic_h + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I422; + y4m_ctx->bps = 16; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = + y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; /*Natively supported: no conversion required.*/ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; } - } else if (strcmp(_y4m->chroma_type, "422p10") == 0) { - _y4m->src_c_dec_h = 2; - _y4m->src_c_dec_v = 1; - _y4m->vpx_fmt = VPX_IMG_FMT_I42216; - _y4m->bps = 20; - _y4m->bit_depth = 10; - _y4m->dst_c_dec_h = _y4m->src_c_dec_h; - _y4m->dst_c_dec_v = _y4m->src_c_dec_v; - _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h + - 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h); - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; + } else if (strcmp(y4m_ctx->chroma_type, "422p10") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42216; + y4m_ctx->bps = 20; + y4m_ctx->bit_depth = 10; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h); + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; if (only_420) { fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n"); return -1; } - } else if (strcmp(_y4m->chroma_type, "422p12") == 0) { - _y4m->src_c_dec_h = 2; - _y4m->src_c_dec_v = 1; - _y4m->vpx_fmt = VPX_IMG_FMT_I42216; - _y4m->bps = 24; - _y4m->bit_depth = 12; - _y4m->dst_c_dec_h = _y4m->src_c_dec_h; - _y4m->dst_c_dec_v = _y4m->src_c_dec_v; - _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h + - 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h); - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; + } else if (strcmp(y4m_ctx->chroma_type, "422p12") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42216; + y4m_ctx->bps = 24; + y4m_ctx->bit_depth = 12; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h); + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; if (only_420) { fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n"); return -1; } - } else if (strcmp(_y4m->chroma_type, "411") == 0) { - _y4m->src_c_dec_h = 4; - _y4m->dst_c_dec_h = 2; - _y4m->src_c_dec_v = 1; - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + } else if (strcmp(y4m_ctx->chroma_type, "411") == 0) { + y4m_ctx->src_c_dec_h = 4; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first. We need to make two filter passes, so we need some extra space in the aux buffer.*/ - _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 3) / 4) * _y4m->pic_h; - _y4m->aux_buf_sz = - _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; - _y4m->convert = y4m_convert_411_420jpeg; + y4m_ctx->aux_buf_read_sz = 2 * ((y4m_ctx->pic_w + 3) / 4) * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = + y4m_ctx->aux_buf_read_sz + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_411_420jpeg; fprintf(stderr, "Unsupported conversion from yuv 411\n"); return -1; - } else if (strcmp(_y4m->chroma_type, "444") == 0) { - _y4m->src_c_dec_h = 1; - _y4m->src_c_dec_v = 1; + } else if (strcmp(y4m_ctx->chroma_type, "444") == 0) { + y4m_ctx->src_c_dec_h = 1; + y4m_ctx->src_c_dec_v = 1; if (only_420) { - _y4m->dst_c_dec_h = 2; - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first. We need to make two filter passes, so we need some extra space in the aux buffer.*/ - _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h; - _y4m->aux_buf_sz = - _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; - _y4m->convert = y4m_convert_444_420jpeg; + y4m_ctx->aux_buf_read_sz = 2 * y4m_ctx->pic_w * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz + + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_444_420jpeg; } else { - _y4m->vpx_fmt = VPX_IMG_FMT_I444; - _y4m->bps = 24; - _y4m->dst_c_dec_h = _y4m->src_c_dec_h; - _y4m->dst_c_dec_v = _y4m->src_c_dec_v; - _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I444; + y4m_ctx->bps = 24; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; /*Natively supported: no conversion required.*/ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; } - } else if (strcmp(_y4m->chroma_type, "444p10") == 0) { - _y4m->src_c_dec_h = 1; - _y4m->src_c_dec_v = 1; - _y4m->vpx_fmt = VPX_IMG_FMT_I44416; - _y4m->bps = 30; - _y4m->bit_depth = 10; - _y4m->dst_c_dec_h = _y4m->src_c_dec_h; - _y4m->dst_c_dec_v = _y4m->src_c_dec_v; - _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h; - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; + } else if (strcmp(y4m_ctx->chroma_type, "444p10") == 0) { + y4m_ctx->src_c_dec_h = 1; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I44416; + y4m_ctx->bps = 30; + y4m_ctx->bit_depth = 10; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; if (only_420) { fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n"); return -1; } - } else if (strcmp(_y4m->chroma_type, "444p12") == 0) { - _y4m->src_c_dec_h = 1; - _y4m->src_c_dec_v = 1; - _y4m->vpx_fmt = VPX_IMG_FMT_I44416; - _y4m->bps = 36; - _y4m->bit_depth = 12; - _y4m->dst_c_dec_h = _y4m->src_c_dec_h; - _y4m->dst_c_dec_v = _y4m->src_c_dec_v; - _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h; - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; + } else if (strcmp(y4m_ctx->chroma_type, "444p12") == 0) { + y4m_ctx->src_c_dec_h = 1; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I44416; + y4m_ctx->bps = 36; + y4m_ctx->bit_depth = 12; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; if (only_420) { fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n"); return -1; } - } else if (strcmp(_y4m->chroma_type, "mono") == 0) { - _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0; - _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + } else if (strcmp(y4m_ctx->chroma_type, "mono") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->src_c_dec_v = 0; + y4m_ctx->dst_c_dec_h = y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*No extra space required, but we need to clear the chroma planes.*/ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_mono_420jpeg; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_mono_420jpeg; } else { - fprintf(stderr, "Unknown chroma sampling type: %s\n", _y4m->chroma_type); + fprintf(stderr, "Unknown chroma sampling type: %s\n", y4m_ctx->chroma_type); return -1; } /*The size of the final frame buffers is always computed from the destination chroma decimation type.*/ - _y4m->dst_buf_sz = - _y4m->pic_w * _y4m->pic_h + - 2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) * - ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v); - if (_y4m->bit_depth == 8) - _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz); + y4m_ctx->dst_buf_sz = + y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + y4m_ctx->dst_c_dec_h - 1) / y4m_ctx->dst_c_dec_h) * + ((y4m_ctx->pic_h + y4m_ctx->dst_c_dec_v - 1) / y4m_ctx->dst_c_dec_v); + if (y4m_ctx->bit_depth == 8) + y4m_ctx->dst_buf = (unsigned char *)malloc(y4m_ctx->dst_buf_sz); else - _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz); + y4m_ctx->dst_buf = (unsigned char *)malloc(2 * y4m_ctx->dst_buf_sz); - if (_y4m->aux_buf_sz > 0) - _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz); + if (y4m_ctx->aux_buf_sz > 0) + y4m_ctx->aux_buf = (unsigned char *)malloc(y4m_ctx->aux_buf_sz); return 0; } diff --git a/y4minput.h b/y4minput.h index a4a8b18dc5..573750d749 100644 --- a/y4minput.h +++ b/y4minput.h @@ -56,8 +56,16 @@ struct y4m_input { unsigned int bit_depth; }; -int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, - int only_420); +/** + * Open the input file, treating it as Y4M. |y4m_ctx| is filled in after + * reading it. The |skip_buffer| indicates bytes that were previously read + * from |file|, to do input-type detection; this buffer will be read before + * the |file| is read. It is of size |num_skip|, which *must* be 8 or less. + * + * Returns 0 on success, -1 on failure. + */ +int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, + int num_skip, int only_420); void y4m_input_close(y4m_input *_y4m); int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img); From fe1c96d1113ad73370841f64913dfcd361ff9bf5 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 19 Jan 2021 18:38:23 -0800 Subject: [PATCH 0103/1000] {highbd_,}loopfilter_neon.c: quiet -Wmaybe-uninitialized Seen with arm-linux-gnueabihf-gcc-8 (8.3.0 & 8.4.0) Without reworking the code or adding an additional branch this warning cannot be silenced otherwise. The loopfilter is only called when needed for a block so these output pixels will be set. BUG=b/176822719 Change-Id: I9cf6e59bd5de901e168867ccbe021d28d0c04933 --- vpx_dsp/arm/highbd_loopfilter_neon.c | 15 +++++++++++++++ vpx_dsp/arm/loopfilter_neon.c | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/vpx_dsp/arm/highbd_loopfilter_neon.c b/vpx_dsp/arm/highbd_loopfilter_neon.c index 5530c6425b..8d6e8acc4c 100644 --- a/vpx_dsp/arm/highbd_loopfilter_neon.c +++ b/vpx_dsp/arm/highbd_loopfilter_neon.c @@ -661,6 +661,17 @@ void vpx_highbd_lpf_vertical_8_dual_neon( vpx_highbd_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd); } +// Quiet warnings of the form: 'vpx_dsp/arm/highbd_loopfilter_neon.c|675 col 67| +// warning: 'oq1' may be used uninitialized in this function +// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding +// an additional branch this warning cannot be silenced otherwise. The +// loopfilter is only called when needed for a block so these output pixels +// will be set. +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + static void lpf_horizontal_16_kernel(uint16_t *s, int p, const uint16x8_t blimit_vec, const uint16x8_t limit_vec, @@ -723,6 +734,10 @@ static void lpf_vertical_16_kernel(uint16_t *s, int p, } } +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic pop +#endif + void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c index 7419cea022..c54e588239 100644 --- a/vpx_dsp/arm/loopfilter_neon.c +++ b/vpx_dsp/arm/loopfilter_neon.c @@ -975,6 +975,17 @@ FUN_LPF_16_KERNEL(_, 8) // lpf_16_kernel FUN_LPF_16_KERNEL(_dual_, 16) // lpf_16_dual_kernel #undef FUN_LPF_16_KERNEL +// Quiet warnings of the form: 'vpx_dsp/arm/loopfilter_neon.c|981 col 42| +// warning: 'oq1' may be used uninitialized in this function +// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding +// an additional branch this warning cannot be silenced otherwise. The +// loopfilter is only called when needed for a block so these output pixels +// will be set. +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, @@ -1090,3 +1101,7 @@ void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, vget_high_u8(oq0), vget_high_u8(oq1)); } } + +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic pop +#endif From f4fc562489bcee227403ca00f589e70043ebc5dc Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Tue, 19 Jan 2021 17:36:12 -0800 Subject: [PATCH 0104/1000] Return status in vp9_extrc_create/init/delete Bug: webm:1716 Change-Id: I0b98741db8c639bdddd899fd6ad359da7b916086 --- vp9/encoder/vp9_ext_ratectrl.c | 43 +++++++++++++++++++++++++++------- vp9/encoder/vp9_ext_ratectrl.h | 9 +++---- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 7d553a2ecd..d93abd60d3 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -13,31 +13,56 @@ #include "vp9/common/vp9_common.h" #include "vpx_dsp/psnr.h" -void vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) { vp9_zero(*ext_ratectrl); } +vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_ERROR; + } + vp9_zero(*ext_ratectrl); + return VPX_CODEC_OK; +} -void vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_rc_config_t ratectrl_config, - EXT_RATECTRL *ext_ratectrl) { +vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs, + vpx_rc_config_t ratectrl_config, + EXT_RATECTRL *ext_ratectrl) { + vpx_rc_status_t rc_status; vpx_rc_firstpass_stats_t *rc_firstpass_stats; + if (ext_ratectrl == NULL) { + return VPX_CODEC_ERROR; + } vp9_extrc_delete(ext_ratectrl); ext_ratectrl->funcs = funcs; ext_ratectrl->ratectrl_config = ratectrl_config; - ext_ratectrl->funcs.create_model(ext_ratectrl->funcs.priv, - &ext_ratectrl->ratectrl_config, - &ext_ratectrl->model); + rc_status = ext_ratectrl->funcs.create_model(ext_ratectrl->funcs.priv, + &ext_ratectrl->ratectrl_config, + &ext_ratectrl->model); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } rc_firstpass_stats = &ext_ratectrl->rc_firstpass_stats; rc_firstpass_stats->num_frames = ratectrl_config.show_frame_count; rc_firstpass_stats->frame_stats = vpx_malloc(sizeof(*rc_firstpass_stats->frame_stats) * rc_firstpass_stats->num_frames); + if (rc_firstpass_stats->frame_stats == NULL) { + return VPX_CODEC_MEM_ERROR; + } ext_ratectrl->ready = 1; + return VPX_CODEC_OK; } -void vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) { +vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_ERROR; + } if (ext_ratectrl->ready) { - ext_ratectrl->funcs.delete_model(ext_ratectrl->model); + vpx_rc_status_t rc_status = + ext_ratectrl->funcs.delete_model(ext_ratectrl->model); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } vpx_free(ext_ratectrl->rc_firstpass_stats.frame_stats); } - vp9_extrc_init(ext_ratectrl); + return vp9_extrc_init(ext_ratectrl); } static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats, diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index 6a86218dac..e4d56c0b22 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -22,12 +22,13 @@ typedef struct EXT_RATECTRL { vpx_rc_firstpass_stats_t rc_firstpass_stats; } EXT_RATECTRL; -void vp9_extrc_init(EXT_RATECTRL *ext_ratectrl); +vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl); -void vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_rc_config_t ratectrl_config, - EXT_RATECTRL *ext_ratectrl); +vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs, + vpx_rc_config_t ratectrl_config, + EXT_RATECTRL *ext_ratectrl); -void vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl); +vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl); void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info); From 27f1838519ea1354bb8a038ec4e9d2c6da0da994 Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Tue, 19 Jan 2021 17:48:07 -0800 Subject: [PATCH 0105/1000] Return status in vp9_extrc_send_firstpass_stats Bug: webm:1716 Change-Id: I96b18436c58ed888fcf677097819cc0093b6f41d --- vp9/encoder/vp9_ext_ratectrl.c | 16 ++++++++++++---- vp9/encoder/vp9_ext_ratectrl.h | 4 ++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index d93abd60d3..4a2e1b82dc 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -94,9 +94,13 @@ static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats, rc_frame_stats->count = stats->count; } -void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl, - const FIRST_PASS_INFO *first_pass_info) { +vpx_codec_err_t vp9_extrc_send_firstpass_stats( + EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_ERROR; + } if (ext_ratectrl->ready) { + vpx_rc_status_t rc_status; vpx_rc_firstpass_stats_t *rc_firstpass_stats = &ext_ratectrl->rc_firstpass_stats; int i; @@ -105,9 +109,13 @@ void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl, gen_rc_firstpass_stats(&first_pass_info->stats[i], &rc_firstpass_stats->frame_stats[i]); } - ext_ratectrl->funcs.send_firstpass_stats(ext_ratectrl->model, - rc_firstpass_stats); + rc_status = ext_ratectrl->funcs.send_firstpass_stats(ext_ratectrl->model, + rc_firstpass_stats); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } } + return VPX_CODEC_OK; } static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index e4d56c0b22..fbb5ebf05d 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -30,8 +30,8 @@ vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl); -void vp9_extrc_send_firstpass_stats(EXT_RATECTRL *ext_ratectrl, - const FIRST_PASS_INFO *first_pass_info); +vpx_codec_err_t vp9_extrc_send_firstpass_stats( + EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info); void vp9_extrc_get_encodeframe_decision( EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, From d890579a2ec8a8a36b1e75b3fe6662faa99608e5 Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Tue, 19 Jan 2021 17:57:00 -0800 Subject: [PATCH 0106/1000] Add status in vp9_extrc_get_encodeframe_decision Bug: webm:1716 Change-Id: Ie6d63a68539369c51fefefa528e299b00a967e29 --- vp9/encoder/vp9_ext_ratectrl.c | 12 ++++++++++-- vp9/encoder/vp9_ext_ratectrl.h | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 4a2e1b82dc..ec6e198af0 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -135,12 +135,16 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { } } -void vp9_extrc_get_encodeframe_decision( +vpx_codec_err_t vp9_extrc_get_encodeframe_decision( EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, FRAME_UPDATE_TYPE update_type, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, vpx_rc_encodeframe_decision_t *encode_frame_decision) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_ERROR; + } if (ext_ratectrl->ready) { + vpx_rc_status_t rc_status; vpx_rc_encodeframe_info_t encode_frame_info; encode_frame_info.show_index = show_index; encode_frame_info.coding_index = coding_index; @@ -151,9 +155,13 @@ void vp9_extrc_get_encodeframe_decision( encode_frame_info.ref_frame_coding_indexes, encode_frame_info.ref_frame_valid_list); - ext_ratectrl->funcs.get_encodeframe_decision( + rc_status = ext_ratectrl->funcs.get_encodeframe_decision( ext_ratectrl->model, &encode_frame_info, encode_frame_decision); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } } + return VPX_CODEC_OK; } void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl, diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index fbb5ebf05d..2082cd530e 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -33,7 +33,7 @@ vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl); vpx_codec_err_t vp9_extrc_send_firstpass_stats( EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info); -void vp9_extrc_get_encodeframe_decision( +vpx_codec_err_t vp9_extrc_get_encodeframe_decision( EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, FRAME_UPDATE_TYPE update_type, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, From d49700e25b280eba27ab3804cc769aed68ffb3e5 Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Tue, 19 Jan 2021 18:18:20 -0800 Subject: [PATCH 0107/1000] Add return to vp9_extrc_update_encodeframe_result Bug: webm:1716 Change-Id: Ib016ab5a49c765971366cc8d2b75bcca3ed5bd0f --- vp9/encoder/vp9_ext_ratectrl.c | 23 +++++++++++++++-------- vp9/encoder/vp9_ext_ratectrl.h | 11 +++++------ 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index ec6e198af0..70d6dd9c22 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -164,14 +164,17 @@ vpx_codec_err_t vp9_extrc_get_encodeframe_decision( return VPX_CODEC_OK; } -void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl, - int64_t bit_count, - const YV12_BUFFER_CONFIG *source_frame, - const YV12_BUFFER_CONFIG *coded_frame, - uint32_t bit_depth, - uint32_t input_bit_depth) { +vpx_codec_err_t vp9_extrc_update_encodeframe_result( + EXT_RATECTRL *ext_ratectrl, int64_t bit_count, + const YV12_BUFFER_CONFIG *source_frame, + const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth, + uint32_t input_bit_depth) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_ERROR; + } if (ext_ratectrl->ready) { PSNR_STATS psnr; + vpx_rc_status_t rc_status; vpx_rc_encodeframe_result_t encode_frame_result; encode_frame_result.bit_count = bit_count; encode_frame_result.pixel_count = @@ -186,7 +189,11 @@ void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl, vpx_calc_psnr(source_frame, coded_frame, &psnr); #endif encode_frame_result.sse = psnr.sse[0]; - ext_ratectrl->funcs.update_encodeframe_result(ext_ratectrl->model, - &encode_frame_result); + rc_status = ext_ratectrl->funcs.update_encodeframe_result( + ext_ratectrl->model, &encode_frame_result); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } } + return VPX_CODEC_OK; } diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index 2082cd530e..11e9102a65 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -39,11 +39,10 @@ vpx_codec_err_t vp9_extrc_get_encodeframe_decision( RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, vpx_rc_encodeframe_decision_t *encode_frame_decision); -void vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl, - int64_t bit_count, - const YV12_BUFFER_CONFIG *source_frame, - const YV12_BUFFER_CONFIG *coded_frame, - uint32_t bit_depth, - uint32_t input_bit_depth); +vpx_codec_err_t vp9_extrc_update_encodeframe_result( + EXT_RATECTRL *ext_ratectrl, int64_t bit_count, + const YV12_BUFFER_CONFIG *source_frame, + const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth, + uint32_t input_bit_depth); #endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ From f57fa3f1df45ea80049ff831a054ac66a12aebdc Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Tue, 19 Jan 2021 18:45:38 -0800 Subject: [PATCH 0108/1000] Handle vp9_extrc functions' return status properly Bug: webm:1716 Change-Id: I204cd3ab35b493759808500b799da3b9e55686d4 --- vp9/encoder/vp9_encoder.c | 29 ++++++++++++++++++++++++----- vp9/vp9_cx_iface.c | 6 +++++- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 6968e57919..eea2f18400 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2464,7 +2464,12 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; - vp9_extrc_init(&cpi->ext_ratectrl); + { + vpx_codec_err_t codec_status = vp9_extrc_init(&cpi->ext_ratectrl); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, "vp9_extrc_init() failed"); + } + } #if !CONFIG_REALTIME_ONLY if (oxcf->pass == 1) { @@ -4503,6 +4508,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest } #endif if (cpi->ext_ratectrl.ready) { + vpx_codec_err_t codec_status; const GF_GROUP *gf_group = &cpi->twopass.gf_group; vpx_rc_encodeframe_decision_t encode_frame_decision; FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; @@ -4511,10 +4517,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest const RefCntBuffer *curr_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx); get_ref_frame_bufs(cpi, ref_frame_bufs); - vp9_extrc_get_encodeframe_decision( + codec_status = vp9_extrc_get_encodeframe_decision( &cpi->ext_ratectrl, curr_frame_buf->frame_index, cm->current_frame_coding_index, gf_group->index, update_type, ref_frame_bufs, ref_frame_flags, &encode_frame_decision); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_encodeframe_decision() failed"); + } q = encode_frame_decision.q_index; } @@ -5489,9 +5499,13 @@ static void encode_frame_to_data_rate( { const RefCntBuffer *coded_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx); - vp9_extrc_update_encodeframe_result( + vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result( &cpi->ext_ratectrl, (*size) << 3, cpi->Source, &coded_frame_buf->buf, cm->bit_depth, cpi->oxcf.input_bit_depth); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_update_encodeframe_result() failed"); + } } #if CONFIG_REALTIME_ONLY (void)encode_frame_result; @@ -5682,8 +5696,13 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; if (cpi->common.current_frame_coding_index == 0) { - vp9_extrc_send_firstpass_stats(&cpi->ext_ratectrl, - &cpi->twopass.first_pass_info); + VP9_COMMON *cm = &cpi->common; + const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats( + &cpi->ext_ratectrl, &cpi->twopass.first_pass_info); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_send_firstpass_stats() failed"); + } } #if CONFIG_MISMATCH_DEBUG mismatch_move_frame_idx_w(); diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index a73683dfe0..ecfacfaf43 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1744,6 +1744,7 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, if (oxcf->pass == 2) { const FRAME_INFO *frame_info = &cpi->frame_info; vpx_rc_config_t ratectrl_config; + vpx_codec_err_t codec_status; ratectrl_config.frame_width = frame_info->frame_width; ratectrl_config.frame_height = frame_info->frame_height; @@ -1755,7 +1756,10 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, ratectrl_config.frame_rate_num = oxcf->g_timebase.den; ratectrl_config.frame_rate_den = oxcf->g_timebase.num; - vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl); + codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl); + if (codec_status != VPX_CODEC_OK) { + return codec_status; + } } return VPX_CODEC_OK; } From b0050f27e27c59312a12c037b263a1bb71df4f3c Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Tue, 19 Jan 2021 18:56:48 -0800 Subject: [PATCH 0109/1000] Use VPX_CODEC_INVALID_PARAM when ext_ratectrl=NULL Bug: webm:1716 Change-Id: Ic60c367aabfc03d94816e85476895b988aced5f1 --- vp9/encoder/vp9_ext_ratectrl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 70d6dd9c22..a27eb653ba 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -15,7 +15,7 @@ vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) { if (ext_ratectrl == NULL) { - return VPX_CODEC_ERROR; + return VPX_CODEC_INVALID_PARAM; } vp9_zero(*ext_ratectrl); return VPX_CODEC_OK; @@ -27,7 +27,7 @@ vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_rc_status_t rc_status; vpx_rc_firstpass_stats_t *rc_firstpass_stats; if (ext_ratectrl == NULL) { - return VPX_CODEC_ERROR; + return VPX_CODEC_INVALID_PARAM; } vp9_extrc_delete(ext_ratectrl); ext_ratectrl->funcs = funcs; @@ -52,7 +52,7 @@ vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs, vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) { if (ext_ratectrl == NULL) { - return VPX_CODEC_ERROR; + return VPX_CODEC_INVALID_PARAM; } if (ext_ratectrl->ready) { vpx_rc_status_t rc_status = @@ -97,7 +97,7 @@ static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats, vpx_codec_err_t vp9_extrc_send_firstpass_stats( EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info) { if (ext_ratectrl == NULL) { - return VPX_CODEC_ERROR; + return VPX_CODEC_INVALID_PARAM; } if (ext_ratectrl->ready) { vpx_rc_status_t rc_status; @@ -141,7 +141,7 @@ vpx_codec_err_t vp9_extrc_get_encodeframe_decision( RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, vpx_rc_encodeframe_decision_t *encode_frame_decision) { if (ext_ratectrl == NULL) { - return VPX_CODEC_ERROR; + return VPX_CODEC_INVALID_PARAM; } if (ext_ratectrl->ready) { vpx_rc_status_t rc_status; @@ -170,7 +170,7 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth, uint32_t input_bit_depth) { if (ext_ratectrl == NULL) { - return VPX_CODEC_ERROR; + return VPX_CODEC_INVALID_PARAM; } if (ext_ratectrl->ready) { PSNR_STATS psnr; From 7b93b56ab9b27f3c2a72e05b7ea3b5e85a06f5fa Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 21 Jan 2021 13:07:20 -0800 Subject: [PATCH 0110/1000] Do not reuse mv in base spatial layer if curr buf same as prev. Bug: b/154890543 Change-Id: Iad5791912f781d225e610a61bc13f3dbaef81bb9 --- vp9/encoder/vp9_encoder.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index f4587d42d9..4823d5f0f1 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -7861,9 +7861,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cm->new_fb_idx = get_free_fb(cm); if (cm->new_fb_idx == INVALID_IDX) return -1; - cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; - + // If the frame buffer for current frame is the same as previous frame, MV in + // the base layer shouldn't be used as it'll cause data race. + if (cm->cur_frame == cm->prev_frame) { + cpi->svc.use_base_mv = 0; + } // Start with a 0 size frame. *size = 0; From bd8dfea54d10adb2c0b19ccdaa6891757b1e5ae0 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Jan 2021 18:03:17 -0800 Subject: [PATCH 0111/1000] sad_test: fix compilation w/gcc 4.8.5 use a #define for kDataAlignment as it's used with DECLARE_ALIGNED (__attribute__((aligned(n)))) and this version under CentOS is more strict over integer constants: ../vpx_ports/mem.h:18:72: error: requested alignment is not an integer constant #define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n))) Bug: webm:1690 Change-Id: I8d4661ec1c2c1b1522bdc210689715d2302c7e72 --- test/sad_test.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 34cb26ed11..ee10a46389 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -26,6 +26,10 @@ #include "vpx_ports/msvc.h" #include "vpx_ports/vpx_timer.h" +// const[expr] should be sufficient for DECLARE_ALIGNED but early +// implementations of c++11 appear to have some issues with it. +#define kDataAlignment 32 + template struct TestParams { TestParams(int w, int h, Function f, int bd = -1) @@ -117,9 +121,6 @@ class SADTestBase : public ::testing::TestWithParam { protected: // Handle blocks up to 4 blocks 64x64 with stride up to 128 // crbug.com/webm/1660 - // const[expr] should be sufficient for DECLARE_ALIGNED but early - // implementations of c++11 appear to have some issues with it. - enum { kDataAlignment = 32 }; static const int kDataBlockSize = 64 * 128; static const int kDataBufferSize = 4 * kDataBlockSize; From 987dd3a9be85a4221733fd4a320eabce7463e56a Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Jan 2021 18:05:47 -0800 Subject: [PATCH 0112/1000] vp9_end_to_end_test: fix compile with gcc 4.8.5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit use Values() rather than ValuesIn() with an initializer list as this version of gcc under CentOS fails to deduce the type: ../third_party/googletest/src/include/gtest/gtest-param-test.h:304:29: note: template argument deduction/substitution failed: ../test/vp9_end_to_end_test.cc:346:59: note: couldn't deduce template parameter ‘T’ ::testing::ValuesIn({ 6, 7, 8 })); Bug: webm:1690 Change-Id: I43d9d4777fcd74a4f8fa8bdcd9834cdca5e546ff --- test/vp9_end_to_end_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc index 4e3a78fac4..7cc126ea58 100644 --- a/test/vp9_end_to_end_test.cc +++ b/test/vp9_end_to_end_test.cc @@ -342,7 +342,7 @@ VP9_INSTANTIATE_TEST_SUITE(EndToEndTestLarge, VP9_INSTANTIATE_TEST_SUITE(EndToEndNV12, ::testing::Values(::libvpx_test::kRealTime), ::testing::ValuesIn(kTestVectorsNv12), - ::testing::ValuesIn({ 6, 7, 8 })); + ::testing::Values(6, 7, 8)); VP9_INSTANTIATE_TEST_SUITE(EndToEndTestAdaptiveRDThresh, ::testing::Values(5, 6, 7), ::testing::Values(8, 9)); From f46b66ac83279e5403091d307cd3be7d97059949 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 27 Jan 2021 09:41:18 -0800 Subject: [PATCH 0113/1000] svc: turn off use_base_mv on non base layer. Change-Id: I4a9402f468e54c58081c882ed37f59ee0269c0fc --- vp9/encoder/vp9_encoder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 4823d5f0f1..4750f5b7bc 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -7864,7 +7864,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; // If the frame buffer for current frame is the same as previous frame, MV in // the base layer shouldn't be used as it'll cause data race. - if (cm->cur_frame == cm->prev_frame) { + if (cpi->svc.spatial_layer_id > 0 && cm->cur_frame == cm->prev_frame) { cpi->svc.use_base_mv = 0; } // Start with a 0 size frame. From ebb5ffc1d462c70dfb2283a5c7afcb75288c7692 Mon Sep 17 00:00:00 2001 From: Elliott Karpilovsky Date: Thu, 28 Jan 2021 11:22:58 -0800 Subject: [PATCH 0114/1000] Relax constraints on Y4M header parsing Previous parser assumed that the header would not exceed 80 characters. However, with latest FFMPEG changes, the header of Y4M files can exceed this limit. New parser can parse up to ~200 characters. Arbitrary parsing in future commit. BUG=aomedia:2876 Change-Id: I2ab8a7930cb5b76004e6731321d0ea20ddf333c1 --- test/y4m_test.cc | 26 +++++++++++++++++ y4minput.c | 76 ++++++++++++++++++++++++++++-------------------- 2 files changed, 71 insertions(+), 31 deletions(-) diff --git a/test/y4m_test.cc b/test/y4m_test.cc index 5df389f520..8272263f66 100644 --- a/test/y4m_test.cc +++ b/test/y4m_test.cc @@ -213,4 +213,30 @@ TEST(Y4MHeaderTest, RegularHeader) { y4m_input_close(&y4m); } +// Testing that headers over 100 characters can be parsed. +static const char kY4MLongHeader[] = + "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG " + "XCOLORRANGE=LIMITED XSOME_UNKNOWN_METADATA XOTHER_UNKNOWN_METADATA\n" + "FRAME\n" + "012345678912345601230123"; + +TEST(Y4MHeaderTest, LongHeader) { + libvpx_test::TempOutFile f; + fwrite(kY4MLongHeader, 1, sizeof(kY4MLongHeader), f.file()); + fflush(f.file()); + EXPECT_EQ(fseek(f.file(), 0, 0), 0); + + y4m_input y4m; + EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL, + /*num_skip=*/0, /*only_420=*/0), + 0); + EXPECT_EQ(y4m.pic_w, 4); + EXPECT_EQ(y4m.pic_h, 4); + EXPECT_EQ(y4m.fps_n, 30); + EXPECT_EQ(y4m.fps_d, 1); + EXPECT_EQ(y4m.interlace, 'p'); + EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0); + y4m_input_close(&y4m); +} + } // namespace diff --git a/y4minput.c b/y4minput.c index 68000768c9..1983021a1e 100644 --- a/y4minput.c +++ b/y4minput.c @@ -52,15 +52,8 @@ static int file_read(void *buf, size_t size, FILE *file) { } static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { - int got_w; - int got_h; - int got_fps; - int got_interlace; - int got_par; - int got_chroma; char *p; char *q; - got_w = got_h = got_fps = got_interlace = got_par = got_chroma = 0; for (p = _tags;; p = q) { /*Skip any leading spaces.*/ while (*p == ' ') p++; @@ -73,52 +66,74 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { switch (p[0]) { case 'W': { if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1; - got_w = 1; break; } case 'H': { if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1; - got_h = 1; break; } case 'F': { if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) { return -1; } - got_fps = 1; break; } case 'I': { _y4m->interlace = p[1]; - got_interlace = 1; break; } case 'A': { if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) { return -1; } - got_par = 1; break; } case 'C': { if (q - p > 16) return -1; memcpy(_y4m->chroma_type, p + 1, q - p - 1); _y4m->chroma_type[q - p - 1] = '\0'; - got_chroma = 1; break; } /*Ignore unknown tags.*/ } } - if (!got_w || !got_h || !got_fps) return -1; - if (!got_interlace) _y4m->interlace = '?'; - if (!got_par) _y4m->par_n = _y4m->par_d = 0; - /*Chroma-type is not specified in older files, e.g., those generated by - mplayer.*/ - if (!got_chroma) strcpy(_y4m->chroma_type, "420"); return 0; } +/* Returns 1 if tags were parsed successfully, 0 otherwise. */ +static int parse_tags(y4m_input *y4m_ctx, char *buffer) { + /* Set Y4M tags to defaults, updating them as processing occurs. Mandatory + fields are marked with -1 and will be checked after the tags are parsed. */ + int ret; + y4m_ctx->pic_w = -1; + y4m_ctx->pic_h = -1; + y4m_ctx->fps_n = -1; /* Also serves as marker for fps_d */ + y4m_ctx->par_n = 0; + y4m_ctx->par_d = 0; + y4m_ctx->interlace = '?'; + snprintf(y4m_ctx->chroma_type, sizeof(y4m_ctx->chroma_type), "420"); + + ret = y4m_parse_tags(y4m_ctx, buffer); + if (ret < 0) { + return 0; + } + + /* Check the mandatory fields. */ + if (y4m_ctx->pic_w == -1) { + fprintf(stderr, "Width field missing\n"); + return 0; + } + if (y4m_ctx->pic_h == -1) { + fprintf(stderr, "Height field missing\n"); + return 0; + } + if (y4m_ctx->fps_n == -1) { + fprintf(stderr, "FPS field missing\n"); + return 0; + } + return 1; +} + /*All anti-aliasing filters in the following conversion functions are based on one of two window functions: The 6-tap Lanczos window (for down-sampling and shifts): @@ -786,13 +801,14 @@ static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst, } static const char TAG[] = "YUV4MPEG2"; +/* Temporary until arbitrary header parsing submitted. */ +#define Y4M_HEADER_BUF_SIZE 200 int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, int num_skip, int only_420) { // File must start with |TAG|. - char tag_buffer[9]; // 9 == strlen(TAG) - char buffer[80] = { 0 }; - int ret; + char tag_buffer[9]; // 9 == strlen(TAG) + char buffer[Y4M_HEADER_BUF_SIZE] = { 0 }; // Rest of header. int i; // Read as much as possible from |skip_buffer|, which were characters // that were previously read from the file to do input-type detection. @@ -813,21 +829,19 @@ int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, fprintf(stderr, "Error parsing header: space must follow %s\n", TAG); return -1; } - /*Read until newline, or 80 cols, whichever happens first.*/ - for (i = 0; i < 79; i++) { + /*Read until newline, or Y4M_HEADER_BUF_SIZE cols, whichever happens first.*/ + for (i = 0; i < Y4M_HEADER_BUF_SIZE - 1; i++) { if (!file_read(buffer + i, 1, file)) return -1; if (buffer[i] == '\n') break; } - /*We skipped too much header data.*/ - if (i == 79) { - fprintf(stderr, "Error parsing header; not a YUV4MPEG2 file?\n"); + if (i == Y4M_HEADER_BUF_SIZE - 1) { + fprintf(stderr, "Error parsing header; not a %s file?\n", TAG); return -1; } buffer[i] = '\0'; - ret = y4m_parse_tags(y4m_ctx, buffer); - if (ret < 0) { - fprintf(stderr, "Error parsing YUV4MPEG2 header.\n"); - return ret; + if (!parse_tags(y4m_ctx, buffer)) { + fprintf(stderr, "Error parsing %s header.\n", TAG); + return -1; } if (y4m_ctx->interlace == '?') { fprintf(stderr, From 61edec1efbea1c02d71857e2aff9426d9cd2df4e Mon Sep 17 00:00:00 2001 From: Elliott Karpilovsky Date: Fri, 29 Jan 2021 09:37:31 -0800 Subject: [PATCH 0115/1000] Relax constraints on Y4M header parsing Previous parser assumed that the header would not exceed 80 characters. However, with latest FFMPEG changes, the header of Y4M files can exceed this limit. New parser can parse an arbitrarily long header, as long each tag is 255 or less characters. BUG=aomedia:2876 Change-Id: I9e6e42c50f4e49251dd697eef8036485ad5a1228 --- y4minput.c | 78 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/y4minput.c b/y4minput.c index 1983021a1e..f923eda34a 100644 --- a/y4minput.c +++ b/y4minput.c @@ -100,11 +100,50 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { return 0; } +// Copy a single tag into the buffer, along with a null character. +// Returns 0 if any file IO errors occur. +static int copy_tag(char *buf, size_t buf_len, char *end_tag, FILE *file) { + size_t i; + assert(buf_len >= 1); + // Skip leading space characters. + do { + if (!file_read(buf, 1, file)) { + return 0; + } + } while (buf[0] == ' '); + + // If we hit the newline, treat this as the "empty" tag. + if (buf[0] == '\n') { + buf[0] = '\0'; + *end_tag = '\n'; + return 1; + } + + // Copy over characters until a space is hit, or the buffer is exhausted. + for (i = 1; i < buf_len; ++i) { + if (!file_read(buf + i, 1, file)) { + return 0; + } + if (buf[i] == ' ' || buf[i] == '\n') { + break; + } + } + if (i == buf_len) { + fprintf(stderr, "Error: Y4M header tags must be less than %lu characters\n", + (unsigned long)i); + return 0; + } + *end_tag = buf[i]; + buf[i] = '\0'; + return 1; +} + /* Returns 1 if tags were parsed successfully, 0 otherwise. */ -static int parse_tags(y4m_input *y4m_ctx, char *buffer) { +static int parse_tags(y4m_input *y4m_ctx, FILE *file) { + char tag[256]; + char end; /* Character denoting the end of the tag, ' ' or '\n'. */ /* Set Y4M tags to defaults, updating them as processing occurs. Mandatory fields are marked with -1 and will be checked after the tags are parsed. */ - int ret; y4m_ctx->pic_w = -1; y4m_ctx->pic_h = -1; y4m_ctx->fps_n = -1; /* Also serves as marker for fps_d */ @@ -113,10 +152,16 @@ static int parse_tags(y4m_input *y4m_ctx, char *buffer) { y4m_ctx->interlace = '?'; snprintf(y4m_ctx->chroma_type, sizeof(y4m_ctx->chroma_type), "420"); - ret = y4m_parse_tags(y4m_ctx, buffer); - if (ret < 0) { - return 0; - } + /* Find one tag at a time. */ + do { + if (!copy_tag(tag, sizeof(tag), &end, file)) { + return 0; + } + /* y4m_parse_tags returns 0 on success. */ + if (y4m_parse_tags(y4m_ctx, tag)) { + return 0; + } + } while (end != '\n'); /* Check the mandatory fields. */ if (y4m_ctx->pic_w == -1) { @@ -801,15 +846,11 @@ static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst, } static const char TAG[] = "YUV4MPEG2"; -/* Temporary until arbitrary header parsing submitted. */ -#define Y4M_HEADER_BUF_SIZE 200 int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, int num_skip, int only_420) { // File must start with |TAG|. - char tag_buffer[9]; // 9 == strlen(TAG) - char buffer[Y4M_HEADER_BUF_SIZE] = { 0 }; // Rest of header. - int i; + char tag_buffer[9]; // 9 == strlen(TAG) // Read as much as possible from |skip_buffer|, which were characters // that were previously read from the file to do input-type detection. assert(num_skip >= 0 && num_skip <= 8); @@ -825,23 +866,12 @@ int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, return -1; } // Next character must be a space. - if (!file_read(buffer, 1, file) || buffer[0] != ' ') { + if (!file_read(tag_buffer, 1, file) || tag_buffer[0] != ' ') { fprintf(stderr, "Error parsing header: space must follow %s\n", TAG); return -1; } - /*Read until newline, or Y4M_HEADER_BUF_SIZE cols, whichever happens first.*/ - for (i = 0; i < Y4M_HEADER_BUF_SIZE - 1; i++) { - if (!file_read(buffer + i, 1, file)) return -1; - if (buffer[i] == '\n') break; - } - if (i == Y4M_HEADER_BUF_SIZE - 1) { - fprintf(stderr, "Error parsing header; not a %s file?\n", TAG); - return -1; - } - buffer[i] = '\0'; - if (!parse_tags(y4m_ctx, buffer)) { + if (!parse_tags(y4m_ctx, file)) { fprintf(stderr, "Error parsing %s header.\n", TAG); - return -1; } if (y4m_ctx->interlace == '?') { fprintf(stderr, From 6c5377fd355f7ae76ddea43f0a5732aa4337d31b Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Wed, 3 Feb 2021 10:04:52 -0800 Subject: [PATCH 0116/1000] Fix to vpx_temporal_svc_encoder Avoid division by zero. Change-Id: Icf3f40aa32fe30f42c46417a1437ebe235e3ac96 --- examples/vpx_temporal_svc_encoder.c | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c index ffeae2abc4..04212e5d7d 100644 --- a/examples/vpx_temporal_svc_encoder.c +++ b/examples/vpx_temporal_svc_encoder.c @@ -930,6 +930,7 @@ int main(int argc, char **argv) { // Update for short-time encoding bitrate states, for moving window // of size rc->window, shifted by rc->window / 2. // Ignore first window segment, due to key frame. + if (rc.window_size == 0) rc.window_size = 15; if (frame_cnt > rc.window_size) { sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate; if (frame_cnt % rc.window_size == 0) { From 557368a8fa9f839afd7a6ded6a95f18829ff3365 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Mon, 1 Feb 2021 16:57:09 -0800 Subject: [PATCH 0117/1000] L2E: let external rate control pass in a max frame size And allow the frame to recode when the frame size is larger than the input max frame size. If the max frame size is not specified, let vp9 decide whether to recode. The recode follows the vp9's current recoding mechanism. The rate control api will return the new qindex back to the external model. Change-Id: I796fbf713ad50a5b413b0e2501583b565ed2343f --- test/vp9_ext_ratectrl_test.cc | 6 ++++++ vp9/encoder/vp9_encoder.c | 35 +++++++++++++++++++++++++++++++--- vp9/encoder/vp9_ext_ratectrl.c | 3 ++- vp9/encoder/vp9_ext_ratectrl.h | 2 +- vpx/vpx_ext_ratectrl.h | 9 ++++++++- 5 files changed, 49 insertions(+), 6 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index 4b3693a347..b6b5b2eaec 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -128,6 +128,7 @@ vpx_rc_status_t rc_get_encodeframe_decision( } else { frame_decision->q_index = 100; } + frame_decision->max_frame_size = 0; return VPX_RC_OK; } @@ -143,6 +144,11 @@ vpx_rc_status_t rc_update_encodeframe_result( if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) { EXPECT_EQ(encode_frame_result->sse, 0); } + if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) { + EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 0); + } else { + EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 100); + } return VPX_RC_OK; } diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 6a21a1c183..2757bc4285 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4402,6 +4402,17 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest int qrange_adj = 1; #endif + // A flag which indicates whether we are recoding the current frame + // when the current frame size is larger than the max frame size in the + // external rate control model. + // This flag doesn't have any impact when external rate control is not used. + int ext_rc_recode = 0; + // Maximal frame size allowed by the external rate control. + // case: 0, we ignore the max frame size limit, and encode with the qindex + // passed in by the external rate control model. + // case: -1, we take VP9's decision for the max frame size. + int ext_rc_max_frame_size = 0; + #if CONFIG_RATE_CTRL const FRAME_UPDATE_TYPE update_type = cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]; @@ -4507,7 +4518,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest q = cpi->encode_command.external_quantize_index; } #endif - if (cpi->ext_ratectrl.ready) { + if (cpi->ext_ratectrl.ready && !ext_rc_recode) { vpx_codec_err_t codec_status; const GF_GROUP *gf_group = &cpi->twopass.gf_group; vpx_rc_encodeframe_decision_t encode_frame_decision; @@ -4526,6 +4537,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest "vp9_extrc_get_encodeframe_decision() failed"); } q = encode_frame_decision.q_index; + ext_rc_max_frame_size = encode_frame_decision.max_frame_size; } vp9_set_quantizer(cpi, q); @@ -4567,7 +4579,24 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest } if (cpi->ext_ratectrl.ready) { - break; + // In general, for the external rate control, we take the qindex provided + // as input and encode the frame with this qindex faithfully. However, + // in some extreme scenarios, the provided qindex leads to a massive + // overshoot of frame size. In this case, we fall back to VP9's decision + // to pick a new qindex and recode the frame. We return the new qindex + // through the API to the external model. + if (ext_rc_max_frame_size == 0) { + break; + } else if (ext_rc_max_frame_size == -1) { + if (rc->projected_frame_size < rc->max_frame_bandwidth) { + break; + } + } else { + if (rc->projected_frame_size < ext_rc_max_frame_size) { + break; + } + } + ext_rc_recode = 1; } #if CONFIG_RATE_CTRL // This part needs to be after save_coding_context() because @@ -5501,7 +5530,7 @@ static void encode_frame_to_data_rate( get_ref_cnt_buffer(cm, cm->new_fb_idx); vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result( &cpi->ext_ratectrl, (*size) << 3, cpi->Source, &coded_frame_buf->buf, - cm->bit_depth, cpi->oxcf.input_bit_depth); + cm->bit_depth, cpi->oxcf.input_bit_depth, cm->base_qindex); if (codec_status != VPX_CODEC_OK) { vpx_internal_error(&cm->error, codec_status, "vp9_extrc_update_encodeframe_result() failed"); diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index a27eb653ba..9f0098ab5a 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -168,7 +168,7 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( EXT_RATECTRL *ext_ratectrl, int64_t bit_count, const YV12_BUFFER_CONFIG *source_frame, const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth, - uint32_t input_bit_depth) { + uint32_t input_bit_depth, const int actual_encoding_qindex) { if (ext_ratectrl == NULL) { return VPX_CODEC_INVALID_PARAM; } @@ -180,6 +180,7 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( encode_frame_result.pixel_count = source_frame->y_crop_width * source_frame->y_crop_height + 2 * source_frame->uv_crop_width * source_frame->uv_crop_height; + encode_frame_result.actual_encoding_qindex = actual_encoding_qindex; #if CONFIG_VP9_HIGHBITDEPTH vpx_calc_highbd_psnr(source_frame, coded_frame, &psnr, bit_depth, input_bit_depth); diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index 11e9102a65..2142363085 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -43,6 +43,6 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( EXT_RATECTRL *ext_ratectrl, int64_t bit_count, const YV12_BUFFER_CONFIG *source_frame, const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth, - uint32_t input_bit_depth); + uint32_t input_bit_depth, int actual_encoding_qindex); #endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index dc4d856a8b..a193e55953 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -38,9 +38,15 @@ typedef void *vpx_rc_model_t; * * The encoder will receive the decision from the external rate control model * through get_encodeframe_decision() defined in vpx_rc_funcs_t. + * + * If max_frame_size = 0, the encoding ignores max frame size limit. + * If max_frame_size = -1, the encoding uses VP9's max frame size as the limit. + * If the encoded frame size is larger than max_frame_size, the frame is + * recoded to meet the size limit, following VP9's recoding principles. */ typedef struct vpx_rc_encodeframe_decision { - int q_index; /**< Quantizer step index [0..255]*/ + int q_index; /**< Quantizer step index [0..255]*/ + int max_frame_size; /**< Maximal frame size allowed to encode a frame*/ } vpx_rc_encodeframe_decision_t; /*!\brief Information for the frame to be encoded. @@ -82,6 +88,7 @@ typedef struct vpx_rc_encodeframe_result { int64_t sse; /**< sum of squared error of the reconstructed frame */ int64_t bit_count; /**< number of bits spent on coding the frame*/ int64_t pixel_count; /**< number of pixels in YUV planes of the frame*/ + int actual_encoding_qindex; /**< the actual qindex used to encode the frame*/ } vpx_rc_encodeframe_result_t; /*!\brief Status returned by rate control callback functions. From b3506b33076b082ab5baf53deabac185cb255a38 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 2 Feb 2021 18:14:44 -0800 Subject: [PATCH 0118/1000] vp8_denoiser_sse2_test: use ASSERT instead of EXPECT when test block contents to avoid producing unnecessary output on failure. Bug: webm:1718 Change-Id: Ie2cf8245ec8c03556549ad1eea65c8bef15a9735 --- test/vp8_denoiser_sse2_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/vp8_denoiser_sse2_test.cc b/test/vp8_denoiser_sse2_test.cc index 0197f143f3..ae547f007f 100644 --- a/test/vp8_denoiser_sse2_test.cc +++ b/test/vp8_denoiser_sse2_test.cc @@ -87,7 +87,7 @@ TEST_P(VP8DenoiserTest, BitexactCheck) { // Check bitexactness. for (int h = 0; h < 16; ++h) { for (int w = 0; w < 16; ++w) { - EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]); + ASSERT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]); } } @@ -103,7 +103,7 @@ TEST_P(VP8DenoiserTest, BitexactCheck) { // Check bitexactness. for (int h = 0; h < 16; ++h) { for (int w = 0; w < 16; ++w) { - EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]); + ASSERT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]); } } } From 158aa20c950bd905252041b7047c72d3aca71766 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Sun, 31 Jan 2021 22:11:33 -0800 Subject: [PATCH 0119/1000] svc: Unittest for ksvc flexible mode with no updates on TL > 0 Catches tsan issue fixed in: 7b93b56 Change-Id: I34b17c289afd0f8691987a1e4afa533f6c7f2806 --- test/svc_datarate_test.cc | 72 +++++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 7 deletions(-) diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc index 0a7d0032c1..3af2255963 100644 --- a/test/svc_datarate_test.cc +++ b/test/svc_datarate_test.cc @@ -84,6 +84,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { prev_frame_width[i] = 320; prev_frame_height[i] = 240; } + ksvc_flex_noupd_tlenh_ = false; } virtual void BeginPassHook(unsigned int /*pass*/) {} @@ -91,9 +92,10 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { // bypass/flexible mode. The pattern corresponds to the pattern // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in // non-flexible mode, except that we disable inter-layer prediction. - void set_frame_flags_bypass_mode( - int tl, int num_spatial_layers, int is_key_frame, - vpx_svc_ref_frame_config_t *ref_frame_config) { + void set_frame_flags_bypass_mode(int tl, int num_spatial_layers, + int is_key_frame, + vpx_svc_ref_frame_config_t *ref_frame_config, + int noupdate_tlenh) { for (int sl = 0; sl < num_spatial_layers; ++sl) ref_frame_config->update_buffer_slot[sl] = 0; @@ -154,6 +156,9 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { ref_frame_config->update_buffer_slot[sl] |= 1 << ref_frame_config->alt_fb_idx[sl]; } + // Force no update on all spatial layers for temporal enhancement layer + // frames. + if (noupdate_tlenh) ref_frame_config->update_buffer_slot[sl] = 0; } } } @@ -244,6 +249,22 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } } + if (ksvc_flex_noupd_tlenh_) { + vpx_svc_layer_id_t layer_id; + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = (video->frame() % 2 != 0); + temporal_layer_id_ = layer_id.temporal_layer_id; + for (int i = 0; i < number_spatial_layers_; i++) { + layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; + ref_frame_config.duration[i] = 1; + } + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + set_frame_flags_bypass_mode(layer_id.temporal_layer_id, + number_spatial_layers_, 0, &ref_frame_config, + 1); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); + } + if (update_pattern_ && video->frame() >= 100) { vpx_svc_layer_id_t layer_id; if (video->frame() == 100) { @@ -258,7 +279,8 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); set_frame_flags_bypass_mode(layer_id.temporal_layer_id, - number_spatial_layers_, 0, &ref_frame_config); + number_spatial_layers_, 0, &ref_frame_config, + 0); encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); } @@ -557,9 +579,14 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) { - double mismatch_psnr = compute_psnr(img1, img2); - mismatch_psnr_ += mismatch_psnr; - ++mismatch_nframes_; + // TODO(marpan): Look into why an assert is triggered in compute_psnr + // for mismatch frames for the special test case: ksvc_flex_noupd_tlenh. + // Has to do with dropped frames in bypass/flexible svc mode. + if (!ksvc_flex_noupd_tlenh_) { + double mismatch_psnr = compute_psnr(img1, img2); + mismatch_psnr_ += mismatch_psnr; + ++mismatch_nframes_; + } } unsigned int GetMismatchFrames() { return mismatch_nframes_; } @@ -604,6 +631,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { int num_resize_down_; unsigned int prev_frame_width[VPX_MAX_LAYERS]; unsigned int prev_frame_height[VPX_MAX_LAYERS]; + bool ksvc_flex_noupd_tlenh_; private: virtual void SetConfig(const int num_temporal_layer) { @@ -1106,6 +1134,36 @@ TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc3SL3TL4Threads) { #endif } +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 2 temporal layers, for KSVC in flexible mode with no update of reference +// frames for all spatial layers on TL > 0 superframes. +// Run HD clip with 4 threads. +TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc3SL2TL4ThKSVCFlex) { + SetSvcConfig(3, 2); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 4; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; + layer_framedrop_ = 0; + const int bitrates[3] = { 200, 400, 600 }; + cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)]; + ResetModel(); + layer_framedrop_ = GET_PARAM(2); + AssignLayerBitrates(); + ksvc_flex_noupd_tlenh_ = true; + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58, + 1.2); +} + // Params: speed setting, inter-layer prediction mode. class DatarateOnePassCbrSvcInterLayerPredSingleBR : public DatarateOnePassCbrSvc, From 0d8354669ac525a27c78bc8c761e98e0f8c3905c Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Wed, 3 Feb 2021 22:09:24 -0800 Subject: [PATCH 0120/1000] svc: Fix an existing unittest for flexible mode The flag update_pattern_ was being set to 0 (because it was set before reset) instead of 1. And the example flexible mode pattern was not setting non-reference frame on top temporal top spatial. Change-Id: I8aee56ce13cc4e0d614126592f9d0f691fe527b0 --- test/svc_datarate_test.cc | 12 +++++++----- vp9/encoder/vp9_svc_layercontext.c | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc index 3af2255963..95d82ce54e 100644 --- a/test/svc_datarate_test.cc +++ b/test/svc_datarate_test.cc @@ -153,8 +153,8 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { ref_frame_config->reference_last[sl] = 1; ref_frame_config->reference_golden[sl] = 0; ref_frame_config->reference_alt_ref[sl] = 0; - ref_frame_config->update_buffer_slot[sl] |= - 1 << ref_frame_config->alt_fb_idx[sl]; + // Non reference frame on top temporal top spatial. + ref_frame_config->update_buffer_slot[sl] = 0; } // Force no update on all spatial layers for temporal enhancement layer // frames. @@ -275,8 +275,10 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { layer_id.spatial_layer_id = 0; layer_id.temporal_layer_id = (video->frame() % 2 != 0); temporal_layer_id_ = layer_id.temporal_layer_id; - for (int i = 0; i < number_spatial_layers_; i++) + for (int i = 0; i < number_spatial_layers_; i++) { layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; + ref_frame_config.duration[i] = 1; + } encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); set_frame_flags_bypass_mode(layer_id.temporal_layer_id, number_spatial_layers_, 0, &ref_frame_config, @@ -750,14 +752,14 @@ TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL2TLDynamicPatternChange) { cfg_.g_threads = 1; cfg_.rc_dropframe_thresh = 30; cfg_.kf_max_dist = 9999; - // Change SVC pattern on the fly. - update_pattern_ = 1; ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, 0, 400); top_sl_width_ = 640; top_sl_height_ = 480; cfg_.rc_target_bitrate = 800; ResetModel(); + // Change SVC pattern on the fly. + update_pattern_ = 1; AssignLayerBitrates(); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index b6c7c74e17..f9a0de62a0 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -956,7 +956,7 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame && !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame) svc->non_reference_frame = 1; - // For non-flexible mode, where update_buffer_slot is used, need to check if + // For flexible mode, where update_buffer_slot is used, need to check if // all buffer slots are not refreshed. if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { if (svc->update_buffer_slot[svc->spatial_layer_id] != 0) From 24bd0733efad6ee63eda3c49ecb730e316eb2483 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 5 Feb 2021 10:21:39 -0800 Subject: [PATCH 0121/1000] vp8_denoiser_sse2_test: disable BitexactCheck w/gcc-8+ this test fails under gcc 8-10, but not with other compilers Bug: webm:1718 Change-Id: I8c6c7a25c4aaf019a7f91f835a1a2c9a731cfadc --- test/vp8_denoiser_sse2_test.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/vp8_denoiser_sse2_test.cc b/test/vp8_denoiser_sse2_test.cc index ae547f007f..8cb84ddd8e 100644 --- a/test/vp8_denoiser_sse2_test.cc +++ b/test/vp8_denoiser_sse2_test.cc @@ -40,7 +40,12 @@ class VP8DenoiserTest : public ::testing::TestWithParam { int increase_denoising_; }; +// TODO(https://crbug.com/webm/1718): This test fails with gcc 8-10. +#if defined(__GNUC__) && __GNUC__ >= 8 +TEST_P(VP8DenoiserTest, DISABLED_BitexactCheck) { +#else TEST_P(VP8DenoiserTest, BitexactCheck) { +#endif ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = 4000; const int stride = 16; From 02392eecccde436a76aca6c86a6fdf643e98eb38 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 17 Feb 2021 17:34:27 -0800 Subject: [PATCH 0122/1000] Remove two pass related code from svc sample encoder. SVC sample encoder is only supposed to be used for realtime SVC. Bug: webm:1705 Change-Id: I5c0c3491732db3e148073aaf7f90ee8d662b57b5 --- examples/vp9_spatial_svc_encoder.c | 69 +++--------------------------- 1 file changed, 6 insertions(+), 63 deletions(-) diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c index 6305572979..c37e608d17 100644 --- a/examples/vp9_spatial_svc_encoder.c +++ b/examples/vp9_spatial_svc_encoder.c @@ -66,12 +66,6 @@ static const arg_def_t kf_dist_arg = ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes"); static const arg_def_t scale_factors_arg = ARG_DEF("r", "scale-factors", 1, "scale factors (lowest to highest layer)"); -static const arg_def_t passes_arg = - ARG_DEF("p", "passes", 1, "Number of passes (1/2)"); -static const arg_def_t pass_arg = - ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)"); -static const arg_def_t fpf_name_arg = - ARG_DEF(NULL, "fpf", 1, "First pass statistics file name"); static const arg_def_t min_q_arg = ARG_DEF(NULL, "min-q", 1, "Minimum quantizer"); static const arg_def_t max_q_arg = @@ -125,9 +119,6 @@ static const arg_def_t *svc_args[] = { &frames_arg, &spatial_layers_arg, &kf_dist_arg, &scale_factors_arg, - &passes_arg, - &pass_arg, - &fpf_name_arg, &min_q_arg, &max_q_arg, &min_bitrate_arg, @@ -173,8 +164,6 @@ typedef struct { uint32_t frames_to_skip; struct VpxInputContext input_ctx; stats_io_t rc_stats; - int passes; - int pass; int tune_content; int inter_layer_pred; } AppInput; @@ -197,9 +186,6 @@ static void parse_command_line(int argc, const char **argv_, char **argi = NULL; char **argj = NULL; vpx_codec_err_t res; - int passes = 0; - int pass = 0; - const char *fpf_file_name = NULL; unsigned int min_bitrate = 0; unsigned int max_bitrate = 0; char string_options[1024] = { 0 }; @@ -289,18 +275,6 @@ static void parse_command_line(int argc, const char **argv_, sizeof(string_options) - strlen(string_options) - 1); strncat(string_options, arg.val, sizeof(string_options) - strlen(string_options) - 1); - } else if (arg_match(&arg, &passes_arg, argi)) { - passes = arg_parse_uint(&arg); - if (passes < 1 || passes > 2) { - die("Error: Invalid number of passes (%d)\n", passes); - } - } else if (arg_match(&arg, &pass_arg, argi)) { - pass = arg_parse_uint(&arg); - if (pass < 1 || pass > 2) { - die("Error: Invalid pass selected (%d)\n", pass); - } - } else if (arg_match(&arg, &fpf_name_arg, argi)) { - fpf_file_name = arg.val; } else if (arg_match(&arg, &min_q_arg, argi)) { strncat(string_options, " min-quantizers=", sizeof(string_options) - strlen(string_options) - 1); @@ -355,35 +329,7 @@ static void parse_command_line(int argc, const char **argv_, if (strlen(string_options) > 0) vpx_svc_set_options(svc_ctx, string_options + 1); - if (passes == 0 || passes == 1) { - if (pass) { - fprintf(stderr, "pass is ignored since there's only one pass\n"); - } - enc_cfg->g_pass = VPX_RC_ONE_PASS; - } else { - if (pass == 0) { - die("pass must be specified when passes is 2\n"); - } - - if (fpf_file_name == NULL) { - die("fpf must be specified when passes is 2\n"); - } - - if (pass == 1) { - enc_cfg->g_pass = VPX_RC_FIRST_PASS; - if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 0)) { - fatal("Failed to open statistics store"); - } - } else { - enc_cfg->g_pass = VPX_RC_LAST_PASS; - if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 1)) { - fatal("Failed to open statistics store"); - } - enc_cfg->rc_twopass_stats_in = stats_get(&app_input->rc_stats); - } - app_input->passes = passes; - app_input->pass = pass; - } + enc_cfg->g_pass = VPX_RC_ONE_PASS; if (enc_cfg->rc_target_bitrate > 0) { if (min_bitrate > 0) { @@ -1004,13 +950,11 @@ int main(int argc, const char **argv) { info.time_base.numerator = enc_cfg.g_timebase.num; info.time_base.denominator = enc_cfg.g_timebase.den; - if (!(app_input.passes == 2 && app_input.pass == 1)) { - // We don't save the bitstream for the 1st pass on two pass rate control - writer = - vpx_video_writer_open(app_input.output_filename, kContainerIVF, &info); - if (!writer) - die("Failed to open %s for writing\n", app_input.output_filename); - } + writer = + vpx_video_writer_open(app_input.output_filename, kContainerIVF, &info); + if (!writer) + die("Failed to open %s for writing\n", app_input.output_filename); + #if OUTPUT_RC_STATS // Write out spatial layer stream. // TODO(marpan/jianj): allow for writing each spatial and temporal stream. @@ -1230,7 +1174,6 @@ int main(int argc, const char **argv) { #endif if (vpx_codec_destroy(&encoder)) die_codec(&encoder, "Failed to destroy codec"); - if (app_input.passes == 2) stats_close(&app_input.rc_stats, 1); if (writer) { vpx_video_writer_close(writer); } From ebefb90b75f07ea5ab06d6b2a5ea5355c843d266 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Fri, 26 Feb 2021 18:02:24 -0800 Subject: [PATCH 0123/1000] Remove comments for removed 'active_map' parameter Change-Id: I8635f6121e13089c25e201df033d5bc68e2862b4 --- vp9/encoder/vp9_lookahead.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h index dbbe3af584..6ac6736673 100644 --- a/vp9/encoder/vp9_lookahead.h +++ b/vp9/encoder/vp9_lookahead.h @@ -82,15 +82,11 @@ int vp9_lookahead_next_show_idx(const struct lookahead_ctx *ctx); * This function will copy the source image into a new framebuffer with * the expected stride/border. * - * If active_map is non-NULL and there is only one frame in the queue, then copy - * only active macroblocks. - * * \param[in] ctx Pointer to the lookahead context * \param[in] src Pointer to the image to enqueue * \param[in] ts_start Timestamp for the start of this frame * \param[in] ts_end Timestamp for the end of this frame * \param[in] flags Flags set on this frame - * \param[in] active_map Map that specifies which macroblock is active */ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, int64_t ts_start, int64_t ts_end, int use_highbitdepth, From d0567bd779febe995020668cc7f6c1193e3e41d6 Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Wed, 3 Mar 2021 16:45:42 +0000 Subject: [PATCH 0124/1000] Add fields into RC for Vizier ML experiments. This patch adds fields into the RC data structure for the Vizier. The added fields allow control of some extra rate control parameters and rate distortion. This patch also adds functions to initialize the various parameters though many are not yet used / wired in and for now all are set to default values. Ultimately many will be set through new command line options. Change-Id: I41591bb627d3837d2104fb363845adedbddf2e02 --- vp9/encoder/vp9_encoder.c | 1 + vp9/encoder/vp9_encoder.h | 1 + vp9/encoder/vp9_ext_ratectrl.h | 2 +- vp9/encoder/vp9_firstpass.c | 164 ++++++++++++++++++++++++++++----- vp9/encoder/vp9_ratectrl.h | 13 +++ vp9/encoder/vp9_rd.c | 103 +++++++++++++++++---- vp9/encoder/vp9_rd.h | 14 +++ 7 files changed, 257 insertions(+), 41 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 2757bc4285..ecd15cb011 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2317,6 +2317,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, cpi->frame_info = vp9_get_frame_info(oxcf); vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc); + vp9_init_rd_parameters(cpi); init_frame_indexes(cm); cpi->partition_search_skippable_frame = 0; diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 8763a5e789..12520fb82a 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -746,6 +746,7 @@ typedef struct VP9_COMP { // Ambient reconstruction err target for force key frames int64_t ambient_err; + RD_CONTROL rd_ctrl; RD_OPT rd; CODING_CONTEXT coding_context; diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index 2142363085..74fd68b96d 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -43,6 +43,6 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( EXT_RATECTRL *ext_ratectrl, int64_t bit_count, const YV12_BUFFER_CONFIG *source_frame, const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth, - uint32_t input_bit_depth, int actual_encoding_qindex); + uint32_t input_bit_depth, const int actual_encoding_qindex); #endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 2a9cf52898..fe7abef07e 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -54,6 +54,31 @@ #define NCOUNT_INTRA_THRESH 8192 #define NCOUNT_INTRA_FACTOR 3 +#define SR_DIFF_PART 0.0015 +#define INTRA_PART 0.005 +#define DEFAULT_DECAY_LIMIT 0.75 +#define LOW_SR_DIFF_TRHESH 0.1 +#define SR_DIFF_MAX 128.0 +#define LOW_CODED_ERR_PER_MB 10.0 +#define NCOUNT_FRAME_II_THRESH 6.0 +#define BASELINE_ERR_PER_MB 12500.0 +#define GF_MAX_FRAME_BOOST 96.0 + +#ifdef AGGRESSIVE_VBR +#define KF_MAX_FRAME_BOOST 80.0 +#define MAX_KF_TOT_BOOST 4800 +#else +#define KF_MAX_FRAME_BOOST 96.0 +#define MAX_KF_TOT_BOOST 5400 +#endif + +#define ZM_POWER_FACTOR 0.75 +#define MINQ_ADJ_LIMIT 48 +#define MINQ_ADJ_LIMIT_CQ 20 +#define HIGH_UNDERSHOOT_RATIO 2 +#define AV_WQ_FACTOR 4.0 +#define DEF_EPMB_LOW 2000.0 + #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001) #if ARF_STATS_OUTPUT @@ -1807,14 +1832,6 @@ void vp9_init_second_pass(VP9_COMP *cpi) { twopass->arnr_strength_adjustment = 0; } -#define SR_DIFF_PART 0.0015 -#define INTRA_PART 0.005 -#define DEFAULT_DECAY_LIMIT 0.75 -#define LOW_SR_DIFF_TRHESH 0.1 -#define SR_DIFF_MAX 128.0 -#define LOW_CODED_ERR_PER_MB 10.0 -#define NCOUNT_FRAME_II_THRESH 6.0 - static double get_sr_decay_rate(const FRAME_INFO *frame_info, const FIRSTPASS_STATS *frame) { double sr_diff = (frame->sr_coded_error - frame->coded_error); @@ -1853,8 +1870,6 @@ static double get_zero_motion_factor(const FRAME_INFO *frame_info, return VPXMIN(sr_decay, zero_motion_pct); } -#define ZM_POWER_FACTOR 0.75 - static double get_prediction_decay_rate(const FRAME_INFO *frame_info, const FIRSTPASS_STATS *frame_stats) { const double sr_decay_rate = get_sr_decay_rate(frame_info, frame_stats); @@ -1942,8 +1957,6 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, } } -#define BASELINE_ERR_PER_MB 12500.0 -#define GF_MAX_BOOST 96.0 static double calc_frame_boost(const FRAME_INFO *frame_info, const FIRSTPASS_STATS *this_frame, int avg_frame_qindex, @@ -1965,7 +1978,7 @@ static double calc_frame_boost(const FRAME_INFO *frame_info, // Q correction and scalling frame_boost = frame_boost * boost_q_correction; - return VPXMIN(frame_boost, GF_MAX_BOOST * boost_q_correction); + return VPXMIN(frame_boost, GF_MAX_FRAME_BOOST * boost_q_correction); } static double kf_err_per_mb(VP9_COMP *cpi) { @@ -3159,14 +3172,6 @@ static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info, #define MIN_SCAN_FRAMES_FOR_KF_BOOST 32 #define KF_ABS_ZOOM_THRESH 6.0 -#ifdef AGGRESSIVE_VBR -#define KF_MAX_FRAME_BOOST 80.0 -#define MAX_KF_TOT_BOOST 4800 -#else -#define KF_MAX_FRAME_BOOST 96.0 -#define MAX_KF_TOT_BOOST 5400 -#endif - int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf, const FRAME_INFO *frame_info, const FIRST_PASS_INFO *first_pass_info, @@ -3470,6 +3475,113 @@ static int is_skippable_frame(const VP9_COMP *cpi) { twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1); } +// Configure image size specific vizier parameters. +// Later these will be set via additional command line options +static void init_vizier_params(RATE_CONTROL *const rc, int screen_area) { + if (1) { + // Force defaults for now + rc->active_wq_factor = AV_WQ_FACTOR; + rc->base_err_per_mb = BASELINE_ERR_PER_MB; + rc->sr_default_decay_limit = DEFAULT_DECAY_LIMIT; + rc->sr_diff_part = SR_DIFF_PART; + rc->gf_frame_max_boost = GF_MAX_FRAME_BOOST; + rc->gf_max_total_boost = MAX_GF_BOOST; + rc->kf_err_per_mb = DEF_EPMB_LOW; + rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; // Max for first kf. + rc->kf_frame_max_boost_subs = KF_MAX_FRAME_BOOST / 2; // Max for other kfs. + rc->kf_max_total_boost = MAX_KF_TOT_BOOST; + rc->zm_power_factor = ZM_POWER_FACTOR; + } else { + // Vizer experimental parameters from training. + // Later these will be set via the command line. + if (screen_area <= 176 * 144) { + rc->active_wq_factor = 46.0; + rc->base_err_per_mb = 37597.399760969536; + rc->sr_default_decay_limit = 0.3905639800962774; + rc->sr_diff_part = 0.009599023654146284; + rc->gf_frame_max_boost = 87.27362648627846; + rc->gf_max_total_boost = MAX_GF_BOOST; + rc->kf_err_per_mb = 1854.8255436877148; + rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2; + rc->kf_max_total_boost = MAX_KF_TOT_BOOST; + rc->zm_power_factor = 2.93715229184991; + } else if (screen_area <= 320 * 240) { + rc->active_wq_factor = 55.0; + rc->base_err_per_mb = 34525.33177195309; + rc->sr_default_decay_limit = 0.23901360046804604; + rc->sr_diff_part = 0.008581014394766773; + rc->gf_frame_max_boost = 127.34978204980285; + rc->gf_max_total_boost = MAX_GF_BOOST; + rc->kf_err_per_mb = 723.8337508755031; + rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2; + rc->kf_max_total_boost = MAX_KF_TOT_BOOST; + rc->zm_power_factor = 3.5299221493593413; + } else if (screen_area <= 640 * 360) { + rc->active_wq_factor = 12.5; + rc->base_err_per_mb = 18823.978018028298; + rc->sr_default_decay_limit = 0.6043527690301296; + rc->sr_diff_part = 0.00343296783885544; + rc->gf_frame_max_boost = 75.17672317013668; + rc->gf_max_total_boost = MAX_GF_BOOST; + rc->kf_err_per_mb = 422.2871502380377; + rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2; + rc->kf_max_total_boost = MAX_KF_TOT_BOOST; + rc->zm_power_factor = 2.265742666649307; + } else if (screen_area <= 854 * 480) { + rc->active_wq_factor = 51.5; + rc->base_err_per_mb = 33718.98307662595; + rc->sr_default_decay_limit = 0.33633414970713393; + rc->sr_diff_part = 0.00868988716928333; + rc->gf_frame_max_boost = 85.2868528581522; + rc->gf_max_total_boost = MAX_GF_BOOST; + rc->kf_err_per_mb = 1513.4883914008383; + rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2; + rc->kf_max_total_boost = MAX_KF_TOT_BOOST; + rc->zm_power_factor = 3.552278528517416; + } else if (screen_area <= 1280 * 720) { + rc->active_wq_factor = 41.5; + rc->base_err_per_mb = 29527.46375825401; + rc->sr_default_decay_limit = 0.5009117586299728; + rc->sr_diff_part = 0.005007364627260114; + rc->gf_frame_max_boost = GF_MAX_FRAME_BOOST; + rc->gf_max_total_boost = MAX_GF_BOOST; + rc->kf_err_per_mb = 998.6342911785146; + rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2; + rc->kf_max_total_boost = MAX_KF_TOT_BOOST; + rc->zm_power_factor = 2.568627575572356; + } else if (screen_area <= 1920 * 1080) { + rc->active_wq_factor = 31.0; + rc->base_err_per_mb = 34474.723463367416; + rc->sr_default_decay_limit = 0.23346886902707745; + rc->sr_diff_part = 0.011431716637966029; + rc->gf_frame_max_boost = 81.00472969483079; + rc->gf_max_total_boost = MAX_GF_BOOST; + rc->kf_err_per_mb = 35931.25734431429; + rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2; + rc->kf_max_total_boost = MAX_KF_TOT_BOOST; + rc->zm_power_factor = 5.5776463538431935; + } else { + rc->active_wq_factor = AV_WQ_FACTOR; + rc->base_err_per_mb = BASELINE_ERR_PER_MB; + rc->sr_default_decay_limit = DEFAULT_DECAY_LIMIT; + rc->sr_diff_part = SR_DIFF_PART; + rc->gf_frame_max_boost = GF_MAX_FRAME_BOOST; + rc->gf_max_total_boost = MAX_GF_BOOST; + rc->kf_err_per_mb = DEF_EPMB_LOW; + rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2; + rc->kf_max_total_boost = MAX_KF_TOT_BOOST; + rc->zm_power_factor = ZM_POWER_FACTOR; + } + } +} + void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; @@ -3480,6 +3592,13 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { if (!twopass->stats_in) return; + // Configure image size specific vizier parameters + if (cm->current_video_frame == 0) { + unsigned int screen_area = (cm->width * cm->height); + + init_vizier_params(rc, screen_area); + } + // If this is an arf frame then we dont want to read the stats file or // advance the input pointer as we already have what we need. if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { @@ -3605,9 +3724,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { subtract_stats(&twopass->total_left_stats, &this_frame); } -#define MINQ_ADJ_LIMIT 48 -#define MINQ_ADJ_LIMIT_CQ 20 -#define HIGH_UNDERSHOOT_RATIO 2 void vp9_twopass_postencode_update(VP9_COMP *cpi) { TWO_PASS *const twopass = &cpi->twopass; RATE_CONTROL *const rc = &cpi->rc; diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 0120f90a01..7437d309e0 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -204,6 +204,19 @@ typedef struct { int preserve_arf_as_gld; int preserve_next_arf_as_gld; int show_arf_as_gld; + + // Vizeir project experimental rate control parameters. + double active_wq_factor; + double base_err_per_mb; + double sr_default_decay_limit; + double sr_diff_part; + double kf_frame_max_boost_first; // Max for first kf in a chunk. + double kf_frame_max_boost_subs; // Max for subsequent mid chunk kfs. + double kf_max_total_boost; + double kf_err_per_mb; + double gf_frame_max_boost; + double gf_max_total_boost; + double zm_power_factor; } RATE_CONTROL; struct VP9_COMP; diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 34c74424ce..b126d8708d 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -197,28 +197,99 @@ static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12, static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128, 128, 144, 144 }; +// Configure Vizier RD parameters. +// Later this function will use passed in command line values. +void vp9_init_rd_parameters(VP9_COMP *cpi) { + RD_CONTROL *const rdc = &cpi->rd_ctrl; + unsigned int screen_area = (cpi->common.width * cpi->common.height); + + // Make sure this function is floating point safe. + vpx_clear_system_state(); + + if (1) { + // Non/pre-Vizer defaults + rdc->rd_mult_q_sq_inter_low_qp = 4.0; + rdc->rd_mult_q_sq_inter_mid_qp = 4.5; + rdc->rd_mult_q_sq_inter_high_qp = 3.0; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.0; + rdc->rd_mult_q_sq_key_low_qp = 3.5; + rdc->rd_mult_q_sq_key_mid_qp = 4.5; + rdc->rd_mult_q_sq_key_high_qp = 7.5; + } else if (screen_area <= 176 * 144) { + rdc->rd_mult_q_sq_inter_high_qp = 4.295745965132044; + rdc->rd_mult_q_sq_inter_low_qp = 4.0718581295922025; + rdc->rd_mult_q_sq_inter_mid_qp = 4.031435609256739; + rdc->rd_mult_q_sq_key_low_qp = 5.7037775720838155; + rdc->rd_mult_q_sq_key_mid_qp = 4.72424015517201; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.290774097327333; + } else if (screen_area <= 320 * 240) { + rdc->rd_mult_q_sq_inter_high_qp = 4.388244213131458; + rdc->rd_mult_q_sq_inter_low_qp = 4.506676356706102; + rdc->rd_mult_q_sq_inter_mid_qp = 4.489349899621181; + rdc->rd_mult_q_sq_key_low_qp = 4.497000582319771; + rdc->rd_mult_q_sq_key_mid_qp = 4.2825894884789735; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.217074424696166; + } else if (screen_area <= 640 * 360) { + rdc->rd_mult_q_sq_inter_high_qp = 4.3702861603380025; + rdc->rd_mult_q_sq_inter_low_qp = 4.730644123689013; + rdc->rd_mult_q_sq_inter_mid_qp = 4.314589509578551; + rdc->rd_mult_q_sq_key_low_qp = 6.068652999601526; + rdc->rd_mult_q_sq_key_mid_qp = 4.817707474077241; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.576902541873747; + } else if (screen_area <= 854 * 480) { + rdc->rd_mult_q_sq_inter_high_qp = 3.969083125219539; + rdc->rd_mult_q_sq_inter_low_qp = 4.811470143416073; + rdc->rd_mult_q_sq_inter_mid_qp = 4.621618127750201; + rdc->rd_mult_q_sq_key_low_qp = 5.073157238799473; + rdc->rd_mult_q_sq_key_mid_qp = 5.7587672849242635; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.9854544277222566; + } else if (screen_area <= 1280 * 720) { + rdc->rd_mult_q_sq_inter_high_qp = 4.410712348825541; + rdc->rd_mult_q_sq_inter_low_qp = 5.119381136011107; + rdc->rd_mult_q_sq_inter_mid_qp = 4.518613675766538; + rdc->rd_mult_q_sq_key_low_qp = 5.848703119971484; + rdc->rd_mult_q_sq_key_mid_qp = 5.368947246228739; + rdc->rd_mult_q_sq_key_ultralow_qp = 3.9468491666607326; + } else if (screen_area <= 1920 * 1080) { + rdc->rd_mult_q_sq_inter_high_qp = 3.2141187537667797; + rdc->rd_mult_q_sq_inter_low_qp = 6.00569815296199; + rdc->rd_mult_q_sq_inter_mid_qp = 3.932565684947023; + rdc->rd_mult_q_sq_key_low_qp = 10.582906599488298; + rdc->rd_mult_q_sq_key_mid_qp = 6.274162346360692; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.399795006320089; + } +} + int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) { - // largest dc_quant is 21387, therefore rdmult should always fit in int32_t + const RD_CONTROL *rdc = &cpi->rd_ctrl; const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth); - uint32_t rdmult = q * q; + // largest dc_quant is 21387, therefore rdmult should fit in int32_t + int rdmult = q * q; + + // Make sure this function is floating point safe. + vpx_clear_system_state(); if (cpi->common.frame_type != KEY_FRAME) { - if (qindex < 128) - rdmult = rdmult * 4; - else if (qindex < 190) - rdmult = rdmult * 4 + rdmult / 2; - else - rdmult = rdmult * 3; + if (qindex < 128) { + rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_inter_low_qp); + } else if (qindex < 190) { + rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_inter_mid_qp); + } else { + rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_inter_high_qp); + } } else { - if (qindex < 64) - rdmult = rdmult * 4; - else if (qindex <= 128) - rdmult = rdmult * 3 + rdmult / 2; - else if (qindex < 190) - rdmult = rdmult * 4 + rdmult / 2; - else - rdmult = rdmult * 7 + rdmult / 2; + if (qindex < 64) { + rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_ultralow_qp); + } else if (qindex <= 128) { + rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_low_qp); + } else if (qindex < 190) { + rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_mid_qp); + + } else { + rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_high_qp); + } } + #if CONFIG_VP9_HIGHBITDEPTH switch (cpi->common.bit_depth) { case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break; diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h index 4c04c95482..2c9f5e7408 100644 --- a/vp9/encoder/vp9_rd.h +++ b/vp9/encoder/vp9_rd.h @@ -101,6 +101,18 @@ typedef enum { THR_INTRA, } THR_MODES_SUB8X8; +typedef struct { + // RD control parameters + // Added for Vizier project. + double rd_mult_q_sq_inter_low_qp; + double rd_mult_q_sq_inter_mid_qp; + double rd_mult_q_sq_inter_high_qp; + double rd_mult_q_sq_key_ultralow_qp; + double rd_mult_q_sq_key_low_qp; + double rd_mult_q_sq_key_mid_qp; + double rd_mult_q_sq_key_high_qp; +} RD_CONTROL; + typedef struct RD_OPT { // Thresh_mult is used to set a threshold for the rd score. A higher value // means that we will accept the best mode so far more often. This number @@ -144,6 +156,8 @@ struct TileDataEnc; struct VP9_COMP; struct macroblock; +void vp9_init_rd_parameters(struct VP9_COMP *cpi); + int vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi, int qindex); int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex); From 2570e33eceb7c489851e689a07473d1889206059 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 3 Mar 2021 14:46:48 -0800 Subject: [PATCH 0125/1000] override assembler with --as option on msvs Bug: webm:1709 Change-Id: I962a64c00042fe95cc1cd845b187f71ad6cfd1b7 --- build/make/configure.sh | 4 ---- build/make/gen_msvs_vcxproj.sh | 10 ++++++---- examples.mk | 1 + libs.mk | 6 ++++++ tools.mk | 1 + 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/build/make/configure.sh b/build/make/configure.sh index c4e938fc72..81d30a16c7 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -1296,10 +1296,6 @@ EOF enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer ;; vs*) - # When building with Microsoft Visual Studio the assembler is - # invoked directly. Checking at configure time is unnecessary. - # Skip the check by setting AS arbitrarily - AS=msvs msvs_arch_dir=x86-msvs case ${tgt_cc##vs} in 14) diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh index bb1c31d230..6f91ad4781 100755 --- a/build/make/gen_msvs_vcxproj.sh +++ b/build/make/gen_msvs_vcxproj.sh @@ -157,6 +157,8 @@ for opt in "$@"; do ;; --lib) proj_kind="lib" ;; + --as=*) as="${optval}" + ;; --src-path-bare=*) src_path_bare=$(fix_path "$optval") src_path_bare=${src_path_bare%/} @@ -247,13 +249,13 @@ libs=${libs// /;} case "$target" in x86_64*) platforms[0]="x64" - asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} "%(FullPath)"" - asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} "%(FullPath)"" + asm_Debug_cmdline="${as} -Xvc -gcv8 -f win64 ${yasmincs} "%(FullPath)"" + asm_Release_cmdline="${as} -Xvc -f win64 ${yasmincs} "%(FullPath)"" ;; x86*) platforms[0]="Win32" - asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} "%(FullPath)"" - asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} "%(FullPath)"" + asm_Debug_cmdline="${as} -Xvc -gcv8 -f win32 ${yasmincs} "%(FullPath)"" + asm_Release_cmdline="${as} -Xvc -f win32 ${yasmincs} "%(FullPath)"" ;; arm64*) platforms[0]="ARM64" diff --git a/examples.mk b/examples.mk index a28e529359..42886f1e15 100644 --- a/examples.mk +++ b/examples.mk @@ -376,6 +376,7 @@ $(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX) --ver=$$(CONFIG_VS_VERSION)\ --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\ --src-path-bare="$(SRC_PATH_BARE)" \ + --as=$$(AS) \ $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \ --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \ $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -l$$(CODEC_LIB) $$^ diff --git a/libs.mk b/libs.mk index b5bc35755c..cabd4ed141 100644 --- a/libs.mk +++ b/libs.mk @@ -232,6 +232,7 @@ vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def --ver=$(CONFIG_VS_VERSION) \ --src-path-bare="$(SRC_PATH_BARE)" \ --out=$@ $(CFLAGS) \ + --as=$(AS) \ $(filter $(SRC_PATH_BARE)/vp8/%.c, $(VCPROJ_SRCS)) \ $(filter $(SRC_PATH_BARE)/vp8/%.h, $(VCPROJ_SRCS)) \ $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \ @@ -262,6 +263,7 @@ vp9rc.$(VCPROJ_SFX): $(RC_RTC_SRCS) --ver=$(CONFIG_VS_VERSION) \ --src-path-bare="$(SRC_PATH_BARE)" \ --out=$@ $(CFLAGS) \ + --as=$(AS) \ $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \ $(filter $(SRC_PATH_BARE)/vp9/%.cc, $(VCPROJ_SRCS)) \ $(filter $(SRC_PATH_BARE)/vp9/%.h, $(VCPROJ_SRCS)) \ @@ -536,6 +538,7 @@ gtest.$(VCPROJ_SFX): $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.c --proj-guid=EC00E1EC-AF68-4D92-A255-181690D1C9B1 \ --ver=$(CONFIG_VS_VERSION) \ --src-path-bare="$(SRC_PATH_BARE)" \ + --as=$(AS) \ -D_VARIADIC_MAX=10 \ --out=gtest.$(VCPROJ_SFX) $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.cc \ -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" -I"$(SRC_PATH_BARE)/third_party/googletest/src" @@ -552,6 +555,7 @@ test_libvpx.$(VCPROJ_SFX): $(LIBVPX_TEST_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_ --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \ --ver=$(CONFIG_VS_VERSION) \ --src-path-bare="$(SRC_PATH_BARE)" \ + --as=$(AS) \ $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \ -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \ @@ -574,6 +578,7 @@ test_intra_pred_speed.$(VCPROJ_SFX): $(TEST_INTRA_PRED_SPEED_SRCS) vpx.$(VCPROJ_ --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \ --ver=$(CONFIG_VS_VERSION) \ --src-path-bare="$(SRC_PATH_BARE)" \ + --as=$(AS) \ $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \ -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \ @@ -592,6 +597,7 @@ test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \ -D_VARIADIC_MAX=10 \ --proj-guid=30458F88-1BC6-4689-B41C-50F3737AAB27 \ --ver=$(CONFIG_VS_VERSION) \ + --as=$(AS) \ --src-path-bare="$(SRC_PATH_BARE)" \ $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \ diff --git a/tools.mk b/tools.mk index 1d005b2acf..dd2ebeb3d5 100644 --- a/tools.mk +++ b/tools.mk @@ -79,6 +79,7 @@ $(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX) --ver=$$(CONFIG_VS_VERSION)\ --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\ --src-path-bare="$(SRC_PATH_BARE)" \ + --as=$$(AS) \ $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \ --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \ $$(INTERNAL_LDFLAGS) $$(LDFLAGS) $$^ From f7c386bab0e637dff65c3fe546a83e9564028aff Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 3 Mar 2021 17:33:30 -0800 Subject: [PATCH 0126/1000] Use -std=gnu++11 instead of -std=c++11 Cygwin and msys2 have stricter compliance requirement over standard c headers. Bug: webm:1708 Change-Id: I676b1227b9dd304149e50016468df0f057c6a78f --- configure | 12 ++++++------ examples/vpx_dec_fuzzer.cc | 2 +- third_party/libwebm/Android.mk | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/configure b/configure index f7e11aaf2d..da631a45e1 100755 --- a/configure +++ b/configure @@ -731,33 +731,33 @@ process_toolchain() { soft_enable libyuv ;; *-android-*) - check_add_cxxflags -std=c++11 && soft_enable webm_io + check_add_cxxflags -std=gnu++11 && soft_enable webm_io soft_enable libyuv # GTestLog must be modified to use Android logging utilities. ;; *-darwin-*) - check_add_cxxflags -std=c++11 + check_add_cxxflags -std=gnu++11 # iOS/ARM builds do not work with gtest. This does not match # x86 targets. ;; *-iphonesimulator-*) - check_add_cxxflags -std=c++11 && soft_enable webm_io + check_add_cxxflags -std=gnu++11 && soft_enable webm_io soft_enable libyuv ;; *-win*) # Some mingw toolchains don't have pthread available by default. # Treat these more like visual studio where threading in gtest # would be disabled for the same reason. - check_add_cxxflags -std=c++11 && soft_enable unit_tests \ + check_add_cxxflags -std=gnu++11 && soft_enable unit_tests \ && soft_enable webm_io check_cxx "$@" < Date: Thu, 4 Mar 2021 18:43:29 -0800 Subject: [PATCH 0127/1000] Check for _WIN32 instead of WIN32. _WIN32 is predefined for the Windows platform in MSVC, whereas WIN32 is not, and WIN32 is also not defined in the makefiles. Change-Id: I8b58e42d891608dbe1e1313dc9629c2be588d9ec --- vp8/decoder/threading.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index 561922de32..491e2ce4c1 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -10,7 +10,7 @@ #include "vpx_config.h" #include "vp8_rtcd.h" -#if !defined(WIN32) && CONFIG_OS_SUPPORT == 1 +#if !defined(_WIN32) && CONFIG_OS_SUPPORT == 1 #include #endif #include "onyxd_int.h" From f27c62c5dfb75c27130e03c724fc5e6f22e03510 Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Thu, 4 Mar 2021 17:10:09 +0000 Subject: [PATCH 0128/1000] Further integration for Vizier. Further integration of Vizier adjustable parameters, This patch connects up additional configurable two pass rate control parameters for the Vizier project. This still needs to be connected up to a command line interface and at the moment should still be using default values that match previous behavior. Do not submit until verified that defaults are all working correctly. Change-Id: If1241c2dba6759395e6efa349c4659a0c345361d --- vp9/encoder/vp9_firstpass.c | 340 +++++++++++++++++++----------------- vp9/encoder/vp9_firstpass.h | 17 +- vp9/encoder/vp9_ratectrl.h | 13 -- 3 files changed, 191 insertions(+), 179 deletions(-) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index fe7abef07e..6d6aa5087f 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -77,7 +77,6 @@ #define MINQ_ADJ_LIMIT_CQ 20 #define HIGH_UNDERSHOOT_RATIO 2 #define AV_WQ_FACTOR 4.0 -#define DEF_EPMB_LOW 2000.0 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001) @@ -1833,6 +1832,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) { } static double get_sr_decay_rate(const FRAME_INFO *frame_info, + const TWO_PASS *const twopass, const FIRSTPASS_STATS *frame) { double sr_diff = (frame->sr_coded_error - frame->coded_error); double sr_decay = 1.0; @@ -1854,28 +1854,31 @@ static double get_sr_decay_rate(const FRAME_INFO *frame_info, if ((sr_diff > LOW_SR_DIFF_TRHESH)) { sr_diff = VPXMIN(sr_diff, SR_DIFF_MAX); - sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - motion_amplitude_part - + sr_decay = 1.0 - (twopass->sr_diff_part * sr_diff) - motion_amplitude_part - (INTRA_PART * modified_pcnt_intra); } - return VPXMAX(sr_decay, DEFAULT_DECAY_LIMIT); + return VPXMAX(sr_decay, twopass->sr_default_decay_limit); } // This function gives an estimate of how badly we believe the prediction // quality is decaying from frame to frame. static double get_zero_motion_factor(const FRAME_INFO *frame_info, + const TWO_PASS *const twopass, const FIRSTPASS_STATS *frame_stats) { const double zero_motion_pct = frame_stats->pcnt_inter - frame_stats->pcnt_motion; - double sr_decay = get_sr_decay_rate(frame_info, frame_stats); + double sr_decay = get_sr_decay_rate(frame_info, twopass, frame_stats); return VPXMIN(sr_decay, zero_motion_pct); } static double get_prediction_decay_rate(const FRAME_INFO *frame_info, + const TWO_PASS *const twopass, const FIRSTPASS_STATS *frame_stats) { - const double sr_decay_rate = get_sr_decay_rate(frame_info, frame_stats); + const double sr_decay_rate = + get_sr_decay_rate(frame_info, twopass, frame_stats); const double zero_motion_factor = (0.95 * pow((frame_stats->pcnt_inter - frame_stats->pcnt_motion), - ZM_POWER_FACTOR)); + twopass->zm_power_factor)); return VPXMAX(zero_motion_factor, (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor))); @@ -1959,6 +1962,7 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, static double calc_frame_boost(const FRAME_INFO *frame_info, const FIRSTPASS_STATS *this_frame, + const TWO_PASS *const twopass, int avg_frame_qindex, double this_frame_mv_in_out) { double frame_boost; @@ -1968,7 +1972,7 @@ static double calc_frame_boost(const FRAME_INFO *frame_info, const double active_area = calculate_active_area(frame_info, this_frame); // Underlying boost factor is based on inter error ratio. - frame_boost = (BASELINE_ERR_PER_MB * active_area) / + frame_boost = (twopass->base_err_per_mb * active_area) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error); // Small adjustment for cases where there is a zoom out @@ -1978,37 +1982,25 @@ static double calc_frame_boost(const FRAME_INFO *frame_info, // Q correction and scalling frame_boost = frame_boost * boost_q_correction; - return VPXMIN(frame_boost, GF_MAX_FRAME_BOOST * boost_q_correction); -} - -static double kf_err_per_mb(VP9_COMP *cpi) { - const VP9_COMMON *const cm = &cpi->common; - unsigned int screen_area = (cm->width * cm->height); - - // Use a different error per mb factor for calculating boost for - // different formats. - if (screen_area < 1280 * 720) { - return 2000.0; - } else if (screen_area < 1920 * 1080) { - return 500.0; - } - return 250.0; + return VPXMIN(frame_boost, twopass->gf_frame_max_boost * boost_q_correction); } static double calc_kf_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, double *sr_accumulator, double this_frame_mv_in_out, - double max_boost) { + double zm_factor) { + TWO_PASS *const twopass = &cpi->twopass; double frame_boost; const double lq = vp9_convert_qindex_to_q( cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth); const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00); const double active_area = calculate_active_area(&cpi->frame_info, this_frame); + double max_boost; // Underlying boost factor is based on inter error ratio. - frame_boost = (kf_err_per_mb(cpi) * active_area) / + frame_boost = (twopass->kf_err_per_mb * active_area) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator); // Update the accumulator for second ref error difference. @@ -2027,13 +2019,20 @@ static double calc_kf_frame_boost(VP9_COMP *cpi, // boost calculation. frame_boost = ((frame_boost + 40.0) * boost_q_correction); - return VPXMIN(frame_boost, max_boost * boost_q_correction); + // Maximum allowed boost this frame. May be different for first vs subsequent + // key frames. + max_boost = (cpi->common.current_video_frame == 0) + ? twopass->kf_frame_max_boost_first + : twopass->kf_frame_max_boost_subs; + max_boost *= zm_factor * boost_q_correction; + + return VPXMIN(frame_boost, max_boost); } static int compute_arf_boost(const FRAME_INFO *frame_info, - const FIRST_PASS_INFO *first_pass_info, - int arf_show_idx, int f_frames, int b_frames, - int avg_frame_qindex) { + TWO_PASS *const twopass, int arf_show_idx, + int f_frames, int b_frames, int avg_frame_qindex) { + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; int i; double boost_score = 0.0; double mv_ratio_accumulator = 0.0; @@ -2064,14 +2063,15 @@ static int compute_arf_boost(const FRAME_INFO *frame_info, // Accumulate the effect of prediction quality decay. if (!flash_detected) { - decay_accumulator *= get_prediction_decay_rate(frame_info, this_frame); + decay_accumulator *= + get_prediction_decay_rate(frame_info, twopass, this_frame); decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : decay_accumulator; } - boost_score += decay_accumulator * calc_frame_boost(frame_info, this_frame, - avg_frame_qindex, - this_frame_mv_in_out); + boost_score += decay_accumulator * + calc_frame_boost(frame_info, this_frame, twopass, + avg_frame_qindex, this_frame_mv_in_out); } arf_boost = (int)boost_score; @@ -2104,14 +2104,15 @@ static int compute_arf_boost(const FRAME_INFO *frame_info, // Cumulative effect of prediction quality decay. if (!flash_detected) { - decay_accumulator *= get_prediction_decay_rate(frame_info, this_frame); + decay_accumulator *= + get_prediction_decay_rate(frame_info, twopass, this_frame); decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : decay_accumulator; } - boost_score += decay_accumulator * calc_frame_boost(frame_info, this_frame, - avg_frame_qindex, - this_frame_mv_in_out); + boost_score += decay_accumulator * + calc_frame_boost(frame_info, this_frame, twopass, + avg_frame_qindex, this_frame_mv_in_out); } arf_boost += (int)boost_score; @@ -2127,8 +2128,8 @@ static int calc_arf_boost(VP9_COMP *cpi, int f_frames, int b_frames) { TWO_PASS *const twopass = &cpi->twopass; const int avg_inter_frame_qindex = cpi->rc.avg_frame_qindex[INTER_FRAME]; int arf_show_idx = get_show_idx(twopass); - return compute_arf_boost(frame_info, &twopass->first_pass_info, arf_show_idx, - f_frames, b_frames, avg_inter_frame_qindex); + return compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames, + b_frames, avg_inter_frame_qindex); } // Calculate a section intra ratio used in setting max loop filter. @@ -2557,10 +2558,11 @@ typedef struct RANGE { * structs. */ static int get_gop_coding_frame_num( - int *use_alt_ref, const FRAME_INFO *frame_info, - const FIRST_PASS_INFO *first_pass_info, const RATE_CONTROL *rc, - int gf_start_show_idx, const RANGE *active_gf_interval, - double gop_intra_factor, int lag_in_frames) { + int *use_alt_ref, const FRAME_INFO *frame_info, TWO_PASS *const twopass, + const RATE_CONTROL *rc, int gf_start_show_idx, + const RANGE *active_gf_interval, double gop_intra_factor, + int lag_in_frames) { + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; double loop_decay_rate = 1.00; double mv_ratio_accumulator = 0.0; double this_frame_mv_in_out = 0.0; @@ -2603,13 +2605,14 @@ static int get_gop_coding_frame_num( if ((rc->frames_since_key + gop_coding_frames - 1) > 1) { zero_motion_accumulator = VPXMIN(zero_motion_accumulator, - get_zero_motion_factor(frame_info, next_frame)); + get_zero_motion_factor(frame_info, twopass, next_frame)); } // Accumulate the effect of prediction quality decay. if (!flash_detected) { double last_loop_decay_rate = loop_decay_rate; - loop_decay_rate = get_prediction_decay_rate(frame_info, next_frame); + loop_decay_rate = + get_prediction_decay_rate(frame_info, twopass, next_frame); // Break clause to detect very still sections after motion. For example, // a static image after a fade or other transition. @@ -2807,14 +2810,14 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { use_alt_ref = gop_command->use_alt_ref; } else { gop_coding_frames = get_gop_coding_frame_num( - &use_alt_ref, frame_info, first_pass_info, rc, gf_start_show_idx, + &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx, &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames); use_alt_ref &= allow_alt_ref; } } #else gop_coding_frames = get_gop_coding_frame_num( - &use_alt_ref, frame_info, first_pass_info, rc, gf_start_show_idx, + &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx, &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames); use_alt_ref &= allow_alt_ref; #endif @@ -2836,8 +2839,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { // Calculate the boost for alt ref. rc->gfu_boost = - compute_arf_boost(frame_info, first_pass_info, arf_show_idx, f_frames, - b_frames, avg_inter_frame_qindex); + compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames, b_frames, + avg_inter_frame_qindex); rc->source_alt_ref_pending = 1; } else { const int f_frames = gop_coding_frames - 1; @@ -2847,9 +2850,9 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { const int gld_show_idx = VPXMIN(gf_start_show_idx + 1, fps_get_num_frames(first_pass_info)); const int arf_boost = - compute_arf_boost(frame_info, first_pass_info, gld_show_idx, f_frames, - b_frames, avg_inter_frame_qindex); - rc->gfu_boost = VPXMIN(MAX_GF_BOOST, arf_boost); + compute_arf_boost(frame_info, twopass, gld_show_idx, f_frames, b_frames, + avg_inter_frame_qindex); + rc->gfu_boost = VPXMIN(twopass->gf_max_total_boost, arf_boost); rc->source_alt_ref_pending = 0; } @@ -2952,7 +2955,9 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone), group_av_noise, vbr_group_bits_per_frame); twopass->active_worst_quality = - (tmp_q + (twopass->active_worst_quality * 3)) >> 2; + (int)((tmp_q + (twopass->active_worst_quality * + (twopass->active_wq_factor - 1))) / + twopass->active_wq_factor); #if CONFIG_ALWAYS_ADJUST_BPM // Reset rolling actual and target bits counters for ARF groups. @@ -3174,8 +3179,9 @@ static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info, int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf, const FRAME_INFO *frame_info, - const FIRST_PASS_INFO *first_pass_info, - int kf_show_idx, int min_gf_interval) { + const TWO_PASS *const twopass, int kf_show_idx, + int min_gf_interval) { + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; int j; int frames_to_key; @@ -3202,7 +3208,8 @@ int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf, break; // How fast is the prediction quality decaying? - loop_decay_rate = get_prediction_decay_rate(frame_info, next_frame); + loop_decay_rate = + get_prediction_decay_rate(frame_info, twopass, next_frame); // We want to know something about the recent past... rather than // as used elsewhere where we are concerned with decay in prediction @@ -3289,7 +3296,7 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) { mean_mod_score, av_err); rc->frames_to_key = vp9_get_frames_to_next_key( - oxcf, frame_info, first_pass_info, kf_show_idx, rc->min_gf_interval); + oxcf, frame_info, twopass, kf_show_idx, rc->min_gf_interval); // If there is a max kf interval set by the user we must obey it. // We already breakout of the loop above at 2x max. @@ -3369,9 +3376,9 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) { // Monitor for static sections. // First frame in kf group the second ref indicator is invalid. if (i > 0) { - zero_motion_accumulator = - VPXMIN(zero_motion_accumulator, - get_zero_motion_factor(&cpi->frame_info, &next_frame)); + zero_motion_accumulator = VPXMIN( + zero_motion_accumulator, + get_zero_motion_factor(&cpi->frame_info, twopass, &next_frame)); } else { zero_motion_accumulator = next_frame.pcnt_inter - next_frame.pcnt_motion; @@ -3385,8 +3392,8 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) { // the first key frame or it points to a refernce before the new key // frame. if (i < 2) sr_accumulator = 0.0; - frame_boost = calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0, - KF_MAX_FRAME_BOOST * zm_factor); + frame_boost = + calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0, zm_factor); boost_score += frame_boost; @@ -3415,12 +3422,12 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) { // Special case for static / slide show content but dont apply // if the kf group is very short. if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) { - rc->kf_boost = MAX_KF_TOT_BOOST; + rc->kf_boost = twopass->kf_max_total_boost; } else { - // Apply various clamps for min and max boost + // Apply various clamps for min and max oost rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); - rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST); + rc->kf_boost = VPXMIN(rc->kf_boost, twopass->kf_max_total_boost); } // Work out how many bits to allocate for the key frame itself. @@ -3477,107 +3484,113 @@ static int is_skippable_frame(const VP9_COMP *cpi) { // Configure image size specific vizier parameters. // Later these will be set via additional command line options -static void init_vizier_params(RATE_CONTROL *const rc, int screen_area) { +static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { if (1) { // Force defaults for now - rc->active_wq_factor = AV_WQ_FACTOR; - rc->base_err_per_mb = BASELINE_ERR_PER_MB; - rc->sr_default_decay_limit = DEFAULT_DECAY_LIMIT; - rc->sr_diff_part = SR_DIFF_PART; - rc->gf_frame_max_boost = GF_MAX_FRAME_BOOST; - rc->gf_max_total_boost = MAX_GF_BOOST; - rc->kf_err_per_mb = DEF_EPMB_LOW; - rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; // Max for first kf. - rc->kf_frame_max_boost_subs = KF_MAX_FRAME_BOOST / 2; // Max for other kfs. - rc->kf_max_total_boost = MAX_KF_TOT_BOOST; - rc->zm_power_factor = ZM_POWER_FACTOR; + twopass->active_wq_factor = AV_WQ_FACTOR; + twopass->base_err_per_mb = BASELINE_ERR_PER_MB; + twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT; + twopass->sr_diff_part = SR_DIFF_PART; + twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST; + twopass->gf_max_total_boost = MAX_GF_BOOST; + if (screen_area < 1280 * 720) { + twopass->kf_err_per_mb = 2000.0; + } else if (screen_area < 1920 * 1080) { + twopass->kf_err_per_mb = 500.0; + } else { + twopass->kf_err_per_mb = 250.0; + } + twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; + twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; + twopass->zm_power_factor = ZM_POWER_FACTOR; } else { // Vizer experimental parameters from training. // Later these will be set via the command line. if (screen_area <= 176 * 144) { - rc->active_wq_factor = 46.0; - rc->base_err_per_mb = 37597.399760969536; - rc->sr_default_decay_limit = 0.3905639800962774; - rc->sr_diff_part = 0.009599023654146284; - rc->gf_frame_max_boost = 87.27362648627846; - rc->gf_max_total_boost = MAX_GF_BOOST; - rc->kf_err_per_mb = 1854.8255436877148; - rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; - rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2; - rc->kf_max_total_boost = MAX_KF_TOT_BOOST; - rc->zm_power_factor = 2.93715229184991; + twopass->active_wq_factor = 46.0; + twopass->base_err_per_mb = 37597.399760969536; + twopass->sr_default_decay_limit = 0.3905639800962774; + twopass->sr_diff_part = 0.009599023654146284; + twopass->gf_frame_max_boost = 87.27362648627846; + twopass->gf_max_total_boost = MAX_GF_BOOST; + twopass->kf_err_per_mb = 1854.8255436877148; + twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; + twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; + twopass->zm_power_factor = 2.93715229184991; } else if (screen_area <= 320 * 240) { - rc->active_wq_factor = 55.0; - rc->base_err_per_mb = 34525.33177195309; - rc->sr_default_decay_limit = 0.23901360046804604; - rc->sr_diff_part = 0.008581014394766773; - rc->gf_frame_max_boost = 127.34978204980285; - rc->gf_max_total_boost = MAX_GF_BOOST; - rc->kf_err_per_mb = 723.8337508755031; - rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; - rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2; - rc->kf_max_total_boost = MAX_KF_TOT_BOOST; - rc->zm_power_factor = 3.5299221493593413; + twopass->active_wq_factor = 55.0; + twopass->base_err_per_mb = 34525.33177195309; + twopass->sr_default_decay_limit = 0.23901360046804604; + twopass->sr_diff_part = 0.008581014394766773; + twopass->gf_frame_max_boost = 127.34978204980285; + twopass->gf_max_total_boost = MAX_GF_BOOST; + twopass->kf_err_per_mb = 723.8337508755031; + twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; + twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; + twopass->zm_power_factor = 3.5299221493593413; } else if (screen_area <= 640 * 360) { - rc->active_wq_factor = 12.5; - rc->base_err_per_mb = 18823.978018028298; - rc->sr_default_decay_limit = 0.6043527690301296; - rc->sr_diff_part = 0.00343296783885544; - rc->gf_frame_max_boost = 75.17672317013668; - rc->gf_max_total_boost = MAX_GF_BOOST; - rc->kf_err_per_mb = 422.2871502380377; - rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; - rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2; - rc->kf_max_total_boost = MAX_KF_TOT_BOOST; - rc->zm_power_factor = 2.265742666649307; + twopass->active_wq_factor = 12.5; + twopass->base_err_per_mb = 18823.978018028298; + twopass->sr_default_decay_limit = 0.6043527690301296; + twopass->sr_diff_part = 0.00343296783885544; + twopass->gf_frame_max_boost = 75.17672317013668; + twopass->gf_max_total_boost = MAX_GF_BOOST; + twopass->kf_err_per_mb = 422.2871502380377; + twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; + twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; + twopass->zm_power_factor = 2.265742666649307; } else if (screen_area <= 854 * 480) { - rc->active_wq_factor = 51.5; - rc->base_err_per_mb = 33718.98307662595; - rc->sr_default_decay_limit = 0.33633414970713393; - rc->sr_diff_part = 0.00868988716928333; - rc->gf_frame_max_boost = 85.2868528581522; - rc->gf_max_total_boost = MAX_GF_BOOST; - rc->kf_err_per_mb = 1513.4883914008383; - rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; - rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2; - rc->kf_max_total_boost = MAX_KF_TOT_BOOST; - rc->zm_power_factor = 3.552278528517416; + twopass->active_wq_factor = 51.5; + twopass->base_err_per_mb = 33718.98307662595; + twopass->sr_default_decay_limit = 0.33633414970713393; + twopass->sr_diff_part = 0.00868988716928333; + twopass->gf_frame_max_boost = 85.2868528581522; + twopass->gf_max_total_boost = MAX_GF_BOOST; + twopass->kf_err_per_mb = 1513.4883914008383; + twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; + twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; + twopass->zm_power_factor = 3.552278528517416; } else if (screen_area <= 1280 * 720) { - rc->active_wq_factor = 41.5; - rc->base_err_per_mb = 29527.46375825401; - rc->sr_default_decay_limit = 0.5009117586299728; - rc->sr_diff_part = 0.005007364627260114; - rc->gf_frame_max_boost = GF_MAX_FRAME_BOOST; - rc->gf_max_total_boost = MAX_GF_BOOST; - rc->kf_err_per_mb = 998.6342911785146; - rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; - rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2; - rc->kf_max_total_boost = MAX_KF_TOT_BOOST; - rc->zm_power_factor = 2.568627575572356; + twopass->active_wq_factor = 41.5; + twopass->base_err_per_mb = 29527.46375825401; + twopass->sr_default_decay_limit = 0.5009117586299728; + twopass->sr_diff_part = 0.005007364627260114; + twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST; + twopass->gf_max_total_boost = MAX_GF_BOOST; + twopass->kf_err_per_mb = 998.6342911785146; + twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; + twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; + twopass->zm_power_factor = 2.568627575572356; } else if (screen_area <= 1920 * 1080) { - rc->active_wq_factor = 31.0; - rc->base_err_per_mb = 34474.723463367416; - rc->sr_default_decay_limit = 0.23346886902707745; - rc->sr_diff_part = 0.011431716637966029; - rc->gf_frame_max_boost = 81.00472969483079; - rc->gf_max_total_boost = MAX_GF_BOOST; - rc->kf_err_per_mb = 35931.25734431429; - rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; - rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2; - rc->kf_max_total_boost = MAX_KF_TOT_BOOST; - rc->zm_power_factor = 5.5776463538431935; + twopass->active_wq_factor = 31.0; + twopass->base_err_per_mb = 34474.723463367416; + twopass->sr_default_decay_limit = 0.23346886902707745; + twopass->sr_diff_part = 0.011431716637966029; + twopass->gf_frame_max_boost = 81.00472969483079; + twopass->gf_max_total_boost = MAX_GF_BOOST; + twopass->kf_err_per_mb = 35931.25734431429; + twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; + twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; + twopass->zm_power_factor = 5.5776463538431935; } else { - rc->active_wq_factor = AV_WQ_FACTOR; - rc->base_err_per_mb = BASELINE_ERR_PER_MB; - rc->sr_default_decay_limit = DEFAULT_DECAY_LIMIT; - rc->sr_diff_part = SR_DIFF_PART; - rc->gf_frame_max_boost = GF_MAX_FRAME_BOOST; - rc->gf_max_total_boost = MAX_GF_BOOST; - rc->kf_err_per_mb = DEF_EPMB_LOW; - rc->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; - rc->kf_frame_max_boost_subs = rc->kf_frame_max_boost_first / 2; - rc->kf_max_total_boost = MAX_KF_TOT_BOOST; - rc->zm_power_factor = ZM_POWER_FACTOR; + twopass->active_wq_factor = AV_WQ_FACTOR; + twopass->base_err_per_mb = BASELINE_ERR_PER_MB; + twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT; + twopass->sr_diff_part = SR_DIFF_PART; + twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST; + twopass->gf_max_total_boost = MAX_GF_BOOST; + twopass->kf_err_per_mb = 250.0; + twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; + twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; + twopass->zm_power_factor = ZM_POWER_FACTOR; } } } @@ -3596,7 +3609,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { if (cm->current_video_frame == 0) { unsigned int screen_area = (cm->width * cm->height); - init_vizier_params(rc, screen_area); + init_vizier_params(twopass, screen_area); } // If this is an arf frame then we dont want to read the stats file or @@ -3862,9 +3875,9 @@ void vp9_get_next_group_of_picture(const VP9_COMP *cpi, int *first_is_key_frame, *first_is_key_frame = 0; if (rc.frames_to_key == 0) { - rc.frames_to_key = vp9_get_frames_to_next_key( - &cpi->oxcf, &cpi->frame_info, &cpi->twopass.first_pass_info, - *first_show_idx, rc.min_gf_interval); + rc.frames_to_key = + vp9_get_frames_to_next_key(&cpi->oxcf, &cpi->frame_info, twopass, + *first_show_idx, rc.min_gf_interval); rc.frames_since_key = 0; *first_is_key_frame = 1; } @@ -3903,9 +3916,9 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf, gop_intra_factor = 1.0; } - frame_count = get_gop_coding_frame_num( - use_alt_ref, frame_info, first_pass_info, rc, show_idx, - &active_gf_interval, gop_intra_factor, oxcf->lag_in_frames); + frame_count = get_gop_coding_frame_num(use_alt_ref, frame_info, twopass, rc, + show_idx, &active_gf_interval, + gop_intra_factor, oxcf->lag_in_frames); *use_alt_ref &= allow_alt_ref; return frame_count; } @@ -3929,7 +3942,7 @@ int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf, int first_is_key_frame = 0; if (rc.frames_to_key == 0) { rc.frames_to_key = vp9_get_frames_to_next_key( - oxcf, frame_info, first_pass_info, show_idx, rc.min_gf_interval); + oxcf, frame_info, twopass, show_idx, rc.min_gf_interval); rc.frames_since_key = 0; first_is_key_frame = 1; } @@ -3951,8 +3964,7 @@ int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf, void vp9_get_key_frame_map(const VP9EncoderConfig *oxcf, const FRAME_INFO *frame_info, - const FIRST_PASS_INFO *first_pass_info, - int *key_frame_map) { + const TWO_PASS *const twopass, int *key_frame_map) { int show_idx = 0; RATE_CONTROL rc; vp9_rc_init(oxcf, 1, &rc); @@ -3966,7 +3978,7 @@ void vp9_get_key_frame_map(const VP9EncoderConfig *oxcf, int key_frame_group_size; key_frame_map[show_idx] = 1; key_frame_group_size = vp9_get_frames_to_next_key( - oxcf, frame_info, first_pass_info, show_idx, rc.min_gf_interval); + oxcf, frame_info, twopass, show_idx, rc.min_gf_interval); assert(key_frame_group_size > 0); show_idx += key_frame_group_size; } diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index b1047eab22..6a347c8b14 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -221,6 +221,19 @@ typedef struct { int last_qindex_of_arf_layer[MAX_ARF_LAYERS]; GF_GROUP gf_group; + + // Vizeir project experimental two pass rate control parameters. + double active_wq_factor; + double base_err_per_mb; + double sr_default_decay_limit; + double sr_diff_part; + double kf_err_per_mb; + double kf_frame_max_boost_first; // Max for first kf in a chunk. + double kf_frame_max_boost_subs; // Max for subsequent mid chunk kfs. + int kf_max_total_boost; + int gf_max_total_boost; + double gf_frame_max_boost; + double zm_power_factor; } TWO_PASS; struct VP9_COMP; @@ -249,8 +262,8 @@ void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width, struct VP9EncoderConfig; int vp9_get_frames_to_next_key(const struct VP9EncoderConfig *oxcf, const FRAME_INFO *frame_info, - const FIRST_PASS_INFO *first_pass_info, - int kf_show_idx, int min_gf_interval); + const TWO_PASS *const twopass, int kf_show_idx, + int min_gf_interval); #if CONFIG_RATE_CTRL /* Call this function to get info about the next group of pictures. * This function should be called after vp9_create_compressor() when encoding diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 7437d309e0..0120f90a01 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -204,19 +204,6 @@ typedef struct { int preserve_arf_as_gld; int preserve_next_arf_as_gld; int show_arf_as_gld; - - // Vizeir project experimental rate control parameters. - double active_wq_factor; - double base_err_per_mb; - double sr_default_decay_limit; - double sr_diff_part; - double kf_frame_max_boost_first; // Max for first kf in a chunk. - double kf_frame_max_boost_subs; // Max for subsequent mid chunk kfs. - double kf_max_total_boost; - double kf_err_per_mb; - double gf_frame_max_boost; - double gf_max_total_boost; - double zm_power_factor; } RATE_CONTROL; struct VP9_COMP; From 36013909a5ac97bf6f08e28e9471261ff8e133d3 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Mon, 8 Mar 2021 12:25:31 -0800 Subject: [PATCH 0129/1000] L2E: let vp9 encoder respect external max frame size constraint Change-Id: Ib926e694d4bc4675af1435a32f6316a587756380 --- vp9/encoder/vp9_encoder.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index ecd15cb011..34646465a6 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4398,6 +4398,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest int frame_over_shoot_limit; int frame_under_shoot_limit; int q = 0, q_low = 0, q_high = 0; + int last_q_attempt = 0; int enable_acl; #ifdef AGGRESSIVE_VBR int qrange_adj = 1; @@ -4413,6 +4414,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest // passed in by the external rate control model. // case: -1, we take VP9's decision for the max frame size. int ext_rc_max_frame_size = 0; + const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth; #if CONFIG_RATE_CTRL const FRAME_UPDATE_TYPE update_type = @@ -4580,6 +4582,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest } if (cpi->ext_ratectrl.ready) { + last_q_attempt = q; // In general, for the external rate control, we take the qindex provided // as input and encode the frame with this qindex faithfully. However, // in some extreme scenarios, the provided qindex leads to a massive @@ -4597,6 +4600,10 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest break; } } + rc->max_frame_bandwidth = ext_rc_max_frame_size; + // If the current frame size exceeds the ext_rc_max_frame_size, + // we adjust the worst qindex to meet the frame size constraint. + q_high = 255; ext_rc_recode = 1; } #if CONFIG_RATE_CTRL @@ -4796,6 +4803,23 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest rc->projected_frame_size < rc->max_frame_bandwidth) loop = 0; + // Special handling of external max frame size constraint + if (ext_rc_recode) { + // If the largest q is not able to meet the max frame size limit, + // do nothing. + if (rc->projected_frame_size > ext_rc_max_frame_size && + last_q_attempt == 255) { + break; + } + // If VP9's q selection leads to a smaller q, we force it to use + // a larger q to better approximate the external max frame size + // constraint. + if (rc->projected_frame_size > ext_rc_max_frame_size && + q <= last_q_attempt) { + q = VPXMIN(255, last_q_attempt + 1); + } + } + if (loop) { ++loop_count; ++loop_at_this_size; @@ -4809,6 +4833,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest if (loop) restore_coding_context(cpi); } while (loop); + rc->max_frame_bandwidth = orig_rc_max_frame_bandwidth; + #ifdef AGGRESSIVE_VBR if (two_pass_first_group_inter(cpi)) { cpi->twopass.active_worst_quality = From cc3444f01c448f1cf6acdd283d65e7ec5d0a9fdd Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Tue, 9 Mar 2021 14:07:48 +0000 Subject: [PATCH 0130/1000] Vizier: Add defaults for > 1080P Previous code did not have sensible defaults for larger image formats. Added defaults for Vizier RD parameters for sizes > 1080P and changed the first pass parameters for large formats to use the 1080P values. No supplied value for rd_mult_q_sq_key_high_qp case yet so set to old hard wired default value. If the Vizier parameters were enabled the lack of sensible defaults caused a large regression for 2K clips in one of our test sets. Change-Id: I306c0cd76eab00d50880c91fadb5842faf6661ff --- vp9/encoder/vp9_firstpass.c | 14 +------------- vp9/encoder/vp9_rd.c | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 27 deletions(-) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 6d6aa5087f..7c67efe4b7 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -3567,7 +3567,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; twopass->zm_power_factor = 2.568627575572356; - } else if (screen_area <= 1920 * 1080) { + } else { twopass->active_wq_factor = 31.0; twopass->base_err_per_mb = 34474.723463367416; twopass->sr_default_decay_limit = 0.23346886902707745; @@ -3579,18 +3579,6 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; twopass->zm_power_factor = 5.5776463538431935; - } else { - twopass->active_wq_factor = AV_WQ_FACTOR; - twopass->base_err_per_mb = BASELINE_ERR_PER_MB; - twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT; - twopass->sr_diff_part = SR_DIFF_PART; - twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST; - twopass->gf_max_total_boost = MAX_GF_BOOST; - twopass->kf_err_per_mb = 250.0; - twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; - twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; - twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_power_factor = ZM_POWER_FACTOR; } } } diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index b126d8708d..3b2e0b088b 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -206,6 +206,7 @@ void vp9_init_rd_parameters(VP9_COMP *cpi) { // Make sure this function is floating point safe. vpx_clear_system_state(); + rdc->rd_mult_q_sq_key_high_qp = 7.5; // No defined Vizer values yet if (1) { // Non/pre-Vizer defaults rdc->rd_mult_q_sq_inter_low_qp = 4.0; @@ -214,49 +215,48 @@ void vp9_init_rd_parameters(VP9_COMP *cpi) { rdc->rd_mult_q_sq_key_ultralow_qp = 4.0; rdc->rd_mult_q_sq_key_low_qp = 3.5; rdc->rd_mult_q_sq_key_mid_qp = 4.5; - rdc->rd_mult_q_sq_key_high_qp = 7.5; } else if (screen_area <= 176 * 144) { - rdc->rd_mult_q_sq_inter_high_qp = 4.295745965132044; rdc->rd_mult_q_sq_inter_low_qp = 4.0718581295922025; rdc->rd_mult_q_sq_inter_mid_qp = 4.031435609256739; + rdc->rd_mult_q_sq_inter_high_qp = 4.295745965132044; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.290774097327333; rdc->rd_mult_q_sq_key_low_qp = 5.7037775720838155; rdc->rd_mult_q_sq_key_mid_qp = 4.72424015517201; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.290774097327333; } else if (screen_area <= 320 * 240) { - rdc->rd_mult_q_sq_inter_high_qp = 4.388244213131458; rdc->rd_mult_q_sq_inter_low_qp = 4.506676356706102; rdc->rd_mult_q_sq_inter_mid_qp = 4.489349899621181; + rdc->rd_mult_q_sq_inter_high_qp = 4.388244213131458; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.217074424696166; rdc->rd_mult_q_sq_key_low_qp = 4.497000582319771; rdc->rd_mult_q_sq_key_mid_qp = 4.2825894884789735; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.217074424696166; } else if (screen_area <= 640 * 360) { - rdc->rd_mult_q_sq_inter_high_qp = 4.3702861603380025; rdc->rd_mult_q_sq_inter_low_qp = 4.730644123689013; rdc->rd_mult_q_sq_inter_mid_qp = 4.314589509578551; + rdc->rd_mult_q_sq_inter_high_qp = 4.3702861603380025; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.576902541873747; rdc->rd_mult_q_sq_key_low_qp = 6.068652999601526; rdc->rd_mult_q_sq_key_mid_qp = 4.817707474077241; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.576902541873747; } else if (screen_area <= 854 * 480) { - rdc->rd_mult_q_sq_inter_high_qp = 3.969083125219539; rdc->rd_mult_q_sq_inter_low_qp = 4.811470143416073; rdc->rd_mult_q_sq_inter_mid_qp = 4.621618127750201; + rdc->rd_mult_q_sq_inter_high_qp = 3.969083125219539; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.9854544277222566; rdc->rd_mult_q_sq_key_low_qp = 5.073157238799473; rdc->rd_mult_q_sq_key_mid_qp = 5.7587672849242635; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.9854544277222566; } else if (screen_area <= 1280 * 720) { - rdc->rd_mult_q_sq_inter_high_qp = 4.410712348825541; rdc->rd_mult_q_sq_inter_low_qp = 5.119381136011107; rdc->rd_mult_q_sq_inter_mid_qp = 4.518613675766538; + rdc->rd_mult_q_sq_inter_high_qp = 4.410712348825541; + rdc->rd_mult_q_sq_key_ultralow_qp = 3.9468491666607326; rdc->rd_mult_q_sq_key_low_qp = 5.848703119971484; rdc->rd_mult_q_sq_key_mid_qp = 5.368947246228739; - rdc->rd_mult_q_sq_key_ultralow_qp = 3.9468491666607326; - } else if (screen_area <= 1920 * 1080) { - rdc->rd_mult_q_sq_inter_high_qp = 3.2141187537667797; + } else { rdc->rd_mult_q_sq_inter_low_qp = 6.00569815296199; rdc->rd_mult_q_sq_inter_mid_qp = 3.932565684947023; + rdc->rd_mult_q_sq_inter_high_qp = 3.2141187537667797; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.399795006320089; rdc->rd_mult_q_sq_key_low_qp = 10.582906599488298; rdc->rd_mult_q_sq_key_mid_qp = 6.274162346360692; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.399795006320089; } } From 8851ed5787b6e30b1f212b171da6cbaa9778a799 Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Tue, 9 Mar 2021 14:47:25 +0000 Subject: [PATCH 0131/1000] Vizier: Add in field for min kf frame boost. Added kf_frame_min_boost field to hold the minimum per frame boost in key frame boost calculations. Replaces hard wired value. To be used in conjunction with and tied to the maximum value. Change-Id: I67a39ecb3f21b5918512a5ccd9a1b214d7971e45 --- vp9/encoder/vp9_firstpass.c | 12 +++++++++++- vp9/encoder/vp9_firstpass.h | 1 + 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 7c67efe4b7..d1044d7815 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -65,9 +65,11 @@ #define GF_MAX_FRAME_BOOST 96.0 #ifdef AGGRESSIVE_VBR +#define KF_MIN_FRAME_BOOST 40.0 #define KF_MAX_FRAME_BOOST 80.0 #define MAX_KF_TOT_BOOST 4800 #else +#define KF_MIN_FRAME_BOOST 40.0 #define KF_MAX_FRAME_BOOST 96.0 #define MAX_KF_TOT_BOOST 5400 #endif @@ -2017,7 +2019,8 @@ static double calc_kf_frame_boost(VP9_COMP *cpi, // The 40.0 value here is an experimentally derived baseline minimum. // This value is in line with the minimum per frame boost in the alt_ref // boost calculation. - frame_boost = ((frame_boost + 40.0) * boost_q_correction); + frame_boost = + ((frame_boost + twopass->kf_frame_min_boost) * boost_q_correction); // Maximum allowed boost this frame. May be different for first vs subsequent // key frames. @@ -3500,6 +3503,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { } else { twopass->kf_err_per_mb = 250.0; } + twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; @@ -3515,6 +3519,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->gf_frame_max_boost = 87.27362648627846; twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 1854.8255436877148; + twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; @@ -3527,6 +3532,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->gf_frame_max_boost = 127.34978204980285; twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 723.8337508755031; + twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; @@ -3539,6 +3545,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->gf_frame_max_boost = 75.17672317013668; twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 422.2871502380377; + twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; @@ -3551,6 +3558,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->gf_frame_max_boost = 85.2868528581522; twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 1513.4883914008383; + twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; @@ -3563,6 +3571,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST; twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 998.6342911785146; + twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; @@ -3575,6 +3584,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->gf_frame_max_boost = 81.00472969483079; twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 35931.25734431429; + twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 6a347c8b14..9613f57fb9 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -228,6 +228,7 @@ typedef struct { double sr_default_decay_limit; double sr_diff_part; double kf_err_per_mb; + double kf_frame_min_boost; double kf_frame_max_boost_first; // Max for first kf in a chunk. double kf_frame_max_boost_subs; // Max for subsequent mid chunk kfs. int kf_max_total_boost; From cbc4ead58691c899262513025a83f03f5932e50e Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Tue, 9 Mar 2021 15:11:41 +0000 Subject: [PATCH 0132/1000] Vizer: Added in experimental max KF boost values. Added the experimental max per frame KF boost values derived from the Vizier experiments. These are still all off by default. When enabled I expect these to cause significant regression as they fluctuate wildly and in a way that makes no sense from format to format. I suspect these values reflect over fitting perhaps from a subset of training clips with more frequent mid chunk key frames and or short key frame groups. Also fixed incorrect value for gf boost for one format. Experiment to moderate these values and use different values for first and subsequent KF groups to follow. Change-Id: Ibeb4268957f2edacdb4549d74930255a22a2fcc5 --- vp9/encoder/vp9_firstpass.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index d1044d7815..8c771c7ffc 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -3520,7 +3520,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 1854.8255436877148; twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; - twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_first = 25.5; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; twopass->zm_power_factor = 2.93715229184991; @@ -3533,7 +3533,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 723.8337508755031; twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; - twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_first = 185.0; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; twopass->zm_power_factor = 3.5299221493593413; @@ -3546,7 +3546,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 422.2871502380377; twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; - twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_first = 224.5; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; twopass->zm_power_factor = 2.265742666649307; @@ -3559,7 +3559,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 1513.4883914008383; twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; - twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_first = 28.0; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; twopass->zm_power_factor = 3.552278528517416; @@ -3568,11 +3568,11 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->base_err_per_mb = 29527.46375825401; twopass->sr_default_decay_limit = 0.5009117586299728; twopass->sr_diff_part = 0.005007364627260114; - twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST; + twopass->gf_frame_max_boost = 81.00472969483079; twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 998.6342911785146; twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; - twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_first = 53.0; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; twopass->zm_power_factor = 2.568627575572356; @@ -3581,11 +3581,11 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->base_err_per_mb = 34474.723463367416; twopass->sr_default_decay_limit = 0.23346886902707745; twopass->sr_diff_part = 0.011431716637966029; - twopass->gf_frame_max_boost = 81.00472969483079; + twopass->gf_frame_max_boost = 213.2940230360479; twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 35931.25734431429; twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; - twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_first = 419.5; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; twopass->zm_power_factor = 5.5776463538431935; From 24b43c4ea5c2b54e32b107921cb06e89a71f916e Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 8 Mar 2021 16:07:02 -0800 Subject: [PATCH 0133/1000] Prepare for v1.10.0 release. Update CHANGELOG, AUTHORS, README, libs.mk Bug: webm:1712 Change-Id: Ic99de12b91a92c32f8a9485dcb759c48bc3eccd6 --- .mailmap | 2 ++ AUTHORS | 5 +++++ CHANGELOG | 30 ++++++++++++++++++++++++++++++ README | 17 +++++++++-------- libs.mk | 14 +++++++++++++- 5 files changed, 59 insertions(+), 9 deletions(-) diff --git a/.mailmap b/.mailmap index 8d97500994..376ca83ae3 100644 --- a/.mailmap +++ b/.mailmap @@ -12,6 +12,8 @@ Deb Mukherjee Elliott Karpilovsky Erik Niemeyer Fyodor Kyslov +Gregor Jasny +Gregor Jasny Guillaume Martres Hangyu Kuang Hui Su diff --git a/AUTHORS b/AUTHORS index 352c91feda..e804842f78 100644 --- a/AUTHORS +++ b/AUTHORS @@ -37,6 +37,7 @@ Christian Duvivier Clement Courbet Daniele Castagna Daniel Kang +Daniel Sommermann Dan Zhu Deb Mukherjee Deepa K G @@ -73,6 +74,7 @@ Ivan Maltz Jacek Caban Jacky Chen James Berry +James Touton James Yu James Zern Jan Gerber @@ -82,11 +84,14 @@ Jean-Yves Avenard Jeff Faust Jeff Muizelaar Jeff Petkau +Jeremy Leconte Jerome Jiang Jia Jia Jian Zhou Jim Bankoski +jinbo Jingning Han +Joel Fernandes Joey Parrish Johann Koenig John Koleszar diff --git a/CHANGELOG b/CHANGELOG index e731fc6121..6338caa380 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,33 @@ +2021-03-09 v1.10.0 "Ruddy Duck" + This maintenance release adds support for darwin20 and new codec controls, as + well as numerous bug fixes. + + - Upgrading: + New codec control is added to disable loopfilter for VP9. + + New encoder control is added to disable feature to increase Q on overshoot + detection for CBR. + + Configure support for darwin20 is added. + + New codec control is added for VP9 rate control. The control ID of this + interface is VP9E_SET_EXTERNAL_RATE_CONTROL. To make VP9 use a customized + external rate control model, users will have to implement each callback + function in vpx_rc_funcs_t and register them using libvpx API + vpx_codec_control_() with the control ID. + + - Enhancement: + Use -std=gnu++11 instead of -std=c++11 for c++ files. + + - Bug fixes: + Override assembler with --as option of configure for MSVS. + Fix several compilation issues with gcc 4.8.5. + Fix to resetting rate control for temporal layers. + Fix to the rate control stats of SVC example encoder when number of spatial + layers is 1. + Fix to reusing motion vectors from the base spatial layer in SVC. + 2 pass related flags removed from SVC example encoder. + 2020-07-29 v1.9.0 "Quacking Duck" This release adds support for NV12, a separate library for rate control, as well as incremental improvements. diff --git a/README b/README index 62fef34593..ddbcb9f695 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -README - 20 July 2020 +README - 08 March 2021 Welcome to the WebM VP8/VP9 Codec SDK! @@ -10,14 +10,14 @@ COMPILING THE APPLICATIONS/LIBRARIES: 1. Prerequisites * All x86 targets require the Yasm[1] assembler be installed[2]. - * All Windows builds require that Cygwin[3] be installed. - * Building the documentation requires Doxygen[4]. If you do not + * All Windows builds require that Cygwin[3] or MSYS2[4] be installed. + * Building the documentation requires Doxygen[5]. If you do not have this package, the install-docs option will be disabled. - * Downloading the data for the unit tests requires curl[5] and sha1sum. + * Downloading the data for the unit tests requires curl[6] and sha1sum. sha1sum is provided via the GNU coreutils, installed by default on many *nix platforms, as well as MinGW and Cygwin. If coreutils is not available, a compatible version of sha1sum can be built from - source[6]. These requirements are optional if not running the unit + source[7]. These requirements are optional if not running the unit tests. [1]: http://www.tortall.net/projects/yasm @@ -26,9 +26,10 @@ COMPILING THE APPLICATIONS/LIBRARIES: yasm--.exe to yasm.exe and place it in: Program Files (x86)/Microsoft Visual Studio/2017//Common7/Tools/ [3]: http://www.cygwin.com - [4]: http://www.doxygen.org - [5]: http://curl.haxx.se - [6]: http://www.microbrew.org/tools/md5sha1sum/ + [4]: http://www.msys2.org/ + [5]: http://www.doxygen.org + [6]: http://curl.haxx.se + [7]: http://www.microbrew.org/tools/md5sha1sum/ 2. Out-of-tree builds Out of tree builds are a supported method of building the application. For diff --git a/libs.mk b/libs.mk index cabd4ed141..d05eee966d 100644 --- a/libs.mk +++ b/libs.mk @@ -287,8 +287,20 @@ OBJS-yes += $(LIBVPX_OBJS) LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) +# Updating version info. +# https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info +# For libtool: c=, a=, r= +# libtool generates .so file as .so.[c-a].a.r, while -version-info c:r:a is +# passed to libtool. +# +# libvpx library file is generated as libvpx.so... +# MAJOR = c-a, MINOR = a, PATCH = r +# +# To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current +# SO_VERSION_* then follow the rules in the link to detemine the new version +# (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1 SO_VERSION_MAJOR := 6 -SO_VERSION_MINOR := 3 +SO_VERSION_MINOR := 4 SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib From 973726c38bb0fd64fb14b1c95caabe52bd39d17d Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Fri, 12 Mar 2021 09:54:03 -0800 Subject: [PATCH 0134/1000] vp9-rtc: Add postencode_drop control to sample encoder Change-Id: I1c989f26b0a7b9239adf37df8d96776f33b89a8b --- examples/vpx_temporal_svc_encoder.c | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c index 04212e5d7d..ad3e79c713 100644 --- a/examples/vpx_temporal_svc_encoder.c +++ b/examples/vpx_temporal_svc_encoder.c @@ -831,6 +831,7 @@ int main(int argc, char **argv) { } else if (strncmp(encoder->name, "vp9", 3) == 0) { vpx_svc_extra_cfg_t svc_params; memset(&svc_params, 0, sizeof(svc_params)); + vpx_codec_control(&codec, VP9E_SET_POSTENCODE_DROP, 0); vpx_codec_control(&codec, VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, 0); vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed); vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); From 04086a30664d2a3e89d6a6e4e1c18f1a82c8f958 Mon Sep 17 00:00:00 2001 From: "Adam B. Goode" Date: Wed, 17 Mar 2021 14:11:57 -0500 Subject: [PATCH 0135/1000] Msvc builds convert to windows path w/msys env Bug: webm:1720 Change-Id: I56689ad408f8086c511e1711dfa9c8d404727b2e --- build/make/msvs_common.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build/make/msvs_common.sh b/build/make/msvs_common.sh index 27ddf7fd91..3989fec0d5 100644 --- a/build/make/msvs_common.sh +++ b/build/make/msvs_common.sh @@ -9,7 +9,8 @@ ## be found in the AUTHORS file in the root of the source tree. ## -if [ "$(uname -o 2>/dev/null)" = "Cygwin" ] \ +shell_name="$(uname -o 2>/dev/null)" +if [[ "$shell_name" = "Cygwin" || "$shell_name" = "Msys" ]] \ && cygpath --help >/dev/null 2>&1; then FIXPATH='cygpath -m' else From b5e754a840511c9956c033561955745a184495cb Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Wed, 10 Mar 2021 14:39:43 +0000 Subject: [PATCH 0136/1000] Change SR_diff calculation and representation This patch changes the way prediction decay is calculated. We expect that frames that are further from an ALT-REF frame (or Golden Frame) will be less well predicted by that ALT-REF frame. As such it is desirable that they should contribute less to the boost calculation used to assign bits to the ALT_REF. This code looks at the reduction in prediction quality between the last frame and the second reference frame (usually two frames old). We make the assumption that we can accumulate this to get a proxy for the likely loss of prediction quality over multiple frames. Previously the calculation looked at the absolute difference in the coded errors. The issue here is that the meaning of a unit difference is not the same for very complex frames as it is for easy frames. In this patch we scale the decay value based on how the error difference compares to the overall frame complexity as represented by the intra coding error. This was tuned experimentally to give test results that were approximately neutral for our various test sets. There was a slight drop in Overall PSNR but a consistent improvement in SSIM. This balance may be improved with tuning further as it is noteworthy that it was much better on the hd_res set. Results (Overall PSNR, SSIM -ve better) for low_res, ugc360, midres2, ugc480P and hd_res are as follows: 0.173 -0.688 0.118 -0.153 0.132 -0.239 0.261 -0.405 -0.305 -1.109 As part of this adjustment the contribution of motion amplitude was removed. This patch also changes the control mechanism that will be exposed on the command line for use by the Vizier project. The control is now a linear factor which defaults to 1.0, where values < 1.0 mean a lower decay rate and values > 1.0 mean an increased decay rate. This presents a more easily understandable interface for use in optimizing the decay behavior for various formats, where it is clear what a passed in value means relative to the default. With the new decay mechanism the current values for various formats are almost certainly wrong and we still need to define sensible upper and lower bounds for use during future training. Change-Id: Ib1074bbea97c725cdbf25772ee8ed66831461ce3 --- vp9/encoder/vp9_firstpass.c | 90 ++++++++++++++++------------------- vp9/encoder/vp9_firstpass.h | 4 +- vp9/encoder/vp9_rd.c | 94 +++++++++++++++++++------------------ 3 files changed, 91 insertions(+), 97 deletions(-) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 8c771c7ffc..a43099e946 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -58,7 +58,6 @@ #define INTRA_PART 0.005 #define DEFAULT_DECAY_LIMIT 0.75 #define LOW_SR_DIFF_TRHESH 0.1 -#define SR_DIFF_MAX 128.0 #define LOW_CODED_ERR_PER_MB 10.0 #define NCOUNT_FRAME_II_THRESH 6.0 #define BASELINE_ERR_PER_MB 12500.0 @@ -1833,17 +1832,21 @@ void vp9_init_second_pass(VP9_COMP *cpi) { twopass->arnr_strength_adjustment = 0; } -static double get_sr_decay_rate(const FRAME_INFO *frame_info, - const TWO_PASS *const twopass, +/* This function considers how the quality of prediction may be deteriorating + * with distance. It comapres the coded error for the last frame and the + * second reference frame (usually two frames old) and also applies a factor + * based on the extent of INTRA coding. + * + * The decay factor is then used to reduce the contribution of frames further + * from the alt-ref or golden frame, to the bitframe boost calculation for that + * alt-ref or golden frame. + */ +static double get_sr_decay_rate(const TWO_PASS *const twopass, const FIRSTPASS_STATS *frame) { double sr_diff = (frame->sr_coded_error - frame->coded_error); double sr_decay = 1.0; double modified_pct_inter; double modified_pcnt_intra; - const double motion_amplitude_part = - frame->pcnt_motion * - ((frame->mvc_abs + frame->mvr_abs) / - (frame_info->frame_height + frame_info->frame_width)); modified_pct_inter = frame->pcnt_inter; if ((frame->coded_error > LOW_CODED_ERR_PER_MB) && @@ -1855,29 +1858,26 @@ static double get_sr_decay_rate(const FRAME_INFO *frame_info, modified_pcnt_intra = 100 * (1.0 - modified_pct_inter); if ((sr_diff > LOW_SR_DIFF_TRHESH)) { - sr_diff = VPXMIN(sr_diff, SR_DIFF_MAX); - sr_decay = 1.0 - (twopass->sr_diff_part * sr_diff) - motion_amplitude_part - - (INTRA_PART * modified_pcnt_intra); + double sr_diff_part = + twopass->sr_diff_factor * ((sr_diff * 0.25) / frame->intra_error); + sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra); } return VPXMAX(sr_decay, twopass->sr_default_decay_limit); } // This function gives an estimate of how badly we believe the prediction // quality is decaying from frame to frame. -static double get_zero_motion_factor(const FRAME_INFO *frame_info, - const TWO_PASS *const twopass, +static double get_zero_motion_factor(const TWO_PASS *const twopass, const FIRSTPASS_STATS *frame_stats) { const double zero_motion_pct = frame_stats->pcnt_inter - frame_stats->pcnt_motion; - double sr_decay = get_sr_decay_rate(frame_info, twopass, frame_stats); + double sr_decay = get_sr_decay_rate(twopass, frame_stats); return VPXMIN(sr_decay, zero_motion_pct); } -static double get_prediction_decay_rate(const FRAME_INFO *frame_info, - const TWO_PASS *const twopass, +static double get_prediction_decay_rate(const TWO_PASS *const twopass, const FIRSTPASS_STATS *frame_stats) { - const double sr_decay_rate = - get_sr_decay_rate(frame_info, twopass, frame_stats); + const double sr_decay_rate = get_sr_decay_rate(twopass, frame_stats); const double zero_motion_factor = (0.95 * pow((frame_stats->pcnt_inter - frame_stats->pcnt_motion), twopass->zm_power_factor)); @@ -2066,8 +2066,7 @@ static int compute_arf_boost(const FRAME_INFO *frame_info, // Accumulate the effect of prediction quality decay. if (!flash_detected) { - decay_accumulator *= - get_prediction_decay_rate(frame_info, twopass, this_frame); + decay_accumulator *= get_prediction_decay_rate(twopass, this_frame); decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : decay_accumulator; @@ -2107,8 +2106,7 @@ static int compute_arf_boost(const FRAME_INFO *frame_info, // Cumulative effect of prediction quality decay. if (!flash_detected) { - decay_accumulator *= - get_prediction_decay_rate(frame_info, twopass, this_frame); + decay_accumulator *= get_prediction_decay_rate(twopass, this_frame); decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : decay_accumulator; @@ -2606,16 +2604,14 @@ static int get_gop_coding_frame_num( // Monitor for static sections. if ((rc->frames_since_key + gop_coding_frames - 1) > 1) { - zero_motion_accumulator = - VPXMIN(zero_motion_accumulator, - get_zero_motion_factor(frame_info, twopass, next_frame)); + zero_motion_accumulator = VPXMIN( + zero_motion_accumulator, get_zero_motion_factor(twopass, next_frame)); } // Accumulate the effect of prediction quality decay. if (!flash_detected) { double last_loop_decay_rate = loop_decay_rate; - loop_decay_rate = - get_prediction_decay_rate(frame_info, twopass, next_frame); + loop_decay_rate = get_prediction_decay_rate(twopass, next_frame); // Break clause to detect very still sections after motion. For example, // a static image after a fade or other transition. @@ -3181,7 +3177,6 @@ static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info, #define KF_ABS_ZOOM_THRESH 6.0 int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf, - const FRAME_INFO *frame_info, const TWO_PASS *const twopass, int kf_show_idx, int min_gf_interval) { const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; @@ -3211,8 +3206,7 @@ int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf, break; // How fast is the prediction quality decaying? - loop_decay_rate = - get_prediction_decay_rate(frame_info, twopass, next_frame); + loop_decay_rate = get_prediction_decay_rate(twopass, next_frame); // We want to know something about the recent past... rather than // as used elsewhere where we are concerned with decay in prediction @@ -3298,8 +3292,8 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) { kf_mod_err = calc_norm_frame_score(oxcf, frame_info, keyframe_stats, mean_mod_score, av_err); - rc->frames_to_key = vp9_get_frames_to_next_key( - oxcf, frame_info, twopass, kf_show_idx, rc->min_gf_interval); + rc->frames_to_key = vp9_get_frames_to_next_key(oxcf, twopass, kf_show_idx, + rc->min_gf_interval); // If there is a max kf interval set by the user we must obey it. // We already breakout of the loop above at 2x max. @@ -3379,9 +3373,9 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) { // Monitor for static sections. // First frame in kf group the second ref indicator is invalid. if (i > 0) { - zero_motion_accumulator = VPXMIN( - zero_motion_accumulator, - get_zero_motion_factor(&cpi->frame_info, twopass, &next_frame)); + zero_motion_accumulator = + VPXMIN(zero_motion_accumulator, + get_zero_motion_factor(twopass, &next_frame)); } else { zero_motion_accumulator = next_frame.pcnt_inter - next_frame.pcnt_motion; @@ -3493,7 +3487,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->active_wq_factor = AV_WQ_FACTOR; twopass->base_err_per_mb = BASELINE_ERR_PER_MB; twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT; - twopass->sr_diff_part = SR_DIFF_PART; + twopass->sr_diff_factor = 1.0; twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST; twopass->gf_max_total_boost = MAX_GF_BOOST; if (screen_area < 1280 * 720) { @@ -3515,7 +3509,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->active_wq_factor = 46.0; twopass->base_err_per_mb = 37597.399760969536; twopass->sr_default_decay_limit = 0.3905639800962774; - twopass->sr_diff_part = 0.009599023654146284; + twopass->sr_diff_factor = 6.4; twopass->gf_frame_max_boost = 87.27362648627846; twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 1854.8255436877148; @@ -3528,7 +3522,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->active_wq_factor = 55.0; twopass->base_err_per_mb = 34525.33177195309; twopass->sr_default_decay_limit = 0.23901360046804604; - twopass->sr_diff_part = 0.008581014394766773; + twopass->sr_diff_factor = 5.73; twopass->gf_frame_max_boost = 127.34978204980285; twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 723.8337508755031; @@ -3541,7 +3535,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->active_wq_factor = 12.5; twopass->base_err_per_mb = 18823.978018028298; twopass->sr_default_decay_limit = 0.6043527690301296; - twopass->sr_diff_part = 0.00343296783885544; + twopass->sr_diff_factor = 2.28; twopass->gf_frame_max_boost = 75.17672317013668; twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 422.2871502380377; @@ -3554,7 +3548,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->active_wq_factor = 51.5; twopass->base_err_per_mb = 33718.98307662595; twopass->sr_default_decay_limit = 0.33633414970713393; - twopass->sr_diff_part = 0.00868988716928333; + twopass->sr_diff_factor = 5.8; twopass->gf_frame_max_boost = 85.2868528581522; twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 1513.4883914008383; @@ -3567,7 +3561,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->active_wq_factor = 41.5; twopass->base_err_per_mb = 29527.46375825401; twopass->sr_default_decay_limit = 0.5009117586299728; - twopass->sr_diff_part = 0.005007364627260114; + twopass->sr_diff_factor = 3.33; twopass->gf_frame_max_boost = 81.00472969483079; twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 998.6342911785146; @@ -3580,7 +3574,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->active_wq_factor = 31.0; twopass->base_err_per_mb = 34474.723463367416; twopass->sr_default_decay_limit = 0.23346886902707745; - twopass->sr_diff_part = 0.011431716637966029; + twopass->sr_diff_factor = 7.6; twopass->gf_frame_max_boost = 213.2940230360479; twopass->gf_max_total_boost = MAX_GF_BOOST; twopass->kf_err_per_mb = 35931.25734431429; @@ -3873,9 +3867,8 @@ void vp9_get_next_group_of_picture(const VP9_COMP *cpi, int *first_is_key_frame, *first_is_key_frame = 0; if (rc.frames_to_key == 0) { - rc.frames_to_key = - vp9_get_frames_to_next_key(&cpi->oxcf, &cpi->frame_info, twopass, - *first_show_idx, rc.min_gf_interval); + rc.frames_to_key = vp9_get_frames_to_next_key( + &cpi->oxcf, twopass, *first_show_idx, rc.min_gf_interval); rc.frames_since_key = 0; *first_is_key_frame = 1; } @@ -3939,8 +3932,8 @@ int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf, int use_alt_ref; int first_is_key_frame = 0; if (rc.frames_to_key == 0) { - rc.frames_to_key = vp9_get_frames_to_next_key( - oxcf, frame_info, twopass, show_idx, rc.min_gf_interval); + rc.frames_to_key = vp9_get_frames_to_next_key(oxcf, twopass, show_idx, + rc.min_gf_interval); rc.frames_since_key = 0; first_is_key_frame = 1; } @@ -3961,7 +3954,6 @@ int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf, } void vp9_get_key_frame_map(const VP9EncoderConfig *oxcf, - const FRAME_INFO *frame_info, const TWO_PASS *const twopass, int *key_frame_map) { int show_idx = 0; RATE_CONTROL rc; @@ -3975,8 +3967,8 @@ void vp9_get_key_frame_map(const VP9EncoderConfig *oxcf, while (show_idx < first_pass_info->num_frames) { int key_frame_group_size; key_frame_map[show_idx] = 1; - key_frame_group_size = vp9_get_frames_to_next_key( - oxcf, frame_info, twopass, show_idx, rc.min_gf_interval); + key_frame_group_size = + vp9_get_frames_to_next_key(oxcf, twopass, show_idx, rc.min_gf_interval); assert(key_frame_group_size > 0); show_idx += key_frame_group_size; } diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 9613f57fb9..624fccd428 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -226,7 +226,7 @@ typedef struct { double active_wq_factor; double base_err_per_mb; double sr_default_decay_limit; - double sr_diff_part; + double sr_diff_factor; double kf_err_per_mb; double kf_frame_min_boost; double kf_frame_max_boost_first; // Max for first kf in a chunk. @@ -262,7 +262,6 @@ void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width, struct VP9EncoderConfig; int vp9_get_frames_to_next_key(const struct VP9EncoderConfig *oxcf, - const FRAME_INFO *frame_info, const TWO_PASS *const twopass, int kf_show_idx, int min_gf_interval); #if CONFIG_RATE_CTRL @@ -311,7 +310,6 @@ int vp9_get_coding_frame_num(const struct VP9EncoderConfig *oxcf, * number of show frames in the video. */ void vp9_get_key_frame_map(const struct VP9EncoderConfig *oxcf, - const FRAME_INFO *frame_info, const FIRST_PASS_INFO *first_pass_info, int *key_frame_map); #endif // CONFIG_RATE_CTRL diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 3b2e0b088b..d5d668f964 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -201,62 +201,66 @@ static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128, // Later this function will use passed in command line values. void vp9_init_rd_parameters(VP9_COMP *cpi) { RD_CONTROL *const rdc = &cpi->rd_ctrl; - unsigned int screen_area = (cpi->common.width * cpi->common.height); // Make sure this function is floating point safe. vpx_clear_system_state(); rdc->rd_mult_q_sq_key_high_qp = 7.5; // No defined Vizer values yet - if (1) { - // Non/pre-Vizer defaults + + if (0) { + unsigned int screen_area = (cpi->common.width * cpi->common.height); + + if (screen_area <= 176 * 144) { + rdc->rd_mult_q_sq_inter_low_qp = 4.0718581295922025; + rdc->rd_mult_q_sq_inter_mid_qp = 4.031435609256739; + rdc->rd_mult_q_sq_inter_high_qp = 4.295745965132044; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.290774097327333; + rdc->rd_mult_q_sq_key_low_qp = 5.7037775720838155; + rdc->rd_mult_q_sq_key_mid_qp = 4.72424015517201; + } else if (screen_area <= 320 * 240) { + rdc->rd_mult_q_sq_inter_low_qp = 4.506676356706102; + rdc->rd_mult_q_sq_inter_mid_qp = 4.489349899621181; + rdc->rd_mult_q_sq_inter_high_qp = 4.388244213131458; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.217074424696166; + rdc->rd_mult_q_sq_key_low_qp = 4.497000582319771; + rdc->rd_mult_q_sq_key_mid_qp = 4.2825894884789735; + } else if (screen_area <= 640 * 360) { + rdc->rd_mult_q_sq_inter_low_qp = 4.730644123689013; + rdc->rd_mult_q_sq_inter_mid_qp = 4.314589509578551; + rdc->rd_mult_q_sq_inter_high_qp = 4.3702861603380025; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.576902541873747; + rdc->rd_mult_q_sq_key_low_qp = 6.068652999601526; + rdc->rd_mult_q_sq_key_mid_qp = 4.817707474077241; + } else if (screen_area <= 854 * 480) { + rdc->rd_mult_q_sq_inter_low_qp = 4.811470143416073; + rdc->rd_mult_q_sq_inter_mid_qp = 4.621618127750201; + rdc->rd_mult_q_sq_inter_high_qp = 3.969083125219539; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.9854544277222566; + rdc->rd_mult_q_sq_key_low_qp = 5.073157238799473; + rdc->rd_mult_q_sq_key_mid_qp = 5.7587672849242635; + } else if (screen_area <= 1280 * 720) { + rdc->rd_mult_q_sq_inter_low_qp = 5.119381136011107; + rdc->rd_mult_q_sq_inter_mid_qp = 4.518613675766538; + rdc->rd_mult_q_sq_inter_high_qp = 4.410712348825541; + rdc->rd_mult_q_sq_key_ultralow_qp = 3.9468491666607326; + rdc->rd_mult_q_sq_key_low_qp = 5.848703119971484; + rdc->rd_mult_q_sq_key_mid_qp = 5.368947246228739; + } else { + rdc->rd_mult_q_sq_inter_low_qp = 6.00569815296199; + rdc->rd_mult_q_sq_inter_mid_qp = 3.932565684947023; + rdc->rd_mult_q_sq_inter_high_qp = 3.2141187537667797; + rdc->rd_mult_q_sq_key_ultralow_qp = 4.399795006320089; + rdc->rd_mult_q_sq_key_low_qp = 10.582906599488298; + rdc->rd_mult_q_sq_key_mid_qp = 6.274162346360692; + } + } else { + // For now force defaults unless testing rdc->rd_mult_q_sq_inter_low_qp = 4.0; rdc->rd_mult_q_sq_inter_mid_qp = 4.5; rdc->rd_mult_q_sq_inter_high_qp = 3.0; rdc->rd_mult_q_sq_key_ultralow_qp = 4.0; rdc->rd_mult_q_sq_key_low_qp = 3.5; rdc->rd_mult_q_sq_key_mid_qp = 4.5; - } else if (screen_area <= 176 * 144) { - rdc->rd_mult_q_sq_inter_low_qp = 4.0718581295922025; - rdc->rd_mult_q_sq_inter_mid_qp = 4.031435609256739; - rdc->rd_mult_q_sq_inter_high_qp = 4.295745965132044; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.290774097327333; - rdc->rd_mult_q_sq_key_low_qp = 5.7037775720838155; - rdc->rd_mult_q_sq_key_mid_qp = 4.72424015517201; - } else if (screen_area <= 320 * 240) { - rdc->rd_mult_q_sq_inter_low_qp = 4.506676356706102; - rdc->rd_mult_q_sq_inter_mid_qp = 4.489349899621181; - rdc->rd_mult_q_sq_inter_high_qp = 4.388244213131458; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.217074424696166; - rdc->rd_mult_q_sq_key_low_qp = 4.497000582319771; - rdc->rd_mult_q_sq_key_mid_qp = 4.2825894884789735; - } else if (screen_area <= 640 * 360) { - rdc->rd_mult_q_sq_inter_low_qp = 4.730644123689013; - rdc->rd_mult_q_sq_inter_mid_qp = 4.314589509578551; - rdc->rd_mult_q_sq_inter_high_qp = 4.3702861603380025; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.576902541873747; - rdc->rd_mult_q_sq_key_low_qp = 6.068652999601526; - rdc->rd_mult_q_sq_key_mid_qp = 4.817707474077241; - } else if (screen_area <= 854 * 480) { - rdc->rd_mult_q_sq_inter_low_qp = 4.811470143416073; - rdc->rd_mult_q_sq_inter_mid_qp = 4.621618127750201; - rdc->rd_mult_q_sq_inter_high_qp = 3.969083125219539; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.9854544277222566; - rdc->rd_mult_q_sq_key_low_qp = 5.073157238799473; - rdc->rd_mult_q_sq_key_mid_qp = 5.7587672849242635; - } else if (screen_area <= 1280 * 720) { - rdc->rd_mult_q_sq_inter_low_qp = 5.119381136011107; - rdc->rd_mult_q_sq_inter_mid_qp = 4.518613675766538; - rdc->rd_mult_q_sq_inter_high_qp = 4.410712348825541; - rdc->rd_mult_q_sq_key_ultralow_qp = 3.9468491666607326; - rdc->rd_mult_q_sq_key_low_qp = 5.848703119971484; - rdc->rd_mult_q_sq_key_mid_qp = 5.368947246228739; - } else { - rdc->rd_mult_q_sq_inter_low_qp = 6.00569815296199; - rdc->rd_mult_q_sq_inter_mid_qp = 3.932565684947023; - rdc->rd_mult_q_sq_inter_high_qp = 3.2141187537667797; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.399795006320089; - rdc->rd_mult_q_sq_key_low_qp = 10.582906599488298; - rdc->rd_mult_q_sq_key_mid_qp = 6.274162346360692; } } From b41ffb53f1000ab2227c1736d8c1355aa5081c40 Mon Sep 17 00:00:00 2001 From: "Adam B. Goode" Date: Wed, 17 Mar 2021 14:11:57 -0500 Subject: [PATCH 0137/1000] Msvc builds convert to windows path w/msys env Bug: webm:1720 Change-Id: I56689ad408f8086c511e1711dfa9c8d404727b2e (cherry picked from commit 04086a30664d2a3e89d6a6e4e1c18f1a82c8f958) --- build/make/msvs_common.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build/make/msvs_common.sh b/build/make/msvs_common.sh index 27ddf7fd91..3989fec0d5 100644 --- a/build/make/msvs_common.sh +++ b/build/make/msvs_common.sh @@ -9,7 +9,8 @@ ## be found in the AUTHORS file in the root of the source tree. ## -if [ "$(uname -o 2>/dev/null)" = "Cygwin" ] \ +shell_name="$(uname -o 2>/dev/null)" +if [[ "$shell_name" = "Cygwin" || "$shell_name" = "Msys" ]] \ && cygpath --help >/dev/null 2>&1; then FIXPATH='cygpath -m' else From e37ee40f7ee0dbafa41e7d1c32dc34740727c7a1 Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Mon, 22 Mar 2021 19:45:10 +0000 Subject: [PATCH 0138/1000] Convert Vizier RD parameters to normalized factors This patch converts the Vizier custom RD multipliers, to factors that adjust each RD multiplier either side of its default value, where a factor of 1.0 will give the previous default behavior. Ultimately I would like to replace the multiple RD multipliers triggered at different Q thresholds (eg, low, medium, high q) with a function that adjusts the rd behavior smoothly as Q changes. Vizier could then be presented with a single adjustment control for each of key frame and inter frame rd. The current behavior is problematic. Firstly having hard threshold Q values at which rd behavior changes may cause anomalies in the rate distortion curve, where in some situations, raising Q, for example, may not cause the expected drop in rate and rise in distortion, because we have crossed a threshold where the rate distortion multiplier changes sharply and this alters the balance of bits spent in the prediction and residual parts of the signal. Having a single value that is used for a range of Q index values (eg 0-64), (65-128) may also cause problems and over-fitting in the context of the Vizier ML project. This project tries to optimize the values for each Q range, for various YT formats, but does so by analyzing the results of single point encodes on a set of clips. For a given format all the clips are encoded with the same parameters (target rate etc) so there is likely to be clustering in regards to the Q values used. For example the training set may give a new value for the Q range 0-64 but most of the data points used may have Q close 64. It will likely require several iterations working with the Vizier team to get this right. This patch just gives an initial framework for testing. Change-Id: Iaa4cd5561b95a202bcae7a1d876c4f40ef444fa2 --- vp9/encoder/vp9_rd.c | 117 ++++++++++++++++++++++++------------------- vp9/encoder/vp9_rd.h | 17 +++---- 2 files changed, 74 insertions(+), 60 deletions(-) diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index d5d668f964..d3bf3d6d4c 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -205,65 +205,74 @@ void vp9_init_rd_parameters(VP9_COMP *cpi) { // Make sure this function is floating point safe. vpx_clear_system_state(); - rdc->rd_mult_q_sq_key_high_qp = 7.5; // No defined Vizer values yet + rdc->rd_mult_key_high_qp_fac = 1.0; // Default: no Vizer values yet if (0) { unsigned int screen_area = (cpi->common.width * cpi->common.height); if (screen_area <= 176 * 144) { - rdc->rd_mult_q_sq_inter_low_qp = 4.0718581295922025; - rdc->rd_mult_q_sq_inter_mid_qp = 4.031435609256739; - rdc->rd_mult_q_sq_inter_high_qp = 4.295745965132044; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.290774097327333; - rdc->rd_mult_q_sq_key_low_qp = 5.7037775720838155; - rdc->rd_mult_q_sq_key_mid_qp = 4.72424015517201; + rdc->rd_mult_inter_low_qp_fac = 1.018; + rdc->rd_mult_inter_mid_qp_fac = 0.896; + rdc->rd_mult_inter_high_qp_fac = 1.432; + rdc->rd_mult_key_ultralow_qp_fac = 1.073; + rdc->rd_mult_key_low_qp_fac = 1.630; + rdc->rd_mult_key_mid_qp_fac = 1.050; } else if (screen_area <= 320 * 240) { - rdc->rd_mult_q_sq_inter_low_qp = 4.506676356706102; - rdc->rd_mult_q_sq_inter_mid_qp = 4.489349899621181; - rdc->rd_mult_q_sq_inter_high_qp = 4.388244213131458; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.217074424696166; - rdc->rd_mult_q_sq_key_low_qp = 4.497000582319771; - rdc->rd_mult_q_sq_key_mid_qp = 4.2825894884789735; + rdc->rd_mult_inter_low_qp_fac = 1.127; + rdc->rd_mult_inter_mid_qp_fac = 0.998; + rdc->rd_mult_inter_high_qp_fac = 1.463; + rdc->rd_mult_key_ultralow_qp_fac = 1.054; + rdc->rd_mult_key_low_qp_fac = 1.285; + rdc->rd_mult_key_mid_qp_fac = 0.952; } else if (screen_area <= 640 * 360) { - rdc->rd_mult_q_sq_inter_low_qp = 4.730644123689013; - rdc->rd_mult_q_sq_inter_mid_qp = 4.314589509578551; - rdc->rd_mult_q_sq_inter_high_qp = 4.3702861603380025; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.576902541873747; - rdc->rd_mult_q_sq_key_low_qp = 6.068652999601526; - rdc->rd_mult_q_sq_key_mid_qp = 4.817707474077241; + rdc->rd_mult_inter_low_qp_fac = 1.183; + rdc->rd_mult_inter_mid_qp_fac = 0.959; + rdc->rd_mult_inter_high_qp_fac = 1.457; + rdc->rd_mult_key_ultralow_qp_fac = 1.144; + rdc->rd_mult_key_low_qp_fac = 1.734; + rdc->rd_mult_key_mid_qp_fac = 1.071; } else if (screen_area <= 854 * 480) { - rdc->rd_mult_q_sq_inter_low_qp = 4.811470143416073; - rdc->rd_mult_q_sq_inter_mid_qp = 4.621618127750201; - rdc->rd_mult_q_sq_inter_high_qp = 3.969083125219539; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.9854544277222566; - rdc->rd_mult_q_sq_key_low_qp = 5.073157238799473; - rdc->rd_mult_q_sq_key_mid_qp = 5.7587672849242635; + rdc->rd_mult_inter_low_qp_fac = 1.203; + rdc->rd_mult_inter_mid_qp_fac = 1.027; + rdc->rd_mult_inter_high_qp_fac = 1.027; + rdc->rd_mult_key_ultralow_qp_fac = 1.246; + rdc->rd_mult_key_low_qp_fac = 1.246; + rdc->rd_mult_key_mid_qp_fac = 1.280; } else if (screen_area <= 1280 * 720) { - rdc->rd_mult_q_sq_inter_low_qp = 5.119381136011107; - rdc->rd_mult_q_sq_inter_mid_qp = 4.518613675766538; - rdc->rd_mult_q_sq_inter_high_qp = 4.410712348825541; - rdc->rd_mult_q_sq_key_ultralow_qp = 3.9468491666607326; - rdc->rd_mult_q_sq_key_low_qp = 5.848703119971484; - rdc->rd_mult_q_sq_key_mid_qp = 5.368947246228739; + rdc->rd_mult_inter_low_qp_fac = 1.280; + rdc->rd_mult_inter_mid_qp_fac = 1.004; + rdc->rd_mult_inter_high_qp_fac = 1.470; + rdc->rd_mult_key_ultralow_qp_fac = 0.987; + rdc->rd_mult_key_low_qp_fac = 1.671; + rdc->rd_mult_key_mid_qp_fac = 1.193; } else { - rdc->rd_mult_q_sq_inter_low_qp = 6.00569815296199; - rdc->rd_mult_q_sq_inter_mid_qp = 3.932565684947023; - rdc->rd_mult_q_sq_inter_high_qp = 3.2141187537667797; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.399795006320089; - rdc->rd_mult_q_sq_key_low_qp = 10.582906599488298; - rdc->rd_mult_q_sq_key_mid_qp = 6.274162346360692; + rdc->rd_mult_inter_low_qp_fac = 1.50; + rdc->rd_mult_inter_mid_qp_fac = 0.874; + rdc->rd_mult_inter_high_qp_fac = 1.07; + rdc->rd_mult_key_ultralow_qp_fac = 1.1; + rdc->rd_mult_key_low_qp_fac = 2.35; + rdc->rd_mult_key_mid_qp_fac = 0.837; } } else { // For now force defaults unless testing - rdc->rd_mult_q_sq_inter_low_qp = 4.0; - rdc->rd_mult_q_sq_inter_mid_qp = 4.5; - rdc->rd_mult_q_sq_inter_high_qp = 3.0; - rdc->rd_mult_q_sq_key_ultralow_qp = 4.0; - rdc->rd_mult_q_sq_key_low_qp = 3.5; - rdc->rd_mult_q_sq_key_mid_qp = 4.5; + rdc->rd_mult_inter_low_qp_fac = 1.0; + rdc->rd_mult_inter_mid_qp_fac = 1.0; + rdc->rd_mult_inter_high_qp_fac = 1.0; + rdc->rd_mult_key_ultralow_qp_fac = 1.0; + rdc->rd_mult_key_low_qp_fac = 1.0; + rdc->rd_mult_key_mid_qp_fac = 1.0; } } +// Default Rd multiplier values for Q ranges +#define INTER_LOW_QP_RDM 4.0 +#define INTER_MID_QP_RDM 4.5 +#define INTER_HIGH_QP_RDM 3.0 +#define KEY_ULOW_QP_RDM 4.0 +#define KEY_LOW_QP_RDM 3.5 +#define KEY_MID_QP_RDM 4.5 +#define KEY_HIGH_QP_RDM 7.5 + int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) { const RD_CONTROL *rdc = &cpi->rd_ctrl; const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth); @@ -275,22 +284,28 @@ int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) { if (cpi->common.frame_type != KEY_FRAME) { if (qindex < 128) { - rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_inter_low_qp); + rdmult = (int)((double)rdmult * INTER_LOW_QP_RDM * + rdc->rd_mult_inter_low_qp_fac); } else if (qindex < 190) { - rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_inter_mid_qp); + rdmult = (int)((double)rdmult * INTER_MID_QP_RDM * + rdc->rd_mult_inter_mid_qp_fac); } else { - rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_inter_high_qp); + rdmult = (int)((double)rdmult * INTER_HIGH_QP_RDM * + rdc->rd_mult_inter_high_qp_fac); } } else { if (qindex < 64) { - rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_ultralow_qp); + rdmult = (int)((double)rdmult * KEY_ULOW_QP_RDM * + rdc->rd_mult_key_ultralow_qp_fac); } else if (qindex <= 128) { - rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_low_qp); + rdmult = + (int)((double)rdmult * KEY_LOW_QP_RDM * rdc->rd_mult_key_low_qp_fac); } else if (qindex < 190) { - rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_mid_qp); - + rdmult = + (int)((double)rdmult * KEY_MID_QP_RDM * rdc->rd_mult_key_mid_qp_fac); } else { - rdmult = (int)((double)rdmult * rdc->rd_mult_q_sq_key_high_qp); + rdmult = (int)((double)rdmult * KEY_HIGH_QP_RDM * + rdc->rd_mult_key_high_qp_fac); } } diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h index 2c9f5e7408..4899e1ae0f 100644 --- a/vp9/encoder/vp9_rd.h +++ b/vp9/encoder/vp9_rd.h @@ -102,15 +102,14 @@ typedef enum { } THR_MODES_SUB8X8; typedef struct { - // RD control parameters - // Added for Vizier project. - double rd_mult_q_sq_inter_low_qp; - double rd_mult_q_sq_inter_mid_qp; - double rd_mult_q_sq_inter_high_qp; - double rd_mult_q_sq_key_ultralow_qp; - double rd_mult_q_sq_key_low_qp; - double rd_mult_q_sq_key_mid_qp; - double rd_mult_q_sq_key_high_qp; + // RD multiplier control factors added for Vizier project. + double rd_mult_inter_low_qp_fac; + double rd_mult_inter_mid_qp_fac; + double rd_mult_inter_high_qp_fac; + double rd_mult_key_ultralow_qp_fac; + double rd_mult_key_low_qp_fac; + double rd_mult_key_mid_qp_fac; + double rd_mult_key_high_qp_fac; } RD_CONTROL; typedef struct RD_OPT { From deef8955067bcfb7bb9d3d9b1719f178ea8e37a2 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 26 Mar 2021 10:55:29 -0700 Subject: [PATCH 0139/1000] vp9_ext_ratectrl_test: use uintptr_t for void* value this avoids a warning about differences in size between void* and unsigned int under msvc: vp9_ext_ratectrl_test.cc(40,3): warning C4312: 'reinterpret_cast': conversion from 'const unsigned int' to 'void *' of greater size Change-Id: I5a412ec785ddcaeff2ec71bb83a6048505400293 --- test/vp9_ext_ratectrl_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index b6b5b2eaec..60a350b84e 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include "test/codec_factory.h" @@ -20,7 +21,7 @@ namespace { constexpr int kModelMagicNumber = 51396; -constexpr unsigned int PrivMagicNumber = 5566; +constexpr uintptr_t PrivMagicNumber = 5566; constexpr int kFrameNum = 5; constexpr int kLosslessCodingIndex = 2; From c19de35ed2b772fd21092c727c5e090034876d4f Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Tue, 16 Mar 2021 21:27:18 -0700 Subject: [PATCH 0140/1000] Add command line options for a few rc parameters These rate control parameters are for the Vizier experiment. They are defined as rational numbers. Change-Id: I23f382dd49158db463b75b5ad8a82d8e0d536308 --- vp8/vp8_cx_iface.c | 28 +++++++++++----- vp9/vp9_cx_iface.c | 28 +++++++++++----- vpx/vpx_encoder.h | 84 ++++++++++++++++++++++++++++++++++++++++++++++ vpxenc.c | 71 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 195 insertions(+), 16 deletions(-) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 32dd3c7708..32bb1d04f2 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -1281,14 +1281,26 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */ { 0 }, - { 0 }, /* ss_target_bitrate */ - 1, /* ts_number_layers */ - { 0 }, /* ts_target_bitrate */ - { 0 }, /* ts_rate_decimator */ - 0, /* ts_periodicity */ - { 0 }, /* ts_layer_id */ - { 0 }, /* layer_target_bitrate */ - 0 /* temporal_layering_mode */ + { 0 }, /* ss_target_bitrate */ + 1, /* ts_number_layers */ + { 0 }, /* ts_target_bitrate */ + { 0 }, /* ts_rate_decimator */ + 0, /* ts_periodicity */ + { 0 }, /* ts_layer_id */ + { 0 }, /* layer_target_bitrate */ + 0, /* temporal_layering_mode */ + { 0, 0 }, /* active_wq_factor */ + { 0, 0 }, /* base_err_per_mb */ + { 0, 0 }, /* sr_default_decay_limit */ + { 0, 0 }, /* sr_diff_factor */ + { 0, 0 }, /* kf_err_per_mb */ + { 0, 0 }, /* kf_frame_min_boost */ + { 0, 0 }, /* kf_frame_max_boost_first */ + { 0, 0 }, /* kf_frame_max_boost_subs */ + { 0, 0 }, /* kf_max_total_boost */ + { 0, 0 }, /* gf_max_total_boost */ + { 0, 0 }, /* gf_frame_max_boost */ + { 0, 0 }, /* zm_power_factor */ } }, }; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index ecfacfaf43..75dda0bed0 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1883,14 +1883,26 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { VPX_SS_DEFAULT_LAYERS, // ss_number_layers { 0 }, - { 0 }, // ss_target_bitrate - 1, // ts_number_layers - { 0 }, // ts_target_bitrate - { 0 }, // ts_rate_decimator - 0, // ts_periodicity - { 0 }, // ts_layer_id - { 0 }, // layer_taget_bitrate - 0 // temporal_layering_mode + { 0 }, // ss_target_bitrate + 1, // ts_number_layers + { 0 }, // ts_target_bitrate + { 0 }, // ts_rate_decimator + 0, // ts_periodicity + { 0 }, // ts_layer_id + { 0 }, // layer_taget_bitrate + 0, // temporal_layering_mode + { 0, 0 }, // active_wq_factor + { 0, 0 }, // base_err_per_mb + { 0, 0 }, // sr_default_decay_limit + { 0, 0 }, // sr_diff_factor + { 0, 0 }, // kf_err_per_mb + { 0, 0 }, // kf_frame_min_boost + { 0, 0 }, // kf_frame_max_boost_first + { 0, 0 }, // kf_frame_max_boost_subs + { 0, 0 }, // kf_max_total_boost + { 0, 0 }, // gf_max_total_boost + { 0, 0 }, // gf_frame_max_boost + { 0, 0 }, // zm_power_factor } }, }; diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index da36095775..accc127f64 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -693,6 +693,90 @@ typedef struct vpx_codec_enc_cfg { * */ int temporal_layering_mode; + + /*!\brief Active worst quality factor. + * + * Rate control parameters, set from external experiment results. + * + */ + vpx_rational_t active_wq_factor; + + /*!\brief Base error per macroblock. + * + * Rate control parameters, set from external experiment results. + * + */ + vpx_rational_t base_err_per_mb; + + /*!\brief Second reference default decay limit. + * + * Rate control parameters, set from external experiment results. + * + */ + vpx_rational_t sr_default_decay_limit; + + /*!\brief Second reference difference factor. + * + * Rate control parameters, set from external experiment results. + * + */ + vpx_rational_t sr_diff_factor; + + /*!\brief Keyframe error per macroblock. + * + * Rate control parameters, set from external experiment results. + * + */ + vpx_rational_t kf_err_per_mb; + + /*!\brief Keyframe minimum boost. + * + * Rate control parameters, set from external experiment results. + * + */ + vpx_rational_t kf_frame_min_boost; + + /*!\brief Keyframe maximum boost, for the first keyframe in a chunk. + * + * Rate control parameters, set from external experiment results. + * + */ + vpx_rational_t kf_frame_max_boost_first; + + /*!\brief Keyframe maximum boost, for subsequent keyframes. + * + * Rate control parameters, set from external experiment results. + * + */ + vpx_rational_t kf_frame_max_boost_subs; + + /*!\brief Keyframe maximum total boost. + * + * Rate control parameters, set from external experiment results. + * + */ + vpx_rational_t kf_max_total_boost; + + /*!\brief Golden frame maximum total boost. + * + * Rate control parameters, set from external experiment results. + * + */ + vpx_rational_t gf_max_total_boost; + + /*!\brief Golden frame maximum boost. + * + * Rate control parameters, set from external experiment results. + * + */ + vpx_rational_t gf_frame_max_boost; + + /*!\brief Zero motion power factor. + * + * Rate control parameters, set from external experiment results. + * + */ + vpx_rational_t zm_power_factor; } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */ /*!\brief vp9 svc extra configure parameters diff --git a/vpxenc.c b/vpxenc.c index 5042e688c9..3c04c64b3b 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -287,6 +287,46 @@ static const arg_def_t *rc_args[] = { &buf_sz, &buf_initial_sz, &buf_optimal_sz, NULL }; +#if CONFIG_VP9_ENCODER +static const arg_def_t active_wq_factor = + ARG_DEF(NULL, "active-wq-factor", 1, "Active worst quality factor"); +static const arg_def_t base_err_per_mb = + ARG_DEF(NULL, "base-err-per-mb", 1, "Base error per macroblock"); +static const arg_def_t sr_default_decay_limit = ARG_DEF( + NULL, "sr-default-decay-limit", 1, "Second reference default decay limit"); +static const arg_def_t sr_diff_factor = + ARG_DEF(NULL, "sr-diff-factor", 1, "Second reference diff factor"); +static const arg_def_t kf_err_per_mb = + ARG_DEF(NULL, "kf-err-per-mb", 1, "Keyframe error per macroblock"); +static const arg_def_t kf_frame_min_boost = + ARG_DEF(NULL, "kf-frame-min-boost", 1, "Keyframe min boost"); +static const arg_def_t kf_frame_max_boost_first = ARG_DEF( + NULL, "kf-frame-max-boost-first", 1, "Max for the first keyframe boost"); +static const arg_def_t kf_frame_max_boost_subs = ARG_DEF( + NULL, "kf-frame-max-boost-subs", 1, "Max for subsequent keyframe boost"); +static const arg_def_t kf_max_total_boost = + ARG_DEF(NULL, "kf-max-total-boost", 1, "Keyframe max total boost"); +static const arg_def_t gf_max_total_boost = + ARG_DEF(NULL, "gf-max-total-boost", 1, "Golden frame max total boost"); +static const arg_def_t gf_frame_max_boost = + ARG_DEF(NULL, "gf-frame-max-boost", 1, "Golden frame max boost"); +static const arg_def_t zm_power_factor = + ARG_DEF(NULL, "zm-power-factor", 1, "Zero motion power factor"); +static const arg_def_t *vizier_rc_args[] = { &active_wq_factor, + &base_err_per_mb, + &sr_default_decay_limit, + &sr_diff_factor, + &kf_err_per_mb, + &kf_frame_min_boost, + &kf_frame_max_boost_first, + &kf_frame_max_boost_subs, + &kf_max_total_boost, + &gf_max_total_boost, + &gf_frame_max_boost, + &zm_power_factor, + NULL }; +#endif + static const arg_def_t bias_pct = ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)"); static const arg_def_t minsection_pct = @@ -573,6 +613,8 @@ static void show_help(FILE *fout, int shorthelp) { #if CONFIG_VP9_ENCODER fprintf(fout, "\nVP9 Specific Options:\n"); arg_show_usage(fout, vp9_args); + fprintf(fout, "\nVizier Rate Control Options:\n"); + arg_show_usage(fout, vizier_rc_args); #endif fprintf(fout, "\nStream timebase (--timebase):\n" @@ -983,6 +1025,32 @@ static int parse_stream_params(struct VpxEncoderConfig *global, config->cfg.kf_max_dist = arg_parse_uint(&arg); } else if (arg_match(&arg, &kf_disabled, argi)) { config->cfg.kf_mode = VPX_KF_DISABLED; +#if CONFIG_VP9_ENCODER + } else if (arg_match(&arg, &active_wq_factor, argi)) { + config->cfg.active_wq_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &base_err_per_mb, argi)) { + config->cfg.base_err_per_mb = arg_parse_rational(&arg); + } else if (arg_match(&arg, &sr_default_decay_limit, argi)) { + config->cfg.sr_default_decay_limit = arg_parse_rational(&arg); + } else if (arg_match(&arg, &sr_diff_factor, argi)) { + config->cfg.sr_diff_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_err_per_mb, argi)) { + config->cfg.kf_err_per_mb = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_frame_min_boost, argi)) { + config->cfg.kf_frame_min_boost = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_frame_max_boost_first, argi)) { + config->cfg.kf_frame_max_boost_first = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_frame_max_boost_subs, argi)) { + config->cfg.kf_frame_max_boost_subs = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_max_total_boost, argi)) { + config->cfg.kf_max_total_boost = arg_parse_rational(&arg); + } else if (arg_match(&arg, &gf_max_total_boost, argi)) { + config->cfg.gf_max_total_boost = arg_parse_rational(&arg); + } else if (arg_match(&arg, &gf_frame_max_boost, argi)) { + config->cfg.gf_frame_max_boost = arg_parse_rational(&arg); + } else if (arg_match(&arg, &zm_power_factor, argi)) { + config->cfg.zm_power_factor = arg_parse_rational(&arg); +#endif #if CONFIG_VP9_HIGHBITDEPTH } else if (arg_match(&arg, &test16bitinternalarg, argi)) { if (strcmp(global->codec->name, "vp9") == 0) { @@ -1177,6 +1245,9 @@ static void show_stream_config(struct stream_state *stream, SHOW(kf_mode); SHOW(kf_min_dist); SHOW(kf_max_dist); + // Temporary use for debug + SHOW(active_wq_factor.num); + SHOW(active_wq_factor.den); } static void open_output_file(struct stream_state *stream, From f32829a2e5db3cd1624e8a7a530af84c382762ef Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Tue, 16 Mar 2021 22:42:40 -0700 Subject: [PATCH 0141/1000] Pass vizier rc parameter values from command line to twopass Change-Id: I02eabeccf2fe4604875820d38e23c2586a63e290 --- vp9/vp9_cx_iface.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 75dda0bed0..f601ca1622 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -638,6 +638,41 @@ static vpx_codec_err_t set_encoder_config( return VPX_CODEC_OK; } +static vpx_codec_err_t set_twopass_params_from_config( + const vpx_codec_enc_cfg_t *const cfg, struct VP9_COMP *cpi) { + if (cpi == NULL) return VPX_CODEC_ERROR; + + cpi->twopass.active_wq_factor = + (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den; + cpi->twopass.base_err_per_mb = + (double)cfg->base_err_per_mb.num / (double)cfg->base_err_per_mb.den; + cpi->twopass.sr_default_decay_limit = + (double)cfg->sr_default_decay_limit.num / + (double)cfg->sr_default_decay_limit.den; + cpi->twopass.sr_diff_factor = + (double)cfg->sr_diff_factor.num / (double)cfg->sr_diff_factor.den; + cpi->twopass.kf_err_per_mb = + (double)cfg->kf_err_per_mb.num / (double)cfg->kf_err_per_mb.den; + cpi->twopass.kf_frame_min_boost = + (double)cfg->kf_frame_min_boost.num / (double)cfg->kf_frame_min_boost.den; + cpi->twopass.kf_frame_max_boost_first = + (double)cfg->kf_frame_max_boost_first.num / + (double)cfg->kf_frame_max_boost_first.den; + cpi->twopass.kf_frame_max_boost_subs = + (double)cfg->kf_frame_max_boost_subs.num / + (double)cfg->kf_frame_max_boost_subs.den; + cpi->twopass.kf_max_total_boost = (int)((double)cfg->kf_max_total_boost.num / + (double)cfg->kf_max_total_boost.den); + cpi->twopass.gf_max_total_boost = (int)((double)cfg->gf_max_total_boost.num / + (double)cfg->gf_max_total_boost.den); + cpi->twopass.gf_frame_max_boost = + (double)cfg->gf_frame_max_boost.num / (double)cfg->gf_frame_max_boost.den; + cpi->twopass.zm_power_factor = + (double)cfg->zm_power_factor.num / (double)cfg->zm_power_factor.den; + + return VPX_CODEC_OK; +} + static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg) { vpx_codec_err_t res; @@ -664,6 +699,7 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, if (res == VPX_CODEC_OK) { ctx->cfg = *cfg; set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); + set_twopass_params_from_config(&ctx->cfg, ctx->cpi); // On profile change, request a key frame force_key |= ctx->cpi->common.profile != ctx->oxcf.profile; vp9_change_config(ctx->cpi, &ctx->oxcf); @@ -696,6 +732,7 @@ static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx, if (res == VPX_CODEC_OK) { ctx->extra_cfg = *extra_cfg; set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); + set_twopass_params_from_config(&ctx->cfg, ctx->cpi); vp9_change_config(ctx->cpi, &ctx->oxcf); } return res; @@ -940,6 +977,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, #endif priv->cpi = vp9_create_compressor(&priv->oxcf, priv->buffer_pool); if (priv->cpi == NULL) res = VPX_CODEC_MEM_ERROR; + set_twopass_params_from_config(&priv->cfg, priv->cpi); } } From d3aaac367bd716b2db06e774f0a8eea7768dd184 Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Wed, 31 Mar 2021 16:58:51 +0100 Subject: [PATCH 0142/1000] Change calculation of rd multiplier. Change the way the rd multiplier is adjusted for Q and frame type. Previously in VP9 the rd multiplier was adjusted based on crude Q bins and whether the frame was a key frame or inter frame. The Q bins create some problems as they potentially introduce discontinuities in the RD curve. For example, rate rising with a stepwise increase in Q instead of falling. As such, in AV1 they have been removed. A further issue was identified when examining the first round of results from from the Vizier project. Here the multiplier for each Q bin and each frame type was optimized for a training set, for various video formats, using single point encodes at the appropriate YT rates. These initial results appeared to show a trend for increased rd multiplier at higher Q for key frames. This fits with intuition as in this encoding context a higher Q indicates that a clip is harder to encode and frames less well predicted. However, the situation appeared to reverse for inter frames with higher rd multipliers chosen at low Q. My initial suspicion was that this was a result of over fitting, but on closer analysis I realized that this may be more related to frame type within the broader inter frame classification. Specifically frames coded at low Q are predominantly ARF frames, for the mid Q bin there will likely be a mix of ARF and normal inter frames, and for the high Q bin the frames will almost exclusively be normal inter frames from difficult content. ARF frames are inherently less well predicted than other inter frames being further apart and not having access to as many prediction modes. We also know from previous work that ARF frames have a higher incidence of INTRA coding and may well behave more like key frames in this context. This patch replaces the bin based approach with a linear function that applies a small but smooth Q based adjustment. It also splits ARF frames and normal inter frames into separate categories. With this done number of parameters that will be exposed for the next round of Vizier training is reduced from 7 to 3 (one adjustment factor each for inter, ARF and key frames) This patch gives net BDATE gains for our test sets even with the baseline / default factors as follows: (% BDRATE change in overall PSNR and SSIM, -ve is better) LowRes -0.231, -0.050 ugc360p 0.160, -0.315 midres2 -0.348, -1.170 hdres2 -0.407, -0.691 Change-Id: I46dd2fea77b1c2849c122f10fd0df74bbd3fcc7f --- vp9/encoder/vp9_rd.c | 114 ++++++++++++++++++------------------------- vp9/encoder/vp9_rd.h | 10 ++-- 2 files changed, 50 insertions(+), 74 deletions(-) diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index d3bf3d6d4c..9efd7425c6 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -205,62 +205,36 @@ void vp9_init_rd_parameters(VP9_COMP *cpi) { // Make sure this function is floating point safe. vpx_clear_system_state(); - rdc->rd_mult_key_high_qp_fac = 1.0; // Default: no Vizer values yet + rdc->rd_mult_arf_qp_fac = 1.0; // Default: No Vizier values yet + // These hard wired estimates for the Vizier values will be removed later + // as the per format factors will be set on the command line. if (0) { unsigned int screen_area = (cpi->common.width * cpi->common.height); if (screen_area <= 176 * 144) { - rdc->rd_mult_inter_low_qp_fac = 1.018; - rdc->rd_mult_inter_mid_qp_fac = 0.896; - rdc->rd_mult_inter_high_qp_fac = 1.432; - rdc->rd_mult_key_ultralow_qp_fac = 1.073; - rdc->rd_mult_key_low_qp_fac = 1.630; - rdc->rd_mult_key_mid_qp_fac = 1.050; + rdc->rd_mult_inter_qp_fac = 0.896; + rdc->rd_mult_key_qp_fac = 1.050; } else if (screen_area <= 320 * 240) { - rdc->rd_mult_inter_low_qp_fac = 1.127; - rdc->rd_mult_inter_mid_qp_fac = 0.998; - rdc->rd_mult_inter_high_qp_fac = 1.463; - rdc->rd_mult_key_ultralow_qp_fac = 1.054; - rdc->rd_mult_key_low_qp_fac = 1.285; - rdc->rd_mult_key_mid_qp_fac = 0.952; + rdc->rd_mult_inter_qp_fac = 0.998; + rdc->rd_mult_key_qp_fac = 0.952; } else if (screen_area <= 640 * 360) { - rdc->rd_mult_inter_low_qp_fac = 1.183; - rdc->rd_mult_inter_mid_qp_fac = 0.959; - rdc->rd_mult_inter_high_qp_fac = 1.457; - rdc->rd_mult_key_ultralow_qp_fac = 1.144; - rdc->rd_mult_key_low_qp_fac = 1.734; - rdc->rd_mult_key_mid_qp_fac = 1.071; + rdc->rd_mult_inter_qp_fac = 0.959; + rdc->rd_mult_key_qp_fac = 1.071; } else if (screen_area <= 854 * 480) { - rdc->rd_mult_inter_low_qp_fac = 1.203; - rdc->rd_mult_inter_mid_qp_fac = 1.027; - rdc->rd_mult_inter_high_qp_fac = 1.027; - rdc->rd_mult_key_ultralow_qp_fac = 1.246; - rdc->rd_mult_key_low_qp_fac = 1.246; - rdc->rd_mult_key_mid_qp_fac = 1.280; + rdc->rd_mult_inter_qp_fac = 1.027; + rdc->rd_mult_key_qp_fac = 1.280; } else if (screen_area <= 1280 * 720) { - rdc->rd_mult_inter_low_qp_fac = 1.280; - rdc->rd_mult_inter_mid_qp_fac = 1.004; - rdc->rd_mult_inter_high_qp_fac = 1.470; - rdc->rd_mult_key_ultralow_qp_fac = 0.987; - rdc->rd_mult_key_low_qp_fac = 1.671; - rdc->rd_mult_key_mid_qp_fac = 1.193; + rdc->rd_mult_inter_qp_fac = 1.004; + rdc->rd_mult_key_qp_fac = 1.193; } else { - rdc->rd_mult_inter_low_qp_fac = 1.50; - rdc->rd_mult_inter_mid_qp_fac = 0.874; - rdc->rd_mult_inter_high_qp_fac = 1.07; - rdc->rd_mult_key_ultralow_qp_fac = 1.1; - rdc->rd_mult_key_low_qp_fac = 2.35; - rdc->rd_mult_key_mid_qp_fac = 0.837; + rdc->rd_mult_inter_qp_fac = 0.874; + rdc->rd_mult_key_qp_fac = 0.837; } } else { // For now force defaults unless testing - rdc->rd_mult_inter_low_qp_fac = 1.0; - rdc->rd_mult_inter_mid_qp_fac = 1.0; - rdc->rd_mult_inter_high_qp_fac = 1.0; - rdc->rd_mult_key_ultralow_qp_fac = 1.0; - rdc->rd_mult_key_low_qp_fac = 1.0; - rdc->rd_mult_key_mid_qp_fac = 1.0; + rdc->rd_mult_inter_qp_fac = 1.0; + rdc->rd_mult_key_qp_fac = 1.0; } } @@ -273,6 +247,27 @@ void vp9_init_rd_parameters(VP9_COMP *cpi) { #define KEY_MID_QP_RDM 4.5 #define KEY_HIGH_QP_RDM 7.5 +// Returns the default rd multiplier for inter frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_inter_rd_multiplier(int qindex) { + return 4.15 + (0.001 * (double)qindex); +} + +// Returns the default rd multiplier for ARF/Golden Frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_arf_rd_multiplier(int qindex) { + return 4.25 + (0.001 * (double)qindex); +} + +// Returns the default rd multiplier for key frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_kf_rd_multiplier(int qindex) { + return 4.35 + (0.001 * (double)qindex); +} + int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) { const RD_CONTROL *rdc = &cpi->rd_ctrl; const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth); @@ -282,31 +277,16 @@ int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) { // Make sure this function is floating point safe. vpx_clear_system_state(); - if (cpi->common.frame_type != KEY_FRAME) { - if (qindex < 128) { - rdmult = (int)((double)rdmult * INTER_LOW_QP_RDM * - rdc->rd_mult_inter_low_qp_fac); - } else if (qindex < 190) { - rdmult = (int)((double)rdmult * INTER_MID_QP_RDM * - rdc->rd_mult_inter_mid_qp_fac); - } else { - rdmult = (int)((double)rdmult * INTER_HIGH_QP_RDM * - rdc->rd_mult_inter_high_qp_fac); - } + if (cpi->common.frame_type == KEY_FRAME) { + double def_rd_q_mult = def_kf_rd_multiplier(qindex); + rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_key_qp_fac); + } else if (!cpi->rc.is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + double def_rd_q_mult = def_arf_rd_multiplier(qindex); + rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_arf_qp_fac); } else { - if (qindex < 64) { - rdmult = (int)((double)rdmult * KEY_ULOW_QP_RDM * - rdc->rd_mult_key_ultralow_qp_fac); - } else if (qindex <= 128) { - rdmult = - (int)((double)rdmult * KEY_LOW_QP_RDM * rdc->rd_mult_key_low_qp_fac); - } else if (qindex < 190) { - rdmult = - (int)((double)rdmult * KEY_MID_QP_RDM * rdc->rd_mult_key_mid_qp_fac); - } else { - rdmult = (int)((double)rdmult * KEY_HIGH_QP_RDM * - rdc->rd_mult_key_high_qp_fac); - } + double def_rd_q_mult = def_inter_rd_multiplier(qindex); + rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_inter_qp_fac); } #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h index 4899e1ae0f..d2bc5e60ed 100644 --- a/vp9/encoder/vp9_rd.h +++ b/vp9/encoder/vp9_rd.h @@ -103,13 +103,9 @@ typedef enum { typedef struct { // RD multiplier control factors added for Vizier project. - double rd_mult_inter_low_qp_fac; - double rd_mult_inter_mid_qp_fac; - double rd_mult_inter_high_qp_fac; - double rd_mult_key_ultralow_qp_fac; - double rd_mult_key_low_qp_fac; - double rd_mult_key_mid_qp_fac; - double rd_mult_key_high_qp_fac; + double rd_mult_inter_qp_fac; + double rd_mult_arf_qp_fac; + double rd_mult_key_qp_fac; } RD_CONTROL; typedef struct RD_OPT { From 8b3e575a45792fe490b5bc08c3fe08f01553756b Mon Sep 17 00:00:00 2001 From: Tom Finegan Date: Fri, 2 Apr 2021 09:40:09 -0700 Subject: [PATCH 0143/1000] Revert "Pass vizier rc parameter values from command line to twopass" This reverts commit f32829a2e5db3cd1624e8a7a530af84c382762ef. BUG=webm:1723 Change-Id: I866cdf288f9873c350b32091515a6d5f4df362a3 --- vp9/vp9_cx_iface.c | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index f601ca1622..75dda0bed0 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -638,41 +638,6 @@ static vpx_codec_err_t set_encoder_config( return VPX_CODEC_OK; } -static vpx_codec_err_t set_twopass_params_from_config( - const vpx_codec_enc_cfg_t *const cfg, struct VP9_COMP *cpi) { - if (cpi == NULL) return VPX_CODEC_ERROR; - - cpi->twopass.active_wq_factor = - (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den; - cpi->twopass.base_err_per_mb = - (double)cfg->base_err_per_mb.num / (double)cfg->base_err_per_mb.den; - cpi->twopass.sr_default_decay_limit = - (double)cfg->sr_default_decay_limit.num / - (double)cfg->sr_default_decay_limit.den; - cpi->twopass.sr_diff_factor = - (double)cfg->sr_diff_factor.num / (double)cfg->sr_diff_factor.den; - cpi->twopass.kf_err_per_mb = - (double)cfg->kf_err_per_mb.num / (double)cfg->kf_err_per_mb.den; - cpi->twopass.kf_frame_min_boost = - (double)cfg->kf_frame_min_boost.num / (double)cfg->kf_frame_min_boost.den; - cpi->twopass.kf_frame_max_boost_first = - (double)cfg->kf_frame_max_boost_first.num / - (double)cfg->kf_frame_max_boost_first.den; - cpi->twopass.kf_frame_max_boost_subs = - (double)cfg->kf_frame_max_boost_subs.num / - (double)cfg->kf_frame_max_boost_subs.den; - cpi->twopass.kf_max_total_boost = (int)((double)cfg->kf_max_total_boost.num / - (double)cfg->kf_max_total_boost.den); - cpi->twopass.gf_max_total_boost = (int)((double)cfg->gf_max_total_boost.num / - (double)cfg->gf_max_total_boost.den); - cpi->twopass.gf_frame_max_boost = - (double)cfg->gf_frame_max_boost.num / (double)cfg->gf_frame_max_boost.den; - cpi->twopass.zm_power_factor = - (double)cfg->zm_power_factor.num / (double)cfg->zm_power_factor.den; - - return VPX_CODEC_OK; -} - static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg) { vpx_codec_err_t res; @@ -699,7 +664,6 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, if (res == VPX_CODEC_OK) { ctx->cfg = *cfg; set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); - set_twopass_params_from_config(&ctx->cfg, ctx->cpi); // On profile change, request a key frame force_key |= ctx->cpi->common.profile != ctx->oxcf.profile; vp9_change_config(ctx->cpi, &ctx->oxcf); @@ -732,7 +696,6 @@ static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx, if (res == VPX_CODEC_OK) { ctx->extra_cfg = *extra_cfg; set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); - set_twopass_params_from_config(&ctx->cfg, ctx->cpi); vp9_change_config(ctx->cpi, &ctx->oxcf); } return res; @@ -977,7 +940,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, #endif priv->cpi = vp9_create_compressor(&priv->oxcf, priv->buffer_pool); if (priv->cpi == NULL) res = VPX_CODEC_MEM_ERROR; - set_twopass_params_from_config(&priv->cfg, priv->cpi); } } From afe1ba7f3f8d270d0eb215ceeb80e89290e1a37d Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Fri, 2 Apr 2021 10:56:29 -0700 Subject: [PATCH 0144/1000] Pass vizier rc parameter values with range check This is similar to the change: https://chromium-review.googlesource.com/c/webm/libvpx/+/2771081 Which fails libvpx nightly test. Here we add range check to get rid of the warning of "divided by zero". BUG=webm:1723 Change-Id: I7712efe7abd4b11cdb725643d51fd1c0a300d924 --- vp8/vp8_cx_iface.c | 37 ++++++++++++++-------- vp9/vp9_cx_iface.c | 76 ++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 89 insertions(+), 24 deletions(-) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 32bb1d04f2..3f09ec38a4 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -257,6 +257,19 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, ERROR("g_threads cannot be bigger than number of token partitions"); #endif + // The range below shall be further tuned. + RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000); + RANGE_CHECK(cfg, base_err_per_mb.den, 1, 1000); + RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000); + RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_err_per_mb.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_min_boost.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_max_boost_subs.den, 1, 1000); + RANGE_CHECK(cfg, kf_max_total_boost.den, 1, 1000); + RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000); + RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000); + RANGE_CHECK(cfg, zm_power_factor.den, 1, 1000); + return VPX_CODEC_OK; } @@ -1289,18 +1302,18 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { { 0 }, /* ts_layer_id */ { 0 }, /* layer_target_bitrate */ 0, /* temporal_layering_mode */ - { 0, 0 }, /* active_wq_factor */ - { 0, 0 }, /* base_err_per_mb */ - { 0, 0 }, /* sr_default_decay_limit */ - { 0, 0 }, /* sr_diff_factor */ - { 0, 0 }, /* kf_err_per_mb */ - { 0, 0 }, /* kf_frame_min_boost */ - { 0, 0 }, /* kf_frame_max_boost_first */ - { 0, 0 }, /* kf_frame_max_boost_subs */ - { 0, 0 }, /* kf_max_total_boost */ - { 0, 0 }, /* gf_max_total_boost */ - { 0, 0 }, /* gf_frame_max_boost */ - { 0, 0 }, /* zm_power_factor */ + { 0, 1 }, /* active_wq_factor */ + { 0, 1 }, /* base_err_per_mb */ + { 0, 1 }, /* sr_default_decay_limit */ + { 0, 1 }, /* sr_diff_factor */ + { 0, 1 }, /* kf_err_per_mb */ + { 0, 1 }, /* kf_frame_min_boost */ + { 0, 1 }, /* kf_frame_max_boost_first */ + { 0, 1 }, /* kf_frame_max_boost_subs */ + { 0, 1 }, /* kf_max_total_boost */ + { 0, 1 }, /* gf_max_total_boost */ + { 0, 1 }, /* gf_frame_max_boost */ + { 0, 1 }, /* zm_power_factor */ } }, }; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 75dda0bed0..7530850ea1 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -348,6 +348,20 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, } RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB); RANGE_CHECK(extra_cfg, color_range, VPX_CR_STUDIO_RANGE, VPX_CR_FULL_RANGE); + + // The range below shall be further tuned. + RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000); + RANGE_CHECK(cfg, base_err_per_mb.den, 1, 1000); + RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000); + RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_err_per_mb.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_min_boost.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_max_boost_subs.den, 1, 1000); + RANGE_CHECK(cfg, kf_max_total_boost.den, 1, 1000); + RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000); + RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000); + RANGE_CHECK(cfg, zm_power_factor.den, 1, 1000); + return VPX_CODEC_OK; } @@ -638,6 +652,41 @@ static vpx_codec_err_t set_encoder_config( return VPX_CODEC_OK; } +static vpx_codec_err_t set_twopass_params_from_config( + const vpx_codec_enc_cfg_t *const cfg, struct VP9_COMP *cpi) { + if (cpi == NULL) return VPX_CODEC_ERROR; + + cpi->twopass.active_wq_factor = + (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den; + cpi->twopass.base_err_per_mb = + (double)cfg->base_err_per_mb.num / (double)cfg->base_err_per_mb.den; + cpi->twopass.sr_default_decay_limit = + (double)cfg->sr_default_decay_limit.num / + (double)cfg->sr_default_decay_limit.den; + cpi->twopass.sr_diff_factor = + (double)cfg->sr_diff_factor.num / (double)cfg->sr_diff_factor.den; + cpi->twopass.kf_err_per_mb = + (double)cfg->kf_err_per_mb.num / (double)cfg->kf_err_per_mb.den; + cpi->twopass.kf_frame_min_boost = + (double)cfg->kf_frame_min_boost.num / (double)cfg->kf_frame_min_boost.den; + cpi->twopass.kf_frame_max_boost_first = + (double)cfg->kf_frame_max_boost_first.num / + (double)cfg->kf_frame_max_boost_first.den; + cpi->twopass.kf_frame_max_boost_subs = + (double)cfg->kf_frame_max_boost_subs.num / + (double)cfg->kf_frame_max_boost_subs.den; + cpi->twopass.kf_max_total_boost = (int)((double)cfg->kf_max_total_boost.num / + (double)cfg->kf_max_total_boost.den); + cpi->twopass.gf_max_total_boost = (int)((double)cfg->gf_max_total_boost.num / + (double)cfg->gf_max_total_boost.den); + cpi->twopass.gf_frame_max_boost = + (double)cfg->gf_frame_max_boost.num / (double)cfg->gf_frame_max_boost.den; + cpi->twopass.zm_power_factor = + (double)cfg->zm_power_factor.num / (double)cfg->zm_power_factor.den; + + return VPX_CODEC_OK; +} + static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg) { vpx_codec_err_t res; @@ -664,6 +713,7 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, if (res == VPX_CODEC_OK) { ctx->cfg = *cfg; set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); + set_twopass_params_from_config(&ctx->cfg, ctx->cpi); // On profile change, request a key frame force_key |= ctx->cpi->common.profile != ctx->oxcf.profile; vp9_change_config(ctx->cpi, &ctx->oxcf); @@ -696,6 +746,7 @@ static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx, if (res == VPX_CODEC_OK) { ctx->extra_cfg = *extra_cfg; set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); + set_twopass_params_from_config(&ctx->cfg, ctx->cpi); vp9_change_config(ctx->cpi, &ctx->oxcf); } return res; @@ -940,6 +991,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, #endif priv->cpi = vp9_create_compressor(&priv->oxcf, priv->buffer_pool); if (priv->cpi == NULL) res = VPX_CODEC_MEM_ERROR; + set_twopass_params_from_config(&priv->cfg, priv->cpi); } } @@ -1891,18 +1943,18 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { { 0 }, // ts_layer_id { 0 }, // layer_taget_bitrate 0, // temporal_layering_mode - { 0, 0 }, // active_wq_factor - { 0, 0 }, // base_err_per_mb - { 0, 0 }, // sr_default_decay_limit - { 0, 0 }, // sr_diff_factor - { 0, 0 }, // kf_err_per_mb - { 0, 0 }, // kf_frame_min_boost - { 0, 0 }, // kf_frame_max_boost_first - { 0, 0 }, // kf_frame_max_boost_subs - { 0, 0 }, // kf_max_total_boost - { 0, 0 }, // gf_max_total_boost - { 0, 0 }, // gf_frame_max_boost - { 0, 0 }, // zm_power_factor + { 0, 1 }, // active_wq_factor + { 0, 1 }, // base_err_per_mb + { 0, 1 }, // sr_default_decay_limit + { 0, 1 }, // sr_diff_factor + { 0, 1 }, // kf_err_per_mb + { 0, 1 }, // kf_frame_min_boost + { 0, 1 }, // kf_frame_max_boost_first + { 0, 1 }, // kf_frame_max_boost_subs + { 0, 1 }, // kf_max_total_boost + { 0, 1 }, // gf_max_total_boost + { 0, 1 }, // gf_frame_max_boost + { 0, 1 }, // zm_power_factor } }, }; From 0d05ca39f25925eec32cdcea41581c974db3f623 Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Tue, 6 Apr 2021 20:08:01 +0100 Subject: [PATCH 0145/1000] Delete unused constants. Delete some #defines that are no longer needed. Change-Id: I9e4e4df10716598b0d62b0c70f538d4b78a32296 --- vp9/encoder/vp9_rd.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 9efd7425c6..87b9a691a7 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -238,15 +238,6 @@ void vp9_init_rd_parameters(VP9_COMP *cpi) { } } -// Default Rd multiplier values for Q ranges -#define INTER_LOW_QP_RDM 4.0 -#define INTER_MID_QP_RDM 4.5 -#define INTER_HIGH_QP_RDM 3.0 -#define KEY_ULOW_QP_RDM 4.0 -#define KEY_LOW_QP_RDM 3.5 -#define KEY_MID_QP_RDM 4.5 -#define KEY_HIGH_QP_RDM 7.5 - // Returns the default rd multiplier for inter frames for a given qindex. // The function here is a first pass estimate based on data from // a previous Vizer run From ab4383063c79e558ee5d8c8140d38626825ebfec Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Tue, 6 Apr 2021 20:05:48 +0100 Subject: [PATCH 0146/1000] Change zm_factor for Vizier. Changes the exposed zm_factor parameter. This patch alters the meaning of the zm_factor parameter that will be exposed for the Vizier project. The previous power factor was hard to interpret in terms of its meaning and effect and has been replaced by a linear factor. Given that the initial Vizier results suggested a lower zero motion effect for all formats, the default impact has been reduced. The patch as it stands gives a modest improvement for PSNR but is slightly down on some sets for SSIM (overall psnr, ssim % bdrate change: -ve is better) lowres -0.111, 0.001 ugc360p -0.282, -0.068 midres2 -0.183, 0.059 hdres2 -0.042, 0.172 Change-Id: Id6566433ceed8470d5fad1f30282daed56de385d --- vp8/vp8_cx_iface.c | 4 ++-- vp9/encoder/vp9_firstpass.c | 30 +++++++++++++++++++----------- vp9/encoder/vp9_firstpass.h | 2 +- vp9/vp9_cx_iface.c | 8 ++++---- vpx/vpx_encoder.h | 2 +- vpxenc.c | 8 ++++---- 6 files changed, 31 insertions(+), 23 deletions(-) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 3f09ec38a4..1ffd81924f 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -268,7 +268,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(cfg, kf_max_total_boost.den, 1, 1000); RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000); RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000); - RANGE_CHECK(cfg, zm_power_factor.den, 1, 1000); + RANGE_CHECK(cfg, zm_factor.den, 1, 1000); return VPX_CODEC_OK; } @@ -1313,7 +1313,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { { 0, 1 }, /* kf_max_total_boost */ { 0, 1 }, /* gf_max_total_boost */ { 0, 1 }, /* gf_frame_max_boost */ - { 0, 1 }, /* zm_power_factor */ + { 0, 1 }, /* zm_factor */ } }, }; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index a43099e946..f142f2611f 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -73,7 +73,7 @@ #define MAX_KF_TOT_BOOST 5400 #endif -#define ZM_POWER_FACTOR 0.75 +#define DEFAULT_ZM_FACTOR 0.5 #define MINQ_ADJ_LIMIT 48 #define MINQ_ADJ_LIMIT_CQ 20 #define HIGH_UNDERSHOOT_RATIO 2 @@ -1878,9 +1878,17 @@ static double get_zero_motion_factor(const TWO_PASS *const twopass, static double get_prediction_decay_rate(const TWO_PASS *const twopass, const FIRSTPASS_STATS *frame_stats) { const double sr_decay_rate = get_sr_decay_rate(twopass, frame_stats); - const double zero_motion_factor = - (0.95 * pow((frame_stats->pcnt_inter - frame_stats->pcnt_motion), - twopass->zm_power_factor)); + double zero_motion_factor = + twopass->zm_factor * DEFAULT_ZM_FACTOR * + (frame_stats->pcnt_inter - frame_stats->pcnt_motion); + + // Clamp value to range 0.0 to 1.0 + // This should happen anyway if input values are sensibly clamped but checked + // here just in case. + if (zero_motion_factor > 1.0) + zero_motion_factor = 1.0; + else if (zero_motion_factor < 0.0) + zero_motion_factor = 0.0; return VPXMAX(zero_motion_factor, (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor))); @@ -3501,7 +3509,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_power_factor = ZM_POWER_FACTOR; + twopass->zm_factor = 1.0; } else { // Vizer experimental parameters from training. // Later these will be set via the command line. @@ -3517,7 +3525,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->kf_frame_max_boost_first = 25.5; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_power_factor = 2.93715229184991; + twopass->zm_factor = 1.0; } else if (screen_area <= 320 * 240) { twopass->active_wq_factor = 55.0; twopass->base_err_per_mb = 34525.33177195309; @@ -3530,7 +3538,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->kf_frame_max_boost_first = 185.0; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_power_factor = 3.5299221493593413; + twopass->zm_factor = 1.0; } else if (screen_area <= 640 * 360) { twopass->active_wq_factor = 12.5; twopass->base_err_per_mb = 18823.978018028298; @@ -3543,7 +3551,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->kf_frame_max_boost_first = 224.5; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_power_factor = 2.265742666649307; + twopass->zm_factor = 1.0; } else if (screen_area <= 854 * 480) { twopass->active_wq_factor = 51.5; twopass->base_err_per_mb = 33718.98307662595; @@ -3556,7 +3564,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->kf_frame_max_boost_first = 28.0; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_power_factor = 3.552278528517416; + twopass->zm_factor = 1.0; } else if (screen_area <= 1280 * 720) { twopass->active_wq_factor = 41.5; twopass->base_err_per_mb = 29527.46375825401; @@ -3569,7 +3577,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->kf_frame_max_boost_first = 53.0; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_power_factor = 2.568627575572356; + twopass->zm_factor = 1.0; } else { twopass->active_wq_factor = 31.0; twopass->base_err_per_mb = 34474.723463367416; @@ -3582,7 +3590,7 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->kf_frame_max_boost_first = 419.5; twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_power_factor = 5.5776463538431935; + twopass->zm_factor = 1.0; } } } diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 624fccd428..8ec8a44748 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -234,7 +234,7 @@ typedef struct { int kf_max_total_boost; int gf_max_total_boost; double gf_frame_max_boost; - double zm_power_factor; + double zm_factor; } TWO_PASS; struct VP9_COMP; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 7530850ea1..94b1afbcc6 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -360,7 +360,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(cfg, kf_max_total_boost.den, 1, 1000); RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000); RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000); - RANGE_CHECK(cfg, zm_power_factor.den, 1, 1000); + RANGE_CHECK(cfg, zm_factor.den, 1, 1000); return VPX_CODEC_OK; } @@ -681,8 +681,8 @@ static vpx_codec_err_t set_twopass_params_from_config( (double)cfg->gf_max_total_boost.den); cpi->twopass.gf_frame_max_boost = (double)cfg->gf_frame_max_boost.num / (double)cfg->gf_frame_max_boost.den; - cpi->twopass.zm_power_factor = - (double)cfg->zm_power_factor.num / (double)cfg->zm_power_factor.den; + cpi->twopass.zm_factor = + (double)cfg->zm_factor.num / (double)cfg->zm_factor.den; return VPX_CODEC_OK; } @@ -1954,7 +1954,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { { 0, 1 }, // kf_max_total_boost { 0, 1 }, // gf_max_total_boost { 0, 1 }, // gf_frame_max_boost - { 0, 1 }, // zm_power_factor + { 0, 1 }, // zm_factor } }, }; diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index accc127f64..497051e037 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -776,7 +776,7 @@ typedef struct vpx_codec_enc_cfg { * Rate control parameters, set from external experiment results. * */ - vpx_rational_t zm_power_factor; + vpx_rational_t zm_factor; } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */ /*!\brief vp9 svc extra configure parameters diff --git a/vpxenc.c b/vpxenc.c index 3c04c64b3b..874dddb137 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -310,7 +310,7 @@ static const arg_def_t gf_max_total_boost = ARG_DEF(NULL, "gf-max-total-boost", 1, "Golden frame max total boost"); static const arg_def_t gf_frame_max_boost = ARG_DEF(NULL, "gf-frame-max-boost", 1, "Golden frame max boost"); -static const arg_def_t zm_power_factor = +static const arg_def_t zm_factor = ARG_DEF(NULL, "zm-power-factor", 1, "Zero motion power factor"); static const arg_def_t *vizier_rc_args[] = { &active_wq_factor, &base_err_per_mb, @@ -323,7 +323,7 @@ static const arg_def_t *vizier_rc_args[] = { &active_wq_factor, &kf_max_total_boost, &gf_max_total_boost, &gf_frame_max_boost, - &zm_power_factor, + &zm_factor, NULL }; #endif @@ -1048,8 +1048,8 @@ static int parse_stream_params(struct VpxEncoderConfig *global, config->cfg.gf_max_total_boost = arg_parse_rational(&arg); } else if (arg_match(&arg, &gf_frame_max_boost, argi)) { config->cfg.gf_frame_max_boost = arg_parse_rational(&arg); - } else if (arg_match(&arg, &zm_power_factor, argi)) { - config->cfg.zm_power_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &zm_factor, argi)) { + config->cfg.zm_factor = arg_parse_rational(&arg); #endif #if CONFIG_VP9_HIGHBITDEPTH } else if (arg_match(&arg, &test16bitinternalarg, argi)) { From 06fee5a89b7729cc178180e9c6743138faba7e79 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Tue, 6 Apr 2021 17:20:39 -0700 Subject: [PATCH 0147/1000] Adjust end to end psnr value A recent change leads to slight difference of encoding results: d3aaac367 Change calculation of rd multiplier, which is caught by Jenkins nightly test. Adjust the threshold to silence the test failure. BUG=webm:1725 Change-Id: I7e8b3a26b72c831ae4d88d0fca681b354314739d --- test/vp9_end_to_end_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc index 7cc126ea58..7a85db26a4 100644 --- a/test/vp9_end_to_end_test.cc +++ b/test/vp9_end_to_end_test.cc @@ -31,7 +31,7 @@ const double kPsnrThreshold[][5] = { { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 35.0, 36.0, 36.0, 36.0, 36.0 }, { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 }, { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 28.0, 32.0, 32.0, 32.0, 32.0 }, - { 28.5, 31.0, 31.0, 31.0, 31.0 }, { 27.5, 30.0, 30.0, 30.0, 30.0 }, + { 28.4, 31.0, 31.0, 31.0, 31.0 }, { 27.5, 30.0, 30.0, 30.0, 30.0 }, }; typedef struct { From 7a5596fa78ad41c428c2e3a4196d3b391d4ac77c Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Fri, 2 Apr 2021 14:27:16 -0700 Subject: [PATCH 0148/1000] Fix compilation for CONFIG_RATE_CTRL Recently, some function signatures have been changed. This change fixes compilation error if --enable-rate-ctrl is used. Change-Id: Ib8e9cb5e181ba1d4a6969883e377f3dd93e9289a --- vp9/encoder/vp9_firstpass.c | 23 ++++++++++++++--------- vp9/encoder/vp9_firstpass.h | 12 ++++++------ vp9/simple_encode.cc | 14 +++++++------- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index a43099e946..b3f19b6068 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -2545,6 +2545,9 @@ typedef struct RANGE { * (The following fields will remain unchanged after initialization of encoder.) * rc->static_scene_max_gf_interval * rc->min_gf_interval + * twopass->sr_diff_factor + * twopass->sr_default_decay_limit + * twopass->zm_factor * * Dynamic fields: * (The following fields will be updated before or after coding each frame.) @@ -2559,10 +2562,10 @@ typedef struct RANGE { * structs. */ static int get_gop_coding_frame_num( - int *use_alt_ref, const FRAME_INFO *frame_info, TWO_PASS *const twopass, - const RATE_CONTROL *rc, int gf_start_show_idx, - const RANGE *active_gf_interval, double gop_intra_factor, - int lag_in_frames) { + int *use_alt_ref, const FRAME_INFO *frame_info, + const TWO_PASS *const twopass, const RATE_CONTROL *rc, + int gf_start_show_idx, const RANGE *active_gf_interval, + double gop_intra_factor, int lag_in_frames) { const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; double loop_decay_rate = 1.00; double mv_ratio_accumulator = 0.0; @@ -3868,7 +3871,7 @@ void vp9_get_next_group_of_picture(const VP9_COMP *cpi, int *first_is_key_frame, *first_is_key_frame = 0; if (rc.frames_to_key == 0) { rc.frames_to_key = vp9_get_frames_to_next_key( - &cpi->oxcf, twopass, *first_show_idx, rc.min_gf_interval); + &cpi->oxcf, &cpi->twopass, *first_show_idx, rc.min_gf_interval); rc.frames_since_key = 0; *first_is_key_frame = 1; } @@ -3879,15 +3882,15 @@ void vp9_get_next_group_of_picture(const VP9_COMP *cpi, int *first_is_key_frame, assert(*coding_frame_count < rc.frames_to_key); } else { *coding_frame_count = vp9_get_gop_coding_frame_count( - &cpi->oxcf, &cpi->frame_info, &cpi->twopass.first_pass_info, &rc, - *first_show_idx, multi_layer_arf, allow_alt_ref, *first_is_key_frame, + &cpi->oxcf, &cpi->twopass, &cpi->frame_info, &rc, *first_show_idx, + multi_layer_arf, allow_alt_ref, *first_is_key_frame, *last_gop_use_alt_ref, use_alt_ref); } } int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf, + const TWO_PASS *const twopass, const FRAME_INFO *frame_info, - const FIRST_PASS_INFO *first_pass_info, const RATE_CONTROL *rc, int show_idx, int multi_layer_arf, int allow_alt_ref, int first_is_key_frame, @@ -3917,6 +3920,7 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf, // Under CONFIG_RATE_CTRL, once the first_pass_info is ready, the number of // coding frames (including show frame and alt ref) can be determined. int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf, + const TWO_PASS *const twopass, const FRAME_INFO *frame_info, const FIRST_PASS_INFO *first_pass_info, int multi_layer_arf, int allow_alt_ref) { @@ -3939,7 +3943,7 @@ int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf, } gop_coding_frame_count = vp9_get_gop_coding_frame_count( - oxcf, frame_info, first_pass_info, &rc, show_idx, multi_layer_arf, + oxcf, twopass, frame_info, &rc, show_idx, multi_layer_arf, allow_alt_ref, first_is_key_frame, last_gop_use_alt_ref, &use_alt_ref); rc.source_alt_ref_active = use_alt_ref; @@ -3955,6 +3959,7 @@ int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf, void vp9_get_key_frame_map(const VP9EncoderConfig *oxcf, const TWO_PASS *const twopass, int *key_frame_map) { + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; int show_idx = 0; RATE_CONTROL rc; vp9_rc_init(oxcf, 1, &rc); diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 624fccd428..7586ce868a 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -278,8 +278,8 @@ void vp9_get_next_group_of_picture(const struct VP9_COMP *cpi, /*!\brief Call this function before coding a new group of pictures to get * information about it. * \param[in] oxcf Encoder config + * \param[in] twopass Twopass info * \param[in] frame_info Frame info - * \param[in] first_pass_info First pass stats * \param[in] rc Rate control state * \param[in] show_idx Show index of the first frame in the group * \param[in] multi_layer_arf Is multi-layer alternate reference used @@ -292,26 +292,26 @@ void vp9_get_next_group_of_picture(const struct VP9_COMP *cpi, * \return Returns coding frame count */ int vp9_get_gop_coding_frame_count(const struct VP9EncoderConfig *oxcf, + const TWO_PASS *const twopass, const FRAME_INFO *frame_info, - const FIRST_PASS_INFO *first_pass_info, const RATE_CONTROL *rc, int show_idx, int multi_layer_arf, int allow_alt_ref, int first_is_key_frame, int last_gop_use_alt_ref, int *use_alt_ref); int vp9_get_coding_frame_num(const struct VP9EncoderConfig *oxcf, + const TWO_PASS *const twopass, const FRAME_INFO *frame_info, const FIRST_PASS_INFO *first_pass_info, int multi_layer_arf, int allow_alt_ref); /*!\brief Compute a key frame binary map indicates whether key frames appear * in the corresponding positions. The passed in key_frame_map must point to an - * integer array with length equal to first_pass_info->num_frames, which is the - * number of show frames in the video. + * integer array with length equal to twopass->first_pass_info.num_frames, + * which is the number of show frames in the video. */ void vp9_get_key_frame_map(const struct VP9EncoderConfig *oxcf, - const FIRST_PASS_INFO *first_pass_info, - int *key_frame_map); + const TWO_PASS *const twopass, int *key_frame_map); #endif // CONFIG_RATE_CTRL FIRSTPASS_STATS vp9_get_frame_stats(const TWO_PASS *twopass); diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index d4eb0c669d..efdc71eb98 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -1084,8 +1084,7 @@ void SimpleEncode::UpdateKeyFrameGroup(int key_frame_show_index) { const VP9_COMP *cpi = impl_ptr_->cpi; key_frame_group_index_ = 0; key_frame_group_size_ = vp9_get_frames_to_next_key( - &cpi->oxcf, &cpi->frame_info, &cpi->twopass.first_pass_info, - key_frame_show_index, cpi->rc.min_gf_interval); + &cpi->oxcf, &cpi->twopass, key_frame_show_index, cpi->rc.min_gf_interval); assert(key_frame_group_size_ > 0); // Init the reference frame info when a new key frame group appears. InitRefFrameInfo(&ref_frame_info_); @@ -1250,6 +1249,7 @@ int SimpleEncode::GetCodingFrameNum() const { } // These are the default settings for now. + const VP9_COMP *cpi = impl_ptr_->cpi; const int multi_layer_arf = 0; const int allow_alt_ref = 1; vpx_rational_t frame_rate = @@ -1262,11 +1262,13 @@ int SimpleEncode::GetCodingFrameNum() const { fps_init_first_pass_info(&first_pass_info, GetVectorData(impl_ptr_->first_pass_stats), num_frames_); - return vp9_get_coding_frame_num(&oxcf, &frame_info, &first_pass_info, - multi_layer_arf, allow_alt_ref); + return vp9_get_coding_frame_num(&oxcf, &cpi->twopass, &frame_info, + &first_pass_info, multi_layer_arf, + allow_alt_ref); } std::vector SimpleEncode::ComputeKeyFrameMap() const { + const VP9_COMP *cpi = impl_ptr_->cpi; // The last entry of first_pass_stats is the overall stats. assert(impl_ptr_->first_pass_stats.size() == num_frames_ + 1); vpx_rational_t frame_rate = @@ -1274,14 +1276,12 @@ std::vector SimpleEncode::ComputeKeyFrameMap() const { const VP9EncoderConfig oxcf = GetEncodeConfig( frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); - FRAME_INFO frame_info = vp9_get_frame_info(&oxcf); FIRST_PASS_INFO first_pass_info; fps_init_first_pass_info(&first_pass_info, GetVectorData(impl_ptr_->first_pass_stats), num_frames_); std::vector key_frame_map(num_frames_, 0); - vp9_get_key_frame_map(&oxcf, &frame_info, &first_pass_info, - GetVectorData(key_frame_map)); + vp9_get_key_frame_map(&oxcf, &cpi->twopass, GetVectorData(key_frame_map)); return key_frame_map; } From 1c792f2991e632be26414ecb62835bf148c086b4 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 8 Apr 2021 17:34:16 -0700 Subject: [PATCH 0149/1000] vpx_image: clear user provided vpx_image_t early this avoids uninitialized values and potential misuse of them which could lead to a crash should the function fail this is the same fix that was applied in libaom: d0cac70b5 Fix a free on invalid ptr when img allocation fails Bug: webm:1722 Change-Id: If7a8d08c4b010f12e2e1d848613c0fa7328f1f9c --- vpx/src/vpx_image.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c index ff496b5d34..2b7411f94f 100644 --- a/vpx/src/vpx_image.c +++ b/vpx/src/vpx_image.c @@ -24,6 +24,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int stride_in_bytes; int align; + if (img != NULL) memset(img, 0, sizeof(vpx_image_t)); + /* Treat align==0 like align==1 */ if (!buf_align) buf_align = 1; @@ -88,8 +90,6 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, if (!img) goto fail; img->self_allocd = 1; - } else { - memset(img, 0, sizeof(vpx_image_t)); } img->img_data = img_data; From c77a7f60040ff63c8dd778cfdd322cd6d7f62e17 Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Mon, 12 Apr 2021 13:51:44 +0100 Subject: [PATCH 0150/1000] Removed unused constant Deleted #define that is no longer referenced. Change-Id: If0b132c5a40dd8910f535fffdee7d2d1c7df4748 --- vp9/encoder/vp9_firstpass.c | 1 - 1 file changed, 1 deletion(-) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index f142f2611f..61ad3b6524 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -54,7 +54,6 @@ #define NCOUNT_INTRA_THRESH 8192 #define NCOUNT_INTRA_FACTOR 3 -#define SR_DIFF_PART 0.0015 #define INTRA_PART 0.005 #define DEFAULT_DECAY_LIMIT 0.75 #define LOW_SR_DIFF_TRHESH 0.1 From 1b07aae9e47a5480f78025dc3434331faaa68e33 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Tue, 13 Apr 2021 11:59:40 -0700 Subject: [PATCH 0151/1000] Set vizier rc parameters If pass --use-vizier-rc-params=1, the rc parameters are overwittern by pass in values. It --use-vizier-rc-params=0, the rc parameters remain the default values. Change-Id: I7a3e806e0918f49e8970997379a6e99af6bb7cac --- vp8/vp8_cx_iface.c | 2 + vp9/encoder/vp9_firstpass.c | 125 +++++++----------------------------- vp9/encoder/vp9_firstpass.h | 4 ++ vp9/vp9_cx_iface.c | 4 ++ vpx/vpx_encoder.h | 30 +++++++++ vpxenc.c | 10 ++- 6 files changed, 72 insertions(+), 103 deletions(-) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 1ffd81924f..64d01e535e 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -258,6 +258,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, #endif // The range below shall be further tuned. + RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1); RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000); RANGE_CHECK(cfg, base_err_per_mb.den, 1, 1000); RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000); @@ -1302,6 +1303,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { { 0 }, /* ts_layer_id */ { 0 }, /* layer_target_bitrate */ 0, /* temporal_layering_mode */ + 0, /* use_vizier_rc_params */ { 0, 1 }, /* active_wq_factor */ { 0, 1 }, /* base_err_per_mb */ { 0, 1 }, /* sr_default_decay_limit */ diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 07d8eddd0b..9b3d7425e2 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -3493,109 +3493,32 @@ static int is_skippable_frame(const VP9_COMP *cpi) { // Configure image size specific vizier parameters. // Later these will be set via additional command line options static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { - if (1) { - // Force defaults for now - twopass->active_wq_factor = AV_WQ_FACTOR; - twopass->base_err_per_mb = BASELINE_ERR_PER_MB; - twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT; - twopass->sr_diff_factor = 1.0; - twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST; - twopass->gf_max_total_boost = MAX_GF_BOOST; - if (screen_area < 1280 * 720) { - twopass->kf_err_per_mb = 2000.0; - } else if (screen_area < 1920 * 1080) { - twopass->kf_err_per_mb = 500.0; - } else { - twopass->kf_err_per_mb = 250.0; - } - twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; - twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; - twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; - twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_factor = 1.0; + // When |use_vizier_rc_params| is 1, we expect the rc parameters have been + // initialized by the pass in values. + // Be careful that parameters below are only initialized to 0, if we do not + // pass values to them. It is desired to take care of each parameter when + // using |use_vizier_rc_params|. + if (twopass->use_vizier_rc_params) return; + + // When |use_vizier_rc_params| is 0, use defaults for now. + twopass->active_wq_factor = AV_WQ_FACTOR; + twopass->base_err_per_mb = BASELINE_ERR_PER_MB; + twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT; + twopass->sr_diff_factor = 1.0; + twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST; + twopass->gf_max_total_boost = MAX_GF_BOOST; + if (screen_area < 1280 * 720) { + twopass->kf_err_per_mb = 2000.0; + } else if (screen_area < 1920 * 1080) { + twopass->kf_err_per_mb = 500.0; } else { - // Vizer experimental parameters from training. - // Later these will be set via the command line. - if (screen_area <= 176 * 144) { - twopass->active_wq_factor = 46.0; - twopass->base_err_per_mb = 37597.399760969536; - twopass->sr_default_decay_limit = 0.3905639800962774; - twopass->sr_diff_factor = 6.4; - twopass->gf_frame_max_boost = 87.27362648627846; - twopass->gf_max_total_boost = MAX_GF_BOOST; - twopass->kf_err_per_mb = 1854.8255436877148; - twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; - twopass->kf_frame_max_boost_first = 25.5; - twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; - twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_factor = 1.0; - } else if (screen_area <= 320 * 240) { - twopass->active_wq_factor = 55.0; - twopass->base_err_per_mb = 34525.33177195309; - twopass->sr_default_decay_limit = 0.23901360046804604; - twopass->sr_diff_factor = 5.73; - twopass->gf_frame_max_boost = 127.34978204980285; - twopass->gf_max_total_boost = MAX_GF_BOOST; - twopass->kf_err_per_mb = 723.8337508755031; - twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; - twopass->kf_frame_max_boost_first = 185.0; - twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; - twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_factor = 1.0; - } else if (screen_area <= 640 * 360) { - twopass->active_wq_factor = 12.5; - twopass->base_err_per_mb = 18823.978018028298; - twopass->sr_default_decay_limit = 0.6043527690301296; - twopass->sr_diff_factor = 2.28; - twopass->gf_frame_max_boost = 75.17672317013668; - twopass->gf_max_total_boost = MAX_GF_BOOST; - twopass->kf_err_per_mb = 422.2871502380377; - twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; - twopass->kf_frame_max_boost_first = 224.5; - twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; - twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_factor = 1.0; - } else if (screen_area <= 854 * 480) { - twopass->active_wq_factor = 51.5; - twopass->base_err_per_mb = 33718.98307662595; - twopass->sr_default_decay_limit = 0.33633414970713393; - twopass->sr_diff_factor = 5.8; - twopass->gf_frame_max_boost = 85.2868528581522; - twopass->gf_max_total_boost = MAX_GF_BOOST; - twopass->kf_err_per_mb = 1513.4883914008383; - twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; - twopass->kf_frame_max_boost_first = 28.0; - twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; - twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_factor = 1.0; - } else if (screen_area <= 1280 * 720) { - twopass->active_wq_factor = 41.5; - twopass->base_err_per_mb = 29527.46375825401; - twopass->sr_default_decay_limit = 0.5009117586299728; - twopass->sr_diff_factor = 3.33; - twopass->gf_frame_max_boost = 81.00472969483079; - twopass->gf_max_total_boost = MAX_GF_BOOST; - twopass->kf_err_per_mb = 998.6342911785146; - twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; - twopass->kf_frame_max_boost_first = 53.0; - twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; - twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_factor = 1.0; - } else { - twopass->active_wq_factor = 31.0; - twopass->base_err_per_mb = 34474.723463367416; - twopass->sr_default_decay_limit = 0.23346886902707745; - twopass->sr_diff_factor = 7.6; - twopass->gf_frame_max_boost = 213.2940230360479; - twopass->gf_max_total_boost = MAX_GF_BOOST; - twopass->kf_err_per_mb = 35931.25734431429; - twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; - twopass->kf_frame_max_boost_first = 419.5; - twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; - twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_factor = 1.0; - } + twopass->kf_err_per_mb = 250.0; } + twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; + twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; + twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; + twopass->zm_factor = 1.0; } void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 441859a342..1418b67a48 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -223,6 +223,10 @@ typedef struct { GF_GROUP gf_group; // Vizeir project experimental two pass rate control parameters. + // When |use_vizier_rc_params| is 1, the following parameters will + // be overwritten by pass in values. Otherwise, they are initialized + // by default values. + int use_vizier_rc_params; double active_wq_factor; double base_err_per_mb; double sr_default_decay_limit; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 94b1afbcc6..e35b6f1e24 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -350,6 +350,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(extra_cfg, color_range, VPX_CR_STUDIO_RANGE, VPX_CR_FULL_RANGE); // The range below shall be further tuned. + RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1); RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000); RANGE_CHECK(cfg, base_err_per_mb.den, 1, 1000); RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000); @@ -654,8 +655,10 @@ static vpx_codec_err_t set_encoder_config( static vpx_codec_err_t set_twopass_params_from_config( const vpx_codec_enc_cfg_t *const cfg, struct VP9_COMP *cpi) { + if (!cfg->use_vizier_rc_params) return VPX_CODEC_OK; if (cpi == NULL) return VPX_CODEC_ERROR; + cpi->twopass.use_vizier_rc_params = cfg->use_vizier_rc_params; cpi->twopass.active_wq_factor = (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den; cpi->twopass.base_err_per_mb = @@ -1943,6 +1946,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { { 0 }, // ts_layer_id { 0 }, // layer_taget_bitrate 0, // temporal_layering_mode + 0, // use_vizier_rc_params { 0, 1 }, // active_wq_factor { 0, 1 }, // base_err_per_mb { 0, 1 }, // sr_default_decay_limit diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 497051e037..3c9304b37a 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -694,9 +694,17 @@ typedef struct vpx_codec_enc_cfg { */ int temporal_layering_mode; + /*!\brief A flag indicating whether to use external rate control parameters. + * By default is 0. If set to 1, the following parameters will be used in the + * rate control system. + */ + int use_vizier_rc_params; + /*!\brief Active worst quality factor. * * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. * */ vpx_rational_t active_wq_factor; @@ -704,6 +712,8 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Base error per macroblock. * * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. * */ vpx_rational_t base_err_per_mb; @@ -711,6 +721,8 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Second reference default decay limit. * * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. * */ vpx_rational_t sr_default_decay_limit; @@ -718,6 +730,8 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Second reference difference factor. * * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. * */ vpx_rational_t sr_diff_factor; @@ -725,6 +739,8 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Keyframe error per macroblock. * * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. * */ vpx_rational_t kf_err_per_mb; @@ -732,6 +748,8 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Keyframe minimum boost. * * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. * */ vpx_rational_t kf_frame_min_boost; @@ -739,6 +757,8 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Keyframe maximum boost, for the first keyframe in a chunk. * * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. * */ vpx_rational_t kf_frame_max_boost_first; @@ -746,6 +766,8 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Keyframe maximum boost, for subsequent keyframes. * * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. * */ vpx_rational_t kf_frame_max_boost_subs; @@ -753,6 +775,8 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Keyframe maximum total boost. * * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. * */ vpx_rational_t kf_max_total_boost; @@ -760,6 +784,8 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Golden frame maximum total boost. * * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. * */ vpx_rational_t gf_max_total_boost; @@ -767,6 +793,8 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Golden frame maximum boost. * * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. * */ vpx_rational_t gf_frame_max_boost; @@ -774,6 +802,8 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Zero motion power factor. * * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. * */ vpx_rational_t zm_factor; diff --git a/vpxenc.c b/vpxenc.c index 874dddb137..c9e386a06b 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -288,6 +288,8 @@ static const arg_def_t *rc_args[] = { }; #if CONFIG_VP9_ENCODER +static const arg_def_t use_vizier_rc_params = + ARG_DEF(NULL, "use-vizier-rc-params", 1, "Use vizier rc params"); static const arg_def_t active_wq_factor = ARG_DEF(NULL, "active-wq-factor", 1, "Active worst quality factor"); static const arg_def_t base_err_per_mb = @@ -311,8 +313,9 @@ static const arg_def_t gf_max_total_boost = static const arg_def_t gf_frame_max_boost = ARG_DEF(NULL, "gf-frame-max-boost", 1, "Golden frame max boost"); static const arg_def_t zm_factor = - ARG_DEF(NULL, "zm-power-factor", 1, "Zero motion power factor"); -static const arg_def_t *vizier_rc_args[] = { &active_wq_factor, + ARG_DEF(NULL, "zm-factor", 1, "Zero motion power factor"); +static const arg_def_t *vizier_rc_args[] = { &use_vizier_rc_params, + &active_wq_factor, &base_err_per_mb, &sr_default_decay_limit, &sr_diff_factor, @@ -1026,6 +1029,8 @@ static int parse_stream_params(struct VpxEncoderConfig *global, } else if (arg_match(&arg, &kf_disabled, argi)) { config->cfg.kf_mode = VPX_KF_DISABLED; #if CONFIG_VP9_ENCODER + } else if (arg_match(&arg, &use_vizier_rc_params, argi)) { + config->cfg.use_vizier_rc_params = arg_parse_int(&arg); } else if (arg_match(&arg, &active_wq_factor, argi)) { config->cfg.active_wq_factor = arg_parse_rational(&arg); } else if (arg_match(&arg, &base_err_per_mb, argi)) { @@ -1246,6 +1251,7 @@ static void show_stream_config(struct stream_state *stream, SHOW(kf_min_dist); SHOW(kf_max_dist); // Temporary use for debug + SHOW(use_vizier_rc_params); SHOW(active_wq_factor.num); SHOW(active_wq_factor.den); } From 665cccfd6ccbc1be2db7d550b68388679b573410 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Wed, 14 Apr 2021 22:15:54 -0700 Subject: [PATCH 0152/1000] Pass vizier rd parameter values Add command line options for three rd parameters. They are controlled by --use_vizier_rc_params, together with other rc parameters. If not set from command line, current default values will be used. Change-Id: Ie1b9a98a50326551cc1d5940c4b637cb01a61aa0 --- vp8/vp8_cx_iface.c | 6 ++++++ vp9/encoder/vp9_rd.c | 41 ++++++++++------------------------------- vp9/vp9_cx_iface.c | 12 ++++++++++++ vpx/vpx_encoder.h | 30 ++++++++++++++++++++++++++++++ vpxenc.c | 15 +++++++++++++++ 5 files changed, 73 insertions(+), 31 deletions(-) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 64d01e535e..872710f138 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -270,6 +270,9 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000); RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000); RANGE_CHECK(cfg, zm_factor.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_key_qp_fac.den, 1, 1000); return VPX_CODEC_OK; } @@ -1316,6 +1319,9 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { { 0, 1 }, /* gf_max_total_boost */ { 0, 1 }, /* gf_frame_max_boost */ { 0, 1 }, /* zm_factor */ + { 1, 1 }, /* rd_mult_inter_qp_fac */ + { 1, 1 }, /* rd_mult_arf_qp_fac */ + { 1, 1 }, /* rd_mult_key_qp_fac */ } }, }; diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 87b9a691a7..9fa3ff1865 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -202,40 +202,19 @@ static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128, void vp9_init_rd_parameters(VP9_COMP *cpi) { RD_CONTROL *const rdc = &cpi->rd_ctrl; + // When |use_vizier_rc_params| is 1, we expect the rd parameters have been + // initialized by the pass in values. + // Be careful that parameters below are only initialized to 1, if we do not + // pass values to them. It is desired to take care of each parameter when + // using |use_vizier_rc_params|. + if (cpi->twopass.use_vizier_rc_params) return; + // Make sure this function is floating point safe. vpx_clear_system_state(); - rdc->rd_mult_arf_qp_fac = 1.0; // Default: No Vizier values yet - - // These hard wired estimates for the Vizier values will be removed later - // as the per format factors will be set on the command line. - if (0) { - unsigned int screen_area = (cpi->common.width * cpi->common.height); - - if (screen_area <= 176 * 144) { - rdc->rd_mult_inter_qp_fac = 0.896; - rdc->rd_mult_key_qp_fac = 1.050; - } else if (screen_area <= 320 * 240) { - rdc->rd_mult_inter_qp_fac = 0.998; - rdc->rd_mult_key_qp_fac = 0.952; - } else if (screen_area <= 640 * 360) { - rdc->rd_mult_inter_qp_fac = 0.959; - rdc->rd_mult_key_qp_fac = 1.071; - } else if (screen_area <= 854 * 480) { - rdc->rd_mult_inter_qp_fac = 1.027; - rdc->rd_mult_key_qp_fac = 1.280; - } else if (screen_area <= 1280 * 720) { - rdc->rd_mult_inter_qp_fac = 1.004; - rdc->rd_mult_key_qp_fac = 1.193; - } else { - rdc->rd_mult_inter_qp_fac = 0.874; - rdc->rd_mult_key_qp_fac = 0.837; - } - } else { - // For now force defaults unless testing - rdc->rd_mult_inter_qp_fac = 1.0; - rdc->rd_mult_key_qp_fac = 1.0; - } + rdc->rd_mult_inter_qp_fac = 1.0; + rdc->rd_mult_arf_qp_fac = 1.0; + rdc->rd_mult_key_qp_fac = 1.0; } // Returns the default rd multiplier for inter frames for a given qindex. diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index e35b6f1e24..c2ca215f10 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -362,6 +362,9 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000); RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000); RANGE_CHECK(cfg, zm_factor.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_key_qp_fac.den, 1, 1000); return VPX_CODEC_OK; } @@ -686,6 +689,12 @@ static vpx_codec_err_t set_twopass_params_from_config( (double)cfg->gf_frame_max_boost.num / (double)cfg->gf_frame_max_boost.den; cpi->twopass.zm_factor = (double)cfg->zm_factor.num / (double)cfg->zm_factor.den; + cpi->rd_ctrl.rd_mult_inter_qp_fac = (double)cfg->rd_mult_inter_qp_fac.num / + (double)cfg->rd_mult_inter_qp_fac.den; + cpi->rd_ctrl.rd_mult_arf_qp_fac = + (double)cfg->rd_mult_arf_qp_fac.num / (double)cfg->rd_mult_arf_qp_fac.den; + cpi->rd_ctrl.rd_mult_key_qp_fac = + (double)cfg->rd_mult_key_qp_fac.num / (double)cfg->rd_mult_key_qp_fac.den; return VPX_CODEC_OK; } @@ -1959,6 +1968,9 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { { 0, 1 }, // gf_max_total_boost { 0, 1 }, // gf_frame_max_boost { 0, 1 }, // zm_factor + { 1, 1 }, // rd_mult_inter_qp_fac + { 1, 1 }, // rd_mult_arf_qp_fac + { 1, 1 }, // rd_mult_key_qp_fac } }, }; diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 3c9304b37a..255cb693ee 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -807,6 +807,36 @@ typedef struct vpx_codec_enc_cfg { * */ vpx_rational_t zm_factor; + + /*!\brief Rate-distortion multiplier for inter frames. + * The multiplier is a crucial parameter in the calculation of rate distortion + * cost. It is often related to the qp (qindex) value. + * Rate control parameters, could be set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t rd_mult_inter_qp_fac; + + /*!\brief Rate-distortion multiplier for alt-ref frames. + * The multiplier is a crucial parameter in the calculation of rate distortion + * cost. It is often related to the qp (qindex) value. + * Rate control parameters, could be set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t rd_mult_arf_qp_fac; + + /*!\brief Rate-distortion multiplier for key frames. + * The multiplier is a crucial parameter in the calculation of rate distortion + * cost. It is often related to the qp (qindex) value. + * Rate control parameters, could be set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t rd_mult_key_qp_fac; } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */ /*!\brief vp9 svc extra configure parameters diff --git a/vpxenc.c b/vpxenc.c index c9e386a06b..1a2b4a9214 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -314,6 +314,12 @@ static const arg_def_t gf_frame_max_boost = ARG_DEF(NULL, "gf-frame-max-boost", 1, "Golden frame max boost"); static const arg_def_t zm_factor = ARG_DEF(NULL, "zm-factor", 1, "Zero motion power factor"); +static const arg_def_t rd_mult_inter_qp_fac = + ARG_DEF(NULL, "rd-mult-inter-qp-fac", 1, "RD multiplier for inter frames"); +static const arg_def_t rd_mult_arf_qp_fac = + ARG_DEF(NULL, "rd-mult-arf-qp-fac", 1, "RD multiplier for alt-ref frames"); +static const arg_def_t rd_mult_key_qp_fac = + ARG_DEF(NULL, "rd-mult-key-qp-fac", 1, "RD multiplier for key frames"); static const arg_def_t *vizier_rc_args[] = { &use_vizier_rc_params, &active_wq_factor, &base_err_per_mb, @@ -327,6 +333,9 @@ static const arg_def_t *vizier_rc_args[] = { &use_vizier_rc_params, &gf_max_total_boost, &gf_frame_max_boost, &zm_factor, + &rd_mult_inter_qp_fac, + &rd_mult_arf_qp_fac, + &rd_mult_key_qp_fac, NULL }; #endif @@ -1055,6 +1064,12 @@ static int parse_stream_params(struct VpxEncoderConfig *global, config->cfg.gf_frame_max_boost = arg_parse_rational(&arg); } else if (arg_match(&arg, &zm_factor, argi)) { config->cfg.zm_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &rd_mult_inter_qp_fac, argi)) { + config->cfg.rd_mult_inter_qp_fac = arg_parse_rational(&arg); + } else if (arg_match(&arg, &rd_mult_arf_qp_fac, argi)) { + config->cfg.rd_mult_arf_qp_fac = arg_parse_rational(&arg); + } else if (arg_match(&arg, &rd_mult_key_qp_fac, argi)) { + config->cfg.rd_mult_key_qp_fac = arg_parse_rational(&arg); #endif #if CONFIG_VP9_HIGHBITDEPTH } else if (arg_match(&arg, &test16bitinternalarg, argi)) { From c911c2d9c51698548c4529a4c35fb495ea95c435 Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Tue, 20 Apr 2021 17:26:22 +0100 Subject: [PATCH 0153/1000] Further normalization of Vizier parameters. Further changes to normalize the Vizier command line parameters. The intent is that the default behavior for any given parameter is signaled by the value 1.0 (expressed on the command line as a rational). The final values used in the two pass code are obtained by multiplying the passed in factors by a default values if use_vizier_rc_params is 1. Where use_vizier_rc_params is 0 the values are explicitly set to the defaults. This patch also changes the default value of each parameter to 1.0 even if not set explicitly. This should ensure safe /default behavior if the user sets use_vizier_rc_params to 1 but does not set all the the individual parameters. Change-Id: Ied08b3c22df18f42f446a4cc9363473cad097f69 --- vp8/vp8_cx_iface.c | 38 +++++++-------- vp9/encoder/vp9_firstpass.c | 85 +++++++++++++++++++++------------ vp9/encoder/vp9_firstpass.h | 6 +-- vp9/vp9_cx_iface.c | 74 +++++++++++++++-------------- vpx/vpx_encoder.h | 33 ++++++------- vpxenc.c | 94 ++++++++++++++++++++----------------- 6 files changed, 182 insertions(+), 148 deletions(-) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 872710f138..6e12e38989 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -260,15 +260,15 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, // The range below shall be further tuned. RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1); RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000); - RANGE_CHECK(cfg, base_err_per_mb.den, 1, 1000); + RANGE_CHECK(cfg, err_per_mb_factor.den, 1, 1000); RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000); RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000); - RANGE_CHECK(cfg, kf_err_per_mb.den, 1, 1000); - RANGE_CHECK(cfg, kf_frame_min_boost.den, 1, 1000); - RANGE_CHECK(cfg, kf_frame_max_boost_subs.den, 1, 1000); - RANGE_CHECK(cfg, kf_max_total_boost.den, 1, 1000); - RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000); - RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000); + RANGE_CHECK(cfg, kf_err_per_mb_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_min_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_max_boost_subs_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_max_total_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, gf_max_total_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, gf_frame_max_boost_factor.den, 1, 1000); RANGE_CHECK(cfg, zm_factor.den, 1, 1000); RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000); RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000); @@ -1307,18 +1307,18 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { { 0 }, /* layer_target_bitrate */ 0, /* temporal_layering_mode */ 0, /* use_vizier_rc_params */ - { 0, 1 }, /* active_wq_factor */ - { 0, 1 }, /* base_err_per_mb */ - { 0, 1 }, /* sr_default_decay_limit */ - { 0, 1 }, /* sr_diff_factor */ - { 0, 1 }, /* kf_err_per_mb */ - { 0, 1 }, /* kf_frame_min_boost */ - { 0, 1 }, /* kf_frame_max_boost_first */ - { 0, 1 }, /* kf_frame_max_boost_subs */ - { 0, 1 }, /* kf_max_total_boost */ - { 0, 1 }, /* gf_max_total_boost */ - { 0, 1 }, /* gf_frame_max_boost */ - { 0, 1 }, /* zm_factor */ + { 1, 1 }, /* active_wq_factor */ + { 1, 1 }, /* err_per_mb_factor */ + { 1, 1 }, /* sr_default_decay_limit */ + { 1, 1 }, /* sr_diff_factor */ + { 1, 1 }, /* kf_err_per_mb_factor */ + { 1, 1 }, /* kf_frame_min_boost_factor */ + { 1, 1 }, /* kf_frame_max_boost_first_factor */ + { 1, 1 }, /* kf_frame_max_boost_subs_factor */ + { 1, 1 }, /* kf_max_total_boost_factor */ + { 1, 1 }, /* gf_max_total_boost_factor */ + { 1, 1 }, /* gf_frame_max_boost_factor */ + { 1, 1 }, /* zm_factor */ { 1, 1 }, /* rd_mult_inter_qp_fac */ { 1, 1 }, /* rd_mult_arf_qp_fac */ { 1, 1 }, /* rd_mult_key_qp_fac */ diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 76ef11909c..a4717ad036 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1878,8 +1878,7 @@ static double get_prediction_decay_rate(const TWO_PASS *const twopass, const FIRSTPASS_STATS *frame_stats) { const double sr_decay_rate = get_sr_decay_rate(twopass, frame_stats); double zero_motion_factor = - twopass->zm_factor * DEFAULT_ZM_FACTOR * - (frame_stats->pcnt_inter - frame_stats->pcnt_motion); + twopass->zm_factor * (frame_stats->pcnt_inter - frame_stats->pcnt_motion); // Clamp value to range 0.0 to 1.0 // This should happen anyway if input values are sensibly clamped but checked @@ -1981,7 +1980,7 @@ static double calc_frame_boost(const FRAME_INFO *frame_info, const double active_area = calculate_active_area(frame_info, this_frame); // Underlying boost factor is based on inter error ratio. - frame_boost = (twopass->base_err_per_mb * active_area) / + frame_boost = (twopass->err_per_mb * active_area) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error); // Small adjustment for cases where there is a zoom out @@ -2027,7 +2026,7 @@ static double calc_kf_frame_boost(VP9_COMP *cpi, // This value is in line with the minimum per frame boost in the alt_ref // boost calculation. frame_boost = - ((frame_boost + twopass->kf_frame_min_boost) * boost_q_correction); + (frame_boost + twopass->kf_frame_min_boost) * boost_q_correction; // Maximum allowed boost this frame. May be different for first vs subsequent // key frames. @@ -2861,7 +2860,7 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { const int arf_boost = compute_arf_boost(frame_info, twopass, gld_show_idx, f_frames, b_frames, avg_inter_frame_qindex); - rc->gfu_boost = VPXMIN(twopass->gf_max_total_boost, arf_boost); + rc->gfu_boost = VPXMIN((int)twopass->gf_max_total_boost, arf_boost); rc->source_alt_ref_pending = 0; } @@ -3429,12 +3428,12 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) { // Special case for static / slide show content but dont apply // if the kf group is very short. if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) { - rc->kf_boost = twopass->kf_max_total_boost; + rc->kf_boost = (int)(twopass->kf_max_total_boost); } else { // Apply various clamps for min and max oost rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); - rc->kf_boost = VPXMIN(rc->kf_boost, twopass->kf_max_total_boost); + rc->kf_boost = VPXMIN(rc->kf_boost, (int)(twopass->kf_max_total_boost)); } // Work out how many bits to allocate for the key frame itself. @@ -3492,32 +3491,56 @@ static int is_skippable_frame(const VP9_COMP *cpi) { // Configure image size specific vizier parameters. // Later these will be set via additional command line options static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { - // When |use_vizier_rc_params| is 1, we expect the rc parameters have been - // initialized by the pass in values. - // Be careful that parameters below are only initialized to 0, if we do not - // pass values to them. It is desired to take care of each parameter when - // using |use_vizier_rc_params|. - if (twopass->use_vizier_rc_params) return; - - // When |use_vizier_rc_params| is 0, use defaults for now. - twopass->active_wq_factor = AV_WQ_FACTOR; - twopass->base_err_per_mb = BASELINE_ERR_PER_MB; - twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT; - twopass->sr_diff_factor = 1.0; - twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST; - twopass->gf_max_total_boost = MAX_GF_BOOST; - if (screen_area < 1280 * 720) { - twopass->kf_err_per_mb = 2000.0; - } else if (screen_area < 1920 * 1080) { - twopass->kf_err_per_mb = 500.0; + // When |use_vizier_rc_params| is 1, we expect the rc parameters below to + // have been initialised on the command line as adjustment factors such + // that a factor of 1.0 will match the default behavior when + // |use_vizier_rc_params| is 0 + if (twopass->use_vizier_rc_params) { + twopass->active_wq_factor *= AV_WQ_FACTOR; + twopass->err_per_mb *= BASELINE_ERR_PER_MB; + twopass->sr_default_decay_limit *= DEFAULT_DECAY_LIMIT; + twopass->sr_diff_factor *= 1.0; + twopass->gf_frame_max_boost *= GF_MAX_FRAME_BOOST; + twopass->gf_max_total_boost *= MAX_GF_BOOST; + twopass->kf_frame_min_boost *= KF_MIN_FRAME_BOOST; + twopass->kf_frame_max_boost_first *= KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_subs *= KF_MAX_FRAME_BOOST; + twopass->kf_max_total_boost *= MAX_KF_TOT_BOOST; + twopass->zm_factor *= DEFAULT_ZM_FACTOR; + + // Correction for the fact that the kf_err_per_mb_factor default is + // already different for different video formats and ensures that a passed + // in value of 1.0 on the vizier command line will still match the current + // default. + if (screen_area < 1280 * 720) { + twopass->kf_err_per_mb *= 2000.0; + } else if (screen_area < 1920 * 1080) { + twopass->kf_err_per_mb *= 500.0; + } else { + twopass->kf_err_per_mb *= 250.0; + } } else { - twopass->kf_err_per_mb = 250.0; + // When |use_vizier_rc_params| is 0, use defaults. + twopass->active_wq_factor = AV_WQ_FACTOR; + twopass->err_per_mb = BASELINE_ERR_PER_MB; + twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT; + twopass->sr_diff_factor = 1.0; + twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST; + twopass->gf_max_total_boost = MAX_GF_BOOST; + twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; + twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_subs = KF_MAX_FRAME_BOOST; + twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; + twopass->zm_factor = DEFAULT_ZM_FACTOR; + + if (screen_area < 1280 * 720) { + twopass->kf_err_per_mb = 2000.0; + } else if (screen_area < 1920 * 1080) { + twopass->kf_err_per_mb = 500.0; + } else { + twopass->kf_err_per_mb = 250.0; + } } - twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; - twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; - twopass->kf_frame_max_boost_subs = twopass->kf_frame_max_boost_first; - twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; - twopass->zm_factor = 1.0; } void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 1418b67a48..cdbcb52412 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -228,15 +228,15 @@ typedef struct { // by default values. int use_vizier_rc_params; double active_wq_factor; - double base_err_per_mb; + double err_per_mb; double sr_default_decay_limit; double sr_diff_factor; double kf_err_per_mb; double kf_frame_min_boost; double kf_frame_max_boost_first; // Max for first kf in a chunk. double kf_frame_max_boost_subs; // Max for subsequent mid chunk kfs. - int kf_max_total_boost; - int gf_max_total_boost; + double kf_max_total_boost; + double gf_max_total_boost; double gf_frame_max_boost; double zm_factor; } TWO_PASS; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index c2ca215f10..c700620ef3 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -352,15 +352,15 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, // The range below shall be further tuned. RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1); RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000); - RANGE_CHECK(cfg, base_err_per_mb.den, 1, 1000); + RANGE_CHECK(cfg, err_per_mb_factor.den, 1, 1000); RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000); RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000); - RANGE_CHECK(cfg, kf_err_per_mb.den, 1, 1000); - RANGE_CHECK(cfg, kf_frame_min_boost.den, 1, 1000); - RANGE_CHECK(cfg, kf_frame_max_boost_subs.den, 1, 1000); - RANGE_CHECK(cfg, kf_max_total_boost.den, 1, 1000); - RANGE_CHECK(cfg, gf_max_total_boost.den, 1, 1000); - RANGE_CHECK(cfg, gf_frame_max_boost.den, 1, 1000); + RANGE_CHECK(cfg, kf_err_per_mb_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_min_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_max_boost_subs_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_max_total_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, gf_max_total_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, gf_frame_max_boost_factor.den, 1, 1000); RANGE_CHECK(cfg, zm_factor.den, 1, 1000); RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000); RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000); @@ -662,31 +662,35 @@ static vpx_codec_err_t set_twopass_params_from_config( if (cpi == NULL) return VPX_CODEC_ERROR; cpi->twopass.use_vizier_rc_params = cfg->use_vizier_rc_params; + + // The values set here are factors that will be applied to default values + // to get the final value used in the two pass code. 1.0 will hence + // match the default behaviour when not using passed in values. cpi->twopass.active_wq_factor = (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den; - cpi->twopass.base_err_per_mb = - (double)cfg->base_err_per_mb.num / (double)cfg->base_err_per_mb.den; + cpi->twopass.err_per_mb = + (double)cfg->err_per_mb_factor.num / (double)cfg->err_per_mb_factor.den; cpi->twopass.sr_default_decay_limit = (double)cfg->sr_default_decay_limit.num / (double)cfg->sr_default_decay_limit.den; cpi->twopass.sr_diff_factor = (double)cfg->sr_diff_factor.num / (double)cfg->sr_diff_factor.den; - cpi->twopass.kf_err_per_mb = - (double)cfg->kf_err_per_mb.num / (double)cfg->kf_err_per_mb.den; - cpi->twopass.kf_frame_min_boost = - (double)cfg->kf_frame_min_boost.num / (double)cfg->kf_frame_min_boost.den; + cpi->twopass.kf_err_per_mb = (double)cfg->kf_err_per_mb_factor.num / + (double)cfg->kf_err_per_mb_factor.den; + cpi->twopass.kf_frame_min_boost = (double)cfg->kf_frame_min_boost_factor.num / + (double)cfg->kf_frame_min_boost_factor.den; cpi->twopass.kf_frame_max_boost_first = - (double)cfg->kf_frame_max_boost_first.num / - (double)cfg->kf_frame_max_boost_first.den; + (double)cfg->kf_frame_max_boost_first_factor.num / + (double)cfg->kf_frame_max_boost_first_factor.den; cpi->twopass.kf_frame_max_boost_subs = - (double)cfg->kf_frame_max_boost_subs.num / - (double)cfg->kf_frame_max_boost_subs.den; - cpi->twopass.kf_max_total_boost = (int)((double)cfg->kf_max_total_boost.num / - (double)cfg->kf_max_total_boost.den); - cpi->twopass.gf_max_total_boost = (int)((double)cfg->gf_max_total_boost.num / - (double)cfg->gf_max_total_boost.den); - cpi->twopass.gf_frame_max_boost = - (double)cfg->gf_frame_max_boost.num / (double)cfg->gf_frame_max_boost.den; + (double)cfg->kf_frame_max_boost_subs_factor.num / + (double)cfg->kf_frame_max_boost_subs_factor.den; + cpi->twopass.kf_max_total_boost = (double)cfg->kf_max_total_boost_factor.num / + (double)cfg->kf_max_total_boost_factor.den; + cpi->twopass.gf_max_total_boost = (double)cfg->gf_max_total_boost_factor.num / + (double)cfg->gf_max_total_boost_factor.den; + cpi->twopass.gf_frame_max_boost = (double)cfg->gf_frame_max_boost_factor.num / + (double)cfg->gf_frame_max_boost_factor.den; cpi->twopass.zm_factor = (double)cfg->zm_factor.num / (double)cfg->zm_factor.den; cpi->rd_ctrl.rd_mult_inter_qp_fac = (double)cfg->rd_mult_inter_qp_fac.num / @@ -1956,18 +1960,18 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { { 0 }, // layer_taget_bitrate 0, // temporal_layering_mode 0, // use_vizier_rc_params - { 0, 1 }, // active_wq_factor - { 0, 1 }, // base_err_per_mb - { 0, 1 }, // sr_default_decay_limit - { 0, 1 }, // sr_diff_factor - { 0, 1 }, // kf_err_per_mb - { 0, 1 }, // kf_frame_min_boost - { 0, 1 }, // kf_frame_max_boost_first - { 0, 1 }, // kf_frame_max_boost_subs - { 0, 1 }, // kf_max_total_boost - { 0, 1 }, // gf_max_total_boost - { 0, 1 }, // gf_frame_max_boost - { 0, 1 }, // zm_factor + { 1, 1 }, // active_wq_factor + { 1, 1 }, // err_per_mb_factor + { 1, 1 }, // sr_default_decay_limit + { 1, 1 }, // sr_diff_factor + { 1, 1 }, // kf_err_per_mb_factor + { 1, 1 }, // kf_frame_min_boost_factor + { 1, 1 }, // kf_frame_max_boost_first_factor + { 1, 1 }, // kf_frame_max_boost_subs_factor + { 1, 1 }, // kf_max_total_boost_factor + { 1, 1 }, // gf_max_total_boost_factor + { 1, 1 }, // gf_frame_max_boost_factor + { 1, 1 }, // zm_factor { 1, 1 }, // rd_mult_inter_qp_fac { 1, 1 }, // rd_mult_arf_qp_fac { 1, 1 }, // rd_mult_key_qp_fac diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 255cb693ee..d62d9ee7e1 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -709,14 +709,14 @@ typedef struct vpx_codec_enc_cfg { */ vpx_rational_t active_wq_factor; - /*!\brief Base error per macroblock. + /*!\brief Error per macroblock adjustment factor. * * Rate control parameters, set from external experiment results. * Only when |use_vizier_rc_params| is set to 1, the pass in value will be * used. Otherwise, the default value is used. * */ - vpx_rational_t base_err_per_mb; + vpx_rational_t err_per_mb_factor; /*!\brief Second reference default decay limit. * @@ -736,68 +736,69 @@ typedef struct vpx_codec_enc_cfg { */ vpx_rational_t sr_diff_factor; - /*!\brief Keyframe error per macroblock. + /*!\brief Keyframe error per macroblock adjustment factor. * * Rate control parameters, set from external experiment results. * Only when |use_vizier_rc_params| is set to 1, the pass in value will be * used. Otherwise, the default value is used. * */ - vpx_rational_t kf_err_per_mb; + vpx_rational_t kf_err_per_mb_factor; - /*!\brief Keyframe minimum boost. + /*!\brief Keyframe minimum boost adjustment factor. * * Rate control parameters, set from external experiment results. * Only when |use_vizier_rc_params| is set to 1, the pass in value will be * used. Otherwise, the default value is used. * */ - vpx_rational_t kf_frame_min_boost; + vpx_rational_t kf_frame_min_boost_factor; - /*!\brief Keyframe maximum boost, for the first keyframe in a chunk. + /*!\brief Keyframe maximum boost adjustment factor, for the first keyframe + * in a chunk. * * Rate control parameters, set from external experiment results. * Only when |use_vizier_rc_params| is set to 1, the pass in value will be * used. Otherwise, the default value is used. * */ - vpx_rational_t kf_frame_max_boost_first; + vpx_rational_t kf_frame_max_boost_first_factor; - /*!\brief Keyframe maximum boost, for subsequent keyframes. + /*!\brief Keyframe maximum boost adjustment factor, for subsequent keyframes. * * Rate control parameters, set from external experiment results. * Only when |use_vizier_rc_params| is set to 1, the pass in value will be * used. Otherwise, the default value is used. * */ - vpx_rational_t kf_frame_max_boost_subs; + vpx_rational_t kf_frame_max_boost_subs_factor; - /*!\brief Keyframe maximum total boost. + /*!\brief Keyframe maximum total boost adjustment factor. * * Rate control parameters, set from external experiment results. * Only when |use_vizier_rc_params| is set to 1, the pass in value will be * used. Otherwise, the default value is used. * */ - vpx_rational_t kf_max_total_boost; + vpx_rational_t kf_max_total_boost_factor; - /*!\brief Golden frame maximum total boost. + /*!\brief Golden frame maximum total boost adjustment factor. * * Rate control parameters, set from external experiment results. * Only when |use_vizier_rc_params| is set to 1, the pass in value will be * used. Otherwise, the default value is used. * */ - vpx_rational_t gf_max_total_boost; + vpx_rational_t gf_max_total_boost_factor; - /*!\brief Golden frame maximum boost. + /*!\brief Golden frame maximum boost adjustment factor. * * Rate control parameters, set from external experiment results. * Only when |use_vizier_rc_params| is set to 1, the pass in value will be * used. Otherwise, the default value is used. * */ - vpx_rational_t gf_frame_max_boost; + vpx_rational_t gf_frame_max_boost_factor; /*!\brief Zero motion power factor. * diff --git a/vpxenc.c b/vpxenc.c index 1a2b4a9214..276ee9b902 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -292,46 +292,52 @@ static const arg_def_t use_vizier_rc_params = ARG_DEF(NULL, "use-vizier-rc-params", 1, "Use vizier rc params"); static const arg_def_t active_wq_factor = ARG_DEF(NULL, "active-wq-factor", 1, "Active worst quality factor"); -static const arg_def_t base_err_per_mb = - ARG_DEF(NULL, "base-err-per-mb", 1, "Base error per macroblock"); +static const arg_def_t err_per_mb_factor = + ARG_DEF(NULL, "err-per-mb-factor", 1, "Error per macroblock factor"); static const arg_def_t sr_default_decay_limit = ARG_DEF( NULL, "sr-default-decay-limit", 1, "Second reference default decay limit"); static const arg_def_t sr_diff_factor = ARG_DEF(NULL, "sr-diff-factor", 1, "Second reference diff factor"); -static const arg_def_t kf_err_per_mb = - ARG_DEF(NULL, "kf-err-per-mb", 1, "Keyframe error per macroblock"); -static const arg_def_t kf_frame_min_boost = - ARG_DEF(NULL, "kf-frame-min-boost", 1, "Keyframe min boost"); -static const arg_def_t kf_frame_max_boost_first = ARG_DEF( - NULL, "kf-frame-max-boost-first", 1, "Max for the first keyframe boost"); -static const arg_def_t kf_frame_max_boost_subs = ARG_DEF( - NULL, "kf-frame-max-boost-subs", 1, "Max for subsequent keyframe boost"); -static const arg_def_t kf_max_total_boost = - ARG_DEF(NULL, "kf-max-total-boost", 1, "Keyframe max total boost"); -static const arg_def_t gf_max_total_boost = - ARG_DEF(NULL, "gf-max-total-boost", 1, "Golden frame max total boost"); -static const arg_def_t gf_frame_max_boost = - ARG_DEF(NULL, "gf-frame-max-boost", 1, "Golden frame max boost"); +static const arg_def_t kf_err_per_mb_factor = ARG_DEF( + NULL, "kf-err-per-mb-factor", 1, "Keyframe error per macroblock factor"); +static const arg_def_t kf_frame_min_boost_factor = + ARG_DEF(NULL, "kf-frame-min-boost-factor", 1, "Keyframe min boost"); +static const arg_def_t kf_frame_max_boost_first_factor = + ARG_DEF(NULL, "kf-frame-max-boost-first-factor", 1, + "Max keyframe boost adjustment factor for first frame"); +static const arg_def_t kf_frame_max_boost_subs_factor = + ARG_DEF(NULL, "kf-frame-max-boost-subs-factor", 1, + "Max boost adjustment factor for subsequent KFs"); +static const arg_def_t kf_max_total_boost_factor = ARG_DEF( + NULL, "kf-max-total-boost-factor", 1, "Keyframe max total boost factor"); +static const arg_def_t gf_max_total_boost_factor = + ARG_DEF(NULL, "gf-max-total-boost-factor", 1, + "Golden frame max total boost factor"); +static const arg_def_t gf_frame_max_boost_factor = + ARG_DEF(NULL, "gf-frame-max-boost-factor", 1, + "Golden frame max per frame boost factor"); static const arg_def_t zm_factor = ARG_DEF(NULL, "zm-factor", 1, "Zero motion power factor"); static const arg_def_t rd_mult_inter_qp_fac = - ARG_DEF(NULL, "rd-mult-inter-qp-fac", 1, "RD multiplier for inter frames"); + ARG_DEF(NULL, "rd-mult-inter-qp-fac", 1, + "RD multiplier adjustment for inter frames"); static const arg_def_t rd_mult_arf_qp_fac = - ARG_DEF(NULL, "rd-mult-arf-qp-fac", 1, "RD multiplier for alt-ref frames"); -static const arg_def_t rd_mult_key_qp_fac = - ARG_DEF(NULL, "rd-mult-key-qp-fac", 1, "RD multiplier for key frames"); + ARG_DEF(NULL, "rd-mult-arf-qp-fac", 1, + "RD multiplier adjustment for alt-ref frames"); +static const arg_def_t rd_mult_key_qp_fac = ARG_DEF( + NULL, "rd-mult-key-qp-fac", 1, "RD multiplier adjustment for key frames"); static const arg_def_t *vizier_rc_args[] = { &use_vizier_rc_params, &active_wq_factor, - &base_err_per_mb, + &err_per_mb_factor, &sr_default_decay_limit, &sr_diff_factor, - &kf_err_per_mb, - &kf_frame_min_boost, - &kf_frame_max_boost_first, - &kf_frame_max_boost_subs, - &kf_max_total_boost, - &gf_max_total_boost, - &gf_frame_max_boost, + &kf_err_per_mb_factor, + &kf_frame_min_boost_factor, + &kf_frame_max_boost_first_factor, + &kf_frame_max_boost_subs_factor, + &kf_max_total_boost_factor, + &gf_max_total_boost_factor, + &gf_frame_max_boost_factor, &zm_factor, &rd_mult_inter_qp_fac, &rd_mult_arf_qp_fac, @@ -1042,26 +1048,26 @@ static int parse_stream_params(struct VpxEncoderConfig *global, config->cfg.use_vizier_rc_params = arg_parse_int(&arg); } else if (arg_match(&arg, &active_wq_factor, argi)) { config->cfg.active_wq_factor = arg_parse_rational(&arg); - } else if (arg_match(&arg, &base_err_per_mb, argi)) { - config->cfg.base_err_per_mb = arg_parse_rational(&arg); + } else if (arg_match(&arg, &err_per_mb_factor, argi)) { + config->cfg.err_per_mb_factor = arg_parse_rational(&arg); } else if (arg_match(&arg, &sr_default_decay_limit, argi)) { config->cfg.sr_default_decay_limit = arg_parse_rational(&arg); } else if (arg_match(&arg, &sr_diff_factor, argi)) { config->cfg.sr_diff_factor = arg_parse_rational(&arg); - } else if (arg_match(&arg, &kf_err_per_mb, argi)) { - config->cfg.kf_err_per_mb = arg_parse_rational(&arg); - } else if (arg_match(&arg, &kf_frame_min_boost, argi)) { - config->cfg.kf_frame_min_boost = arg_parse_rational(&arg); - } else if (arg_match(&arg, &kf_frame_max_boost_first, argi)) { - config->cfg.kf_frame_max_boost_first = arg_parse_rational(&arg); - } else if (arg_match(&arg, &kf_frame_max_boost_subs, argi)) { - config->cfg.kf_frame_max_boost_subs = arg_parse_rational(&arg); - } else if (arg_match(&arg, &kf_max_total_boost, argi)) { - config->cfg.kf_max_total_boost = arg_parse_rational(&arg); - } else if (arg_match(&arg, &gf_max_total_boost, argi)) { - config->cfg.gf_max_total_boost = arg_parse_rational(&arg); - } else if (arg_match(&arg, &gf_frame_max_boost, argi)) { - config->cfg.gf_frame_max_boost = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_err_per_mb_factor, argi)) { + config->cfg.kf_err_per_mb_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_frame_min_boost_factor, argi)) { + config->cfg.kf_frame_min_boost_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_frame_max_boost_first_factor, argi)) { + config->cfg.kf_frame_max_boost_first_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_frame_max_boost_subs_factor, argi)) { + config->cfg.kf_frame_max_boost_subs_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_max_total_boost_factor, argi)) { + config->cfg.kf_max_total_boost_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &gf_max_total_boost_factor, argi)) { + config->cfg.gf_max_total_boost_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &gf_frame_max_boost_factor, argi)) { + config->cfg.gf_frame_max_boost_factor = arg_parse_rational(&arg); } else if (arg_match(&arg, &zm_factor, argi)) { config->cfg.zm_factor = arg_parse_rational(&arg); } else if (arg_match(&arg, &rd_mult_inter_qp_fac, argi)) { From adc185feb7b7aabd6407cc41ad1f3c1e9c1e8b17 Mon Sep 17 00:00:00 2001 From: Sreerenj Balachandran Date: Wed, 21 Apr 2021 11:34:03 -0700 Subject: [PATCH 0154/1000] vp8: enc: Fix valid range for under/over_shoot pct The overshoot_pct & undershoot_pct attributes for rate control are expressed as a percentage of the target bitrate, so the range should be 0-100. Change-Id: I67af3c8be7ab814c711c2eaf30786f1e2fa4f5a3 --- vp8/vp8_cx_iface.c | 4 ++-- vpx/vpx_encoder.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 872710f138..06ee5d7dfb 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -152,8 +152,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(cfg, g_lag_in_frames, 25); #endif RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q); - RANGE_CHECK_HI(cfg, rc_undershoot_pct, 1000); - RANGE_CHECK_HI(cfg, rc_overshoot_pct, 1000); + RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100); + RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100); RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100); RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO); diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 255cb693ee..1682931a7f 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -498,7 +498,7 @@ typedef struct vpx_codec_enc_cfg { * undershoot level (current rate vs target) beyond which more aggressive * corrective measures are taken. * * - * Valid values in the range VP8:0-1000 VP9: 0-100. + * Valid values in the range VP8:0-100 VP9: 0-100. */ unsigned int rc_undershoot_pct; @@ -513,7 +513,7 @@ typedef struct vpx_codec_enc_cfg { * overshoot level (current rate vs target) beyond which more aggressive * corrective measures are taken. * - * Valid values in the range VP8:0-1000 VP9: 0-100. + * Valid values in the range VP8:0-100 VP9: 0-100. */ unsigned int rc_overshoot_pct; From c15555c62fc35872a4ff803c4d57ef5d2f4f81d9 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 23 Apr 2021 16:50:48 -0700 Subject: [PATCH 0155/1000] sync CONTRIBUTING.md w/libwebm Change-Id: I63ffea52d079b0d50002526e209ae3fb64811bac --- CONTRIBUTING.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 577c96a6b5..7a73a30317 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,10 +19,9 @@ again. All submissions, including submissions by project members, require review. We use a [Gerrit](https://www.gerritcodereview.com) instance hosted at -https://chromium-review.googlesource.com for this purpose. - -See https://www.webmproject.org/code/contribute/submitting-patches for an -example of a typical gerrit workflow. +https://chromium-review.googlesource.com for this purpose. See the +[WebM Project page](https://www.webmproject.org/code/contribute/submitting-patches/) +for additional details. ## Community Guidelines From 9f57bc4d6c7a577538042a49ede8ee98dc8cc300 Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Mon, 26 Apr 2021 15:06:54 +0100 Subject: [PATCH 0156/1000] Add limits to Vizier input parameters. Imposed provisional upper and lower limits to each parameter that can be adjusted in the Vizier ML experiment. Also in some cases applied secondary limits on on the range of the final "used" values. Defaults and limits may well require further tuning after subsequent rounds of experimentation. Re-factor get_sr_decay_rate(). Change-Id: I28e804ce3d3710f30cd51a203348e4ab23ef06c0 --- vp9/encoder/vp9_firstpass.c | 41 +++++++++++-------- vp9/vp9_cx_iface.c | 79 ++++++++++++++++++++++++++++++++++++- 2 files changed, 103 insertions(+), 17 deletions(-) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index a4717ad036..ce7590fe4c 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1832,33 +1832,35 @@ void vp9_init_second_pass(VP9_COMP *cpi) { } /* This function considers how the quality of prediction may be deteriorating - * with distance. It comapres the coded error for the last frame and the + * with distance. It compares the coded error for the last frame and the * second reference frame (usually two frames old) and also applies a factor * based on the extent of INTRA coding. * * The decay factor is then used to reduce the contribution of frames further - * from the alt-ref or golden frame, to the bitframe boost calculation for that + * from the alt-ref or golden frame, to the bitrate boost calculation for that * alt-ref or golden frame. */ static double get_sr_decay_rate(const TWO_PASS *const twopass, const FIRSTPASS_STATS *frame) { double sr_diff = (frame->sr_coded_error - frame->coded_error); double sr_decay = 1.0; - double modified_pct_inter; - double modified_pcnt_intra; - - modified_pct_inter = frame->pcnt_inter; - if ((frame->coded_error > LOW_CODED_ERR_PER_MB) && - ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < - (double)NCOUNT_FRAME_II_THRESH)) { - modified_pct_inter = - frame->pcnt_inter + frame->pcnt_intra_low - frame->pcnt_neutral; - } - modified_pcnt_intra = 100 * (1.0 - modified_pct_inter); + // Do nothing if the second ref to last frame error difference is + // very small or even negative. if ((sr_diff > LOW_SR_DIFF_TRHESH)) { - double sr_diff_part = + const double sr_diff_part = twopass->sr_diff_factor * ((sr_diff * 0.25) / frame->intra_error); + double modified_pct_inter = frame->pcnt_inter; + double modified_pcnt_intra; + + if ((frame->coded_error > LOW_CODED_ERR_PER_MB) && + ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < + (double)NCOUNT_FRAME_II_THRESH)) { + modified_pct_inter = + frame->pcnt_inter + frame->pcnt_intra_low - frame->pcnt_neutral; + } + modified_pcnt_intra = 100 * (1.0 - modified_pct_inter); + sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra); } return VPXMAX(sr_decay, twopass->sr_default_decay_limit); @@ -1979,7 +1981,7 @@ static double calc_frame_boost(const FRAME_INFO *frame_info, const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5); const double active_area = calculate_active_area(frame_info, this_frame); - // Underlying boost factor is based on inter error ratio. + // Frame booost is based on inter error. frame_boost = (twopass->err_per_mb * active_area) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error); @@ -2007,7 +2009,7 @@ static double calc_kf_frame_boost(VP9_COMP *cpi, calculate_active_area(&cpi->frame_info, this_frame); double max_boost; - // Underlying boost factor is based on inter error ratio. + // Frame booost is based on inter error. frame_boost = (twopass->kf_err_per_mb * active_area) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator); @@ -3499,14 +3501,21 @@ static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { twopass->active_wq_factor *= AV_WQ_FACTOR; twopass->err_per_mb *= BASELINE_ERR_PER_MB; twopass->sr_default_decay_limit *= DEFAULT_DECAY_LIMIT; + if (twopass->sr_default_decay_limit > 1.0) // > 1.0 here makes no sense + twopass->sr_default_decay_limit = 1.0; twopass->sr_diff_factor *= 1.0; twopass->gf_frame_max_boost *= GF_MAX_FRAME_BOOST; twopass->gf_max_total_boost *= MAX_GF_BOOST; + // NOTE: In use max boost has precedence over min boost. So even if min is + // somehow set higher than max the final boost value will be clamped to the + // appropriate maximum. twopass->kf_frame_min_boost *= KF_MIN_FRAME_BOOST; twopass->kf_frame_max_boost_first *= KF_MAX_FRAME_BOOST; twopass->kf_frame_max_boost_subs *= KF_MAX_FRAME_BOOST; twopass->kf_max_total_boost *= MAX_KF_TOT_BOOST; twopass->zm_factor *= DEFAULT_ZM_FACTOR; + if (twopass->zm_factor > 1.0) // > 1.0 here makes no sense + twopass->zm_factor = 1.0; // Correction for the fact that the kf_err_per_mb_factor default is // already different for different video formats and ensures that a passed diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index c700620ef3..438f9b5ed9 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -664,41 +664,118 @@ static vpx_codec_err_t set_twopass_params_from_config( cpi->twopass.use_vizier_rc_params = cfg->use_vizier_rc_params; // The values set here are factors that will be applied to default values - // to get the final value used in the two pass code. 1.0 will hence + // to get the final value used in the two pass code. Hence 1.0 will // match the default behaviour when not using passed in values. + // We also apply limits here to prevent the user from applying settings + // that make no sense. cpi->twopass.active_wq_factor = (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den; + if (cpi->twopass.active_wq_factor < 0.25) + cpi->twopass.active_wq_factor = 0.25; + else if (cpi->twopass.active_wq_factor > 16.0) + cpi->twopass.active_wq_factor = 16.0; + cpi->twopass.err_per_mb = (double)cfg->err_per_mb_factor.num / (double)cfg->err_per_mb_factor.den; + if (cpi->twopass.err_per_mb < 0.25) + cpi->twopass.err_per_mb = 0.25; + else if (cpi->twopass.err_per_mb > 4.0) + cpi->twopass.err_per_mb = 4.0; + cpi->twopass.sr_default_decay_limit = (double)cfg->sr_default_decay_limit.num / (double)cfg->sr_default_decay_limit.den; + if (cpi->twopass.sr_default_decay_limit < 0.25) + cpi->twopass.sr_default_decay_limit = 0.25; + // If the default changes this will need to change. + else if (cpi->twopass.sr_default_decay_limit > 1.33) + cpi->twopass.sr_default_decay_limit = 1.33; + cpi->twopass.sr_diff_factor = (double)cfg->sr_diff_factor.num / (double)cfg->sr_diff_factor.den; + if (cpi->twopass.sr_diff_factor < 0.25) + cpi->twopass.sr_diff_factor = 0.25; + else if (cpi->twopass.sr_diff_factor > 4.0) + cpi->twopass.sr_diff_factor = 4.0; + cpi->twopass.kf_err_per_mb = (double)cfg->kf_err_per_mb_factor.num / (double)cfg->kf_err_per_mb_factor.den; + if (cpi->twopass.kf_err_per_mb < 0.25) + cpi->twopass.kf_err_per_mb = 0.25; + else if (cpi->twopass.kf_err_per_mb > 4.0) + cpi->twopass.kf_err_per_mb = 4.0; + cpi->twopass.kf_frame_min_boost = (double)cfg->kf_frame_min_boost_factor.num / (double)cfg->kf_frame_min_boost_factor.den; + if (cpi->twopass.kf_frame_min_boost < 0.25) + cpi->twopass.kf_frame_min_boost = 0.25; + else if (cpi->twopass.kf_frame_min_boost > 4.0) + cpi->twopass.kf_frame_min_boost = 4.0; + cpi->twopass.kf_frame_max_boost_first = (double)cfg->kf_frame_max_boost_first_factor.num / (double)cfg->kf_frame_max_boost_first_factor.den; + if (cpi->twopass.kf_frame_max_boost_first < 0.25) + cpi->twopass.kf_frame_max_boost_first = 0.25; + else if (cpi->twopass.kf_frame_max_boost_first > 4.0) + cpi->twopass.kf_frame_max_boost_first = 4.0; + cpi->twopass.kf_frame_max_boost_subs = (double)cfg->kf_frame_max_boost_subs_factor.num / (double)cfg->kf_frame_max_boost_subs_factor.den; + if (cpi->twopass.kf_frame_max_boost_subs < 0.25) + cpi->twopass.kf_frame_max_boost_subs = 0.25; + else if (cpi->twopass.kf_frame_max_boost_subs > 4.0) + cpi->twopass.kf_frame_max_boost_subs = 4.0; + cpi->twopass.kf_max_total_boost = (double)cfg->kf_max_total_boost_factor.num / (double)cfg->kf_max_total_boost_factor.den; + if (cpi->twopass.kf_max_total_boost < 0.25) + cpi->twopass.kf_max_total_boost = 0.25; + else if (cpi->twopass.kf_max_total_boost > 4.0) + cpi->twopass.kf_max_total_boost = 4.0; + cpi->twopass.gf_max_total_boost = (double)cfg->gf_max_total_boost_factor.num / (double)cfg->gf_max_total_boost_factor.den; + if (cpi->twopass.gf_max_total_boost < 0.25) + cpi->twopass.gf_max_total_boost = 0.25; + else if (cpi->twopass.gf_max_total_boost > 4.0) + cpi->twopass.gf_max_total_boost = 4.0; + cpi->twopass.gf_frame_max_boost = (double)cfg->gf_frame_max_boost_factor.num / (double)cfg->gf_frame_max_boost_factor.den; + if (cpi->twopass.gf_frame_max_boost < 0.25) + cpi->twopass.gf_frame_max_boost = 0.25; + else if (cpi->twopass.gf_frame_max_boost > 4.0) + cpi->twopass.gf_frame_max_boost = 4.0; + cpi->twopass.zm_factor = (double)cfg->zm_factor.num / (double)cfg->zm_factor.den; + if (cpi->twopass.zm_factor < 0.25) + cpi->twopass.zm_factor = 0.25; + else if (cpi->twopass.zm_factor > 2.0) + cpi->twopass.zm_factor = 2.0; + cpi->rd_ctrl.rd_mult_inter_qp_fac = (double)cfg->rd_mult_inter_qp_fac.num / (double)cfg->rd_mult_inter_qp_fac.den; + if (cpi->rd_ctrl.rd_mult_inter_qp_fac < 0.25) + cpi->rd_ctrl.rd_mult_inter_qp_fac = 0.25; + else if (cpi->rd_ctrl.rd_mult_inter_qp_fac > 4.0) + cpi->rd_ctrl.rd_mult_inter_qp_fac = 4.0; + cpi->rd_ctrl.rd_mult_arf_qp_fac = (double)cfg->rd_mult_arf_qp_fac.num / (double)cfg->rd_mult_arf_qp_fac.den; + if (cpi->rd_ctrl.rd_mult_arf_qp_fac < 0.25) + cpi->rd_ctrl.rd_mult_arf_qp_fac = 0.25; + else if (cpi->rd_ctrl.rd_mult_arf_qp_fac > 4.0) + cpi->rd_ctrl.rd_mult_arf_qp_fac = 4.0; + cpi->rd_ctrl.rd_mult_key_qp_fac = (double)cfg->rd_mult_key_qp_fac.num / (double)cfg->rd_mult_key_qp_fac.den; + if (cpi->rd_ctrl.rd_mult_key_qp_fac < 0.25) + cpi->rd_ctrl.rd_mult_key_qp_fac = 0.25; + else if (cpi->rd_ctrl.rd_mult_key_qp_fac > 4.0) + cpi->rd_ctrl.rd_mult_key_qp_fac = 4.0; return VPX_CODEC_OK; } From 5ca8c5f31d2d1c5d9760577a651e0d710cef7663 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 27 Apr 2021 18:02:35 -0700 Subject: [PATCH 0157/1000] vpx_convolve_copy_neon: prefer != 0 to > 0 in tests this produces better assembly code Change-Id: I80ed1a165512e941b35a4965faa0c44403357e91 --- vpx_dsp/arm/vpx_convolve_copy_neon.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve_copy_neon.c b/vpx_dsp/arm/vpx_convolve_copy_neon.c index 7abed67a40..361ec8a806 100644 --- a/vpx_dsp/arm/vpx_convolve_copy_neon.c +++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c @@ -33,7 +33,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, src += src_stride; dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w == 8) { // copy8 uint8x8_t s0, s1; do { @@ -47,7 +47,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, vst1_u8(dst, s1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w < 32) { // copy16 uint8x16_t s0, s1; do { @@ -61,7 +61,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, vst1q_u8(dst, s1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w == 32) { // copy32 uint8x16_t s0, s1, s2, s3; do { @@ -79,7 +79,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, vst1q_u8(dst + 16, s3); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else { // copy64 uint8x16_t s0, s1, s2, s3; do { From 07cf024d4d061feac503054d15d039c2cfbce35e Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 27 Apr 2021 18:02:35 -0700 Subject: [PATCH 0158/1000] vpx_convolve_avg_neon: prefer != 0 to > 0 in tests this produces better assembly code Change-Id: I174b67a595d7efeb60c921f066302043b1c7d84e --- vpx_dsp/arm/vpx_convolve_avg_neon.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve_avg_neon.c b/vpx_dsp/arm/vpx_convolve_avg_neon.c index 07349d03ae..8e3ee599f4 100644 --- a/vpx_dsp/arm/vpx_convolve_avg_neon.c +++ b/vpx_dsp/arm/vpx_convolve_avg_neon.c @@ -43,7 +43,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w == 8) { // avg8 uint8x8_t s0, s1, d0, d1; uint8x16_t s01, d01; @@ -64,7 +64,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, vst1_u8(dst, vget_high_u8(d01)); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w < 32) { // avg16 uint8x16_t s0, s1, d0, d1; do { @@ -83,7 +83,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, vst1q_u8(dst, d1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w == 32) { // avg32 uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3; do { @@ -110,7 +110,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, vst1q_u8(dst + 16, d3); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else { // avg64 uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3; do { From ff67c848115ae1356f21d361342091140d176c1d Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 27 Apr 2021 18:02:35 -0700 Subject: [PATCH 0159/1000] vpx_convolve_neon: prefer != 0 to > 0 in tests this produces better assembly code; the horizontal convolve is called with an adjusted intermediate_height where it may over process some rows so the checks in those functions remain. Change-Id: Iebe9842f2a13a4960d9a5addde9489452f5ce33a --- vpx_dsp/arm/vpx_convolve8_neon.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index 08ae17dbab..c55c9fb568 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -145,7 +145,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src += 4; dst += 4; w -= 4; - } while (w > 0); + } while (w != 0); } else { const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); @@ -296,7 +296,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s += 8; d += 8; width -= 8; - } while (width > 0); + } while (width != 0); src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; @@ -402,7 +402,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src += 4; dst += 4; w -= 4; - } while (w > 0); + } while (w != 0); } else { const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); @@ -586,7 +586,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s += 8; d += 8; width -= 8; - } while (width > 0); + } while (width != 0); src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; @@ -679,7 +679,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5 = s9; s6 = s10; h -= 4; - } while (h > 0); + } while (h != 0); } else { const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); @@ -759,11 +759,11 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5 = s9; s6 = s10; height -= 4; - } while (height > 0); + } while (height != 0); src += 8; dst += 8; w -= 8; - } while (w > 0); + } while (w != 0); } } @@ -860,7 +860,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5 = s9; s6 = s10; h -= 4; - } while (h > 0); + } while (h != 0); } else { const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); @@ -950,10 +950,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5 = s9; s6 = s10; height -= 4; - } while (height > 0); + } while (height != 0); src += 8; dst += 8; w -= 8; - } while (w > 0); + } while (w != 0); } } From 4ec84326cc65c5b042bf06d222d51e51d7e5461d Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Wed, 28 Apr 2021 13:54:07 -0700 Subject: [PATCH 0160/1000] Bump ABI version Due to recent changes to command line options for rate control parameters. Change-Id: I1de7cb4ff2850a3ed19ec216dd9d07f64a118e92 --- vpx/vpx_encoder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index c4589bea10..f8fdfc0307 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -58,7 +58,7 @@ extern "C" { * fields to structures */ #define VPX_ENCODER_ABI_VERSION \ - (14 + VPX_CODEC_ABI_VERSION + \ + (15 + VPX_CODEC_ABI_VERSION + \ VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield From e14026ac21a6b6c93fd0fc954055643ed30972e0 Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Thu, 29 Apr 2021 11:06:16 +0100 Subject: [PATCH 0161/1000] Add assert for zero_motion_factor range Change clamp to an assert so we are warned if changes to input ranges or defaults in the future lead to an invalid value. Change-Id: Idb4e0729f477a519bfff3083cdce3891e2fc6faa --- vp9/encoder/vp9_firstpass.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index ce7590fe4c..b63d47a05e 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1882,13 +1882,8 @@ static double get_prediction_decay_rate(const TWO_PASS *const twopass, double zero_motion_factor = twopass->zm_factor * (frame_stats->pcnt_inter - frame_stats->pcnt_motion); - // Clamp value to range 0.0 to 1.0 - // This should happen anyway if input values are sensibly clamped but checked - // here just in case. - if (zero_motion_factor > 1.0) - zero_motion_factor = 1.0; - else if (zero_motion_factor < 0.0) - zero_motion_factor = 0.0; + // Check that the zero motion factor is valid + assert(zero_motion_factor >= 0.0 && zero_motion_factor <= 1.0); return VPXMAX(zero_motion_factor, (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor))); From abc7105acdfbbeaeecf41c675148683a1cb8b4f7 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 4 May 2021 12:10:21 -0700 Subject: [PATCH 0162/1000] test.mk: enable vp9_denoiser_test w/NEON this file uses GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST so it's safe to enable unconditionally. the filter check fell out of sync with the code, there's a sse2 and neon implementation for the filter. Change-Id: I2a3336ccef3fb524ca5d9b8f88279240c9a276aa --- test/test.mk | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/test.mk b/test/test.mk index 04902382db..b0319fb0de 100644 --- a/test/test.mk +++ b/test/test.mk @@ -193,10 +193,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_NON_GREEDY_MV) += non_greedy_mv_test.cc endif ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes) -ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2))) LIBVPX_TEST_SRCS-yes += vp9_denoiser_test.cc endif -endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc ifeq ($(CONFIG_VP9_ENCODER),yes) From 12a14913947b510514746389319b49a188a53579 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 4 May 2021 12:13:17 -0700 Subject: [PATCH 0163/1000] vp9_denoiser_neon,horizontal_add_s8x16: use vaddlv w/aarch64 this reduces the number of instructions to compute the sum Change-Id: Icae4d4fb3e343d5b6e5a095c60ac6d171b3e7d54 --- vp9/encoder/arm/neon/vp9_denoiser_neon.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/vp9/encoder/arm/neon/vp9_denoiser_neon.c index 4152e7bb5d..53e8c7e498 100644 --- a/vp9/encoder/arm/neon/vp9_denoiser_neon.c +++ b/vp9/encoder/arm/neon/vp9_denoiser_neon.c @@ -21,6 +21,9 @@ // Compute the sum of all pixel differences of this MB. static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) { +#if defined(__aarch64__) + return vaddlvq_s8(v_sum_diff_total); +#else const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total); const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10); const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210); @@ -28,6 +31,7 @@ static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) { vget_low_s64(fedcba98_76543210)); const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0); return sum_diff; +#endif } // Denoise a 16x1 vector. From c1f77a3689a6cf5e95e1c1ae35d76f4f171f5ef3 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Sun, 11 Apr 2021 15:20:36 +0100 Subject: [PATCH 0164/1000] Implement horizontal convolution using Neon SDOT instruction Add an alternative AArch64 implementation of vpx_convolve8_horiz_neon for targets that implement the Armv8.4-A SDOT (signed dot product) instruction. The existing MLA-based implementation of vpx_convolve8_horiz_neon is retained and used on target CPUs that do not implement the SDOT instruction (or CPUs executing in AArch32 mode). The availability of the SDOT instruction is indicated by the feature macro __ARM_FEATURE_DOTPROD. Co-authored by: James Greenhalgh Change-Id: I5337286b0f5f2775ad7cdbc0174785ae694363cc --- vpx_dsp/arm/mem_neon.h | 18 +++++ vpx_dsp/arm/vpx_convolve8_neon.c | 109 +++++++++++++++++++++++++++++++ vpx_dsp/arm/vpx_convolve8_neon.h | 63 ++++++++++++++++++ 3 files changed, 190 insertions(+) diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index 943865b3c2..c89f92d1ad 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -19,6 +19,24 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +// Support for these xN intrinsics is lacking in older versions of GCC. +#if defined(__GNUC__) && !defined(__clang__) +#if __GNUC__ < 8 || defined(__arm__) +static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) { + uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } }; + return res; +} +#endif + +#if __GNUC__ < 9 || defined(__arm__) +static INLINE uint8x16x3_t vld1q_u8_x3(uint8_t const *ptr) { + uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16), + vld1q_u8(ptr + 2 * 16) } }; + return res; +} +#endif +#endif + static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1, const int16_t c2, const int16_t c3) { return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) | diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index c55c9fb568..a86adb4e72 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -14,6 +14,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/arm/vpx_convolve8_neon.h" #include "vpx_ports/mem.h" @@ -52,6 +53,112 @@ static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, vst1_u8(s, s7); } +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ + (__ARM_FEATURE_DOTPROD == 1) + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x16_t range_limit = vdupq_n_u8(128); + uint8x16_t s0, s1, s2, s3; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int32x4_t t0, t1, t2, t3; + int16x8_t t01, t23; + uint8x8_t d01, d23; + + s0 = vld1q_u8(src); + src += src_stride; + s1 = vld1q_u8(src); + src += src_stride; + s2 = vld1q_u8(src); + src += src_stride; + s3 = vld1q_u8(src); + src += src_stride; + + t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl); + t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl); + t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl); + t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl); + + t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); + t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); + d01 = vqrshrun_n_s16(t01, 7); + d23 = vqrshrun_n_s16(t23, 7); + + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); + dst += dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + s0 = vld1q_u8(s + 0 * src_stride); + s1 = vld1q_u8(s + 1 * src_stride); + s2 = vld1q_u8(s + 2 * src_stride); + s3 = vld1q_u8(s + 3 * src_stride); + + d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl); + d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl); + d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl); + d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl); + + vst1_u8(d + 0 * dst_stride, d0); + vst1_u8(d + 1 * dst_stride, d1); + vst1_u8(d + 2 * dst_stride, d2); + vst1_u8(d + 3 * dst_stride, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } +} + +#else + void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -305,6 +412,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } +#endif + void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index 4f27da9d2f..14e7488540 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -72,6 +72,69 @@ static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p, *s7 = vld1q_u8(s); } +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ + (__ARM_FEATURE_DOTPROD == 1) + +static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x2_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[2]; + int32x4_t sum; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + + /* Accumulate dot product into 'correction' to account for range clamp. */ + sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); + sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1); + + /* Narrowing and packing is performed by the caller. */ + return sum; +} + +static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[3]; + int32x4_t sum0, sum1; + int16x8_t sum; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ + permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + + /* Accumulate dot product into 'correction' to account for range clamp. */ + /* First 4 output values. */ + sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); + sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1); + /* Second 4 output values. */ + sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0); + sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, 7); +} + +#endif + static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, From a28d43658e3347d55d70655e6ee3d87d0d3fba8a Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 6 May 2021 14:51:05 +0100 Subject: [PATCH 0165/1000] Optimize Neon SAD reductions using wider ADDP instruction Implement AArch64-only paths for each of the Neon SAD reduction functions, making use of a wider pairwise addition instruction only available on AArch64. This change removes the need for shuffling between high and low halves of Neon vectors - resulting in a faster reduction that requires fewer instructions. Bug: b/181236880 Change-Id: I1c48580b4aec27222538eeab44e38ecc1f2009dc --- vpx_dsp/arm/sad4d_neon.c | 53 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 06443c6995..34c0a7adef 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -34,7 +34,9 @@ static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride, uint32_t *const res) { int i; uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; +#if !defined(__aarch64__) uint16x4_t a[2]; +#endif uint32x4_t r; assert(!((intptr_t)src_ptr % sizeof(uint32_t))); @@ -51,9 +53,14 @@ static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride, abs[1] = vabal_u8(abs[1], s, ref23); } +#if defined(__aarch64__) + abs[0] = vpaddq_u16(abs[0], abs[1]); + r = vpaddlq_u16(abs[0]); +#else a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0])); a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1])); r = vpaddlq_u16(vcombine_u16(a[0], a[1])); +#endif vst1q_u32(res, r); } @@ -74,6 +81,12 @@ void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, // Can handle 512 pixels' sad sum (such as 16x32 or 32x16) static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/, uint32_t *const res) { +#if defined(__aarch64__) + const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); + const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); + const uint16x8_t b0 = vpaddq_u16(a0, a1); + const uint32x4_t r = vpaddlq_u16(b0); +#else const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); @@ -81,12 +94,21 @@ static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/, const uint16x4_t b0 = vpadd_u16(a0, a1); const uint16x4_t b1 = vpadd_u16(a2, a3); const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1)); +#endif vst1q_u32(res, r); } // Can handle 1024 pixels' sad sum (such as 32x32) static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/, uint32_t *const res) { +#if defined(__aarch64__) + const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); + const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); + const uint32x4_t b0 = vpaddlq_u16(a0); + const uint32x4_t b1 = vpaddlq_u16(a1); + const uint32x4_t r = vpaddq_u32(b0, b1); + vst1q_u32(res, r); +#else const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); @@ -96,11 +118,22 @@ static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/, const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0)); const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1)); vst1q_u32(res, vcombine_u32(c0, c1)); +#endif } // Can handle 2048 pixels' sad sum (such as 32x64 or 64x32) static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/, uint32_t *const res) { +#if defined(__aarch64__) + const uint32x4_t a0 = vpaddlq_u16(sum[0]); + const uint32x4_t a1 = vpaddlq_u16(sum[1]); + const uint32x4_t a2 = vpaddlq_u16(sum[2]); + const uint32x4_t a3 = vpaddlq_u16(sum[3]); + const uint32x4_t b0 = vpaddq_u32(a0, a1); + const uint32x4_t b1 = vpaddq_u32(a2, a3); + const uint32x4_t r = vpaddq_u32(b0, b1); + vst1q_u32(res, r); +#else const uint32x4_t a0 = vpaddlq_u16(sum[0]); const uint32x4_t a1 = vpaddlq_u16(sum[1]); const uint32x4_t a2 = vpaddlq_u16(sum[2]); @@ -112,11 +145,30 @@ static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/, const uint32x2_t c0 = vpadd_u32(b0, b1); const uint32x2_t c1 = vpadd_u32(b2, b3); vst1q_u32(res, vcombine_u32(c0, c1)); +#endif } // Can handle 4096 pixels' sad sum (such as 64x64) static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, uint32_t *const res) { +#if defined(__aarch64__) + const uint32x4_t a0 = vpaddlq_u16(sum[0]); + const uint32x4_t a1 = vpaddlq_u16(sum[1]); + const uint32x4_t a2 = vpaddlq_u16(sum[2]); + const uint32x4_t a3 = vpaddlq_u16(sum[3]); + const uint32x4_t a4 = vpaddlq_u16(sum[4]); + const uint32x4_t a5 = vpaddlq_u16(sum[5]); + const uint32x4_t a6 = vpaddlq_u16(sum[6]); + const uint32x4_t a7 = vpaddlq_u16(sum[7]); + const uint32x4_t b0 = vaddq_u32(a0, a1); + const uint32x4_t b1 = vaddq_u32(a2, a3); + const uint32x4_t b2 = vaddq_u32(a4, a5); + const uint32x4_t b3 = vaddq_u32(a6, a7); + const uint32x4_t c0 = vpaddq_u32(b0, b1); + const uint32x4_t c1 = vpaddq_u32(b2, b3); + const uint32x4_t r = vpaddq_u32(c0, c1); + vst1q_u32(res, r); +#else const uint32x4_t a0 = vpaddlq_u16(sum[0]); const uint32x4_t a1 = vpaddlq_u16(sum[1]); const uint32x4_t a2 = vpaddlq_u16(sum[2]); @@ -136,6 +188,7 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, const uint32x2_t d0 = vpadd_u32(c0, c1); const uint32x2_t d1 = vpadd_u32(c2, c3); vst1q_u32(res, vcombine_u32(d0, d1)); +#endif } static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride, From f7364c05748b70a1e0fd57849665a9d9f0990803 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 6 May 2021 15:11:52 +0100 Subject: [PATCH 0166/1000] Manually unroll the inner loop of Neon sad16x_4d() Manually unrolling the inner loop is sufficient to stop the compiler getting confused and emitting inefficient code. Co-authored by: James Greenhalgh Bug: b/181236880 Change-Id: I860768ce0e6c0e0b6286d3fc1b94f0eae95d0a1a --- vpx_dsp/arm/sad4d_neon.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 34c0a7adef..256bc41ce7 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -243,7 +243,7 @@ static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t *res, const int height) { - int i, j; + int i; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], ref_array[3] }; uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), @@ -252,10 +252,15 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, for (i = 0; i < height; ++i) { const uint8x16_t s = vld1q_u8(src_ptr); src_ptr += src_stride; - for (j = 0; j < 4; ++j) { - sad16_neon(ref_loop[j], s, &sum[j]); - ref_loop[j] += ref_stride; - } + /* Manual unrolling here stops the compiler from getting confused. */ + sad16_neon(ref_loop[0], s, &sum[0]); + ref_loop[0] += ref_stride; + sad16_neon(ref_loop[1], s, &sum[1]); + ref_loop[1] += ref_stride; + sad16_neon(ref_loop[2], s, &sum[2]); + ref_loop[2] += ref_stride; + sad16_neon(ref_loop[3], s, &sum[3]); + ref_loop[3] += ref_stride; } sad_512_pel_final_neon(sum, res); From 43df64a9ac32491a25772ac9c678f45b2b7004d2 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 7 May 2021 19:35:25 -0700 Subject: [PATCH 0167/1000] img_alloc_helper: make align var unsigned quiets an integer sanitizer warning: vpx/src/vpx_image.c:101:25: runtime error: implicit conversion from type 'int' of value -2 (32-bit, signed) to type 'unsigned int' changed the value to 4294967294 (32-bit, unsigned) Change-Id: Ifeac31cc80811081c1ba10aadaa94dc36cd46efa --- vpx/src/vpx_image.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c index 2b7411f94f..2a7afc00c2 100644 --- a/vpx/src/vpx_image.c +++ b/vpx/src/vpx_image.c @@ -22,7 +22,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned char *img_data) { unsigned int h, w, s, xcs, ycs, bps; unsigned int stride_in_bytes; - int align; + unsigned int align; if (img != NULL) memset(img, 0, sizeof(vpx_image_t)); From 0f563e5fadbccb10fabd6ac80c256a4321401e22 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Fri, 7 May 2021 13:25:51 +0100 Subject: [PATCH 0168/1000] Optimize Neon reductions in sum_neon.h using ADDV instruction Use the AArch64-only ADDV and ADDLV instructions to accelerate reductions that add across a Neon vector in sum_neon.h. This commit also refactors the inline functions to return a scalar instead of a vector - allowing for optimization of the surrounding code at each call site. Bug: b/181236880 Change-Id: Ieed2a2dd3c74f8a52957bf404141ffc044bd5d79 --- vpx_dsp/arm/avg_neon.c | 11 +++-------- vpx_dsp/arm/fdct_partial_neon.c | 32 +++++++++++--------------------- vpx_dsp/arm/sad_neon.c | 24 ++++++++++++------------ vpx_dsp/arm/sum_neon.h | 33 ++++++++++++++++++++++++--------- vpx_dsp/arm/variance_neon.c | 21 +++++++++------------ 5 files changed, 59 insertions(+), 62 deletions(-) diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index fa7dd09600..8e57bdaa50 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -22,15 +22,13 @@ uint32_t vpx_avg_4x4_neon(const uint8_t *a, int a_stride) { const uint8x16_t b = load_unaligned_u8q(a, a_stride); const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b)); - const uint32x2_t d = horizontal_add_uint16x8(c); - return vget_lane_u32(vrshr_n_u32(d, 4), 0); + return (horizontal_add_uint16x8(c) + (1 << 3)) >> 4; } uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) { int i; uint8x8_t b, c; uint16x8_t sum; - uint32x2_t d; b = vld1_u8(a); a += a_stride; c = vld1_u8(a); @@ -43,9 +41,7 @@ uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) { sum = vaddw_u8(sum, d); } - d = horizontal_add_uint16x8(sum); - - return vget_lane_u32(vrshr_n_u32(d, 6), 0); + return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6; } // coeff: 16 bits, dynamic range [-32640, 32640]. @@ -139,8 +135,7 @@ int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) { ref += 16; } - return vget_lane_s16(vreinterpret_s16_u32(horizontal_add_uint16x8(vec_sum)), - 0); + return (int16_t)horizontal_add_uint16x8(vec_sum); } // ref, src = [0, 510] - max diff = 16-bits diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c index e73de41d77..0a1cdca41d 100644 --- a/vpx_dsp/arm/fdct_partial_neon.c +++ b/vpx_dsp/arm/fdct_partial_neon.c @@ -15,19 +15,10 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -static INLINE tran_low_t get_lane(const int32x2_t a) { -#if CONFIG_VP9_HIGHBITDEPTH - return vget_lane_s32(a, 0); -#else - return vget_lane_s16(vreinterpret_s16_s32(a), 0); -#endif // CONFIG_VP9_HIGHBITDETPH -} - void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { int16x4_t a0, a1, a2, a3; int16x8_t b0, b1; int16x8_t c; - int32x2_t d; a0 = vld1_s16(input); input += stride; @@ -42,9 +33,7 @@ void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { c = vaddq_s16(b0, b1); - d = horizontal_add_int16x8(c); - - output[0] = get_lane(vshl_n_s32(d, 1)); + output[0] = (tran_low_t)(horizontal_add_int16x8(c) << 1); output[1] = 0; } @@ -57,7 +46,7 @@ void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { sum = vaddq_s16(sum, input_00); } - output[0] = get_lane(horizontal_add_int16x8(sum)); + output[0] = (tran_low_t)horizontal_add_int16x8(sum); output[1] = 0; } @@ -66,7 +55,7 @@ void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int r; int16x8_t left = vld1q_s16(input); int16x8_t right = vld1q_s16(input + 8); - int32x2_t sum; + int32_t sum; input += stride; for (r = 1; r < 16; ++r) { @@ -77,9 +66,9 @@ void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, right = vaddq_s16(right, b); } - sum = vadd_s32(horizontal_add_int16x8(left), horizontal_add_int16x8(right)); + sum = horizontal_add_int16x8(left) + horizontal_add_int16x8(right); - output[0] = get_lane(vshr_n_s32(sum, 1)); + output[0] = (tran_low_t)(sum >> 1); output[1] = 0; } @@ -90,7 +79,7 @@ void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int16x8_t a1 = vld1q_s16(input + 8); int16x8_t a2 = vld1q_s16(input + 16); int16x8_t a3 = vld1q_s16(input + 24); - int32x2_t sum; + int32_t sum; input += stride; for (r = 1; r < 32; ++r) { @@ -105,9 +94,10 @@ void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, a3 = vaddq_s16(a3, b3); } - sum = vadd_s32(horizontal_add_int16x8(a0), horizontal_add_int16x8(a1)); - sum = vadd_s32(sum, horizontal_add_int16x8(a2)); - sum = vadd_s32(sum, horizontal_add_int16x8(a3)); - output[0] = get_lane(vshr_n_s32(sum, 3)); + sum = horizontal_add_int16x8(a0); + sum += horizontal_add_int16x8(a1); + sum += horizontal_add_int16x8(a2); + sum += horizontal_add_int16x8(a3); + output[0] = (tran_low_t)(sum >> 3); output[1] = 0; } diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index c4a49e366d..59567bda5b 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -23,7 +23,7 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); + return horizontal_add_uint16x8(abs); } uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, @@ -35,7 +35,7 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg)); abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); + return horizontal_add_uint16x8(abs); } uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, @@ -51,7 +51,7 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); } - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); + return horizontal_add_uint16x8(abs); } uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, @@ -71,7 +71,7 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); } - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); + return horizontal_add_uint16x8(abs); } static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride, @@ -114,7 +114,7 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride, uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + return horizontal_add_uint16x8(abs); \ } \ \ uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ @@ -122,7 +122,7 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride, const uint8_t *second_pred) { \ const uint16x8_t abs = \ sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + return horizontal_add_uint16x8(abs); \ } sad8xN(4); @@ -172,7 +172,7 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { \ const uint16x8_t abs = \ sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + return horizontal_add_uint16x8(abs); \ } \ \ uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ @@ -180,7 +180,7 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride, const uint8_t *second_pred) { \ const uint16x8_t abs = \ sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + return horizontal_add_uint16x8(abs); \ } sad16xN(8); @@ -240,7 +240,7 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { \ const uint16x8_t abs = \ sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + return horizontal_add_uint16x8(abs); \ } \ \ uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ @@ -248,7 +248,7 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride, const uint8_t *second_pred) { \ const uint16x8_t abs = \ sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + return horizontal_add_uint16x8(abs); \ } sad32xN(16); @@ -338,7 +338,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { \ const uint32x4_t abs = \ sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ + return horizontal_add_uint32x4(abs); \ } \ \ uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ @@ -346,7 +346,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, const uint8_t *second_pred) { \ const uint32x4_t abs = \ sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ + return horizontal_add_uint32x4(abs); \ } sad64xN(32); diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index 9e6833aad3..630296237b 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -16,23 +16,38 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" -static INLINE int32x2_t horizontal_add_int16x8(const int16x8_t a) { +static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) { +#if defined(__aarch64__) + return vaddlvq_s16(a); +#else const int32x4_t b = vpaddlq_s16(a); const int64x2_t c = vpaddlq_s32(b); - return vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)), - vreinterpret_s32_s64(vget_high_s64(c))); + const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)), + vreinterpret_s32_s64(vget_high_s64(c))); + return vget_lane_s32(d, 0); +#endif } -static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) { +static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) { +#if defined(__aarch64__) + return vaddlvq_u16(a); +#else const uint32x4_t b = vpaddlq_u16(a); const uint64x2_t c = vpaddlq_u32(b); - return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), - vreinterpret_u32_u64(vget_high_u64(c))); + const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), + vreinterpret_u32_u64(vget_high_u64(c))); + return vget_lane_u32(d, 0); +#endif } -static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) { +static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) { +#if defined(__aarch64__) + return vaddvq_u32(a); +#else const uint64x2_t b = vpaddlq_u32(a); - return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +#endif } #endif // VPX_VPX_DSP_ARM_SUM_NEON_H_ diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index 77b1015b74..e08f31f2c5 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -66,10 +66,9 @@ static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride, ref_ptr += 4 * ref_stride; } - *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); - *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32( - vaddq_s32(sse_lo_s32, sse_hi_s32))), - 0); + *sum = horizontal_add_int16x8(sum_s16); + *sse = horizontal_add_uint32x4( + vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); } // Process a block of any size where the width is divisible by 16. @@ -115,10 +114,9 @@ static void variance_neon_w16(const uint8_t *src_ptr, int src_stride, ref_ptr += ref_stride; } - *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); - *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32( - vaddq_s32(sse_lo_s32, sse_hi_s32))), - 0); + *sum = horizontal_add_int16x8(sum_s16); + *sse = horizontal_add_uint32x4( + vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); } // Process a block of width 8 two rows at a time. @@ -157,10 +155,9 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, i += 2; } while (i < h); - *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); - *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32( - vaddq_s32(sse_lo_s32, sse_hi_s32))), - 0); + *sum = horizontal_add_int16x8(sum_s16); + *sse = horizontal_add_uint32x4( + vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); } void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, From 2db85c269bc5479e48ea7cd4fde85236ee0bc347 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 10 May 2021 12:22:03 +0100 Subject: [PATCH 0169/1000] Use ABD and UDOT to implement Neon sad_4d functions Implementing sad16_neon using ABD, UDOT instead of ABAL, ABAL2 saves a cycle and removes resource contention for a single SIMD pipe on modern out-of-order Arm CPUs. The UDOT accumulation into 32-bit elements also allows for a faster reduction at the end of each SAD function. The existing implementation is retained for CPUs that do not implement the Armv8.4-A UDOT instruction, and CPUs executing in AArch32 mode. Bug: b/181236880 Change-Id: Ibd0da46e86751d2f808c7b1e424f82b046a1aa6f --- vpx_dsp/arm/sad4d_neon.c | 214 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 256bc41ce7..5c7a0fcaf0 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -98,6 +98,8 @@ static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/, vst1q_u32(res, r); } +#if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD) + // Can handle 1024 pixels' sad sum (such as 32x32) static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/, uint32_t *const res) { @@ -191,6 +193,8 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, #endif } +#endif + static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t *res, const int height) { @@ -233,6 +237,41 @@ void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ + (__ARM_FEATURE_DOTPROD == 1) + +static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, + uint32x4_t *const sum) { + const uint8x16_t r = vld1q_u8(ref_ptr); + const uint8x16_t diff = vabdq_u8(src_ptr, r); + *sum = vdotq_u32(*sum, diff, vdupq_n_u8(1)); +} + +static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t *res, const int height) { + int i; + uint32x4_t r0, r1; + const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], + ref_array[3] }; + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + for (i = 0; i < height; ++i) { + const uint8x16_t s = vld1q_u8(src_ptr + i * src_stride); + sad16_neon(ref_loop[0] + i * ref_stride, s, &sum[0]); + sad16_neon(ref_loop[1] + i * ref_stride, s, &sum[1]); + sad16_neon(ref_loop[2] + i * ref_stride, s, &sum[2]); + sad16_neon(ref_loop[3] + i * ref_stride, s, &sum[3]); + } + + r0 = vpaddq_u32(sum[0], sum[1]); + r1 = vpaddq_u32(sum[2], sum[3]); + vst1q_u32(res, vpaddq_u32(r0, r1)); +} + +#else + static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, uint16x8_t *const sum) { const uint8x16_t r = vld1q_u8(ref_ptr); @@ -266,6 +305,8 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, sad_512_pel_final_neon(sum, res); } +#endif + void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { @@ -286,6 +327,67 @@ void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ + (__ARM_FEATURE_DOTPROD == 1) + +static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t *res, const int height) { + int i; + uint32x4_t r0, r1; + const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], + ref_array[3] }; + + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + for (i = 0; i < height; ++i) { + uint8x16_t s; + + s = vld1q_u8(src_ptr + 0 * 16); + sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); + + s = vld1q_u8(src_ptr + 1 * 16); + sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); + + src_ptr += src_stride; + ref_loop[0] += ref_stride; + ref_loop[1] += ref_stride; + ref_loop[2] += ref_stride; + ref_loop[3] += ref_stride; + } + + r0 = vpaddq_u32(sum[0], sum[1]); + r1 = vpaddq_u32(sum[2], sum[3]); + vst1q_u32(res, vpaddq_u32(r0, r1)); +} + +void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t *res) { + sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16); +} + +void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t *res) { + sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32); +} + +void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t *res) { + sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 64); +} + +#else + static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, const int height, uint16x8_t *const sum) { @@ -342,8 +444,118 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, sad_2048_pel_final_neon(sum, res); } +#endif + //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ + (__ARM_FEATURE_DOTPROD == 1) + +void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t *res) { + int i; + uint32x4_t r0, r1; + const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], + ref_array[3] }; + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + for (i = 0; i < 32; ++i) { + uint8x16_t s; + + s = vld1q_u8(src_ptr + 0 * 16); + sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); + + s = vld1q_u8(src_ptr + 1 * 16); + sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); + + s = vld1q_u8(src_ptr + 2 * 16); + sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]); + + s = vld1q_u8(src_ptr + 3 * 16); + sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]); + + src_ptr += src_stride; + ref_loop[0] += ref_stride; + ref_loop[1] += ref_stride; + ref_loop[2] += ref_stride; + ref_loop[3] += ref_stride; + } + + r0 = vpaddq_u32(sum[0], sum[1]); + r1 = vpaddq_u32(sum[2], sum[3]); + vst1q_u32(res, vpaddq_u32(r0, r1)); +} + +void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t *res) { + int i; + uint32x4_t r0, r1, r2, r3; + const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], + ref_array[3] }; + uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0), vdupq_n_u32(0) }; + + for (i = 0; i < 64; ++i) { + uint8x16_t s; + + s = vld1q_u8(src_ptr + 0 * 16); + sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]); + sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]); + sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]); + + s = vld1q_u8(src_ptr + 1 * 16); + sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]); + sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]); + sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]); + + s = vld1q_u8(src_ptr + 2 * 16); + sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]); + sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]); + sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]); + sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]); + + s = vld1q_u8(src_ptr + 3 * 16); + sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]); + sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]); + sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]); + sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]); + + src_ptr += src_stride; + ref_loop[0] += ref_stride; + ref_loop[1] += ref_stride; + ref_loop[2] += ref_stride; + ref_loop[3] += ref_stride; + } + + r0 = vpaddq_u32(sum[0], sum[1]); + r1 = vpaddq_u32(sum[2], sum[3]); + r2 = vpaddq_u32(sum[4], sum[5]); + r3 = vpaddq_u32(sum[6], sum[7]); + r0 = vpaddq_u32(r0, r1); + r1 = vpaddq_u32(r2, r3); + vst1q_u32(res, vpaddq_u32(r0, r1)); +} + +#else + void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t *res) { @@ -436,3 +648,5 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, sad_4096_pel_final_neon(sum, res); } + +#endif From c8b0432505d32820af0c42a94b219aa83eed5db9 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Tue, 11 May 2021 13:17:44 +0100 Subject: [PATCH 0170/1000] Implement Neon variance functions using UDOT instruction Accelerate Neon variance functions by implementing the sum of squares calculation using the Armv8.4-A UDOT instruction instead of 4 MLAs. The previous implementation is retained for use on CPUs that do not implement the Armv8.4-A dot product instructions. Bug: b/181236880 Change-Id: I9ab3d52634278b9b6f0011f39390a1195210bc75 --- vpx_dsp/arm/sum_neon.h | 27 +++++++++++ vpx_dsp/arm/variance_neon.c | 96 +++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index 630296237b..9a7c424e8e 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -40,6 +40,33 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) { #endif } +static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) { +#if defined(__aarch64__) + return vaddv_s32(a); +#else + return vget_lane_s32(a, 0) + vget_lane_s32(a, 1); +#endif +} + +static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) { +#if defined(__aarch64__) + return vaddv_u32(a); +#else + return vget_lane_u32(a, 0) + vget_lane_u32(a, 1); +#endif +} + +static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) { +#if defined(__aarch64__) + return vaddvq_s32(a); +#else + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +#endif +} + static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) { #if defined(__aarch64__) return vaddvq_u32(a); diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index e08f31f2c5..19aaac7b69 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -19,6 +19,100 @@ #include "vpx_dsp/arm/sum_neon.h" #include "vpx_ports/mem.h" +#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1) + +// Process a block of width 4 four rows at a time. +static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int h, + uint32_t *sse, int *sum) { + int i; + uint32x4_t sum_a = vdupq_n_u32(0); + uint32x4_t sum_b = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + for (i = 0; i < h; i += 4) { + const uint8x16_t a = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t b = load_unaligned_u8q(ref_ptr, ref_stride); + + const uint8x16_t abs_diff = vabdq_u8(a, b); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1)); + sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1)); + + src_ptr += 4 * src_stride; + ref_ptr += 4 * ref_stride; + } + + *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of any size where the width is divisible by 16. +static void variance_neon_w16(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { + int i, j; + uint32x4_t sum_a = vdupq_n_u32(0); + uint32x4_t sum_b = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 16) { + const uint8x16_t a = vld1q_u8(src_ptr + j); + const uint8x16_t b = vld1q_u8(ref_ptr + j); + + const uint8x16_t abs_diff = vabdq_u8(a, b); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1)); + sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1)); + } + src_ptr += src_stride; + ref_ptr += ref_stride; + } + + *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of width 8 two rows at a time. +static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int h, + uint32_t *sse, int *sum) { + int i = 0; + uint32x2_t sum_a = vdup_n_u32(0); + uint32x2_t sum_b = vdup_n_u32(0); + uint32x2_t sse_lo_u32 = vdup_n_u32(0); + uint32x2_t sse_hi_u32 = vdup_n_u32(0); + + do { + const uint8x8_t a_0 = vld1_u8(src_ptr); + const uint8x8_t a_1 = vld1_u8(src_ptr + src_stride); + const uint8x8_t b_0 = vld1_u8(ref_ptr); + const uint8x8_t b_1 = vld1_u8(ref_ptr + ref_stride); + + const uint8x8_t abs_diff_0 = vabd_u8(a_0, b_0); + const uint8x8_t abs_diff_1 = vabd_u8(a_1, b_1); + sse_lo_u32 = vdot_u32(sse_lo_u32, abs_diff_0, abs_diff_0); + sse_hi_u32 = vdot_u32(sse_hi_u32, abs_diff_1, abs_diff_1); + + sum_a = vdot_u32(sum_a, a_0, vdup_n_u8(1)); + sum_b = vdot_u32(sum_b, b_0, vdup_n_u8(1)); + sum_a = vdot_u32(sum_a, a_1, vdup_n_u8(1)); + sum_b = vdot_u32(sum_b, b_1, vdup_n_u8(1)); + + src_ptr += src_stride + src_stride; + ref_ptr += ref_stride + ref_stride; + i += 2; + } while (i < h); + + *sum = horizontal_add_int32x2(vreinterpret_s32_u32(vsub_u32(sum_a, sum_b))); + *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32)); +} + +#else + // The variance helper functions use int16_t for sum. 8 values are accumulated // and then added (at which point they expand up to int32_t). To avoid overflow, // there can be no more than 32767 / 255 ~= 128 values accumulated in each @@ -160,6 +254,8 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); } +#endif + void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum) { From 231aa6ae32fca53efc45ffd39e14650346fcb030 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Tue, 20 Apr 2021 12:03:56 +0100 Subject: [PATCH 0171/1000] Implement vertical convolution using Neon SDOT instruction Add an alternative AArch64 implementation of vpx_convolve8_vert_neon for targets that implement the Armv8.4-A SDOT (signed dot product) instruction. The existing MLA-based implementation of vpx_convolve8_vert_neon is retained and used on target CPUs that do not implement the SDOT instruction (or CPUs executing in AArch32 mode). The availability of the SDOT instruction is indicated by the feature macro __ARM_FEATURE_DOTPROD. Bug: b/181236880 Change-Id: Iebb8c77aba1d45b553b5112f3d87071fef3076f0 --- vpx_dsp/arm/vpx_convolve8_neon.c | 184 +++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index a86adb4e72..7394968169 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -704,6 +704,188 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ + (__ARM_FEATURE_DOTPROD == 1) + +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x16_t range_limit = vdupq_n_u8(128); + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, d01, d23; + uint8x16_t s0, s1, s2, s3; + int32x4_t d0, d1, d2, d3; + + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); + src += 8 * src_stride; + + do { + load_u8_8x4(src, src_stride, &t8, &t9, &t10, &t11); + transpose_u8_8x4(&t8, &t9, &t10, &t11); + s0 = vcombine_u8(t0, t8); + s1 = vcombine_u8(t1, t9); + s2 = vcombine_u8(t2, t10); + s3 = vcombine_u8(t3, t11); + + d0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl); + d1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl); + d2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl); + d3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl); + + d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); + d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); + transpose_u8_4x4(&d01, &d23); + + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); + dst += dst_stride; + + t0 = vext_u8(t0, t8, 4); + t1 = vext_u8(t1, t9, 4); + t2 = vext_u8(t2, t10, 4); + t3 = vext_u8(t3, t11, 4); + src += 4 * src_stride; + h -= 4; + } while (h > 0); + } else if (h == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, d04, d15, d26, d37; + uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; + int32x4_t d0, d1, d2, d3, d4, d5, d6, d7; + const uint8_t *s; + uint8_t *d; + + do { + s = src; + d = dst; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s += 8 * src_stride; + t8 = vld1_u8(s); + s += src_stride; + t9 = vld1_u8(s); + s += src_stride; + t10 = vld1_u8(s); + s += src_stride; + + transpose_u8_8x16(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, + vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0), + vdup_n_u8(0), &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + d0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl); + d1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl); + d2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl); + d3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl); + d4 = convolve8_4_dot(s4, filters, correction, range_limit, permute_tbl); + d5 = convolve8_4_dot(s5, filters, correction, range_limit, permute_tbl); + d6 = convolve8_4_dot(s6, filters, correction, range_limit, permute_tbl); + d7 = convolve8_4_dot(s7, filters, correction, range_limit, permute_tbl); + + d04 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d4)), 7); + d15 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d1), vqmovn_s32(d5)), 7); + d26 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d6)), 7); + d37 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d3), vqmovn_s32(d7)), 7); + + transpose_u8_8x4(&d04, &d15, &d26, &d37); + + vst1_u8(d, d04); + d += dst_stride; + vst1_u8(d, d15); + d += dst_stride; + vst1_u8(d, d26); + d += dst_stride; + vst1_u8(d, d37); + d += dst_stride; + + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, + t15, d0, d1, d2, d3, d4, d5, d6, d7; + uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s += 8 * src_stride; + + do { + load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14, + &t15); + transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15); + s0 = vcombine_u8(t0, t8); + s1 = vcombine_u8(t1, t9); + s2 = vcombine_u8(t2, t10); + s3 = vcombine_u8(t3, t11); + s4 = vcombine_u8(t4, t12); + s5 = vcombine_u8(t5, t13); + s6 = vcombine_u8(t6, t14); + s7 = vcombine_u8(t7, t15); + + d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl); + d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl); + d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl); + d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl); + d4 = convolve8_8_dot(s4, filters, correction, range_limit, permute_tbl); + d5 = convolve8_8_dot(s5, filters, correction, range_limit, permute_tbl); + d6 = convolve8_8_dot(s6, filters, correction, range_limit, permute_tbl); + d7 = convolve8_8_dot(s7, filters, correction, range_limit, permute_tbl); + + transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + t0 = t8; + t1 = t9; + t2 = t10; + t3 = t11; + t4 = t12; + t5 = t13; + t6 = t14; + t7 = t15; + s += 8 * src_stride; + d += 8 * dst_stride; + height -= 8; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } +} + +#else + void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -876,6 +1058,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } } +#endif + void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, From 4808d831dbc4e9ff83fa0efe11207bc135c6d6f5 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 12 May 2021 16:05:56 +0100 Subject: [PATCH 0172/1000] Optimize remaining mse and sse functions in variance_neon.c Implement sum of squared difference calculations in vpx_mse16x16_neon and vpx_get4x4sse_cs_neon using the ABD and UDOT instructions - instead of widening subtracts followed by a sequence of MLAs. The existing implementation is retained for use on CPUs that do not implement the Armv8.4-A UDOT instruction. This commit also updates the variable names used in the existing implementations to be more descriptive. Bug: b/181236880 Change-Id: Id4ad8ea7c808af1ac9bb5f1b63327ab487e4b1c7 --- vpx_dsp/arm/variance_neon.c | 218 ++++++++++++++++++++++-------------- 1 file changed, 133 insertions(+), 85 deletions(-) diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index 19aaac7b69..410ce7d9e6 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -357,117 +357,165 @@ unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); } +#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1) + unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - int64x1_t d0s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - q7s32 = vdupq_n_s32(0); - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { // mse16x16_neon_loop - q0u8 = vld1q_u8(src_ptr); + uint8x16_t a[2], b[2], abs_diff[2]; + uint32x4_t sse_vec[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + for (i = 0; i < 8; i++) { + a[0] = vld1q_u8(src_ptr); src_ptr += src_stride; - q1u8 = vld1q_u8(src_ptr); + a[1] = vld1q_u8(src_ptr); src_ptr += src_stride; - q2u8 = vld1q_u8(ref_ptr); + b[0] = vld1q_u8(ref_ptr); ref_ptr += ref_stride; - q3u8 = vld1q_u8(ref_ptr); + b[1] = vld1q_u8(ref_ptr); ref_ptr += ref_stride; - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q7s32 = vmlal_s16(q7s32, d22s16, d22s16); - q8s32 = vmlal_s16(q8s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q7s32 = vmlal_s16(q7s32, d26s16, d26s16); - q8s32 = vmlal_s16(q8s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q10s32 = vaddq_s32(q7s32, q9s32); + abs_diff[0] = vabdq_u8(a[0], b[0]); + abs_diff[1] = vabdq_u8(a[1], b[1]); - q1s64 = vpaddlq_s32(q10s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + sse_vec[0] = vdotq_u32(sse_vec[0], abs_diff[0], abs_diff[0]); + sse_vec[1] = vdotq_u32(sse_vec[1], abs_diff[1], abs_diff[1]); + } - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); + *sse = horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1])); + return horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1])); } unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride) { - int16x4_t d22s16, d24s16, d26s16, d28s16; - int64x1_t d0s64; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - d0u8 = vld1_u8(src_ptr); - src_ptr += src_stride; - d4u8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - d1u8 = vld1_u8(src_ptr); + uint8x8_t a[4], b[4], abs_diff[4]; + uint32x2_t sse = vdup_n_u32(0); + + a[0] = vld1_u8(src_ptr); src_ptr += src_stride; - d5u8 = vld1_u8(ref_ptr); + b[0] = vld1_u8(ref_ptr); ref_ptr += ref_stride; - d2u8 = vld1_u8(src_ptr); + a[1] = vld1_u8(src_ptr); src_ptr += src_stride; - d6u8 = vld1_u8(ref_ptr); + b[1] = vld1_u8(ref_ptr); ref_ptr += ref_stride; - d3u8 = vld1_u8(src_ptr); + a[2] = vld1_u8(src_ptr); src_ptr += src_stride; - d7u8 = vld1_u8(ref_ptr); + b[2] = vld1_u8(ref_ptr); ref_ptr += ref_stride; + a[3] = vld1_u8(src_ptr); + b[3] = vld1_u8(ref_ptr); + + abs_diff[0] = vabd_u8(a[0], b[0]); + abs_diff[1] = vabd_u8(a[1], b[1]); + abs_diff[2] = vabd_u8(a[2], b[2]); + abs_diff[3] = vabd_u8(a[3], b[3]); + + sse = vdot_u32(sse, abs_diff[0], abs_diff[0]); + sse = vdot_u32(sse, abs_diff[1], abs_diff[1]); + sse = vdot_u32(sse, abs_diff[2], abs_diff[2]); + sse = vdot_u32(sse, abs_diff[3], abs_diff[3]); + + return vget_lane_u32(sse, 0); +} + +#else + +unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, + const unsigned char *ref_ptr, int ref_stride, + unsigned int *sse) { + int i; + uint8x16_t a[2], b[2]; + int16x4_t diff_lo[4], diff_hi[4]; + uint16x8_t diff[4]; + int32x4_t sse_vec[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + + for (i = 0; i < 8; i++) { + a[0] = vld1q_u8(src_ptr); + src_ptr += src_stride; + a[1] = vld1q_u8(src_ptr); + src_ptr += src_stride; + b[0] = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + b[1] = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + + diff[0] = vsubl_u8(vget_low_u8(a[0]), vget_low_u8(b[0])); + diff[1] = vsubl_u8(vget_high_u8(a[0]), vget_high_u8(b[0])); + diff[2] = vsubl_u8(vget_low_u8(a[1]), vget_low_u8(b[1])); + diff[3] = vsubl_u8(vget_high_u8(a[1]), vget_high_u8(b[1])); + + diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0])); + diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1])); + sse_vec[0] = vmlal_s16(sse_vec[0], diff_lo[0], diff_lo[0]); + sse_vec[1] = vmlal_s16(sse_vec[1], diff_lo[1], diff_lo[1]); + + diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2])); + diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3])); + sse_vec[2] = vmlal_s16(sse_vec[2], diff_lo[2], diff_lo[2]); + sse_vec[3] = vmlal_s16(sse_vec[3], diff_lo[3], diff_lo[3]); + + diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0])); + diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1])); + sse_vec[0] = vmlal_s16(sse_vec[0], diff_hi[0], diff_hi[0]); + sse_vec[1] = vmlal_s16(sse_vec[1], diff_hi[1], diff_hi[1]); + + diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2])); + diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3])); + sse_vec[2] = vmlal_s16(sse_vec[2], diff_hi[2], diff_hi[2]); + sse_vec[3] = vmlal_s16(sse_vec[3], diff_hi[3], diff_hi[3]); + } + + sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[1]); + sse_vec[2] = vaddq_s32(sse_vec[2], sse_vec[3]); + sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[2]); - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d1u8, d5u8); - q13u16 = vsubl_u8(d2u8, d6u8); - q14u16 = vsubl_u8(d3u8, d7u8); + *sse = horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0])); + return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0])); +} + +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, + const unsigned char *ref_ptr, + int ref_stride) { + uint8x8_t a[4], b[4]; + int16x4_t diff_lo[4]; + uint16x8_t diff[4]; + int32x4_t sse; - d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); - d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); - d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); - d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); + a[0] = vld1_u8(src_ptr); + src_ptr += src_stride; + b[0] = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + a[1] = vld1_u8(src_ptr); + src_ptr += src_stride; + b[1] = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + a[2] = vld1_u8(src_ptr); + src_ptr += src_stride; + b[2] = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + a[3] = vld1_u8(src_ptr); + b[3] = vld1_u8(ref_ptr); - q7s32 = vmull_s16(d22s16, d22s16); - q8s32 = vmull_s16(d24s16, d24s16); - q9s32 = vmull_s16(d26s16, d26s16); - q10s32 = vmull_s16(d28s16, d28s16); + diff[0] = vsubl_u8(a[0], b[0]); + diff[1] = vsubl_u8(a[1], b[1]); + diff[2] = vsubl_u8(a[2], b[2]); + diff[3] = vsubl_u8(a[3], b[3]); - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q9s32 = vaddq_s32(q7s32, q9s32); + diff_lo[0] = vget_low_s16(vreinterpretq_s16_u16(diff[0])); + diff_lo[1] = vget_low_s16(vreinterpretq_s16_u16(diff[1])); + diff_lo[2] = vget_low_s16(vreinterpretq_s16_u16(diff[2])); + diff_lo[3] = vget_low_s16(vreinterpretq_s16_u16(diff[3])); - q1s64 = vpaddlq_s32(q9s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + sse = vmull_s16(diff_lo[0], diff_lo[0]); + sse = vmlal_s16(sse, diff_lo[1], diff_lo[1]); + sse = vmlal_s16(sse, diff_lo[2], diff_lo[2]); + sse = vmlal_s16(sse, diff_lo[3], diff_lo[3]); - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); + return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse)); } + +#endif From 66c1ff6850fd53bcf5c17247569bea1d700d6247 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 17 May 2021 10:53:07 +0100 Subject: [PATCH 0173/1000] Implement vpx_convolve8_avg_horiz_neon using SDOT instruction Add an alternative AArch64 implementation of vpx_convolve8_avg_horiz_neon for targets that implement the Armv8.4-A SDOT (signed dot product) instruction. The existing MLA-based implementation of vpx_convolve8_avg_horiz_neon is retained and used on target CPUs that do not implement the SDOT instruction (or CPUs executing in AArch32 mode). The availability of the SDOT instruction is indicated by the feature macro __ARM_FEATURE_DOTPROD. Bug: b/181236880 Change-Id: Ib435107c47c485f325248da87ba5618d68b0c8ed --- vpx_dsp/arm/vpx_convolve8_neon.c | 115 ++++++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 2 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index 7394968169..acb128c855 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -157,6 +157,117 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x16_t range_limit = vdupq_n_u8(128); + uint8x16_t s0, s1, s2, s3; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int32x4_t t0, t1, t2, t3; + int16x8_t t01, t23; + uint8x8_t d01, d23, dd01, dd23; + dd01 = vdup_n_u8(0); + dd23 = vdup_n_u8(0); + + s0 = vld1q_u8(src); + src += src_stride; + s1 = vld1q_u8(src); + src += src_stride; + s2 = vld1q_u8(src); + src += src_stride; + s3 = vld1q_u8(src); + src += src_stride; + + t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl); + t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl); + t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl); + t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl); + + t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); + t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); + d01 = vqrshrun_n_s16(t01, 7); + d23 = vqrshrun_n_s16(t23, 7); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); + dst += dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + + do { + width = w; + s = src; + d = dst; + do { + s0 = vld1q_u8(s + 0 * src_stride); + s1 = vld1q_u8(s + 1 * src_stride); + s2 = vld1q_u8(s + 2 * src_stride); + s3 = vld1q_u8(s + 3 * src_stride); + + d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl); + d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl); + d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl); + d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl); + + dd0 = vld1_u8(d + 0 * dst_stride); + dd1 = vld1_u8(d + 1 * dst_stride); + dd2 = vld1_u8(d + 2 * dst_stride); + dd3 = vld1_u8(d + 3 * dst_stride); + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + vst1_u8(d + 0 * dst_stride, d0); + vst1_u8(d + 1 * dst_stride, d1); + vst1_u8(d + 2 * dst_stride, d2); + vst1_u8(d + 3 * dst_stride, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } +} + #else void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, @@ -412,8 +523,6 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } -#endif - void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -704,6 +813,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } +#endif + #if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ (__ARM_FEATURE_DOTPROD == 1) From 10823f54681747b9f64deb3002531c95cc67d17f Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Sat, 22 May 2021 22:07:25 +0100 Subject: [PATCH 0174/1000] Merge transpose and permute in Neon SDOT vertical convolution The original dot-product implementation of vpx_convolve8_vert_neon used a separate transpose before and after the convolution operation. This patch merges the first transpose with the TBL permute (necessary before using SDOT to compute the convolution) to significantly reduce the amount of data re-arrangement. This new approach also allows for more effective data re-use between loop iterations. Co-authored by: James Greenhalgh Bug: b/181236880 Change-Id: I87fe4dadd312c3ad6216943b71a5410ddf4a1b5b --- vpx_dsp/arm/vpx_convolve8_neon.c | 385 ++++++++++++++++++++----------- vpx_dsp/arm/vpx_convolve8_neon.h | 38 +++ 2 files changed, 282 insertions(+), 141 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index acb128c855..25a59a2da9 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -31,37 +31,72 @@ // instructions. This optimization is much faster in speed unit test, but slowed // down the whole decoder by 5%. -static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, - const uint8x8_t s0, const uint8x8_t s1, - const uint8x8_t s2, const uint8x8_t s3, - const uint8x8_t s4, const uint8x8_t s5, - const uint8x8_t s6, const uint8x8_t s7) { - vst1_u8(s, s0); - s += p; - vst1_u8(s, s1); - s += p; - vst1_u8(s, s2); - s += p; - vst1_u8(s, s3); - s += p; - vst1_u8(s, s4); - s += p; - vst1_u8(s, s5); - s += p; - vst1_u8(s, s6); - s += p; - vst1_u8(s, s7); -} - #if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ (__ARM_FEATURE_DOTPROD == 1) - DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { + /* Shift left and insert new last column in transposed 4x4 block. */ + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + /* Shift left and insert two new columns in transposed 4x4 block. */ + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + /* Shift left and insert three new columns in transposed 4x4 block. */ + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 +}; + +static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1, + int8x8_t *a2, int8x8_t *a3, + int8x16_t *b, + const uint8x16_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, XX, XX, XX, XX + * a1: 10, 11, 12, 13, XX, XX, XX, XX + * a2: 20, 21, 22, 23, XX, XX, XX, XX + * a3: 30, 31, 32, 33, XX, XX, XX, XX + * + * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } }; + *b = vqtbl2q_s8(samples, permute_tbl); +} + +static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1, + int8x8_t *a2, int8x8_t *a3, + int8x16_t *b0, int8x16_t *b1, + const uint8x16x2_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, 04, 05, 06, 07 + * a1: 10, 11, 12, 13, 14, 15, 16, 17 + * a2: 20, 21, 22, 23, 24, 25, 26, 27 + * a3: 30, 31, 32, 33, 34, 35, 36, 37 + * + * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } }; + *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); + *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); +} + void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -270,6 +305,28 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, #else +static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3, + const uint8x8_t s4, const uint8x8_t s5, + const uint8x8_t s6, const uint8x8_t s7) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); + s += p; + vst1_u8(s, s4); + s += p; + vst1_u8(s, s5); + s += p; + vst1_u8(s, s6); + s += p; + vst1_u8(s, s7); +} + void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -826,7 +883,11 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x16_t range_limit = vdupq_n_u8(128); + const uint8x8_t range_limit = vdup_n_u8(128); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int8x16x2_t samples_LUT; assert(!((intptr_t)dst & 3)); assert(!(dst_stride & 3)); @@ -839,154 +900,196 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3 * src_stride; if (w == 4) { - const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, d01, d23; - uint8x16_t s0, s1, s2, s3; + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; int32x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; - load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); - src += 8 * src_stride; + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + src += 4 * src_stride; + t4 = vld1_u8(src); + src += src_stride; + t5 = vld1_u8(src); + src += src_stride; + t6 = vld1_u8(src); + src += src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl); do { - load_u8_8x4(src, src_stride, &t8, &t9, &t10, &t11); - transpose_u8_8x4(&t8, &t9, &t10, &t11); - s0 = vcombine_u8(t0, t8); - s1 = vcombine_u8(t1, t9); - s2 = vcombine_u8(t2, t10); - s3 = vcombine_u8(t3, t11); - - d0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl); - d1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl); - d2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl); - d3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl); + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters); + d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters); + d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters); + d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters); d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); - transpose_u8_4x4(&d01, &d23); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); + dst += dst_stride; vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); dst += dst_stride; - t0 = vext_u8(t0, t8, 4); - t1 = vext_u8(t1, t9, 4); - t2 = vext_u8(t2, t10, 4); - t3 = vext_u8(t3, t11, 4); + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + src += 4 * src_stride; h -= 4; } while (h > 0); - } else if (h == 4) { - const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, d04, d15, d26, d37; - uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; - int32x4_t d0, d1, d2, d3, d4, d5, d6, d7; + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; const uint8_t *s; uint8_t *d; + int height; do { + height = h; s = src; d = dst; - load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s += 8 * src_stride; - t8 = vld1_u8(s); + + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + s += 4 * src_stride; + t4 = vld1_u8(s); s += src_stride; - t9 = vld1_u8(s); + t5 = vld1_u8(s); s += src_stride; - t10 = vld1_u8(s); + t6 = vld1_u8(s); s += src_stride; - transpose_u8_8x16(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, - vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0), - vdup_n_u8(0), &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - - d0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl); - d1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl); - d2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl); - d3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl); - d4 = convolve8_4_dot(s4, filters, correction, range_limit, permute_tbl); - d5 = convolve8_4_dot(s5, filters, correction, range_limit, permute_tbl); - d6 = convolve8_4_dot(s6, filters, correction, range_limit, permute_tbl); - d7 = convolve8_4_dot(s7, filters, correction, range_limit, permute_tbl); - - d04 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d4)), 7); - d15 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d1), vqmovn_s32(d5)), 7); - d26 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d6)), 7); - d37 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d3), vqmovn_s32(d7)), 7); - - transpose_u8_8x4(&d04, &d15, &d26, &d37); - - vst1_u8(d, d04); - d += dst_stride; - vst1_u8(d, d15); - d += dst_stride; - vst1_u8(d, d26); - d += dst_stride; - vst1_u8(d, d37); - d += dst_stride; - - src += 8; - dst += 8; - w -= 8; - } while (w > 0); - } else { - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, - t15, d0, d1, d2, d3, d4, d5, d6, d7; - uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; - const uint8_t *s; - uint8_t *d; - int height; - - do { - height = h; - s = src; - d = dst; - load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s += 8 * src_stride; + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); do { - load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14, - &t15); - transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15); - s0 = vcombine_u8(t0, t8); - s1 = vcombine_u8(t1, t9); - s2 = vcombine_u8(t2, t10); - s3 = vcombine_u8(t3, t11); - s4 = vcombine_u8(t4, t12); - s5 = vcombine_u8(t5, t13); - s6 = vcombine_u8(t6, t14); - s7 = vcombine_u8(t7, t15); + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filters); + d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filters); + d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filters); + d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filters); + vst1_u8(d + 0 * dst_stride, d0); + vst1_u8(d + 1 * dst_stride, d1); + vst1_u8(d + 2 * dst_stride, d2); + vst1_u8(d + 3 * dst_stride, d3); - d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl); - d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl); - d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl); - d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl); - d4 = convolve8_8_dot(s4, filters, correction, range_limit, permute_tbl); - d5 = convolve8_8_dot(s5, filters, correction, range_limit, permute_tbl); - d6 = convolve8_8_dot(s6, filters, correction, range_limit, permute_tbl); - d7 = convolve8_8_dot(s7, filters, correction, range_limit, permute_tbl); - - transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); - store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); - - t0 = t8; - t1 = t9; - t2 = t10; - t3 = t11; - t4 = t12; - t5 = t13; - t6 = t14; - t7 = t15; - s += 8 * src_stride; - d += 8 * dst_stride; - height -= 8; + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; } while (height > 0); src += 8; dst += 8; diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index 14e7488540..857b6d54e2 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -75,6 +75,21 @@ static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p, #if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ (__ARM_FEATURE_DOTPROD == 1) +static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo, + const int8x16_t samples_hi, + const int32x4_t correction, + const int8x8_t filters) { + /* Sample range-clamping and permutation are performed by the caller. */ + int32x4_t sum; + + /* Accumulate dot product into 'correction' to account for range clamp. */ + sum = vdotq_lane_s32(correction, samples_lo, filters, 0); + sum = vdotq_lane_s32(sum, samples_hi, filters, 1); + + /* Narrowing and packing is performed by the caller. */ + return sum; +} + static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples, const int8x8_t filters, const int32x4_t correction, @@ -100,6 +115,29 @@ static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples, return sum; } +static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo, + const int8x16_t samples0_hi, + const int8x16_t samples1_lo, + const int8x16_t samples1_hi, + const int32x4_t correction, + const int8x8_t filters) { + /* Sample range-clamping and permutation are performed by the caller. */ + int32x4_t sum0, sum1; + int16x8_t sum; + + /* Accumulate dot product into 'correction' to account for range clamp. */ + /* First 4 output values. */ + sum0 = vdotq_lane_s32(correction, samples0_lo, filters, 0); + sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1); + /* Second 4 output values. */ + sum1 = vdotq_lane_s32(correction, samples1_lo, filters, 0); + sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, 7); +} + static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples, const int8x8_t filters, const int32x4_t correction, From 35bce9389ea875b57b352a0f5f532b96aa47bff6 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Sun, 23 May 2021 13:35:15 +0100 Subject: [PATCH 0175/1000] Implement vpx_convolve8_avg_vert_neon using SDOT instruction Add an alternative AArch64 implementation of vpx_convolve8_avg_vert_neon for targets that implement the Armv8.4-A SDOT (signed dot product) instruction. The existing MLA-based implementation of vpx_convolve8_avg_vert_neon is retained and used on target CPUs that do not implement the SDOT instruction (or CPUs executing in AArch32 mode). The availability of the SDOT instruction is indicated by the feature macro __ARM_FEATURE_DOTPROD. Bug: b/181236880 Change-Id: I971c626116155e1384bff4c76fd3420312c7a15b --- vpx_dsp/arm/vpx_convolve8_neon.c | 695 ++++++++++++++++++++----------- 1 file changed, 463 insertions(+), 232 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index 25a59a2da9..06b58c438f 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -303,6 +303,467 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x8_t range_limit = vdup_n_u8(128); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int8x16x2_t samples_LUT; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int32x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + src += 4 * src_stride; + t4 = vld1_u8(src); + src += src_stride; + t5 = vld1_u8(src); + src += src_stride; + t6 = vld1_u8(src); + src += src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters); + d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters); + d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters); + d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters); + + d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); + d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); + + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); + dst += dst_stride; + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + s += 4 * src_stride; + t4 = vld1_u8(s); + s += src_stride; + t5 = vld1_u8(s); + s += src_stride; + t6 = vld1_u8(s); + s += src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filters); + d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filters); + d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filters); + d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filters); + vst1_u8(d + 0 * dst_stride, d0); + vst1_u8(d + 1 * dst_stride, d1); + vst1_u8(d + 2 * dst_stride, d2); + vst1_u8(d + 3 * dst_stride, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } +} + +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x8_t range_limit = vdup_n_u8(128); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int8x16x2_t samples_LUT; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int32x4_t d0, d1, d2, d3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + src += 4 * src_stride; + t4 = vld1_u8(src); + src += src_stride; + t5 = vld1_u8(src); + src += src_stride; + t6 = vld1_u8(src); + src += src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters); + d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters); + d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters); + d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters); + + d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); + d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); + dst += dst_stride; + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + s += 4 * src_stride; + t4 = vld1_u8(s); + s += src_stride; + t5 = vld1_u8(s); + s += src_stride; + t6 = vld1_u8(s); + s += src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filters); + d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filters); + d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filters); + d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filters); + + dd0 = vld1_u8(d + 0 * dst_stride); + dd1 = vld1_u8(d + 1 * dst_stride); + dd2 = vld1_u8(d + 2 * dst_stride); + dd3 = vld1_u8(d + 3 * dst_stride); + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + vst1_u8(d + 0 * dst_stride, d0); + vst1_u8(d + 1 * dst_stride, d1); + vst1_u8(d + 2 * dst_stride, d2); + vst1_u8(d + 3 * dst_stride, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } +} + #else static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, @@ -870,236 +1331,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } -#endif - -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) - -void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x8_t range_limit = vdup_n_u8(128); - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; - int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int8x16x2_t samples_LUT; - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - assert(y_step_q4 == 16); - - (void)x0_q4; - (void)x_step_q4; - (void)y_step_q4; - - src -= 3 * src_stride; - - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int32x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - src += 4 * src_stride; - t4 = vld1_u8(src); - src += src_stride; - t5 = vld1_u8(src); - src += src_stride; - t6 = vld1_u8(src); - src += src_stride; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl); - - do { - uint8x8_t t7, t8, t9, t10; - - load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - - transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters); - d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters); - d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters); - d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters); - - d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); - d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); - - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123 = s4567; - s1234 = s5678; - s2345 = s6789; - s3456 = s78910; - - src += 4 * src_stride; - h -= 4; - } while (h > 0); - } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; - - do { - height = h; - s = src; - d = dst; - - load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s += 4 * src_stride; - t4 = vld1_u8(s); - s += src_stride; - t5 = vld1_u8(s); - s += src_stride; - t6 = vld1_u8(s); - s += src_stride; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); - transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); - - do { - uint8x8_t t7, t8, t9, t10; - - load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - - transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - samples_LUT.val[0] = s3456_hi; - samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filters); - d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filters); - d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filters); - d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filters); - vst1_u8(d + 0 * dst_stride, d0); - vst1_u8(d + 1 * dst_stride, d1); - vst1_u8(d + 2 * dst_stride, d2); - vst1_u8(d + 3 * dst_stride, d3); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123_lo = s4567_lo; - s0123_hi = s4567_hi; - s1234_lo = s5678_lo; - s1234_hi = s5678_hi; - s2345_lo = s6789_lo; - s2345_hi = s6789_hi; - s3456_lo = s78910_lo; - s3456_hi = s78910_hi; - - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height > 0); - src += 8; - dst += 8; - w -= 8; - } while (w > 0); - } -} - -#else - void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -1272,8 +1503,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } } -#endif - void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -1464,3 +1693,5 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } while (w != 0); } } + +#endif From dbda032fcfb323bfa74af52f86b26f337b0dc6be Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 24 May 2021 11:42:09 +0100 Subject: [PATCH 0176/1000] Use 'ptrdiff_t' instead of 'int' for pointer offset parameters A number of the load/store functions in mem_neon.h use type 'int' for the 'stride' pointer offset parameter. This causes Clang to generate the following warning every time these functions are called with a wider type passed in for 'stride': warning: implicit conversion loses integer precision: 'ptrdiff_t' (aka 'long') to 'int' [-Wshorten-64-to-32] This patch changes all such instances of 'int' to 'ptrdiff_t'. Bug: b/181236880 Change-Id: I2e86b005219e1fbb54f7cf2465e918b7c077f7ee --- vpx_dsp/arm/mem_neon.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index c89f92d1ad..50aaa94fe0 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -113,7 +113,8 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) { } // Load 2 sets of 4 bytes when alignment is not guaranteed. -static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) { +static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, + ptrdiff_t stride) { uint32_t a; uint32x2_t a_u32 = vdup_n_u32(0); if (stride == 4) return vld1_u8(buf); @@ -126,7 +127,7 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) { } // Store 2 sets of 4 bytes when alignment is not guaranteed. -static INLINE void store_unaligned_u8(uint8_t *buf, int stride, +static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) { const uint32x2_t a_u32 = vreinterpret_u32_u8(a); if (stride == 4) { @@ -139,7 +140,8 @@ static INLINE void store_unaligned_u8(uint8_t *buf, int stride, } // Load 4 sets of 4 bytes when alignment is not guaranteed. -static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { +static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, + ptrdiff_t stride) { uint32_t a; uint32x4_t a_u32 = vdupq_n_u32(0); if (stride == 4) return vld1q_u8(buf); @@ -159,7 +161,7 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { } // Store 4 sets of 4 bytes when alignment is not guaranteed. -static INLINE void store_unaligned_u8q(uint8_t *buf, int stride, +static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride, const uint8x16_t a) { const uint32x4_t a_u32 = vreinterpretq_u32_u8(a); if (stride == 4) { @@ -176,7 +178,7 @@ static INLINE void store_unaligned_u8q(uint8_t *buf, int stride, } // Load 2 sets of 4 bytes when alignment is guaranteed. -static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) { +static INLINE uint8x8_t load_u8(const uint8_t *buf, ptrdiff_t stride) { uint32x2_t a = vdup_n_u32(0); assert(!((intptr_t)buf % sizeof(uint32_t))); @@ -189,7 +191,7 @@ static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) { } // Store 2 sets of 4 bytes when alignment is guaranteed. -static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) { +static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) { uint32x2_t a_u32 = vreinterpret_u32_u8(a); assert(!((intptr_t)buf % sizeof(uint32_t))); From d42b93a15f182b0cd2bcc639e6951c40da6da8ce Mon Sep 17 00:00:00 2001 From: Chunbo Hua Date: Wed, 26 May 2021 02:02:07 -0700 Subject: [PATCH 0177/1000] Fixed redundant wording for decoder algorithm interface Change-Id: Id56e03dc9cf6d4e70c4681896f29893a9b4c76f2 --- vpx/internal/vpx_codec_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h index 4ef93057f5..961b0bfe4c 100644 --- a/vpx/internal/vpx_codec_internal.h +++ b/vpx/internal/vpx_codec_internal.h @@ -283,7 +283,7 @@ typedef const struct vpx_codec_enc_cfg_map { vpx_codec_enc_cfg_t cfg; } vpx_codec_enc_cfg_map_t; -/*!\brief Decoder algorithm interface interface +/*!\brief Decoder algorithm interface * * All decoders \ref MUST expose a variable of this type. */ From b8273e8ae5c14bccefde96170507336a4f15c98c Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Mon, 24 May 2021 15:53:06 -0700 Subject: [PATCH 0178/1000] Fix simple encode Properly init and delete cpi struct in simple encode functions. Change-Id: I6e66bcac852cbb3dec9b754ba3fb01a348ac98b8 --- vp9/encoder/vp9_firstpass.c | 6 ++--- vp9/encoder/vp9_firstpass.h | 5 ++-- vp9/simple_encode.cc | 48 +++++++++++++++++++------------------ 3 files changed, 30 insertions(+), 29 deletions(-) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index b63d47a05e..aeb17aa6fa 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -3878,9 +3878,9 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf, // coding frames (including show frame and alt ref) can be determined. int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf, const TWO_PASS *const twopass, - const FRAME_INFO *frame_info, - const FIRST_PASS_INFO *first_pass_info, - int multi_layer_arf, int allow_alt_ref) { + const FRAME_INFO *frame_info, int multi_layer_arf, + int allow_alt_ref) { + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; int coding_frame_num = 0; RATE_CONTROL rc; int gop_coding_frame_count; diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index cdbcb52412..e504528f15 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -305,9 +305,8 @@ int vp9_get_gop_coding_frame_count(const struct VP9EncoderConfig *oxcf, int vp9_get_coding_frame_num(const struct VP9EncoderConfig *oxcf, const TWO_PASS *const twopass, - const FRAME_INFO *frame_info, - const FIRST_PASS_INFO *first_pass_info, - int multi_layer_arf, int allow_alt_ref); + const FRAME_INFO *frame_info, int multi_layer_arf, + int allow_alt_ref); /*!\brief Compute a key frame binary map indicates whether key frames appear * in the corresponding positions. The passed in key_frame_map must point to an diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index efdc71eb98..48551ef72f 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -872,14 +872,14 @@ void SimpleEncode::ComputeFirstPassStats() { const VP9EncoderConfig oxcf = GetEncodeConfig( frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, VPX_RC_FIRST_PASS, impl_ptr_->encode_config_list); - VP9_COMP *cpi = init_encoder(&oxcf, impl_ptr_->img_fmt); - struct lookahead_ctx *lookahead = cpi->lookahead; + impl_ptr_->cpi = init_encoder(&oxcf, impl_ptr_->img_fmt); + struct lookahead_ctx *lookahead = impl_ptr_->cpi->lookahead; int i; int use_highbitdepth = 0; const int num_rows_16x16 = get_num_unit_16x16(frame_height_); const int num_cols_16x16 = get_num_unit_16x16(frame_width_); #if CONFIG_VP9_HIGHBITDEPTH - use_highbitdepth = cpi->common.use_highbitdepth; + use_highbitdepth = impl_ptr_->cpi->common.use_highbitdepth; #endif vpx_image_t img; vpx_img_alloc(&img, impl_ptr_->img_fmt, frame_width_, frame_height_, 1); @@ -905,30 +905,38 @@ void SimpleEncode::ComputeFirstPassStats() { ENCODE_FRAME_RESULT encode_frame_info; vp9_init_encode_frame_result(&encode_frame_info); // TODO(angiebird): Call vp9_first_pass directly - vp9_get_compressed_data(cpi, &frame_flags, &size, nullptr, &time_stamp, - &time_end, flush, &encode_frame_info); + vp9_get_compressed_data(impl_ptr_->cpi, &frame_flags, &size, nullptr, + &time_stamp, &time_end, flush, + &encode_frame_info); // vp9_get_compressed_data only generates first pass stats not // compresses data assert(size == 0); // Get vp9 first pass motion vector info. std::vector mv_info(num_rows_16x16 * num_cols_16x16); - update_motion_vector_info(cpi->fp_motion_vector_info, num_rows_16x16, - num_cols_16x16, mv_info.data(), - kMotionVectorFullPixelPrecision); + update_motion_vector_info( + impl_ptr_->cpi->fp_motion_vector_info, num_rows_16x16, + num_cols_16x16, mv_info.data(), kMotionVectorFullPixelPrecision); fp_motion_vector_info_.push_back(mv_info); } - impl_ptr_->first_pass_stats.push_back(vp9_get_frame_stats(&cpi->twopass)); + impl_ptr_->first_pass_stats.push_back( + vp9_get_frame_stats(&impl_ptr_->cpi->twopass)); } } - vp9_end_first_pass(cpi); // TODO(angiebird): Store the total_stats apart form first_pass_stats - impl_ptr_->first_pass_stats.push_back(vp9_get_total_stats(&cpi->twopass)); - free_encoder(cpi); - rewind(in_file_); - vpx_img_free(&img); + impl_ptr_->first_pass_stats.push_back( + vp9_get_total_stats(&impl_ptr_->cpi->twopass)); + vp9_end_first_pass(impl_ptr_->cpi); + fps_init_first_pass_info(&cpi->twopass.first_pass_info, + GetVectorData(impl_ptr_->first_pass_stats), + num_frames_); // Generate key_frame_map based on impl_ptr_->first_pass_stats. key_frame_map_ = ComputeKeyFrameMap(); + + free_encoder(impl_ptr_->cpi); + impl_ptr_->cpi = nullptr; + rewind(in_file_); + vpx_img_free(&img); } std::vector> SimpleEncode::ObserveFirstPassStats() { @@ -1249,7 +1257,7 @@ int SimpleEncode::GetCodingFrameNum() const { } // These are the default settings for now. - const VP9_COMP *cpi = impl_ptr_->cpi; + VP9_COMP *cpi = impl_ptr_->cpi; const int multi_layer_arf = 0; const int allow_alt_ref = 1; vpx_rational_t frame_rate = @@ -1258,13 +1266,11 @@ int SimpleEncode::GetCodingFrameNum() const { frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); FRAME_INFO frame_info = vp9_get_frame_info(&oxcf); - FIRST_PASS_INFO first_pass_info; - fps_init_first_pass_info(&first_pass_info, + fps_init_first_pass_info(&cpi->twopass.first_pass_info, GetVectorData(impl_ptr_->first_pass_stats), num_frames_); return vp9_get_coding_frame_num(&oxcf, &cpi->twopass, &frame_info, - &first_pass_info, multi_layer_arf, - allow_alt_ref); + multi_layer_arf, allow_alt_ref); } std::vector SimpleEncode::ComputeKeyFrameMap() const { @@ -1276,10 +1282,6 @@ std::vector SimpleEncode::ComputeKeyFrameMap() const { const VP9EncoderConfig oxcf = GetEncodeConfig( frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); - FIRST_PASS_INFO first_pass_info; - fps_init_first_pass_info(&first_pass_info, - GetVectorData(impl_ptr_->first_pass_stats), - num_frames_); std::vector key_frame_map(num_frames_, 0); vp9_get_key_frame_map(&oxcf, &cpi->twopass, GetVectorData(key_frame_map)); return key_frame_map; From 463d33145de28770f815466db0ffc85d14442043 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Thu, 27 May 2021 15:38:28 -0700 Subject: [PATCH 0179/1000] L2E: properly init two pass rc parameters Two pass rc parameters are only initialized in the second pass in vp9 normal two pass encoding. However, the simple_encode API queries the keyframe group, arf group, and number of coding frames without going throught the two pass route. Since recent libvpx rc changes, parameters in the TWO_PASS struct have a great influence on the determination of the above information. We therefore need to properly init two pass rc parameters in the simple_encode related environment. Change-Id: Ie14b86d6e7ebf171b638d2da24a7fdcf5a15c3d9 --- vp9/encoder/vp9_firstpass.c | 6 +++--- vp9/encoder/vp9_firstpass.h | 3 ++- vp9/simple_encode.cc | 25 ++++++++++++++++--------- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index aeb17aa6fa..28a22ded6d 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -3487,7 +3487,7 @@ static int is_skippable_frame(const VP9_COMP *cpi) { // Configure image size specific vizier parameters. // Later these will be set via additional command line options -static void init_vizier_params(TWO_PASS *const twopass, int screen_area) { +void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area) { // When |use_vizier_rc_params| is 1, we expect the rc parameters below to // have been initialised on the command line as adjustment factors such // that a factor of 1.0 will match the default behavior when @@ -3561,7 +3561,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { if (cm->current_video_frame == 0) { unsigned int screen_area = (cm->width * cm->height); - init_vizier_params(twopass, screen_area); + vp9_init_vizier_params(twopass, screen_area); } // If this is an arf frame then we dont want to read the stats file or @@ -3877,7 +3877,7 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf, // Under CONFIG_RATE_CTRL, once the first_pass_info is ready, the number of // coding frames (including show frame and alt ref) can be determined. int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf, - const TWO_PASS *const twopass, + TWO_PASS *const twopass, const FRAME_INFO *frame_info, int multi_layer_arf, int allow_alt_ref) { const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index e504528f15..ff0eb40c87 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -257,6 +257,7 @@ void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi, void vp9_init_second_pass(struct VP9_COMP *cpi); void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi); +void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area); // Post encode update of the rate control parameters for 2-pass void vp9_twopass_postencode_update(struct VP9_COMP *cpi); @@ -304,7 +305,7 @@ int vp9_get_gop_coding_frame_count(const struct VP9EncoderConfig *oxcf, int last_gop_use_alt_ref, int *use_alt_ref); int vp9_get_coding_frame_num(const struct VP9EncoderConfig *oxcf, - const TWO_PASS *const twopass, + TWO_PASS *const twopass, const FRAME_INFO *frame_info, int multi_layer_arf, int allow_alt_ref); diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 48551ef72f..7371eee8b9 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -926,9 +926,6 @@ void SimpleEncode::ComputeFirstPassStats() { impl_ptr_->first_pass_stats.push_back( vp9_get_total_stats(&impl_ptr_->cpi->twopass)); vp9_end_first_pass(impl_ptr_->cpi); - fps_init_first_pass_info(&cpi->twopass.first_pass_info, - GetVectorData(impl_ptr_->first_pass_stats), - num_frames_); // Generate key_frame_map based on impl_ptr_->first_pass_stats. key_frame_map_ = ComputeKeyFrameMap(); @@ -1057,6 +1054,11 @@ void SimpleEncode::StartEncode() { frame_coding_index_ = 0; show_frame_count_ = 0; + assert(impl_ptr_->cpi != nullptr); + FRAME_INFO frame_info = vp9_get_frame_info(&oxcf); + unsigned int screen_area = frame_info.frame_width * frame_info.frame_height; + vp9_init_vizier_params(&impl_ptr_->cpi->twopass, screen_area); + UpdateKeyFrameGroup(show_frame_count_); const GOP_COMMAND gop_command = GetGopCommand(gop_map_, show_frame_count_); @@ -1257,7 +1259,7 @@ int SimpleEncode::GetCodingFrameNum() const { } // These are the default settings for now. - VP9_COMP *cpi = impl_ptr_->cpi; + TWO_PASS twopass; const int multi_layer_arf = 0; const int allow_alt_ref = 1; vpx_rational_t frame_rate = @@ -1266,15 +1268,16 @@ int SimpleEncode::GetCodingFrameNum() const { frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); FRAME_INFO frame_info = vp9_get_frame_info(&oxcf); - fps_init_first_pass_info(&cpi->twopass.first_pass_info, + fps_init_first_pass_info(&twopass.first_pass_info, GetVectorData(impl_ptr_->first_pass_stats), num_frames_); - return vp9_get_coding_frame_num(&oxcf, &cpi->twopass, &frame_info, - multi_layer_arf, allow_alt_ref); + unsigned int screen_area = frame_info.frame_width * frame_info.frame_height; + vp9_init_vizier_params(&twopass, screen_area); + return vp9_get_coding_frame_num(&oxcf, &twopass, &frame_info, multi_layer_arf, + allow_alt_ref); } std::vector SimpleEncode::ComputeKeyFrameMap() const { - const VP9_COMP *cpi = impl_ptr_->cpi; // The last entry of first_pass_stats is the overall stats. assert(impl_ptr_->first_pass_stats.size() == num_frames_ + 1); vpx_rational_t frame_rate = @@ -1282,8 +1285,12 @@ std::vector SimpleEncode::ComputeKeyFrameMap() const { const VP9EncoderConfig oxcf = GetEncodeConfig( frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); + TWO_PASS twopass; + fps_init_first_pass_info(&twopass.first_pass_info, + GetVectorData(impl_ptr_->first_pass_stats), + num_frames_); std::vector key_frame_map(num_frames_, 0); - vp9_get_key_frame_map(&oxcf, &cpi->twopass, GetVectorData(key_frame_map)); + vp9_get_key_frame_map(&oxcf, &twopass, GetVectorData(key_frame_map)); return key_frame_map; } From 71d09c34fff6f51a153b8732eef6bfb4e381fcbf Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 5 Jun 2021 19:30:04 -0700 Subject: [PATCH 0180/1000] simple_encode_test: fix input file path this allows the file to be located in LIBVPX_TEST_DATA_PATH similar to other test sources. Bug: webm:1731 Change-Id: I51606635d91871e7c179aa8d20d4841b0d60b6ad --- test/simple_encode_test.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/simple_encode_test.cc b/test/simple_encode_test.cc index ab893045d8..03e28e3387 100644 --- a/test/simple_encode_test.cc +++ b/test/simple_encode_test.cc @@ -13,6 +13,7 @@ #include #include #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/video_source.h" #include "vp9/simple_encode.h" namespace vp9 { @@ -36,7 +37,8 @@ class SimpleEncodeTest : public ::testing::Test { const int frame_rate_den_ = 1; const int target_bitrate_ = 1000; const int num_frames_ = 17; - const std::string in_file_path_str_ = "bus_352x288_420_f20_b8.yuv"; + const std::string in_file_path_str_ = + libvpx_test::GetDataPath() + "/bus_352x288_420_f20_b8.yuv"; }; TEST_F(SimpleEncodeTest, ComputeFirstPassStats) { From 5d678fe78a5a0ece72cddbf7d7071ef8dc3598dc Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 9 Jun 2021 15:07:15 -0700 Subject: [PATCH 0181/1000] simple_encode: fix some -Wsign-compare warnings Bug: webm:1731 Change-Id: I1db777c0c3a8784fb3dcf7cd39f78ebf833ab915 --- vp9/simple_encode.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 7371eee8b9..8ff5ad3c98 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -1248,7 +1248,7 @@ static int GetCodingFrameNumFromGopMap(const std::vector &gop_map) { start_show_index += gop_command.show_frame_count; coding_frame_count += gop_command_coding_frame_count(&gop_command); } - assert(start_show_index == gop_map.size()); + assert(static_cast(start_show_index) == gop_map.size()); return coding_frame_count; } @@ -1279,7 +1279,8 @@ int SimpleEncode::GetCodingFrameNum() const { std::vector SimpleEncode::ComputeKeyFrameMap() const { // The last entry of first_pass_stats is the overall stats. - assert(impl_ptr_->first_pass_stats.size() == num_frames_ + 1); + assert(impl_ptr_->first_pass_stats.size() == + static_cast(num_frames_) + 1); vpx_rational_t frame_rate = make_vpx_rational(frame_rate_num_, frame_rate_den_); const VP9EncoderConfig oxcf = GetEncodeConfig( From d85c54d4e870e979062e275a1a58a3a44f64e601 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 11 Jun 2021 16:34:41 -0700 Subject: [PATCH 0182/1000] Update some comments for rc_target_bitrate this mirrors the change from libaom: 5b150b150 Update some comments for rc_target_bitrate Change-Id: Iaabee5924e0320609a29dc8ab71327923fb4c5d2 --- vp8/vp8_cx_iface.c | 2 +- vpx/vpx_encoder.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 8d3044f6a4..78631e7976 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -1276,7 +1276,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { VPX_VBR, /* rc_end_usage */ { NULL, 0 }, /* rc_twopass_stats_in */ { NULL, 0 }, /* rc_firstpass_mb_stats_in */ - 256, /* rc_target_bandwidth */ + 256, /* rc_target_bitrate */ 4, /* rc_min_quantizer */ 63, /* rc_max_quantizer */ 100, /* rc_undershoot_pct */ diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index f8fdfc0307..21254bb547 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -457,7 +457,7 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Target data rate * - * Target bandwidth to use for this stream, in kilobits per second. + * Target bitrate to use for this stream, in kilobits per second. */ unsigned int rc_target_bitrate; From 9a25e3169b59ca822558024423c5675790ffcf5b Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Mon, 14 Jun 2021 15:02:52 -0700 Subject: [PATCH 0183/1000] vp9-rtc: Refactor 1 pass vbr rate control This refactoring is needed to allow the RC_rtc library to support VBR. Change-Id: I863a4a65096fed06b02307098febf7976360e0f3 --- vp9/encoder/vp9_ratectrl.c | 63 ++++++++++++++++++++++---------------- vp9/encoder/vp9_ratectrl.h | 3 ++ 2 files changed, 39 insertions(+), 27 deletions(-) diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 4b87ff2f0c..b89166466c 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -2009,7 +2009,7 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { } } -static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { +int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { const RATE_CONTROL *const rc = &cpi->rc; const int af_ratio = rc->af_ratio_onepass_vbr; int64_t target = @@ -2024,7 +2024,7 @@ static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { return vp9_rc_clamp_pframe_target_size(cpi, (int)target); } -static int calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { +int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { static const int kf_ratio = 25; const RATE_CONTROL *rc = &cpi->rc; const int target = rc->avg_frame_bandwidth * kf_ratio; @@ -2050,22 +2050,9 @@ static void adjust_gfint_frame_constraint(VP9_COMP *cpi, int frame_constraint) { } } -void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; +void vp9_set_gf_update_one_pass_vbr(VP9_COMP *const cpi) { RATE_CONTROL *const rc = &cpi->rc; - int target; - if (!cpi->refresh_alt_ref_frame && - (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) || - rc->frames_to_key == 0)) { - cm->frame_type = KEY_FRAME; - rc->this_key_frame_forced = - cm->current_video_frame != 0 && rc->frames_to_key == 0; - rc->frames_to_key = cpi->oxcf.key_freq; - rc->kf_boost = DEFAULT_KF_BOOST; - rc->source_alt_ref_active = 0; - } else { - cm->frame_type = INTER_FRAME; - } + VP9_COMMON *const cm = &cpi->common; if (rc->frames_till_gf_update_due == 0) { double rate_err = 1.0; rc->gfu_boost = DEFAULT_GF_BOOST; @@ -2084,15 +2071,18 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { rate_err > 3.5) { rc->baseline_gf_interval = VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1); - } else if (rc->avg_frame_low_motion < 20) { + } else if (rc->avg_frame_low_motion > 0 && + rc->avg_frame_low_motion < 20) { // Decrease gf interval for high motion case. rc->baseline_gf_interval = VPXMAX(6, rc->baseline_gf_interval >> 1); } - // Adjust boost and af_ratio based on avg_frame_low_motion, which varies - // between 0 and 100 (stationary, 100% zero/small motion). - rc->gfu_boost = - VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / - (rc->avg_frame_low_motion + 100)); + if (rc->avg_frame_low_motion > 0) { + // Adjust boost and af_ratio based on avg_frame_low_motion, which + // varies between 0 and 100 (stationary, 100% zero/small motion). + rc->gfu_boost = + VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / + (rc->avg_frame_low_motion + 100)); + } rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400)); } adjust_gfint_frame_constraint(cpi, rc->frames_to_key); @@ -2105,10 +2095,29 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { rc->alt_ref_gf_group = 1; } } +} + +void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int target; + if (!cpi->refresh_alt_ref_frame && + (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) || + rc->frames_to_key == 0)) { + cm->frame_type = KEY_FRAME; + rc->this_key_frame_forced = + cm->current_video_frame != 0 && rc->frames_to_key == 0; + rc->frames_to_key = cpi->oxcf.key_freq; + rc->kf_boost = DEFAULT_KF_BOOST; + rc->source_alt_ref_active = 0; + } else { + cm->frame_type = INTER_FRAME; + } + vp9_set_gf_update_one_pass_vbr(cpi); if (cm->frame_type == KEY_FRAME) - target = calc_iframe_target_size_one_pass_vbr(cpi); + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi); else - target = calc_pframe_target_size_one_pass_vbr(cpi); + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi); vp9_rc_set_frame_target(cpi, target); if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0) vp9_cyclic_refresh_update_parameters(cpi); @@ -2953,7 +2962,7 @@ static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, } } } - target = calc_pframe_target_size_one_pass_vbr(cpi); + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi); vp9_rc_set_frame_target(cpi, target); } rc->prev_avg_source_sad_lag = avg_source_sad_lag; @@ -3163,7 +3172,7 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval)); adjust_gfint_frame_constraint(cpi, rc->frames_to_key); rc->frames_till_gf_update_due = rc->baseline_gf_interval; - target = calc_pframe_target_size_one_pass_vbr(cpi); + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi); vp9_rc_set_frame_target(cpi, target); rc->count_last_scene_change = 0; } else { diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 0120f90a01..8ef10c94a3 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -255,6 +255,9 @@ void vp9_rc_get_one_pass_vbr_params(struct VP9_COMP *cpi); void vp9_rc_get_one_pass_cbr_params(struct VP9_COMP *cpi); int vp9_calc_pframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi); int vp9_calc_iframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi); +int vp9_calc_pframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi); +int vp9_calc_iframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi); +void vp9_set_gf_update_one_pass_vbr(struct VP9_COMP *const cpi); void vp9_update_buffer_level_preencode(struct VP9_COMP *cpi); void vp9_rc_get_svc_params(struct VP9_COMP *cpi); From a945f344e04d2851cd675cca48182cca2e7d8a4e Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 15 Jun 2021 14:55:29 -0700 Subject: [PATCH 0184/1000] Change the data path in svc rate control test Change-Id: Iba58e2aa2578964b5c8b48ab0acbee9b44bcdada --- test/ratectrl_rtc_test.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc index 3a1c8469ee..58bfac3a7a 100644 --- a/test/ratectrl_rtc_test.cc +++ b/test/ratectrl_rtc_test.cc @@ -115,8 +115,7 @@ class RcInterfaceTest : public ::testing::Test { libvpx::VP9FrameParamsQpRTC frame_params; frame_params.frame_type = KEY_FRAME; std::ifstream svc_file; - svc_file.open(std::string(std::getenv("LIBVPX_TEST_DATA_PATH")) + - "/rc_interface_test_svc"); + svc_file.open(libvpx_test::GetDataPath() + "/rc_interface_test_svc"); ASSERT_TRUE(svc_file.good()); for (size_t i = 0; i < kNumFrame * rc_cfg_.ss_number_layers; i++) { svc_file >> frame_info; From 364f0e31fed78be436d00c177574dac00c0d85a4 Mon Sep 17 00:00:00 2001 From: Chunbo Hua Date: Wed, 16 Jun 2021 01:51:44 -0700 Subject: [PATCH 0185/1000] Initialize VP9EncoderConfig profile and bit depth Change-Id: I5c42013a08677cdef8d47f348458118338ff0138 --- vp9/ratectrl_rtc.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index 47f9f3ba33..8f77fc842d 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -39,6 +39,8 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { cm->bit_depth = VPX_BITS_8; cm->show_frame = 1; oxcf->rc_mode = VPX_CBR; + oxcf->profile = cm->profile; + oxcf->bit_depth = cm->bit_depth; oxcf->pass = 0; oxcf->aq_mode = NO_AQ; oxcf->content = VP9E_CONTENT_DEFAULT; From 338013712e516d07388651437918e6328ea909f5 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Thu, 17 Jun 2021 12:00:33 -0700 Subject: [PATCH 0186/1000] vp9: Adjust logic for gf update in 1 pass vbr This reduces some regression when external RC is used, for which avg_frame_low_motion is not set/updated (=0). Change-Id: I2408e62bd97592e892cefa0f183357c641aa5eea --- vp9/encoder/vp9_ratectrl.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index b89166466c..3775d22361 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -2076,13 +2076,14 @@ void vp9_set_gf_update_one_pass_vbr(VP9_COMP *const cpi) { // Decrease gf interval for high motion case. rc->baseline_gf_interval = VPXMAX(6, rc->baseline_gf_interval >> 1); } - if (rc->avg_frame_low_motion > 0) { - // Adjust boost and af_ratio based on avg_frame_low_motion, which - // varies between 0 and 100 (stationary, 100% zero/small motion). + // Adjust boost and af_ratio based on avg_frame_low_motion, which + // varies between 0 and 100 (stationary, 100% zero/small motion). + if (rc->avg_frame_low_motion > 0) rc->gfu_boost = VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / (rc->avg_frame_low_motion + 100)); - } + else if (rc->avg_frame_low_motion == 0 && rate_err > 1.0) + rc->gfu_boost = DEFAULT_GF_BOOST >> 1; rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400)); } adjust_gfint_frame_constraint(cpi, rc->frames_to_key); From 2380e13da8a5bba3e8afdb14e0aa61fd980a49c9 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 18 Jun 2021 11:56:27 -0700 Subject: [PATCH 0187/1000] normalize vp9_calc_[ip]frame declarations and definitions fixes warnings under visual studio: vp9\encoder\vp9_ratectrl.c(2012): warning C4028: formal parameter 1 different from declaration vp9\encoder\vp9_ratectrl.c(2027): warning C4028: formal parameter 1 different from declaration Change-Id: Ia0740db597fb7a259f90d362b483f58662f9f584 --- vp9/encoder/vp9_ratectrl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 3775d22361..51fb2aab8f 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -2009,7 +2009,7 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { } } -int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { +int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) { const RATE_CONTROL *const rc = &cpi->rc; const int af_ratio = rc->af_ratio_onepass_vbr; int64_t target = @@ -2024,7 +2024,7 @@ int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { return vp9_rc_clamp_pframe_target_size(cpi, (int)target); } -int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { +int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *cpi) { static const int kf_ratio = 25; const RATE_CONTROL *rc = &cpi->rc; const int target = rc->avg_frame_bandwidth * kf_ratio; From 1f45e7b07ec839dae7a90455e00c3b2d553ea772 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 15 Jun 2021 12:54:13 -0700 Subject: [PATCH 0188/1000] vp9 rc: add vbr to rtc rate control library Change-Id: I3d2565572c2b905966d60bcaa6e5e6f057b1bd51 --- test/ratectrl_rtc_test.cc | 104 +++++++++++++++++++++++++++++++++---- test/test-data.mk | 2 + vp9/encoder/vp9_ratectrl.c | 3 -- vp9/encoder/vp9_ratectrl.h | 3 ++ vp9/ratectrl_rtc.cc | 31 ++++++++--- vp9/ratectrl_rtc.h | 4 ++ 6 files changed, 126 insertions(+), 21 deletions(-) diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc index 58bfac3a7a..e9a9f15e9f 100644 --- a/test/ratectrl_rtc_test.cc +++ b/test/ratectrl_rtc_test.cc @@ -32,6 +32,7 @@ struct FrameInfo { info.bytes_used; return is; } + int frame_id; int spatial_id; int temporal_id; @@ -48,24 +49,32 @@ struct FrameInfo { // This test runs the rate control interface and compare against ground truth // generated by encoders. // Settings for the encoder: -// For 1 layer: +// For 1 layer CBR: +// - AQ_Mode 0 +// - Disable golden refresh +// - Bitrate x 2 at frame/superframe 200 +// - Bitrate / 4 at frame/superframe 400 +// examples/vpx_temporal_svc_encoder gipsrec_motion1.1280_720.yuv out vp9 +// 1280 720 1 30 7 0 0 1 0 1000 // +// For 1 layer VBR: +// - Set rc_end_usage to VPX_VBR +// - AQ Mode 0 +// - Disable vp9_compute_frame_low_motion in vp9_encoder.c // examples/vpx_temporal_svc_encoder gipsrec_motion1.1280_720.yuv out vp9 // 1280 720 1 30 7 0 0 1 0 1000 // // For SVC (3 temporal layers, 3 spatial layers): -// +// - AQ_Mode 0 +// - Disable golden refresh +// - Bitrate x 2 at frame/superframe 200 +// - Bitrate / 4 at frame/superframe 400 // examples/vp9_spatial_svc_encoder -f 10000 -w 1280 -h 720 -t 1/30 -sl 3 // -k 10000 -bl 100,140,200,250,350,500,450,630,900 -b 1600 --rc-end-usage=1 // --lag-in-frames=0 --passes=1 --speed=7 --threads=1 // --temporal-layering-mode=3 -aq 1 -rcstat 1 // gipsrec_motion1.1280_720.yuv -o out.webm // -// - AQ_Mode 0 -// - Disable golden refresh -// - Bitrate x 2 at frame/superframe 200 -// - Bitrate / 4 at frame/superframe 400 -// // The generated file includes: // frame number, spatial layer ID, temporal layer ID, base QP, target // bandwidth, buffer level, loopfilter level, encoded frame size @@ -77,8 +86,8 @@ class RcInterfaceTest : public ::testing::Test { virtual ~RcInterfaceTest() {} protected: - void RunOneLayer() { - SetConfigOneLayer(); + void RunOneLayerCBR() { + SetConfigOneLayerCBR(); rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); FrameInfo frame_info; libvpx::VP9FrameParamsQpRTC frame_params; @@ -144,8 +153,58 @@ class RcInterfaceTest : public ::testing::Test { } } + void RunOneLayerVBR() { + SetConfigOneLayerVBR(); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + FrameInfo frame_info; + libvpx::VP9FrameParamsQpRTC frame_params; + frame_params.frame_type = KEY_FRAME; + frame_params.spatial_layer_id = 0; + frame_params.temporal_layer_id = 0; + std::ifstream one_layer_file; + one_layer_file.open(libvpx_test::GetDataPath() + + "/rc_interface_test_one_layer_vbr"); + ASSERT_TRUE(one_layer_file.good()); + for (size_t i = 0; i < kNumFrame; i++) { + one_layer_file >> frame_info; + if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME; + ASSERT_EQ(frame_info.spatial_id, 0); + ASSERT_EQ(frame_info.temporal_id, 0); + rc_api_->ComputeQP(frame_params); + ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q); + ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_); + rc_api_->PostEncodeUpdate(frame_info.bytes_used); + } + } + + void RunOneLayerVBRPeriodicKey() { + SetConfigOneLayerVBRPeriodicKey(); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + FrameInfo frame_info; + libvpx::VP9FrameParamsQpRTC frame_params; + frame_params.frame_type = KEY_FRAME; + frame_params.spatial_layer_id = 0; + frame_params.temporal_layer_id = 0; + std::ifstream one_layer_file; + one_layer_file.open(libvpx_test::GetDataPath() + + "/rc_interface_test_one_layer_vbr_periodic_key"); + ASSERT_TRUE(one_layer_file.good()); + for (size_t i = 0; i < kNumFrame; i++) { + one_layer_file >> frame_info; + if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME; + if (frame_info.frame_id % rc_cfg_.key_freq == 0) + frame_params.frame_type = KEY_FRAME; + ASSERT_EQ(frame_info.spatial_id, 0); + ASSERT_EQ(frame_info.temporal_id, 0); + rc_api_->ComputeQP(frame_params); + ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q); + ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_); + rc_api_->PostEncodeUpdate(frame_info.bytes_used); + } + } + private: - void SetConfigOneLayer() { + void SetConfig() { rc_cfg_.width = 1280; rc_cfg_.height = 720; rc_cfg_.max_quantizer = 52; @@ -167,6 +226,24 @@ class RcInterfaceTest : public ::testing::Test { rc_cfg_.min_quantizers[0] = 2; } + void SetConfigOneLayerCBR() { + SetConfig(); + rc_cfg_.rc_mode = VPX_CBR; + rc_cfg_.key_freq = 3000; + } + + void SetConfigOneLayerVBR() { + SetConfig(); + rc_cfg_.rc_mode = VPX_VBR; + rc_cfg_.key_freq = 3000; + } + + void SetConfigOneLayerVBRPeriodicKey() { + SetConfig(); + rc_cfg_.rc_mode = VPX_VBR; + rc_cfg_.key_freq = 300; + } + void SetConfigSVC() { rc_cfg_.width = 1280; rc_cfg_.height = 720; @@ -182,6 +259,7 @@ class RcInterfaceTest : public ::testing::Test { rc_cfg_.framerate = 30.0; rc_cfg_.ss_number_layers = 3; rc_cfg_.ts_number_layers = 3; + rc_cfg_.rc_mode = VPX_CBR; rc_cfg_.scaling_factor_num[0] = 1; rc_cfg_.scaling_factor_den[0] = 4; @@ -217,7 +295,11 @@ class RcInterfaceTest : public ::testing::Test { libvpx::VP9RateControlRtcConfig rc_cfg_; }; -TEST_F(RcInterfaceTest, OneLayer) { RunOneLayer(); } +TEST_F(RcInterfaceTest, OneLayerCBR) { RunOneLayerCBR(); } + +TEST_F(RcInterfaceTest, OneLayerVBR) { RunOneLayerVBR(); } + +TEST_F(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); } TEST_F(RcInterfaceTest, SVC) { RunSVC(); } } // namespace diff --git a/test/test-data.mk b/test/test-data.mk index 744901b115..379fc6e7a9 100644 --- a/test/test-data.mk +++ b/test/test-data.mk @@ -28,6 +28,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_one_layer +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_one_layer_vbr +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_one_layer_vbr_periodic_key LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_svc LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += bus_352x288_420_f20_b8.yuv diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 3775d22361..dbbd458c96 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -39,9 +39,6 @@ #define MAX_MB_RATE 250 #define MAXRATE_1080P 4000000 -#define DEFAULT_KF_BOOST 2000 -#define DEFAULT_GF_BOOST 2000 - #define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1 #define MIN_BPB_FACTOR 0.005 diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 8ef10c94a3..bdddd2df8b 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -27,6 +27,9 @@ extern "C" { // Bits Per MB at different Q (Multiplied by 512) #define BPER_MB_NORMBITS 9 +#define DEFAULT_KF_BOOST 2000 +#define DEFAULT_GF_BOOST 2000 + #define MIN_GF_INTERVAL 4 #define MAX_GF_INTERVAL 16 #define FIXED_GF_INTERVAL 8 // Used in some testing modes only diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index 8f77fc842d..8455ca9a3d 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -38,13 +38,16 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { cm->profile = PROFILE_0; cm->bit_depth = VPX_BITS_8; cm->show_frame = 1; - oxcf->rc_mode = VPX_CBR; oxcf->profile = cm->profile; oxcf->bit_depth = cm->bit_depth; + oxcf->rc_mode = rc_cfg.rc_mode; oxcf->pass = 0; oxcf->aq_mode = NO_AQ; oxcf->content = VP9E_CONTENT_DEFAULT; oxcf->drop_frames_water_mark = 0; + cm->current_video_frame = 0; + oxcf->key_freq = rc_cfg.key_freq; + rc->kf_boost = DEFAULT_KF_BOOST; UpdateRateControl(rc_cfg); @@ -57,8 +60,8 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { rc->rc_2_frame = 0; vp9_rc_init_minq_luts(); vp9_rc_init(oxcf, 0, rc); + rc->frames_to_key = oxcf->key_freq; cpi_->sf.use_nonrd_pick_mode = 1; - cm->current_video_frame = 0; } void VP9RateControlRTC::UpdateRateControl( @@ -75,6 +78,7 @@ void VP9RateControlRTC::UpdateRateControl( oxcf->best_allowed_q = vp9_quantizer_to_qindex(rc_cfg.min_quantizer); rc->worst_quality = oxcf->worst_allowed_q; rc->best_quality = oxcf->best_allowed_q; + oxcf->init_framerate = rc_cfg.framerate; oxcf->target_bandwidth = 1000 * rc_cfg.target_bandwidth; oxcf->starting_buffer_level_ms = rc_cfg.buf_initial_sz; oxcf->optimal_buffer_level_ms = rc_cfg.buf_optimal_sz; @@ -140,11 +144,24 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { cpi_->sf.use_nonrd_pick_mode = 1; if (cpi_->svc.number_spatial_layers == 1 && cpi_->svc.number_temporal_layers == 1) { - int target; - if (frame_is_intra_only(cm)) - target = vp9_calc_iframe_target_size_one_pass_cbr(cpi_); - else - target = vp9_calc_pframe_target_size_one_pass_cbr(cpi_); + int target = 0; + if (cpi_->oxcf.rc_mode == VPX_CBR) { + if (frame_is_intra_only(cm)) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi_); + else + target = vp9_calc_pframe_target_size_one_pass_cbr(cpi_); + } else if (cpi_->oxcf.rc_mode == VPX_VBR) { + if (cm->frame_type == KEY_FRAME) { + cpi_->rc.this_key_frame_forced = + cm->current_video_frame != 0 && cpi_->rc.frames_to_key == 0; + cpi_->rc.frames_to_key = cpi_->oxcf.key_freq; + } + vp9_set_gf_update_one_pass_vbr(cpi_); + if (frame_is_intra_only(cm)) + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi_); + else + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi_); + } vp9_rc_set_frame_target(cpi_, target); vp9_update_buffer_level_preencode(cpi_); } else { diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index 72ea40fd68..a1f2767126 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -49,6 +49,10 @@ struct VP9RateControlRtcConfig { int scaling_factor_den[VPX_SS_MAX_LAYERS]; int layer_target_bitrate[VPX_MAX_LAYERS]; int ts_rate_decimator[VPX_TS_MAX_LAYERS]; + // vbr, cbr + enum vpx_rc_mode rc_mode; + // key frame frequency + int key_freq; }; struct VP9FrameParamsQpRTC { From 9873d61b252c4bada8515a4fd8df8377cda012f1 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 21 Jun 2021 13:33:44 -0700 Subject: [PATCH 0189/1000] test-data.sha1: add missing sha sums for rc_interface_test_one_layer_vbr and rc_interface_test_one_layer_vbr_periodic_key added in: 1f45e7b07 vp9 rc: add vbr to rtc rate control library Change-Id: I8bfa3698284c8ff289e830f7b8fa1ca42b752563 --- test/test-data.sha1 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test-data.sha1 b/test/test-data.sha1 index 6f9021554d..bcf9612fba 100644 --- a/test/test-data.sha1 +++ b/test/test-data.sha1 @@ -871,3 +871,5 @@ d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv 03f827c0e36ff9a6e23c5cc11936924e4f1827ab *rc_interface_test_one_layer 99e4f4c2961d46dc286db230090a39d78460b25d *rc_interface_test_svc +9dcaafd91bc61ed360c23616b4788437b9f9b96b *rc_interface_test_one_layer_vbr +babd17cca2e93cc74753c6ed80de87457bc3a5f3 *rc_interface_test_one_layer_vbr_periodic_key From a1fdfbb174487e5efb76e6e77119d2e50840086e Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Fri, 18 Jun 2021 16:09:41 -0700 Subject: [PATCH 0190/1000] Fix flaky assertions in SimpleEncode Bug: webm:1731 Change-Id: Ieecb98a7ac19e6291acd5d51432dc6a3789e9552 --- vp9/encoder/vp9_firstpass.c | 2 +- vp9/simple_encode.cc | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 28a22ded6d..375438839b 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -3836,7 +3836,7 @@ void vp9_get_next_group_of_picture(const VP9_COMP *cpi, int *first_is_key_frame, if (gop_command->use) { *coding_frame_count = gop_command_coding_frame_count(gop_command); *use_alt_ref = gop_command->use_alt_ref; - assert(*coding_frame_count < rc.frames_to_key); + assert(gop_command->show_frame_count <= rc.frames_to_key); } else { *coding_frame_count = vp9_get_gop_coding_frame_count( &cpi->oxcf, &cpi->twopass, &cpi->frame_info, &rc, *first_show_idx, diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 8ff5ad3c98..87727cb12a 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -1009,8 +1009,7 @@ T *GetVectorData(const std::vector &v) { static GOP_COMMAND GetGopCommand(const std::vector &gop_map, int start_show_index) { GOP_COMMAND gop_command; - if (gop_map.size() > 0) { - assert(static_cast(start_show_index) < gop_map.size()); + if (static_cast(start_show_index) < gop_map.size()) { assert((gop_map[start_show_index] & kGopMapFlagStart) != 0); int end_show_index = start_show_index + 1; // gop_map[end_show_index] & kGopMapFlagStart == 0 means this is From 0bb7bb6df8c2dcce22d0151676183f589bcf27a1 Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Thu, 17 Jun 2021 20:23:30 -0700 Subject: [PATCH 0191/1000] Add use_simple_encode_api to oxcf Use this flag to change the encoder behavior when SimpleEncode APIs are used BUG=webm:1733 Change-Id: I9f0852a03ff99faa01cdd8eee8ab71718cc58632 --- vp9/encoder/vp9_encodeframe.c | 37 ++-- vp9/encoder/vp9_encoder.c | 394 ++++++++++++---------------------- vp9/encoder/vp9_encoder.h | 1 + vp9/encoder/vp9_firstpass.c | 106 +++++---- vp9/encoder/vp9_firstpass.h | 2 +- vp9/encoder/vp9_ratectrl.c | 29 +-- vp9/simple_encode.cc | 1 + vp9/vp9_cx_iface.c | 3 + 8 files changed, 248 insertions(+), 325 deletions(-) diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index dcd6476581..00855319d6 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -4603,15 +4603,18 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); #if CONFIG_RATE_CTRL - // Store partition, motion vector of the superblock. - if (output_enabled) { - const int num_unit_rows = get_num_unit_4x4(cpi->frame_info.frame_height); - const int num_unit_cols = get_num_unit_4x4(cpi->frame_info.frame_width); - store_superblock_info(pc_tree, cm->mi_grid_visible, cm->mi_stride, - num_4x4_blocks_wide_lookup[BLOCK_64X64], - num_unit_rows, num_unit_cols, mi_row << 1, - mi_col << 1, cpi->partition_info, - cpi->motion_vector_info); + if (oxcf->use_simple_encode_api) { + // Store partition, motion vector of the superblock. + if (output_enabled) { + const int num_unit_rows = + get_num_unit_4x4(cpi->frame_info.frame_height); + const int num_unit_cols = get_num_unit_4x4(cpi->frame_info.frame_width); + store_superblock_info(pc_tree, cm->mi_grid_visible, cm->mi_stride, + num_4x4_blocks_wide_lookup[BLOCK_64X64], + num_unit_rows, num_unit_cols, mi_row << 1, + mi_col << 1, cpi->partition_info, + cpi->motion_vector_info); + } } #endif // CONFIG_RATE_CTRL } @@ -5981,9 +5984,14 @@ void vp9_init_tile_data(VP9_COMP *cpi) { for (i = 0; i < BLOCK_SIZES; ++i) { for (j = 0; j < MAX_MODES; ++j) { tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT; -#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL +#if CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT; + } +#endif // CONFIG_RATE_CTRL +#if CONFIG_CONSISTENT_RECODE tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT; -#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL +#endif // CONFIG_CONSISTENT_RECODE tile_data->mode_map[i][j] = j; } } @@ -6406,7 +6414,12 @@ static void restore_encode_params(VP9_COMP *cpi) { void vp9_encode_frame(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; -#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL +#if CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + restore_encode_params(cpi); + } +#endif // CONFIG_RATE_CTRL +#if CONFIG_CONSISTENT_RECODE restore_encode_params(cpi); #endif diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 34646465a6..bbd6dd030d 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1022,10 +1022,12 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { cpi->mi_ssim_rdmult_scaling_factors = NULL; #if CONFIG_RATE_CTRL - free_partition_info(cpi); - free_motion_vector_info(cpi); - free_fp_motion_vector_info(cpi); - free_tpl_stats_info(cpi); + if (cpi->oxcf.use_simple_encode_api) { + free_partition_info(cpi); + free_motion_vector_info(cpi); + free_fp_motion_vector_info(cpi); + free_tpl_stats_info(cpi); + } #endif vp9_free_ref_frame_buffers(cm->buffer_pool); @@ -2669,10 +2671,12 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, #if CONFIG_RATE_CTRL encode_command_init(&cpi->encode_command); - partition_info_init(cpi); - motion_vector_info_init(cpi); - fp_motion_vector_info_init(cpi); - tpl_stats_info_init(cpi); + if (oxcf->use_simple_encode_api) { + partition_info_init(cpi); + motion_vector_info_init(cpi); + fp_motion_vector_info_init(cpi); + tpl_stats_info_init(cpi); + } #endif return cpi; @@ -4470,11 +4474,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest loop_at_this_size = 0; } -#if CONFIG_RATE_CTRL - if (cpi->encode_command.use_external_target_frame_bits) { - q = rq_model_predict_q_index(rq_model, rq_history, rc->this_frame_target); - } -#endif // CONFIG_RATE_CTRL // Decide frame size bounds first time through. if (loop_count == 0) { vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target, @@ -4517,10 +4516,16 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest #if CONFIG_RATE_CTRL // TODO(angiebird): This is a hack for making sure the encoder use the // external_quantize_index exactly. Avoid this kind of hack later. - if (cpi->encode_command.use_external_quantize_index) { - q = cpi->encode_command.external_quantize_index; + if (cpi->oxcf.use_simple_encode_api) { + if (cpi->encode_command.use_external_target_frame_bits) { + q = rq_model_predict_q_index(rq_model, rq_history, + rc->this_frame_target); + } + if (cpi->encode_command.use_external_quantize_index) { + q = cpi->encode_command.external_quantize_index; + } } -#endif +#endif // CONFIG_RATE_CTRL if (cpi->ext_ratectrl.ready && !ext_rc_recode) { vpx_codec_err_t codec_status; const GF_GROUP *gf_group = &cpi->twopass.gf_group; @@ -4607,33 +4612,36 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest ext_rc_recode = 1; } #if CONFIG_RATE_CTRL - // This part needs to be after save_coding_context() because - // restore_coding_context will be called in the end of this function. - // TODO(angiebird): This is a hack for making sure the encoder use the - // external_quantize_index exactly. Avoid this kind of hack later. - if (cpi->encode_command.use_external_quantize_index) { - break; - } + if (cpi->oxcf.use_simple_encode_api) { + // This part needs to be after save_coding_context() because + // restore_coding_context will be called in the end of this function. + // TODO(angiebird): This is a hack for making sure the encoder use the + // external_quantize_index exactly. Avoid this kind of hack later. + if (cpi->encode_command.use_external_quantize_index) { + break; + } - if (cpi->encode_command.use_external_target_frame_bits) { - const double percent_diff = get_bits_percent_diff( - rc->this_frame_target, rc->projected_frame_size); - update_rq_history(rq_history, rc->this_frame_target, - rc->projected_frame_size, q); - loop_count += 1; + if (cpi->encode_command.use_external_target_frame_bits) { + const double percent_diff = get_bits_percent_diff( + rc->this_frame_target, rc->projected_frame_size); + update_rq_history(rq_history, rc->this_frame_target, + rc->projected_frame_size, q); + loop_count += 1; - rq_model_update(rq_history, rc->this_frame_target, rq_model); + rq_model_update(rq_history, rc->this_frame_target, rq_model); - // Check if we hit the target bitrate. - if (percent_diff <= cpi->encode_command.target_frame_bits_error_percent || - rq_history->recode_count >= RATE_CTRL_MAX_RECODE_NUM || - rq_history->q_index_low >= rq_history->q_index_high) { - break; - } + // Check if we hit the target bitrate. + if (percent_diff <= + cpi->encode_command.target_frame_bits_error_percent || + rq_history->recode_count >= RATE_CTRL_MAX_RECODE_NUM || + rq_history->q_index_low >= rq_history->q_index_high) { + break; + } - loop = 1; - restore_coding_context(cpi); - continue; + loop = 1; + restore_coding_context(cpi); + continue; + } } #endif // CONFIG_RATE_CTRL @@ -5368,17 +5376,81 @@ static void set_mb_wiener_variance(VP9_COMP *cpi) { } #if !CONFIG_REALTIME_ONLY -static void update_encode_frame_result( +static void update_encode_frame_result_basic( + FRAME_UPDATE_TYPE update_type, int show_idx, int quantize_index, + ENCODE_FRAME_RESULT *encode_frame_result) { + encode_frame_result->show_idx = show_idx; + encode_frame_result->update_type = update_type; + encode_frame_result->quantize_index = quantize_index; +} + +#if CONFIG_RATE_CTRL +static void yv12_buffer_to_image_buffer(const YV12_BUFFER_CONFIG *yv12_buffer, + IMAGE_BUFFER *image_buffer) { + const uint8_t *src_buf_ls[3] = { yv12_buffer->y_buffer, yv12_buffer->u_buffer, + yv12_buffer->v_buffer }; + const int src_stride_ls[3] = { yv12_buffer->y_stride, yv12_buffer->uv_stride, + yv12_buffer->uv_stride }; + const int w_ls[3] = { yv12_buffer->y_crop_width, yv12_buffer->uv_crop_width, + yv12_buffer->uv_crop_width }; + const int h_ls[3] = { yv12_buffer->y_crop_height, yv12_buffer->uv_crop_height, + yv12_buffer->uv_crop_height }; + int plane; + for (plane = 0; plane < 3; ++plane) { + const int src_stride = src_stride_ls[plane]; + const int w = w_ls[plane]; + const int h = h_ls[plane]; + const uint8_t *src_buf = src_buf_ls[plane]; + uint8_t *dst_buf = image_buffer->plane_buffer[plane]; + int r; + assert(image_buffer->plane_width[plane] == w); + assert(image_buffer->plane_height[plane] == h); + for (r = 0; r < h; ++r) { + memcpy(dst_buf, src_buf, sizeof(*src_buf) * w); + src_buf += src_stride; + dst_buf += w; + } + } +} +// This function will update extra information specific for simple_encode APIs +static void update_encode_frame_result_simple_encode( int ref_frame_flags, FRAME_UPDATE_TYPE update_type, const YV12_BUFFER_CONFIG *source_frame, const RefCntBuffer *coded_frame_buf, - RefCntBuffer *ref_frame_buf[MAX_INTER_REF_FRAMES], int quantize_index, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int quantize_index, uint32_t bit_depth, uint32_t input_bit_depth, const FRAME_COUNTS *counts, -#if CONFIG_RATE_CTRL const PARTITION_INFO *partition_info, const MOTION_VECTOR_INFO *motion_vector_info, const TplDepStats *tpl_stats_info, + ENCODE_FRAME_RESULT *encode_frame_result) { + PSNR_STATS psnr; + update_encode_frame_result_basic(update_type, coded_frame_buf->frame_index, + quantize_index, encode_frame_result); +#if CONFIG_VP9_HIGHBITDEPTH + vpx_calc_highbd_psnr(source_frame, &coded_frame_buf->buf, &psnr, bit_depth, + input_bit_depth); +#else // CONFIG_VP9_HIGHBITDEPTH + (void)bit_depth; + (void)input_bit_depth; + vpx_calc_psnr(source_frame, &coded_frame_buf->buf, &psnr); +#endif // CONFIG_VP9_HIGHBITDEPTH + encode_frame_result->frame_coding_index = coded_frame_buf->frame_coding_index; + + vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs, + encode_frame_result->ref_frame_coding_indexes, + encode_frame_result->ref_frame_valid_list); + + encode_frame_result->psnr = psnr.psnr[0]; + encode_frame_result->sse = psnr.sse[0]; + encode_frame_result->frame_counts = *counts; + encode_frame_result->partition_info = partition_info; + encode_frame_result->motion_vector_info = motion_vector_info; + encode_frame_result->tpl_stats_info = tpl_stats_info; + if (encode_frame_result->coded_frame.allocated) { + yv12_buffer_to_image_buffer(&coded_frame_buf->buf, + &encode_frame_result->coded_frame); + } +} #endif // CONFIG_RATE_CTRL - ENCODE_FRAME_RESULT *encode_frame_result); #endif // !CONFIG_REALTIME_ONLY static void encode_frame_to_data_rate( @@ -5473,10 +5545,14 @@ static void encode_frame_to_data_rate( memset(cpi->mode_chosen_counts, 0, MAX_MODES * sizeof(*cpi->mode_chosen_counts)); #endif -#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL +#if CONFIG_CONSISTENT_RECODE // Backup to ensure consistency between recodes save_encode_params(cpi); -#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL +#elif CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + save_encode_params(cpi); + } +#endif if (cpi->sf.recode_loop == DISALLOW_RECODE) { if (!encode_without_recode_loop(cpi, size, dest)) return; @@ -5568,10 +5644,12 @@ static void encode_frame_to_data_rate( assert(encode_frame_result == NULL); #else // CONFIG_REALTIME_ONLY if (encode_frame_result != NULL) { - const int ref_frame_flags = get_ref_frame_flags(cpi); const RefCntBuffer *coded_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx); RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; + FRAME_UPDATE_TYPE update_type = + cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]; + int quantize_index = vp9_get_quantizer(cpi); get_ref_frame_bufs(cpi, ref_frame_bufs); // update_encode_frame_result() depends on twopass.gf_group.index and // cm->new_fb_idx, cpi->Source, cpi->lst_fb_idx, cpi->gld_fb_idx and @@ -5589,15 +5667,21 @@ static void encode_frame_to_data_rate( // This function needs to be called before vp9_update_reference_frames(). // TODO(angiebird): Improve the codebase to make the update of frame // dependent variables more robust. - update_encode_frame_result( - ref_frame_flags, - cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index], - cpi->Source, coded_frame_buf, ref_frame_bufs, vp9_get_quantizer(cpi), - cm->bit_depth, cpi->oxcf.input_bit_depth, cpi->td.counts, + + update_encode_frame_result_basic(update_type, coded_frame_buf->frame_index, + quantize_index, encode_frame_result); #if CONFIG_RATE_CTRL - cpi->partition_info, cpi->motion_vector_info, cpi->tpl_stats_info, + if (cpi->oxcf.use_simple_encode_api) { + const int ref_frame_flags = get_ref_frame_flags(cpi); + update_encode_frame_result_simple_encode( + ref_frame_flags, + cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index], + cpi->Source, coded_frame_buf, ref_frame_bufs, quantize_index, + cm->bit_depth, cpi->oxcf.input_bit_depth, cpi->td.counts, + cpi->partition_info, cpi->motion_vector_info, cpi->tpl_stats_info, + encode_frame_result); + } #endif // CONFIG_RATE_CTRL - encode_frame_result); } #endif // CONFIG_REALTIME_ONLY @@ -7517,7 +7601,9 @@ static void setup_tpl_stats(VP9_COMP *cpi) { #endif // CONFIG_NON_GREEDY_MV #if CONFIG_RATE_CTRL - accumulate_frame_tpl_stats(cpi); + if (cpi->oxcf.use_simple_encode_api) { + accumulate_frame_tpl_stats(cpi); + } #endif // CONFIG_RATE_CTRL } @@ -7545,206 +7631,6 @@ void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags, } } -#if !CONFIG_REALTIME_ONLY -#if CONFIG_RATE_CTRL -static void copy_frame_counts(const FRAME_COUNTS *input_counts, - FRAME_COUNTS *output_counts) { - int i, j, k, l, m, n; - for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) { - for (j = 0; j < INTRA_MODES; ++j) { - output_counts->y_mode[i][j] = input_counts->y_mode[i][j]; - } - } - for (i = 0; i < INTRA_MODES; ++i) { - for (j = 0; j < INTRA_MODES; ++j) { - output_counts->uv_mode[i][j] = input_counts->uv_mode[i][j]; - } - } - for (i = 0; i < PARTITION_CONTEXTS; ++i) { - for (j = 0; j < PARTITION_TYPES; ++j) { - output_counts->partition[i][j] = input_counts->partition[i][j]; - } - } - for (i = 0; i < TX_SIZES; ++i) { - for (j = 0; j < PLANE_TYPES; ++j) { - for (k = 0; k < REF_TYPES; ++k) { - for (l = 0; l < COEF_BANDS; ++l) { - for (m = 0; m < COEFF_CONTEXTS; ++m) { - output_counts->eob_branch[i][j][k][l][m] = - input_counts->eob_branch[i][j][k][l][m]; - for (n = 0; n < UNCONSTRAINED_NODES + 1; ++n) { - output_counts->coef[i][j][k][l][m][n] = - input_counts->coef[i][j][k][l][m][n]; - } - } - } - } - } - } - for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) { - for (j = 0; j < SWITCHABLE_FILTERS; ++j) { - output_counts->switchable_interp[i][j] = - input_counts->switchable_interp[i][j]; - } - } - for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { - for (j = 0; j < INTER_MODES; ++j) { - output_counts->inter_mode[i][j] = input_counts->inter_mode[i][j]; - } - } - for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) { - for (j = 0; j < 2; ++j) { - output_counts->intra_inter[i][j] = input_counts->intra_inter[i][j]; - } - } - for (i = 0; i < COMP_INTER_CONTEXTS; ++i) { - for (j = 0; j < 2; ++j) { - output_counts->comp_inter[i][j] = input_counts->comp_inter[i][j]; - } - } - for (i = 0; i < REF_CONTEXTS; ++i) { - for (j = 0; j < 2; ++j) { - for (k = 0; k < 2; ++k) { - output_counts->single_ref[i][j][k] = input_counts->single_ref[i][j][k]; - } - } - } - for (i = 0; i < REF_CONTEXTS; ++i) { - for (j = 0; j < 2; ++j) { - output_counts->comp_ref[i][j] = input_counts->comp_ref[i][j]; - } - } - for (i = 0; i < SKIP_CONTEXTS; ++i) { - for (j = 0; j < 2; ++j) { - output_counts->skip[i][j] = input_counts->skip[i][j]; - } - } - for (i = 0; i < TX_SIZE_CONTEXTS; i++) { - for (j = 0; j < TX_SIZES; j++) { - output_counts->tx.p32x32[i][j] = input_counts->tx.p32x32[i][j]; - } - for (j = 0; j < TX_SIZES - 1; j++) { - output_counts->tx.p16x16[i][j] = input_counts->tx.p16x16[i][j]; - } - for (j = 0; j < TX_SIZES - 2; j++) { - output_counts->tx.p8x8[i][j] = input_counts->tx.p8x8[i][j]; - } - } - for (i = 0; i < TX_SIZES; i++) { - output_counts->tx.tx_totals[i] = input_counts->tx.tx_totals[i]; - } - for (i = 0; i < MV_JOINTS; i++) { - output_counts->mv.joints[i] = input_counts->mv.joints[i]; - } - for (k = 0; k < 2; k++) { - nmv_component_counts *const comps = &output_counts->mv.comps[k]; - const nmv_component_counts *const comps_t = &input_counts->mv.comps[k]; - for (i = 0; i < 2; i++) { - comps->sign[i] = comps_t->sign[i]; - comps->class0_hp[i] = comps_t->class0_hp[i]; - comps->hp[i] = comps_t->hp[i]; - } - for (i = 0; i < MV_CLASSES; i++) { - comps->classes[i] = comps_t->classes[i]; - } - for (i = 0; i < CLASS0_SIZE; i++) { - comps->class0[i] = comps_t->class0[i]; - for (j = 0; j < MV_FP_SIZE; j++) { - comps->class0_fp[i][j] = comps_t->class0_fp[i][j]; - } - } - for (i = 0; i < MV_OFFSET_BITS; i++) { - for (j = 0; j < 2; j++) { - comps->bits[i][j] = comps_t->bits[i][j]; - } - } - for (i = 0; i < MV_FP_SIZE; i++) { - comps->fp[i] = comps_t->fp[i]; - } - } -} - -static void yv12_buffer_to_image_buffer(const YV12_BUFFER_CONFIG *yv12_buffer, - IMAGE_BUFFER *image_buffer) { - const uint8_t *src_buf_ls[3] = { yv12_buffer->y_buffer, yv12_buffer->u_buffer, - yv12_buffer->v_buffer }; - const int src_stride_ls[3] = { yv12_buffer->y_stride, yv12_buffer->uv_stride, - yv12_buffer->uv_stride }; - const int w_ls[3] = { yv12_buffer->y_crop_width, yv12_buffer->uv_crop_width, - yv12_buffer->uv_crop_width }; - const int h_ls[3] = { yv12_buffer->y_crop_height, yv12_buffer->uv_crop_height, - yv12_buffer->uv_crop_height }; - int plane; - for (plane = 0; plane < 3; ++plane) { - const int src_stride = src_stride_ls[plane]; - const int w = w_ls[plane]; - const int h = h_ls[plane]; - const uint8_t *src_buf = src_buf_ls[plane]; - uint8_t *dst_buf = image_buffer->plane_buffer[plane]; - int r; - assert(image_buffer->plane_width[plane] == w); - assert(image_buffer->plane_height[plane] == h); - for (r = 0; r < h; ++r) { - memcpy(dst_buf, src_buf, sizeof(*src_buf) * w); - src_buf += src_stride; - dst_buf += w; - } - } -} -#endif // CONFIG_RATE_CTRL - -static void update_encode_frame_result( - int ref_frame_flags, FRAME_UPDATE_TYPE update_type, - const YV12_BUFFER_CONFIG *source_frame, const RefCntBuffer *coded_frame_buf, - RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int quantize_index, - uint32_t bit_depth, uint32_t input_bit_depth, const FRAME_COUNTS *counts, -#if CONFIG_RATE_CTRL - const PARTITION_INFO *partition_info, - const MOTION_VECTOR_INFO *motion_vector_info, - const TplDepStats *tpl_stats_info, -#endif // CONFIG_RATE_CTRL - ENCODE_FRAME_RESULT *encode_frame_result) { -#if CONFIG_RATE_CTRL - PSNR_STATS psnr; -#if CONFIG_VP9_HIGHBITDEPTH - vpx_calc_highbd_psnr(source_frame, &coded_frame_buf->buf, &psnr, bit_depth, - input_bit_depth); -#else // CONFIG_VP9_HIGHBITDEPTH - (void)bit_depth; - (void)input_bit_depth; - vpx_calc_psnr(source_frame, &coded_frame_buf->buf, &psnr); -#endif // CONFIG_VP9_HIGHBITDEPTH - encode_frame_result->frame_coding_index = coded_frame_buf->frame_coding_index; - - vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs, - encode_frame_result->ref_frame_coding_indexes, - encode_frame_result->ref_frame_valid_list); - - encode_frame_result->psnr = psnr.psnr[0]; - encode_frame_result->sse = psnr.sse[0]; - copy_frame_counts(counts, &encode_frame_result->frame_counts); - encode_frame_result->partition_info = partition_info; - encode_frame_result->motion_vector_info = motion_vector_info; - encode_frame_result->tpl_stats_info = tpl_stats_info; - if (encode_frame_result->coded_frame.allocated) { - yv12_buffer_to_image_buffer(&coded_frame_buf->buf, - &encode_frame_result->coded_frame); - } -#else // CONFIG_RATE_CTRL - (void)ref_frame_flags; - (void)bit_depth; - (void)input_bit_depth; - (void)source_frame; - (void)coded_frame_buf; - (void)ref_frame_bufs; - (void)counts; -#endif // CONFIG_RATE_CTRL - encode_frame_result->show_idx = coded_frame_buf->frame_index; - encode_frame_result->update_type = update_type; - encode_frame_result->quantize_index = quantize_index; -} -#endif // !CONFIG_REALTIME_ONLY - void vp9_init_encode_frame_result(ENCODE_FRAME_RESULT *encode_frame_result) { encode_frame_result->show_idx = -1; // Actual encoding doesn't happen. #if CONFIG_RATE_CTRL diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 12520fb82a..65a1d33286 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -291,6 +291,7 @@ typedef struct VP9EncoderConfig { int row_mt; unsigned int motion_vector_unit_test; int delta_q_uv; + int use_simple_encode_api; // Use SimpleEncode APIs or not } VP9EncoderConfig; static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 375438839b..7343d1bc66 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1114,8 +1114,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; #if CONFIG_RATE_CTRL - // Store zero mv as default - store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0); + if (cpi->oxcf.use_simple_encode_api) { + // Store zero mv as default + store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0); + } #endif // CONFIG_RAGE_CTRL xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; @@ -1183,7 +1185,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, } } #if CONFIG_RATE_CTRL - store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0); + if (cpi->oxcf.use_simple_encode_api) { + store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0); + } #endif // CONFIG_RAGE_CTRL // Search in an older reference frame. @@ -1207,7 +1211,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error); #if CONFIG_RATE_CTRL - store_fp_motion_vector(cpi, &tmp_mv, mb_row, mb_col, GOLDEN_FRAME, 1); + if (cpi->oxcf.use_simple_encode_api) { + store_fp_motion_vector(cpi, &tmp_mv, mb_row, mb_col, GOLDEN_FRAME, + 1); + } #endif // CONFIG_RAGE_CTRL if (gf_motion_error < motion_error && gf_motion_error < this_error) @@ -1383,7 +1390,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, } else { fp_acc_data->sr_coded_error += (int64_t)this_error; #if CONFIG_RATE_CTRL - store_fp_motion_vector(cpi, NULL, mb_row, mb_col, INTRA_FRAME, 0); + if (cpi->oxcf.use_simple_encode_api) { + store_fp_motion_vector(cpi, NULL, mb_row, mb_col, INTRA_FRAME, 0); + } #endif // CONFIG_RAGE_CTRL } fp_acc_data->coded_error += (int64_t)this_error; @@ -1412,9 +1421,11 @@ static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) { vp9_tile_init(tile, cm, 0, 0); #if CONFIG_RATE_CTRL - fp_motion_vector_info_reset(cpi->frame_info.frame_width, - cpi->frame_info.frame_height, - cpi->fp_motion_vector_info); + if (cpi->oxcf.use_simple_encode_api) { + fp_motion_vector_info_reset(cpi->frame_info.frame_width, + cpi->frame_info.frame_height, + cpi->fp_motion_vector_info); + } #endif for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { @@ -2677,25 +2688,25 @@ static int get_gop_coding_frame_num( return gop_coding_frames; } -static RANGE get_active_gf_inverval_range( - const FRAME_INFO *frame_info, const RATE_CONTROL *rc, int arf_active_or_kf, - int gf_start_show_idx, int active_worst_quality, int last_boosted_qindex) { +static RANGE get_active_gf_inverval_range_simple(int min_gf_interval, + int arf_active_or_kf, + int frames_to_key) { RANGE active_gf_interval; -#if CONFIG_RATE_CTRL - (void)frame_info; - (void)gf_start_show_idx; - (void)active_worst_quality; - (void)last_boosted_qindex; - active_gf_interval.min = rc->min_gf_interval + arf_active_or_kf + 2; - + active_gf_interval.min = min_gf_interval + arf_active_or_kf + 2; active_gf_interval.max = 16 + arf_active_or_kf; - if ((active_gf_interval.max <= rc->frames_to_key) && - (active_gf_interval.max >= (rc->frames_to_key - rc->min_gf_interval))) { - active_gf_interval.min = rc->frames_to_key / 2; - active_gf_interval.max = rc->frames_to_key / 2; + if ((active_gf_interval.max <= frames_to_key) && + (active_gf_interval.max >= (frames_to_key - min_gf_interval))) { + active_gf_interval.min = frames_to_key / 2; + active_gf_interval.max = frames_to_key / 2; } -#else + return active_gf_interval; +} + +static RANGE get_active_gf_inverval_range( + const FRAME_INFO *frame_info, const RATE_CONTROL *rc, int arf_active_or_kf, + int gf_start_show_idx, int active_worst_quality, int last_boosted_qindex) { + RANGE active_gf_interval; int int_max_q = (int)(vp9_convert_qindex_to_q(active_worst_quality, frame_info->bit_depth)); int q_term = (gf_start_show_idx == 0) @@ -2733,7 +2744,6 @@ static RANGE get_active_gf_inverval_range( } active_gf_interval.max = VPXMAX(active_gf_interval.max, active_gf_interval.min); -#endif return active_gf_interval; } @@ -2794,9 +2804,14 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { vpx_clear_system_state(); - active_gf_interval = get_active_gf_inverval_range( - frame_info, rc, arf_active_or_kf, gf_start_show_idx, - twopass->active_worst_quality, rc->last_boosted_qindex); + if (oxcf->use_simple_encode_api) { + active_gf_interval = get_active_gf_inverval_range_simple( + rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key); + } else { + active_gf_interval = get_active_gf_inverval_range( + frame_info, rc, arf_active_or_kf, gf_start_show_idx, + twopass->active_worst_quality, rc->last_boosted_qindex); + } if (cpi->multi_layer_arf) { int arf_layers = get_arf_layers(cpi->multi_layer_arf, oxcf->enable_auto_arf, @@ -2806,25 +2821,21 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { gop_intra_factor = 1.0; } + gop_coding_frames = get_gop_coding_frame_num( + &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx, + &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames); + use_alt_ref &= allow_alt_ref; #if CONFIG_RATE_CTRL - { + // If the external gop_command is on, we will override the decisions + // of gop_coding_frames and use_alt_ref. + if (cpi->oxcf.use_simple_encode_api) { const GOP_COMMAND *gop_command = &cpi->encode_command.gop_command; assert(allow_alt_ref == 1); if (gop_command->use) { gop_coding_frames = gop_command_coding_frame_count(gop_command); use_alt_ref = gop_command->use_alt_ref; - } else { - gop_coding_frames = get_gop_coding_frame_num( - &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx, - &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames); - use_alt_ref &= allow_alt_ref; } } -#else - gop_coding_frames = get_gop_coding_frame_num( - &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx, - &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames); - use_alt_ref &= allow_alt_ref; #endif // Was the group length constrained by the requirement for a new KF? @@ -3855,12 +3866,19 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf, int frame_count; double gop_intra_factor; const int arf_active_or_kf = last_gop_use_alt_ref || first_is_key_frame; - RANGE active_gf_interval = get_active_gf_inverval_range( - frame_info, rc, arf_active_or_kf, show_idx, /*active_worst_quality=*/0, - /*last_boosted_qindex=*/0); + RANGE active_gf_interval; + int arf_layers; + if (oxcf->use_simple_encode_api) { + active_gf_interval = get_active_gf_inverval_range_simple( + rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key); + } else { + active_gf_interval = get_active_gf_inverval_range( + frame_info, rc, arf_active_or_kf, show_idx, /*active_worst_quality=*/0, + /*last_boosted_qindex=*/0); + } - const int arf_layers = get_arf_layers(multi_layer_arf, oxcf->enable_auto_arf, - active_gf_interval.max); + arf_layers = get_arf_layers(multi_layer_arf, oxcf->enable_auto_arf, + active_gf_interval.max); if (multi_layer_arf) { gop_intra_factor = 1.0 + 0.25 * arf_layers; } else { @@ -3877,7 +3895,7 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf, // Under CONFIG_RATE_CTRL, once the first_pass_info is ready, the number of // coding frames (including show frame and alt ref) can be determined. int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf, - TWO_PASS *const twopass, + const TWO_PASS *const twopass, const FRAME_INFO *frame_info, int multi_layer_arf, int allow_alt_ref) { const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index ff0eb40c87..ddfc87d894 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -305,7 +305,7 @@ int vp9_get_gop_coding_frame_count(const struct VP9EncoderConfig *oxcf, int last_gop_use_alt_ref, int *use_alt_ref); int vp9_get_coding_frame_num(const struct VP9EncoderConfig *oxcf, - TWO_PASS *const twopass, + const TWO_PASS *const twopass, const FRAME_INFO *frame_info, int multi_layer_arf, int allow_alt_ref); diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 3775d22361..043036721a 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1720,10 +1720,12 @@ void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) { } #if CONFIG_RATE_CTRL - if (cpi->encode_command.use_external_target_frame_bits) { - rc->this_frame_target = cpi->encode_command.target_frame_bits; + if (cpi->oxcf.use_simple_encode_api) { + if (cpi->encode_command.use_external_target_frame_bits) { + rc->this_frame_target = cpi->encode_command.target_frame_bits; + } } -#endif +#endif // CONFIG_RATE_CTRL // Target rate per SB64 (including partial SB64s. rc->sb64_target_rate = (int)(((int64_t)rc->this_frame_target * 64 * 64) / @@ -2536,26 +2538,25 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi, rc->min_gf_interval = FIXED_GF_INTERVAL; rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL; } else { + double framerate = cpi->framerate; // Set Maximum gf/arf interval rc->max_gf_interval = oxcf->max_gf_interval; rc->min_gf_interval = oxcf->min_gf_interval; #if CONFIG_RATE_CTRL + if (oxcf->use_simple_encode_api) { + // In this experiment, we avoid framerate being changed dynamically during + // encoding. + framerate = oxcf->init_framerate; + } +#endif // CONFIG_RATE_CTRL if (rc->min_gf_interval == 0) { rc->min_gf_interval = vp9_rc_get_default_min_gf_interval( - oxcf->width, oxcf->height, oxcf->init_framerate); + oxcf->width, oxcf->height, framerate); } if (rc->max_gf_interval == 0) { - rc->max_gf_interval = vp9_rc_get_default_max_gf_interval( - oxcf->init_framerate, rc->min_gf_interval); + rc->max_gf_interval = + vp9_rc_get_default_max_gf_interval(framerate, rc->min_gf_interval); } -#else - if (rc->min_gf_interval == 0) - rc->min_gf_interval = vp9_rc_get_default_min_gf_interval( - oxcf->width, oxcf->height, cpi->framerate); - if (rc->max_gf_interval == 0) - rc->max_gf_interval = vp9_rc_get_default_max_gf_interval( - cpi->framerate, rc->min_gf_interval); -#endif // Extended max interval for genuinely static scenes like slide shows. rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH; diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 87727cb12a..6ba37a321c 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -793,6 +793,7 @@ static VP9EncoderConfig GetEncodeConfig( if (enc_pass == VPX_RC_FIRST_PASS) { oxcf.lag_in_frames = 0; } + oxcf.use_simple_encode_api = 1; return oxcf; } diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 438f9b5ed9..7697806ce0 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -652,6 +652,7 @@ static vpx_codec_err_t set_encoder_config( } if (get_level_index(oxcf->target_level) >= 0) config_target_level(oxcf); + oxcf->use_simple_encode_api = 0; // vp9_dump_encoder_config(oxcf, stderr); return VPX_CODEC_OK; } @@ -2288,6 +2289,8 @@ void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp) { DUMP_STRUCT_VALUE(fp, oxcf, row_mt); DUMP_STRUCT_VALUE(fp, oxcf, motion_vector_unit_test); + DUMP_STRUCT_VALUE(fp, oxcf, delta_q_uv); + DUMP_STRUCT_VALUE(fp, oxcf, use_simple_encode_api); } FRAME_INFO vp9_get_frame_info(const VP9EncoderConfig *oxcf) { From a00c56373ea7d12bf6b1ab8060ccdb4af03c1794 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 21 Jun 2021 17:22:51 -0700 Subject: [PATCH 0192/1000] rc: turn off gf constrain for external RC Added a new flag in rate control which turns off gf interval constrain on key frame frequency for external RC. It remains on for libvpx. Change-Id: I18bb0d8247a421193f023619f906d0362b873b31 --- test/ratectrl_rtc_test.cc | 7 ++----- vp9/encoder/vp9_ratectrl.c | 4 +++- vp9/encoder/vp9_ratectrl.h | 4 ++++ vp9/ratectrl_rtc.cc | 6 ++---- vp9/ratectrl_rtc.h | 2 -- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc index e9a9f15e9f..5e5a179b29 100644 --- a/test/ratectrl_rtc_test.cc +++ b/test/ratectrl_rtc_test.cc @@ -61,6 +61,7 @@ struct FrameInfo { // - Set rc_end_usage to VPX_VBR // - AQ Mode 0 // - Disable vp9_compute_frame_low_motion in vp9_encoder.c +// - Set rc->constrain_gf_key_freq_onepass_vbr = 0 in vp9_rc_init // examples/vpx_temporal_svc_encoder gipsrec_motion1.1280_720.yuv out vp9 // 1280 720 1 30 7 0 0 1 0 1000 // @@ -192,8 +193,7 @@ class RcInterfaceTest : public ::testing::Test { for (size_t i = 0; i < kNumFrame; i++) { one_layer_file >> frame_info; if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME; - if (frame_info.frame_id % rc_cfg_.key_freq == 0) - frame_params.frame_type = KEY_FRAME; + if (frame_info.frame_id % 300 == 0) frame_params.frame_type = KEY_FRAME; ASSERT_EQ(frame_info.spatial_id, 0); ASSERT_EQ(frame_info.temporal_id, 0); rc_api_->ComputeQP(frame_params); @@ -229,19 +229,16 @@ class RcInterfaceTest : public ::testing::Test { void SetConfigOneLayerCBR() { SetConfig(); rc_cfg_.rc_mode = VPX_CBR; - rc_cfg_.key_freq = 3000; } void SetConfigOneLayerVBR() { SetConfig(); rc_cfg_.rc_mode = VPX_VBR; - rc_cfg_.key_freq = 3000; } void SetConfigOneLayerVBRPeriodicKey() { SetConfig(); rc_cfg_.rc_mode = VPX_VBR; - rc_cfg_.key_freq = 300; } void SetConfigSVC() { diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index dbbd458c96..26e1d5381f 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -407,6 +407,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->source_alt_ref_active = 0; rc->frames_till_gf_update_due = 0; + rc->constrain_gf_key_freq_onepass_vbr = 1; rc->ni_av_qi = oxcf->worst_allowed_q; rc->ni_tot_qi = 0; rc->ni_frames = 0; @@ -2083,7 +2084,8 @@ void vp9_set_gf_update_one_pass_vbr(VP9_COMP *const cpi) { rc->gfu_boost = DEFAULT_GF_BOOST >> 1; rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400)); } - adjust_gfint_frame_constraint(cpi, rc->frames_to_key); + if (rc->constrain_gf_key_freq_onepass_vbr) + adjust_gfint_frame_constraint(cpi, rc->frames_to_key); rc->frames_till_gf_update_due = rc->baseline_gf_interval; cpi->refresh_golden_frame = 1; rc->source_alt_ref_pending = 0; diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index bdddd2df8b..83a12cde73 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -207,6 +207,10 @@ typedef struct { int preserve_arf_as_gld; int preserve_next_arf_as_gld; int show_arf_as_gld; + + // Flag to constrain golden frame interval on key frame frequency for 1 pass + // VBR. + int constrain_gf_key_freq_onepass_vbr; } RATE_CONTROL; struct VP9_COMP; diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index 8455ca9a3d..2595a2bc07 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -46,7 +46,6 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { oxcf->content = VP9E_CONTENT_DEFAULT; oxcf->drop_frames_water_mark = 0; cm->current_video_frame = 0; - oxcf->key_freq = rc_cfg.key_freq; rc->kf_boost = DEFAULT_KF_BOOST; UpdateRateControl(rc_cfg); @@ -60,7 +59,7 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { rc->rc_2_frame = 0; vp9_rc_init_minq_luts(); vp9_rc_init(oxcf, 0, rc); - rc->frames_to_key = oxcf->key_freq; + rc->constrain_gf_key_freq_onepass_vbr = 0; cpi_->sf.use_nonrd_pick_mode = 1; } @@ -152,8 +151,7 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { target = vp9_calc_pframe_target_size_one_pass_cbr(cpi_); } else if (cpi_->oxcf.rc_mode == VPX_VBR) { if (cm->frame_type == KEY_FRAME) { - cpi_->rc.this_key_frame_forced = - cm->current_video_frame != 0 && cpi_->rc.frames_to_key == 0; + cpi_->rc.this_key_frame_forced = cm->current_video_frame != 0; cpi_->rc.frames_to_key = cpi_->oxcf.key_freq; } vp9_set_gf_update_one_pass_vbr(cpi_); diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index a1f2767126..c7c0505e33 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -51,8 +51,6 @@ struct VP9RateControlRtcConfig { int ts_rate_decimator[VPX_TS_MAX_LAYERS]; // vbr, cbr enum vpx_rc_mode rc_mode; - // key frame frequency - int key_freq; }; struct VP9FrameParamsQpRTC { From bd53f0cc9faefbca2dcb6b21b6849d5e24141c9c Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 24 Jun 2021 13:13:50 -0700 Subject: [PATCH 0193/1000] Add constructor to VP9RateControlRtcConfig Also add max_inter_bitrate_pct Change-Id: Ie2c0e7f1397ca0bb55214251906412cdf24e42e2 --- vp9/ratectrl_rtc.cc | 1 + vp9/ratectrl_rtc.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index 2595a2bc07..b38a0db9c0 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -90,6 +90,7 @@ void VP9RateControlRTC::UpdateRateControl( (rc_cfg.ts_number_layers > 1) ? rc_cfg.ts_number_layers : 0); cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct; + cpi_->oxcf.rc_max_inter_bitrate_pct = rc_cfg.max_inter_bitrate_pct; cpi_->framerate = rc_cfg.framerate; cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers; cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers; diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index c7c0505e33..a30ec1da23 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -26,6 +26,36 @@ namespace libvpx { struct VP9RateControlRtcConfig { + public: + VP9RateControlRtcConfig() { + width = 1280; + height = 720; + max_quantizer = 63; + min_quantizer = 2; + target_bandwidth = 1000; + buf_initial_sz = 600; + buf_optimal_sz = 600; + buf_sz = 1000; + undershoot_pct = overshoot_pct = 50; + max_intra_bitrate_pct = 50; + max_inter_bitrate_pct = 0; + framerate = 30.0; + ss_number_layers = ts_number_layers = 1; + rc_mode = VPX_CBR; + vp9_zero(max_quantizers); + vp9_zero(min_quantizers); + vp9_zero(scaling_factor_den); + vp9_zero(scaling_factor_num); + vp9_zero(layer_target_bitrate); + vp9_zero(ts_rate_decimator); + scaling_factor_num[0] = 1; + scaling_factor_den[0] = 1; + layer_target_bitrate[0] = target_bandwidth; + max_quantizers[0] = max_quantizer; + min_quantizers[0] = min_quantizer; + ts_rate_decimator[0] = 1; + } + int width; int height; // 0-63 @@ -38,6 +68,7 @@ struct VP9RateControlRtcConfig { int undershoot_pct; int overshoot_pct; int max_intra_bitrate_pct; + int max_inter_bitrate_pct; double framerate; // Number of spatial layers int ss_number_layers; From 67bfbcfbf706766c841ac900b6cd2165c651983c Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Thu, 24 Jun 2021 23:34:36 -0700 Subject: [PATCH 0194/1000] vp9-rtc: Extract content dependency in cyclic refresh For usage in the external RC. When content_mode = 0, the cyclic refresh has no dependency on the content (motion, spatial variance, motion vectors, etc,). The content_mode = 0, when compared to content_mode = 1, on rtc set for speed 7: has some regression on some clips (~3-5%), but overall/average bdrate loss is about ~1-2%. Comparing aq_mode=3 with content_mode = 0, vs aq_mode=3: about ~14% avg/overall bdrate gain, but has ~3-7% regression on some hard motion clip (e.g.m street). Change-Id: I93117fabb8f7f89032c15baf1292b201e8c07362 --- vp9/encoder/vp9_aq_cyclicrefresh.c | 23 +++++++++++++++++------ vp9/encoder/vp9_aq_cyclicrefresh.h | 1 + vp9/encoder/vp9_encodeframe.c | 9 ++++++--- vp9/encoder/vp9_encoder.c | 2 +- 4 files changed, 25 insertions(+), 10 deletions(-) diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index 858a416546..e6edf5a925 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -48,6 +48,7 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) { assert(MAXQ <= 255); memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size); cr->counter_encode_maxq_scene_change = 0; + cr->content_mode = 1; return cr; } @@ -326,7 +327,8 @@ void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) { else rc->baseline_gf_interval = 40; if (cpi->oxcf.rc_mode == VPX_VBR) rc->baseline_gf_interval = 20; - if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40) + if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40 && + cr->content_mode) rc->baseline_gf_interval = 10; } @@ -388,7 +390,8 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex) : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex); // More aggressive settings for noisy content. - if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) { + if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium && + cr->content_mode) { consec_zero_mv_thresh = 60; qindex_thresh = VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex), @@ -409,7 +412,7 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { #if CONFIG_VP9_HIGHBITDEPTH if (cpi->common.use_highbitdepth) compute_content = 0; #endif - if (cpi->Last_Source == NULL || + if (cr->content_mode == 0 || cpi->Last_Source == NULL || cpi->Last_Source->y_width != cpi->Source->y_width || cpi->Last_Source->y_height != cpi->Source->y_height) compute_content = 0; @@ -430,7 +433,8 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { // reset to 0 later depending on the coding mode. if (cr->map[bl_index2] == 0) { count_tot++; - if (cr->last_coded_q_map[bl_index2] > qindex_thresh || + if (cr->content_mode == 0 || + cr->last_coded_q_map[bl_index2] > qindex_thresh || cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh_block) { sum_map++; count_sel++; @@ -489,7 +493,8 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { rc->avg_frame_qindex[INTER_FRAME] < qp_thresh || (cpi->use_svc && cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) || - (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion && + (!cpi->use_svc && cr->content_mode && + rc->avg_frame_low_motion < thresh_low_motion && rc->frames_since_key > 40) || (!cpi->use_svc && rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh && rc->frames_since_key > 20)) { @@ -528,7 +533,7 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { cr->percent_refresh = (cr->skip_flat_static_blocks) ? 5 : 10; // Increase the amount of refresh on scene change that is encoded at max Q, // increase for a few cycles of the refresh period (~100 / percent_refresh). - if (cr->counter_encode_maxq_scene_change < 30) + if (cr->content_mode && cr->counter_encode_maxq_scene_change < 30) cr->percent_refresh = (cr->skip_flat_static_blocks) ? 10 : 15; cr->rate_ratio_qdelta = 2.0; cr->rate_boost_fac = 10; @@ -575,6 +580,12 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { (double)(cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) / num8x8bl; cr->weight_segment = weight_segment; + if (cr->content_mode == 0) { + cr->actual_num_seg1_blocks = + cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100; + cr->actual_num_seg2_blocks = 0; + cr->weight_segment = (double)(cr->actual_num_seg1_blocks) / num8x8bl; + } } // Setup cyclic background refresh: set delta q and segmentation map. diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h index b6d7fdeae7..c74cee4743 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.h +++ b/vp9/encoder/vp9_aq_cyclicrefresh.h @@ -70,6 +70,7 @@ struct CYCLIC_REFRESH { int apply_cyclic_refresh; int counter_encode_maxq_scene_change; int skip_flat_static_blocks; + int content_mode; }; struct VP9_COMP; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 00855319d6..969fad59b1 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1842,7 +1842,8 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx, } // Else for cyclic refresh mode update the segment map, set the segment id // and then update the quantizer. - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->content_mode) { vp9_cyclic_refresh_update_segment(cpi, xd->mi[0], mi_row, mi_col, bsize, ctx->rate, ctx->dist, x->skip, p); } @@ -2539,7 +2540,8 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled)) { // Setting segmentation map for cyclic_refresh. - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->content_mode) { vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize, ctx->rate, ctx->dist, x->skip, p); } else { @@ -6716,7 +6718,8 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, ++td->counts->tx.tx_totals[mi->tx_size]; ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])]; - if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->content_mode) vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize); if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 && (!cpi->use_svc || diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index bbd6dd030d..1af83e405a 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4208,7 +4208,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, // Update some stats from cyclic refresh, and check for golden frame update. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && - !frame_is_intra_only(cm)) + !frame_is_intra_only(cm) && cpi->cyclic_refresh->content_mode) vp9_cyclic_refresh_postencode(cpi); // Update the skip mb flag probabilities based on the distribution From fe1c7d2d8cce13d9cd1edfe11a6703e5521ae561 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Thu, 17 Jun 2021 15:36:18 -0700 Subject: [PATCH 0195/1000] Disallow skipping transform and quantization The encoder has a feature to skip transform and quantization based on model rd analysis. It could happen that the model based analysis lets the encoder skips transform and quantization, while a bad prediction occurs, leading to bad reconstructed blocks, which are intrusive and apparently coding errors. We add a speed feature to guard the skipping feature. Due to the risk of bad perceptual quality, we disallow such skipping by default. On hdres test set, speed 2, the coding performance difference is 0.025%, speed difference is 1.2%, which can be considered non significant. BUG=webm:1729 Change-Id: I48af01ae8dcc7a76c05c695f3f3e68b866c89574 --- vp9/encoder/vp9_block.h | 3 +++ vp9/encoder/vp9_rdopt.c | 23 ++++++++++------------- vp9/encoder/vp9_speed_features.c | 1 + vp9/encoder/vp9_speed_features.h | 6 ++++++ 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 37a4605ad8..20294b4b94 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -157,6 +157,9 @@ struct macroblock { // skip forward transform and quantization uint8_t skip_txfm[MAX_MB_PLANE << 2]; #define SKIP_TXFM_NONE 0 +// TODO(chengchen): consider remove SKIP_TXFM_AC_DC from vp9 completely +// since it increases risks of bad perceptual quality. +// https://crbug.com/webm/1729 #define SKIP_TXFM_AC_DC 1 #define SKIP_TXFM_AC_ONLY 2 diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 37de4e4839..a1687dcf46 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -745,8 +745,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, MODE_INFO *const mi = xd->mi[0]; int64_t rd1, rd2, rd; int rate; - int64_t dist; - int64_t sse; + int64_t dist = INT64_MAX; + int64_t sse = INT64_MAX; const int coeff_ctx = combine_entropy_contexts(args->t_left[blk_row], args->t_above[blk_col]); struct buf_2d *recon = args->this_recon; @@ -799,6 +799,13 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, if (max_txsize_lookup[plane_bsize] == tx_size) skip_txfm_flag = x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))]; + // This reduces the risk of bad perceptual quality due to bad prediction. + // We always force the encoder to perform transform and quantization. + if (!args->cpi->sf.allow_skip_txfm_ac_dc && + skip_txfm_flag == SKIP_TXFM_AC_DC) { + skip_txfm_flag = SKIP_TXFM_NONE; + } + if (skip_txfm_flag == SKIP_TXFM_NONE || (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) { // full forward transform and quantization @@ -827,17 +834,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, dist = VPXMAX(0, sse - dc_correct); } } else { - // SKIP_TXFM_AC_DC - // skip forward transform. Because this is handled here, the quantization - // does not need to do it. - x->plane[plane].eobs[block] = 0; - sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4; - dist = sse; - if (recon) { - uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)]; - copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride, - blk_row, blk_col, plane_bsize, tx_bsize); - } + assert(0 && "allow_skip_txfm_ac_dc does not allow SKIP_TXFM_AC_DC."); } } diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 585c9604c6..fc7a67c9f1 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -940,6 +940,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { sf->enable_tpl_model = oxcf->enable_tpl_model; sf->prune_ref_frame_for_rect_partitions = 0; sf->temporal_filter_search_method = MESH; + sf->allow_skip_txfm_ac_dc = 0; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_ALL; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index ca284ded82..5ea04709ec 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -612,6 +612,12 @@ typedef struct SPEED_FEATURES { // For real-time mode: force DC only under intra search when content // does not have high souce SAD. int rt_intra_dc_only_low_content; + + // The encoder has a feature that skips forward transform and quantization + // based on a model rd estimation to reduce encoding time. + // However, this feature is dangerous since it could lead to bad perceptual + // quality. This flag is added to guard the feature. + int allow_skip_txfm_ac_dc; } SPEED_FEATURES; struct VP9_COMP; From 5f345a9246b71374cbedeb28b0e0b0101701732a Mon Sep 17 00:00:00 2001 From: "Jorge E. Moreira" Date: Wed, 30 Jun 2021 11:33:51 -0700 Subject: [PATCH 0196/1000] Avoid overflow in calc_iframe_target_size The changed product was observed to attempt to multiply 1800 by 2500000, which overflows unsigned 32 bits. Converting to unsigned 64 bits first and testing whether the final result fits in 32 bits solves the problem. BUG=b:179686142 Change-Id: I5d27317bf14b0311b739144c451d8e172db01945 --- vp8/encoder/ratectrl.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index ba124c359e..d2b8dff06a 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -349,8 +349,12 @@ static void calc_iframe_target_size(VP8_COMP *cpi) { } if (cpi->oxcf.rc_max_intra_bitrate_pct) { - unsigned int max_rate = - cpi->per_frame_bandwidth * cpi->oxcf.rc_max_intra_bitrate_pct / 100; + unsigned int max_rate; + // This product may overflow unsigned int + uint64_t product = cpi->per_frame_bandwidth; + product *= cpi->oxcf.rc_max_intra_bitrate_pct; + product /= 100; + max_rate = (unsigned int)VPXMIN(INT_MAX, product); if (target > max_rate) target = max_rate; } From 350b0b47f2b126ae33607002590d58aca18033bc Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 1 Jul 2021 22:16:42 -0700 Subject: [PATCH 0197/1000] ratectrl_rtc.h: quiet MSVC int64_t->int conv warning target_bandwidth is int64_t, but layer_target_bitrate[0] is an int. this is safe in the only place it's set because target_bandwidth defaults to 1000. target_bandwidth is later used to populate the cpi's target, which is an unsigned int so there may be further fixes/cleanups that can be done. Change-Id: I35dbaa2e55a0fca22e0e2680dcac9ea4c6b2815a --- vp9/ratectrl_rtc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index a30ec1da23..f219f24500 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -50,7 +50,7 @@ struct VP9RateControlRtcConfig { vp9_zero(ts_rate_decimator); scaling_factor_num[0] = 1; scaling_factor_den[0] = 1; - layer_target_bitrate[0] = target_bandwidth; + layer_target_bitrate[0] = static_cast(target_bandwidth); max_quantizers[0] = max_quantizer; min_quantizers[0] = min_quantizer; ts_rate_decimator[0] = 1; From c64022fa3c3019da117fbafbe80535a9ffbd8163 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 2 Jul 2021 11:28:48 -0700 Subject: [PATCH 0198/1000] Add codec control to get loopfilter level Change-Id: I70d417da900082160e7ba53315af98eceede257c --- vp9/vp9_cx_iface.c | 9 +++++++++ vpx/vp8cx.h | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 7697806ce0..2c09a3992d 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -834,6 +834,14 @@ static vpx_codec_err_t ctrl_get_quantizer64(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_get_loopfilter_level(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; + *arg = ctx->cpi->common.lf.filter_level; + return VPX_CODEC_OK; +} + static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx, const struct vp9_extracfg *extra_cfg) { const vpx_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg); @@ -1967,6 +1975,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { // Getters { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer }, { VP8E_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 }, + { VP9E_GET_LOOPFILTER_LEVEL, ctrl_get_loopfilter_level }, { VP9_GET_REFERENCE, ctrl_get_reference }, { VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id }, { VP9E_GET_ACTIVEMAP, ctrl_get_active_map }, diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 37ad07d33d..a5dd324b70 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -712,6 +712,12 @@ enum vp8e_enc_control_id { * Supported in codecs: VP9 */ VP9E_SET_EXTERNAL_RATE_CONTROL, + + /*!\brief Codec control function to get loopfilter level in the encoder. + * + * Supported in codecs: VP9 + */ + VP9E_GET_LOOPFILTER_LEVEL, }; /*!\brief vpx 1-D scaling mode @@ -1037,6 +1043,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_ROW_MT, unsigned int) VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *) #define VPX_CTRL_VP9E_GET_LEVEL +VPX_CTRL_USE_TYPE(VP9E_GET_LOOPFILTER_LEVEL, int *) +#define VPX_CTRL_VP9E_GET_LOOPFILTER_LEVEL + VPX_CTRL_USE_TYPE(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int) #define VPX_CTRL_VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST From df7dc31cdfaa81e20fd0f4aed4c5eff037f484c4 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 8 Jul 2021 15:08:05 -0700 Subject: [PATCH 0199/1000] Document vpx_img_set_rect() more precisely Document the side effects and return value of vpx_img_set_rect() more precisely. Change-Id: Id1120bc478ff090a70b4ddd23c4798026bbefe10 --- vpx/vpx_image.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h index bc23be50c5..1adc9b9d9e 100644 --- a/vpx/vpx_image.h +++ b/vpx/vpx_image.h @@ -171,7 +171,8 @@ vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w, /*!\brief Set the rectangle identifying the displayed portion of the image * * Updates the displayed rectangle (aka viewport) on the image surface to - * match the specified coordinates and size. + * match the specified coordinates and size. Specifically, sets img->d_w, + * img->d_h, and elements of the img->planes[] array. * * \param[in] img Image descriptor * \param[in] x leftmost column @@ -179,7 +180,7 @@ vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w, * \param[in] w width * \param[in] h height * - * \return 0 if the requested rectangle is valid, nonzero otherwise. + * \return 0 if the requested rectangle is valid, nonzero (-1) otherwise. */ int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y, unsigned int w, unsigned int h); From 69fc604636f740a57482f3898c2527d29663ee6d Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 8 Jul 2021 15:17:48 -0700 Subject: [PATCH 0200/1000] Check for addition overflows in vpx_img_set_rect() Check for x + w and y + h overflows in vpx_img_set_rect(). Move the declaration of the local variable 'data' to the block it is used in. Change-Id: I6bda875e1853c03135ec6ce29015bcc78bb8b7ba --- vpx/src/vpx_image.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c index 2a7afc00c2..f9f0dd6025 100644 --- a/vpx/src/vpx_image.c +++ b/vpx/src/vpx_image.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include @@ -152,9 +153,8 @@ vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w, int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y, unsigned int w, unsigned int h) { - unsigned char *data; - - if (x + w <= img->w && y + h <= img->h) { + if (x <= UINT_MAX - w && x + w <= img->w && y <= UINT_MAX - h && + y + h <= img->h) { img->d_w = w; img->d_h = h; @@ -165,7 +165,7 @@ int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y, } else { const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; - data = img->img_data; + unsigned char *data = img->img_data; if (img->fmt & VPX_IMG_FMT_HAS_ALPHA) { img->planes[VPX_PLANE_ALPHA] = From 76ad30b6fb85f1462b28323220960d165d167e78 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 13 Jul 2021 11:54:34 -0700 Subject: [PATCH 0201/1000] Add codec control for rtc external ratectrl lib This will do 3 things: Turn off low motion computation Turn off gf update constrain on key frame frequency turn off content mode for cyclic refresh Those are used to verify the external ratectrl lib works as expected. Change-Id: Ic6e61498de82d6b3973e58df246cf5e05f838680 --- vp9/encoder/vp9_encoder.c | 4 +++- vp9/encoder/vp9_encoder.h | 2 ++ vp9/vp9_cx_iface.c | 13 +++++++++++++ vpx/vp8cx.h | 17 +++++++++++++++++ 4 files changed, 35 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 1af83e405a..f50b979979 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2304,6 +2304,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, cm, cm->frame_contexts, (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts))); + cpi->compute_frame_low_motion_onepass = 1; cpi->use_svc = 0; cpi->resize_state = ORIG; cpi->external_resize = 0; @@ -5747,7 +5748,8 @@ static void encode_frame_to_data_rate( vp9_rc_postencode_update(cpi, *size); - if (oxcf->pass == 0 && !frame_is_intra_only(cm) && + if (cpi->compute_frame_low_motion_onepass && oxcf->pass == 0 && + !frame_is_intra_only(cm) && (!cpi->use_svc || (cpi->use_svc && !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 65a1d33286..ea2d59e1b5 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -961,6 +961,8 @@ typedef struct VP9_COMP { int compute_source_sad_onepass; + int compute_frame_low_motion_onepass; + LevelConstraint level_constraint; uint8_t *count_arf_frame_usage; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 2c09a3992d..906f2b0b85 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1039,6 +1039,18 @@ static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + if (data) { + cpi->compute_frame_low_motion_onepass = 0; + cpi->rc.constrain_gf_key_freq_onepass_vbr = 0; + cpi->cyclic_refresh->content_mode = 0; + } + return VPX_CODEC_OK; +} + static vpx_codec_err_t ctrl_enable_motion_vector_unit_test( vpx_codec_alg_priv_t *ctx, va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; @@ -1970,6 +1982,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_SET_SVC_SPATIAL_LAYER_SYNC, ctrl_set_svc_spatial_layer_sync }, { VP9E_SET_DELTA_Q_UV, ctrl_set_delta_q_uv }, { VP9E_SET_DISABLE_LOOPFILTER, ctrl_set_disable_loopfilter }, + { VP9E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl }, { VP9E_SET_EXTERNAL_RATE_CONTROL, ctrl_set_external_rate_control }, // Getters diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index a5dd324b70..011dfcba52 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -713,6 +713,20 @@ enum vp8e_enc_control_id { */ VP9E_SET_EXTERNAL_RATE_CONTROL, + /*!\brief Codec control to disable internal features in rate control. + * + * This will do 3 things, only for 1 pass: + * - Turn off low motion computation + * - Turn off gf update constraint on key frame frequency + * - Turn off content mode for cyclic refresh + * + * With those, the rate control is expected to work exactly the same as the + * interface provided in ratectrl_rtc.cc/h + * + * Supported in codecs: VP9 + */ + VP9E_SET_RTC_EXTERNAL_RATECTRL, + /*!\brief Codec control function to get loopfilter level in the encoder. * * Supported in codecs: VP9 @@ -1077,6 +1091,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, int) VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int) #define VPX_CTRL_VP9E_SET_DISABLE_LOOPFILTER +VPX_CTRL_USE_TYPE(VP9E_SET_RTC_EXTERNAL_RATECTRL, int) +#define VPX_CTRL_VP9E_SET_RTC_EXTERNAL_RATECTRL + VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *) #define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL From b1f2532b4d670b2286f50d240992e58402a70aea Mon Sep 17 00:00:00 2001 From: Bohan Li Date: Thu, 15 Jul 2021 13:21:35 -0700 Subject: [PATCH 0202/1000] Avoid chroma resampling for 420mpeg2 input BUG=aomedia:3080 Change-Id: I4ed81abf4b799224085485560f675c10c318cde6 --- y4minput.c | 31 ++----------------------------- 1 file changed, 2 insertions(+), 29 deletions(-) diff --git a/y4minput.c b/y4minput.c index f923eda34a..9a4bdbd7b5 100644 --- a/y4minput.c +++ b/y4minput.c @@ -285,26 +285,6 @@ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst, } } -/*Handles both 422 and 420mpeg2 to 422jpeg and 420jpeg, respectively.*/ -static void y4m_convert_42xmpeg2_42xjpeg(y4m_input *_y4m, unsigned char *_dst, - unsigned char *_aux) { - int c_w; - int c_h; - int c_sz; - int pli; - /*Skip past the luma data.*/ - _dst += _y4m->pic_w * _y4m->pic_h; - /*Compute the size of each chroma plane.*/ - c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; - c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; - c_sz = c_w * c_h; - for (pli = 1; pli < 3; pli++) { - y4m_42xmpeg2_42xjpeg_helper(_dst, _aux, c_w, c_h); - _dst += c_sz; - _aux += c_sz; - } -} - /*This format is only used for interlaced content, but is included for completeness. @@ -889,7 +869,8 @@ int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, y4m_ctx->aux_buf = NULL; y4m_ctx->dst_buf = NULL; if (strcmp(y4m_ctx->chroma_type, "420") == 0 || - strcmp(y4m_ctx->chroma_type, "420jpeg") == 0) { + strcmp(y4m_ctx->chroma_type, "420jpeg") == 0 || + strcmp(y4m_ctx->chroma_type, "420mpeg2") == 0) { y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v = y4m_ctx->dst_c_dec_v = 2; y4m_ctx->dst_buf_read_sz = @@ -934,14 +915,6 @@ int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n"); return -1; } - } else if (strcmp(y4m_ctx->chroma_type, "420mpeg2") == 0) { - y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v = - y4m_ctx->dst_c_dec_v = 2; - y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; - /*Chroma filter required: read into the aux buf first.*/ - y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = - 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); - y4m_ctx->convert = y4m_convert_42xmpeg2_42xjpeg; } else if (strcmp(y4m_ctx->chroma_type, "420paldv") == 0) { y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v = y4m_ctx->dst_c_dec_v = 2; From f9b565f7ecebb5f76c8d406e35dba1bd25a6398d Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 12 Jul 2021 14:04:12 -0700 Subject: [PATCH 0203/1000] Refactor rtc rate control test Remove golden files. Run actual encoding as the ground truth. Change-Id: I1cea001278c1e9409bb02d33823cf69192c790a4 --- libs.mk | 6 + test/ratectrl_rtc_test.cc | 418 ++++++++++++++++++++++---------------- test/test-data.mk | 4 - test/test-data.sha1 | 4 - test/test.mk | 5 + vp9/ratectrl_rtc.h | 17 +- 6 files changed, 264 insertions(+), 190 deletions(-) diff --git a/libs.mk b/libs.mk index d05eee966d..f5b43abadc 100644 --- a/libs.mk +++ b/libs.mk @@ -493,10 +493,12 @@ TEST_INTRA_PRED_SPEED_SRCS=$(call addprefix_clean,test/,\ $(call enabled,TEST_INTRA_PRED_SPEED_SRCS)) TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS))) +ifeq ($(CONFIG_VP9_ENCODER),yes) RC_INTERFACE_TEST_BIN=./test_rc_interface$(EXE_SFX) RC_INTERFACE_TEST_SRCS=$(call addprefix_clean,test/,\ $(call enabled,RC_INTERFACE_TEST_SRCS)) RC_INTERFACE_TEST_OBJS := $(sort $(call objs,$(RC_INTERFACE_TEST_SRCS))) +endif SIMPLE_ENCODE_TEST_BIN=./test_simple_encode$(EXE_SFX) SIMPLE_ENCODE_TEST_SRCS=$(call addprefix_clean,test/,\ @@ -597,6 +599,7 @@ test_intra_pred_speed.$(VCPROJ_SFX): $(TEST_INTRA_PRED_SPEED_SRCS) vpx.$(VCPROJ_ -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^ endif # TEST_INTRA_PRED_SPEED +ifeq ($(CONFIG_VP9_ENCODER),yes) ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),) PROJECTS-$(CONFIG_MSVS) += test_rc_interface.$(VCPROJ_SFX) test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \ @@ -616,6 +619,7 @@ test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \ -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \ -L. -l$(CODEC_LIB) -l$(RC_RTC_LIB) -l$(GTEST_LIB) $^ endif # RC_INTERFACE_TEST +endif # CONFIG_VP9_ENCODER endif else @@ -657,6 +661,7 @@ $(eval $(call linkerxx_template,$(TEST_INTRA_PRED_SPEED_BIN), \ -L. -lvpx -lgtest $(extralibs) -lm)) endif # TEST_INTRA_PRED_SPEED +ifeq ($(CONFIG_VP9_ENCODER),yes) ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),) $(RC_INTERFACE_TEST_OBJS) $(RC_INTERFACE_TEST_OBJS:.o=.d): \ CXXFLAGS += $(GTEST_INCLUDES) @@ -668,6 +673,7 @@ $(eval $(call linkerxx_template,$(RC_INTERFACE_TEST_BIN), \ $(RC_INTERFACE_TEST_OBJS) \ -L. -lvpx -lgtest -lvp9rc $(extralibs) -lm)) endif # RC_INTERFACE_TEST +endif # CONFIG_VP9_ENCODER ifneq ($(strip $(SIMPLE_ENCODE_TEST_OBJS)),) $(SIMPLE_ENCODE_TEST_OBJS) $(SIMPLE_ENCODE_TEST_OBJS:.o=.d): \ diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc index 5e5a179b29..1414377082 100644 --- a/test/ratectrl_rtc_test.cc +++ b/test/ratectrl_rtc_test.cc @@ -16,6 +16,7 @@ #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" +#include "test/i420_video_source.h" #include "test/util.h" #include "test/video_source.h" #include "vpx/vpx_codec.h" @@ -23,188 +24,89 @@ namespace { -const size_t kNumFrame = 850; +const size_t kNumFrames = 300; -struct FrameInfo { - friend std::istream &operator>>(std::istream &is, FrameInfo &info) { - is >> info.frame_id >> info.spatial_id >> info.temporal_id >> info.base_q >> - info.target_bandwidth >> info.buffer_level >> info.filter_level_ >> - info.bytes_used; - return is; - } - - int frame_id; - int spatial_id; - int temporal_id; - // Base QP - int base_q; - size_t target_bandwidth; - size_t buffer_level; - // Loopfilter level - int filter_level_; - // Frame size for current frame, used for pose encode update - size_t bytes_used; -}; +const int kTemporalId[4] = { 0, 2, 1, 2 }; -// This test runs the rate control interface and compare against ground truth -// generated by encoders. -// Settings for the encoder: -// For 1 layer CBR: -// - AQ_Mode 0 -// - Disable golden refresh -// - Bitrate x 2 at frame/superframe 200 -// - Bitrate / 4 at frame/superframe 400 -// examples/vpx_temporal_svc_encoder gipsrec_motion1.1280_720.yuv out vp9 -// 1280 720 1 30 7 0 0 1 0 1000 -// -// For 1 layer VBR: -// - Set rc_end_usage to VPX_VBR -// - AQ Mode 0 -// - Disable vp9_compute_frame_low_motion in vp9_encoder.c -// - Set rc->constrain_gf_key_freq_onepass_vbr = 0 in vp9_rc_init -// examples/vpx_temporal_svc_encoder gipsrec_motion1.1280_720.yuv out vp9 -// 1280 720 1 30 7 0 0 1 0 1000 -// -// For SVC (3 temporal layers, 3 spatial layers): -// - AQ_Mode 0 -// - Disable golden refresh -// - Bitrate x 2 at frame/superframe 200 -// - Bitrate / 4 at frame/superframe 400 -// examples/vp9_spatial_svc_encoder -f 10000 -w 1280 -h 720 -t 1/30 -sl 3 -// -k 10000 -bl 100,140,200,250,350,500,450,630,900 -b 1600 --rc-end-usage=1 -// --lag-in-frames=0 --passes=1 --speed=7 --threads=1 -// --temporal-layering-mode=3 -aq 1 -rcstat 1 -// gipsrec_motion1.1280_720.yuv -o out.webm -// -// The generated file includes: -// frame number, spatial layer ID, temporal layer ID, base QP, target -// bandwidth, buffer level, loopfilter level, encoded frame size -// TODO(jianj): Remove golden files, and run actual encoding in this test. -class RcInterfaceTest : public ::testing::Test { +class RcInterfaceTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { public: - explicit RcInterfaceTest() {} + RcInterfaceTest() + : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000), + encoder_exit_(false) {} virtual ~RcInterfaceTest() {} protected: - void RunOneLayerCBR() { - SetConfigOneLayerCBR(); - rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); - FrameInfo frame_info; - libvpx::VP9FrameParamsQpRTC frame_params; - frame_params.frame_type = KEY_FRAME; - frame_params.spatial_layer_id = 0; - frame_params.temporal_layer_id = 0; - std::ifstream one_layer_file; - one_layer_file.open(libvpx_test::GetDataPath() + - "/rc_interface_test_one_layer"); - ASSERT_TRUE(one_layer_file.good()); - for (size_t i = 0; i < kNumFrame; i++) { - one_layer_file >> frame_info; - if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME; - if (frame_info.frame_id == 200) { - rc_cfg_.target_bandwidth = rc_cfg_.target_bandwidth * 2; - rc_api_->UpdateRateControl(rc_cfg_); - } else if (frame_info.frame_id == 400) { - rc_cfg_.target_bandwidth = rc_cfg_.target_bandwidth / 4; - rc_api_->UpdateRateControl(rc_cfg_); - } - ASSERT_EQ(frame_info.spatial_id, 0); - ASSERT_EQ(frame_info.temporal_id, 0); - rc_api_->ComputeQP(frame_params); - ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q); - ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_); - rc_api_->PostEncodeUpdate(frame_info.bytes_used); + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + } + + virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, 7); + encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); + encoder->Control(VP9E_SET_TUNE_CONTENT, 0); + encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); + encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1); } + frame_params_.frame_type = + video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME; + if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) { + // Disable golden frame update. + frame_flags_ |= VP8_EFLAG_NO_UPD_GF; + frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; + } + encoder_exit_ = video->frame() == kNumFrames; } - void RunSVC() { - SetConfigSVC(); - rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); - FrameInfo frame_info; - libvpx::VP9FrameParamsQpRTC frame_params; - frame_params.frame_type = KEY_FRAME; - std::ifstream svc_file; - svc_file.open(libvpx_test::GetDataPath() + "/rc_interface_test_svc"); - ASSERT_TRUE(svc_file.good()); - for (size_t i = 0; i < kNumFrame * rc_cfg_.ss_number_layers; i++) { - svc_file >> frame_info; - if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME; - if (frame_info.frame_id == 200 * rc_cfg_.ss_number_layers) { - for (int layer = 0; - layer < rc_cfg_.ss_number_layers * rc_cfg_.ts_number_layers; - layer++) - rc_cfg_.layer_target_bitrate[layer] *= 2; - rc_cfg_.target_bandwidth *= 2; - rc_api_->UpdateRateControl(rc_cfg_); - } else if (frame_info.frame_id == 400 * rc_cfg_.ss_number_layers) { - for (int layer = 0; - layer < rc_cfg_.ss_number_layers * rc_cfg_.ts_number_layers; - layer++) - rc_cfg_.layer_target_bitrate[layer] /= 4; - rc_cfg_.target_bandwidth /= 4; - rc_api_->UpdateRateControl(rc_cfg_); - } - frame_params.spatial_layer_id = frame_info.spatial_id; - frame_params.temporal_layer_id = frame_info.temporal_id; - rc_api_->ComputeQP(frame_params); - ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q); - ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_); - rc_api_->PostEncodeUpdate(frame_info.bytes_used); + virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + if (encoder_exit_) { + return; } + int loopfilter_level, qp; + encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level); + encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp); + rc_api_->ComputeQP(frame_params_); + ASSERT_EQ(rc_api_->GetQP(), qp); + ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level); } - void RunOneLayerVBR() { - SetConfigOneLayerVBR(); + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + rc_api_->PostEncodeUpdate(pkt->data.frame.sz); + } + + void RunOneLayer() { + SetConfig(GET_PARAM(2)); rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); - FrameInfo frame_info; - libvpx::VP9FrameParamsQpRTC frame_params; - frame_params.frame_type = KEY_FRAME; - frame_params.spatial_layer_id = 0; - frame_params.temporal_layer_id = 0; - std::ifstream one_layer_file; - one_layer_file.open(libvpx_test::GetDataPath() + - "/rc_interface_test_one_layer_vbr"); - ASSERT_TRUE(one_layer_file.good()); - for (size_t i = 0; i < kNumFrame; i++) { - one_layer_file >> frame_info; - if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME; - ASSERT_EQ(frame_info.spatial_id, 0); - ASSERT_EQ(frame_info.temporal_id, 0); - rc_api_->ComputeQP(frame_params); - ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q); - ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_); - rc_api_->PostEncodeUpdate(frame_info.bytes_used); - } + frame_params_.spatial_layer_id = 0; + frame_params_.temporal_layer_id = 0; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } void RunOneLayerVBRPeriodicKey() { - SetConfigOneLayerVBRPeriodicKey(); + if (GET_PARAM(2) != VPX_VBR) return; + key_interval_ = 100; + SetConfig(VPX_VBR); rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); - FrameInfo frame_info; - libvpx::VP9FrameParamsQpRTC frame_params; - frame_params.frame_type = KEY_FRAME; - frame_params.spatial_layer_id = 0; - frame_params.temporal_layer_id = 0; - std::ifstream one_layer_file; - one_layer_file.open(libvpx_test::GetDataPath() + - "/rc_interface_test_one_layer_vbr_periodic_key"); - ASSERT_TRUE(one_layer_file.good()); - for (size_t i = 0; i < kNumFrame; i++) { - one_layer_file >> frame_info; - if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME; - if (frame_info.frame_id % 300 == 0) frame_params.frame_type = KEY_FRAME; - ASSERT_EQ(frame_info.spatial_id, 0); - ASSERT_EQ(frame_info.temporal_id, 0); - rc_api_->ComputeQP(frame_params); - ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q); - ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_); - rc_api_->PostEncodeUpdate(frame_info.bytes_used); - } + frame_params_.spatial_layer_id = 0; + frame_params_.temporal_layer_id = 0; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } private: - void SetConfig() { + void SetConfig(vpx_rc_mode rc_mode) { rc_cfg_.width = 1280; rc_cfg_.height = 720; rc_cfg_.max_quantizer = 52; @@ -224,24 +126,182 @@ class RcInterfaceTest : public ::testing::Test { rc_cfg_.layer_target_bitrate[0] = 1000; rc_cfg_.max_quantizers[0] = 52; rc_cfg_.min_quantizers[0] = 2; + rc_cfg_.rc_mode = rc_mode; + + // Encoder settings for ground truth. + cfg_.g_w = 1280; + cfg_.g_h = 720; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_buf_initial_sz = 600; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 52; + cfg_.rc_end_usage = rc_mode; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 0; + cfg_.rc_target_bitrate = 1000; + cfg_.kf_min_dist = key_interval_; + cfg_.kf_max_dist = key_interval_; } - void SetConfigOneLayerCBR() { - SetConfig(); - rc_cfg_.rc_mode = VPX_CBR; + std::unique_ptr rc_api_; + libvpx::VP9RateControlRtcConfig rc_cfg_; + int aq_mode_; + int key_interval_; + libvpx::VP9FrameParamsQpRTC frame_params_; + bool encoder_exit_; +}; + +class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + public: + RcInterfaceSvcTest() : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)) {} + virtual ~RcInterfaceSvcTest() {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); } - void SetConfigOneLayerVBR() { - SetConfig(); - rc_cfg_.rc_mode = VPX_VBR; + virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, 7); + encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); + encoder->Control(VP9E_SET_TUNE_CONTENT, 0); + encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 900); + encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1); + encoder->Control(VP9E_SET_SVC, 1); + encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); + } + + frame_params_.frame_type = video->frame() == 0 ? KEY_FRAME : INTER_FRAME; + if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) { + // Disable golden frame update. + frame_flags_ |= VP8_EFLAG_NO_UPD_GF; + frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; + } + encoder_exit_ = video->frame() == kNumFrames; + current_superframe_ = video->frame(); } - void SetConfigOneLayerVBRPeriodicKey() { - SetConfig(); - rc_cfg_.rc_mode = VPX_VBR; + virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); + while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + ParseSuperframeSizes(static_cast(pkt->data.frame.buf), + pkt->data.frame.sz); + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { + frame_params_.spatial_layer_id = sl; + frame_params_.temporal_layer_id = kTemporalId[current_superframe_ % 4]; + rc_api_->ComputeQP(frame_params_); + frame_params_.frame_type = INTER_FRAME; + rc_api_->PostEncodeUpdate(sizes_[sl]); + } + } + if (!encoder_exit_) { + int loopfilter_level, qp; + encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level); + encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp); + ASSERT_EQ(rc_api_->GetQP(), qp); + ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level); + } + } + // This method needs to be overridden because non-reference frames are + // expected to be mismatched frames as the encoder will avoid loopfilter on + // these frames. + virtual void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) {} + + void RunSvc() { + SetConfigSvc(); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderSvc(); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + private: + vpx_codec_err_t ParseSuperframeSizes(const uint8_t *data, size_t data_sz) { + uint8_t marker = *(data + data_sz - 1); + if ((marker & 0xe0) == 0xc0) { + const uint32_t frames = (marker & 0x7) + 1; + const uint32_t mag = ((marker >> 3) & 0x3) + 1; + const size_t index_sz = 2 + mag * frames; + // This chunk is marked as having a superframe index but doesn't have + // enough data for it, thus it's an invalid superframe index. + if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME; + { + const uint8_t marker2 = *(data + data_sz - index_sz); + // This chunk is marked as having a superframe index but doesn't have + // the matching marker byte at the front of the index therefore it's an + // invalid chunk. + if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME; + } + const uint8_t *x = &data[data_sz - index_sz + 1]; + for (uint32_t i = 0; i < frames; ++i) { + uint32_t this_sz = 0; + + for (uint32_t j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8); + sizes_[i] = this_sz; + } + } + return VPX_CODEC_OK; + } + + void SetEncoderSvc() { + cfg_.ss_number_layers = 3; + cfg_.ts_number_layers = 3; + cfg_.g_timebase.num = 1; + cfg_.g_timebase.den = 30; + svc_params_.scaling_factor_num[0] = 72; + svc_params_.scaling_factor_den[0] = 288; + svc_params_.scaling_factor_num[1] = 144; + svc_params_.scaling_factor_den[1] = 288; + svc_params_.scaling_factor_num[2] = 288; + svc_params_.scaling_factor_den[2] = 288; + for (int i = 0; i < VPX_MAX_LAYERS; ++i) { + svc_params_.max_quantizers[i] = 56; + svc_params_.min_quantizers[i] = 2; + } + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 0; + // 3 temporal layers + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.g_threads = 1; + cfg_.kf_max_dist = 9999; + cfg_.rc_target_bitrate = 1600; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_undershoot_pct = 50; + + cfg_.layer_target_bitrate[0] = 100; + cfg_.layer_target_bitrate[1] = 140; + cfg_.layer_target_bitrate[2] = 200; + cfg_.layer_target_bitrate[3] = 250; + cfg_.layer_target_bitrate[4] = 350; + cfg_.layer_target_bitrate[5] = 500; + cfg_.layer_target_bitrate[6] = 450; + cfg_.layer_target_bitrate[7] = 630; + cfg_.layer_target_bitrate[8] = 900; } - void SetConfigSVC() { + void SetConfigSvc() { rc_cfg_.width = 1280; rc_cfg_.height = 720; rc_cfg_.max_quantizer = 56; @@ -288,17 +348,25 @@ class RcInterfaceTest : public ::testing::Test { } } + int aq_mode_; std::unique_ptr rc_api_; libvpx::VP9RateControlRtcConfig rc_cfg_; + vpx_svc_extra_cfg_t svc_params_; + libvpx::VP9FrameParamsQpRTC frame_params_; + bool encoder_exit_; + int current_superframe_; + uint32_t sizes_[8]; }; -TEST_F(RcInterfaceTest, OneLayerCBR) { RunOneLayerCBR(); } +TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); } -TEST_F(RcInterfaceTest, OneLayerVBR) { RunOneLayerVBR(); } +TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); } -TEST_F(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); } +TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); } -TEST_F(RcInterfaceTest, SVC) { RunSVC(); } +VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0), + ::testing::Values(VPX_CBR, VPX_VBR)); +VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0)); } // namespace int main(int argc, char **argv) { diff --git a/test/test-data.mk b/test/test-data.mk index 379fc6e7a9..46fe359898 100644 --- a/test/test-data.mk +++ b/test/test-data.mk @@ -27,10 +27,6 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += noisy_clip_640_360.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv -LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_one_layer -LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_one_layer_vbr -LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_one_layer_vbr_periodic_key -LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rc_interface_test_svc LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += bus_352x288_420_f20_b8.yuv # Test vectors diff --git a/test/test-data.sha1 b/test/test-data.sha1 index bcf9612fba..668992fba2 100644 --- a/test/test-data.sha1 +++ b/test/test-data.sha1 @@ -869,7 +869,3 @@ bac455906360b45338a16dd626ac5f19bc36a307 *desktop_office1.1280_720-020.yuv 518a0be998afece76d3df76047d51e256c591ff2 *invalid-bug-148271109.ivf d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv -03f827c0e36ff9a6e23c5cc11936924e4f1827ab *rc_interface_test_one_layer -99e4f4c2961d46dc286db230090a39d78460b25d *rc_interface_test_svc -9dcaafd91bc61ed360c23616b4788437b9f9b96b *rc_interface_test_one_layer_vbr -babd17cca2e93cc74753c6ed80de87457bc3a5f3 *rc_interface_test_one_layer_vbr_periodic_key diff --git a/test/test.mk b/test/test.mk index b0319fb0de..11228ecdda 100644 --- a/test/test.mk +++ b/test/test.mk @@ -214,6 +214,11 @@ TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) := ratectrl_rtc_test.cc +RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += encode_test_driver.cc +RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += encode_test_driver.h +RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.cc +RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.h +RC_INTERFACE_TEST_SRCS-yes += codec_factory.h endif # CONFIG_SHARED diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index f219f24500..4e0cb8b4c0 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -117,13 +117,16 @@ class VP9RateControlRTC { const VP9RateControlRtcConfig &cfg); ~VP9RateControlRTC() { if (cpi_) { - for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) { - for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) { - int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers); - LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer]; - vpx_free(lc->map); - vpx_free(lc->last_coded_q_map); - vpx_free(lc->consec_zero_mv); + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) { + for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) { + int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers); + LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer]; + vpx_free(lc->map); + vpx_free(lc->last_coded_q_map); + vpx_free(lc->consec_zero_mv); + } } } vpx_free(cpi_); From 6b4b82fd7a47720d608f6349bdb2cb2b81adb6a1 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 15 Jul 2021 16:05:16 -0700 Subject: [PATCH 0204/1000] Use round to be more accurate casting float to int Change-Id: Ifd5961917831752b176dd75d39d6b2cba6ce72fa --- vp8/encoder/onyx_if.c | 6 +++--- vp9/encoder/vp9_svc_layercontext.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index aeed719d1f..71ef057a4a 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -301,9 +301,9 @@ static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, /* Work out the average size of a frame within this layer */ if (layer > 0) { lc->avg_frame_size_for_layer = - (int)((cpi->oxcf.target_bitrate[layer] - - cpi->oxcf.target_bitrate[layer - 1]) * - 1000 / (lc->framerate - prev_layer_framerate)); + (int)round((cpi->oxcf.target_bitrate[layer] - + cpi->oxcf.target_bitrate[layer - 1]) * + 1000 / (lc->framerate - prev_layer_framerate)); } lc->active_worst_quality = cpi->oxcf.worst_allowed_q; diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index f9a0de62a0..ad3a8f7afa 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -322,8 +322,8 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) { const int prev_layer_target_bandwidth = oxcf->layer_target_bitrate[st_idx - 1]; lc->avg_frame_size = - (int)((lc->target_bandwidth - prev_layer_target_bandwidth) / - (lc->framerate - prev_layer_framerate)); + (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) / + (lc->framerate - prev_layer_framerate)); } } From cd260eba10b9155f6e6086c999bd4c9d3ca6c706 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 29 Jun 2021 14:48:35 -0700 Subject: [PATCH 0205/1000] Add cyclic refresh to vp9 rtc external ratecontrol Change-Id: Ia2a881399aa31ca0f34481b975362ddd4ad87f1c --- test/ratectrl_rtc_test.cc | 3 ++- vp9/encoder/vp9_aq_cyclicrefresh.c | 3 ++- vp9/ratectrl_rtc.cc | 27 ++++++++++++++++++++++++++- vp9/ratectrl_rtc.h | 10 ++++++++++ 4 files changed, 40 insertions(+), 3 deletions(-) diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc index 1414377082..8136bd8b93 100644 --- a/test/ratectrl_rtc_test.cc +++ b/test/ratectrl_rtc_test.cc @@ -127,6 +127,7 @@ class RcInterfaceTest rc_cfg_.max_quantizers[0] = 52; rc_cfg_.min_quantizers[0] = 2; rc_cfg_.rc_mode = rc_mode; + rc_cfg_.aq_mode = aq_mode_; // Encoder settings for ground truth. cfg_.g_w = 1280; @@ -364,7 +365,7 @@ TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); } TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); } -VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0), +VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3), ::testing::Values(VPX_CBR, VPX_VBR)); VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0)); } // namespace diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index e6edf5a925..f06fe47268 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -516,7 +516,8 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { cr->rate_ratio_qdelta = 3.0; } else { cr->rate_ratio_qdelta = 2.0; - if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) { + if (cr->content_mode && cpi->noise_estimate.enabled && + cpi->noise_estimate.level >= kMedium) { // Reduce the delta-qp if the estimated source noise is above threshold. cr->rate_ratio_qdelta = 1.7; cr->rate_boost_fac = 13; diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index b38a0db9c0..0f56e67e80 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -11,6 +11,7 @@ #include +#include "vp9/common/vp9_common.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_picklpf.h" #include "vpx/vp8cx.h" @@ -28,6 +29,15 @@ std::unique_ptr VP9RateControlRTC::Create( return nullptr; } rc_api->InitRateControl(cfg); + if (cfg.aq_mode) { + VP9_COMP *const cpi = rc_api->cpi_; + cpi->segmentation_map = static_cast( + vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, + sizeof(*cpi->segmentation_map))); + cpi->cyclic_refresh = + vp9_cyclic_refresh_alloc(cpi->common.mi_rows, cpi->common.mi_cols); + cpi->cyclic_refresh->content_mode = 0; + } return rc_api; } @@ -42,13 +52,14 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { oxcf->bit_depth = cm->bit_depth; oxcf->rc_mode = rc_cfg.rc_mode; oxcf->pass = 0; - oxcf->aq_mode = NO_AQ; + oxcf->aq_mode = rc_cfg.aq_mode ? CYCLIC_REFRESH_AQ : NO_AQ; oxcf->content = VP9E_CONTENT_DEFAULT; oxcf->drop_frames_water_mark = 0; cm->current_video_frame = 0; rc->kf_boost = DEFAULT_KF_BOOST; UpdateRateControl(rc_cfg); + vp9_set_mb_mi(cm, cm->width, cm->height); cpi_->use_svc = (cpi_->svc.number_spatial_layers > 1 || cpi_->svc.number_temporal_layers > 1) @@ -146,6 +157,8 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { cpi_->svc.number_temporal_layers == 1) { int target = 0; if (cpi_->oxcf.rc_mode == VPX_CBR) { + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_update_parameters(cpi_); if (frame_is_intra_only(cm)) target = vp9_calc_iframe_target_size_one_pass_cbr(cpi_); else @@ -156,6 +169,8 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { cpi_->rc.frames_to_key = cpi_->oxcf.key_freq; } vp9_set_gf_update_one_pass_vbr(cpi_); + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_update_parameters(cpi_); if (frame_is_intra_only(cm)) target = vp9_calc_iframe_target_size_one_pass_vbr(cpi_); else @@ -171,6 +186,8 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { int bottom_index, top_index; cpi_->common.base_qindex = vp9_rc_pick_q_and_bounds(cpi_, &bottom_index, &top_index); + + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_setup(cpi_); } int VP9RateControlRTC::GetQP() const { return cpi_->common.base_qindex; } @@ -181,6 +198,14 @@ int VP9RateControlRTC::GetLoopfilterLevel() const { return lf->filter_level; } +signed char *VP9RateControlRTC::GetCyclicRefreshMap() const { + return cpi_->cyclic_refresh->map; +} + +int *VP9RateControlRTC::GetDeltaQ() const { + return cpi_->cyclic_refresh->qindex_delta; +} + void VP9RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { vp9_rc_postencode_update(cpi_, encoded_frame_size); if (cpi_->svc.number_spatial_layers > 1 || diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index 4e0cb8b4c0..5cc7ec9457 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -18,6 +18,7 @@ #include "vp9/common/vp9_enums.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/vp9_iface_common.h" +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/vp9_cx_iface.h" @@ -42,6 +43,7 @@ struct VP9RateControlRtcConfig { framerate = 30.0; ss_number_layers = ts_number_layers = 1; rc_mode = VPX_CBR; + aq_mode = 0; vp9_zero(max_quantizers); vp9_zero(min_quantizers); vp9_zero(scaling_factor_den); @@ -82,6 +84,7 @@ struct VP9RateControlRtcConfig { int ts_rate_decimator[VPX_TS_MAX_LAYERS]; // vbr, cbr enum vpx_rc_mode rc_mode; + int aq_mode; }; struct VP9FrameParamsQpRTC { @@ -129,6 +132,11 @@ class VP9RateControlRTC { } } } + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + vpx_free(cpi_->segmentation_map); + cpi_->segmentation_map = NULL; + vp9_cyclic_refresh_free(cpi_->cyclic_refresh); + } vpx_free(cpi_); } } @@ -137,6 +145,8 @@ class VP9RateControlRTC { // GetQP() needs to be called after ComputeQP() to get the latest QP int GetQP() const; int GetLoopfilterLevel() const; + signed char *GetCyclicRefreshMap() const; + int *GetDeltaQ() const; void ComputeQP(const VP9FrameParamsQpRTC &frame_params); // Feedback to rate control with the size of current encoded frame void PostEncodeUpdate(uint64_t encoded_frame_size); From 4a4ea28a3826d6a9843369bb8880cf49dc7c5dc0 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 21 Jul 2021 14:32:27 -0700 Subject: [PATCH 0206/1000] Add control to get QP for all spatial layers Change-Id: I77a9884351e71649c8f8632293d9515c60f6adbc --- vp9/encoder/vp9_encoder.c | 4 ++++ vp9/encoder/vp9_svc_layercontext.h | 2 ++ vp9/vp9_cx_iface.c | 12 ++++++++++++ vpx/vp8cx.h | 13 +++++++++++++ 4 files changed, 31 insertions(+) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index f50b979979..c964eb68bb 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -3709,6 +3709,10 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index, cpi->rc.force_max_q = 0; } + if (cpi->use_svc) { + cpi->svc.base_qindex[cpi->svc.spatial_layer_id] = *q; + } + if (!frame_is_intra_only(cm)) { vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH); } diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index b12e7e01a7..b2d1d1b98f 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -173,6 +173,8 @@ typedef struct SVC { uint8_t fb_idx_temporal_layer_id[REF_FRAMES]; int spatial_layer_sync[VPX_SS_MAX_LAYERS]; + // Quantizer for each spatial layer. + int base_qindex[VPX_SS_MAX_LAYERS]; uint8_t set_intra_only_frame; uint8_t previous_frame_is_intra_only; uint8_t superframe_has_layer_sync; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 906f2b0b85..0da54d2d0d 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -834,6 +834,17 @@ static vpx_codec_err_t ctrl_get_quantizer64(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_get_quantizer_svc_layers(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + int i; + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; + for (i = 0; i < VPX_SS_MAX_LAYERS; i++) { + arg[i] = ctx->cpi->svc.base_qindex[i]; + } + return VPX_CODEC_OK; +} + static vpx_codec_err_t ctrl_get_loopfilter_level(vpx_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); @@ -1988,6 +1999,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { // Getters { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer }, { VP8E_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 }, + { VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, ctrl_get_quantizer_svc_layers }, { VP9E_GET_LOOPFILTER_LEVEL, ctrl_get_loopfilter_level }, { VP9_GET_REFERENCE, ctrl_get_reference }, { VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id }, diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 011dfcba52..7d0dee0b78 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -732,6 +732,16 @@ enum vp8e_enc_control_id { * Supported in codecs: VP9 */ VP9E_GET_LOOPFILTER_LEVEL, + + /*!\brief Codec control to get last quantizers for all spatial layers. + * + * Return value uses an array of internal quantizers scale defined by the + * codec, for all spatial layers. + * The size of the array passed in should be #VPX_SS_MAX_LAYERS. + * + * Supported in codecs: VP9 + */ + VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, }; /*!\brief vpx 1-D scaling mode @@ -989,6 +999,9 @@ VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *) #define VPX_CTRL_VP8E_GET_LAST_QUANTIZER VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *) #define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64 +VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *) +#define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS + VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *) #define VPX_CTRL_VP9E_GET_SVC_LAYER_ID From cf64eb2805c73f4feaac3fec2b890d2fb3eef7da Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Fri, 23 Jul 2021 10:55:10 -0700 Subject: [PATCH 0207/1000] Disable allow_partition_search_skip feature This feature was added to help speed up still images and slideshows. It didn't work anymore, and thus was disabled. Code cleanup will follow. This had negligible impact to regular test sets. Borg test result on ugc360p set at speed 3. avg_psnr: ovr_psnr: ssim: speed: -0.244 -0.278 -0.153 -0.973 Change-Id: If74edabce0c93be1361e645ffd2eec063c2db76b --- vp9/encoder/vp9_speed_features.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index fc7a67c9f1..1431446d9e 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -345,7 +345,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; sf->adaptive_interp_filter_search = 1; - sf->allow_partition_search_skip = 1; + sf->allow_partition_search_skip = 0; if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { for (i = 0; i < MAX_MESH_STEP; ++i) { From 7c00f0ce18811dbe6d538ebda9fec7339fed3a90 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Fri, 23 Jul 2021 22:34:01 -0700 Subject: [PATCH 0208/1000] Clean up allow_partition_search_skip code Change-Id: Ia05157fc3e613d93f10df5abddd77a740a0005ca --- vp9/encoder/vp9_encodeframe.c | 22 ++------------------- vp9/encoder/vp9_encoder.c | 1 - vp9/encoder/vp9_encoder.h | 3 --- vp9/encoder/vp9_firstpass.c | 33 -------------------------------- vp9/encoder/vp9_speed_features.c | 2 -- vp9/encoder/vp9_speed_features.h | 3 --- 6 files changed, 2 insertions(+), 62 deletions(-) diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 969fad59b1..3c54aa548d 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -160,6 +160,7 @@ unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi, #endif // CONFIG_VP9_HIGHBITDEPTH #if !CONFIG_REALTIME_ONLY +#if CONFIG_FP_MB_STATS static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi, const struct buf_2d *ref, int mi_row, int mi_col, @@ -174,20 +175,8 @@ static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi, var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse); return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); } +#endif -static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, MACROBLOCK *x, - int mi_row, int mi_col) { - unsigned int var = get_sby_perpixel_diff_variance( - cpi, &x->plane[0].src, mi_row, mi_col, BLOCK_64X64); - if (var < 8) - return BLOCK_64X64; - else if (var < 128) - return BLOCK_32X32; - else if (var < 2048) - return BLOCK_16X16; - else - return BLOCK_8X8; -} #endif // !CONFIG_REALTIME_ONLY static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row, @@ -4705,13 +4694,6 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root); - } else if (cpi->partition_search_skippable_frame) { - BLOCK_SIZE bsize; - set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); - bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col); - set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); - rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, td->pc_root); } else if (sf->partition_search_type == VAR_BASED_PARTITION && cm->frame_type != KEY_FRAME) { choose_partitioning(cpi, tile_info, x, mi_row, mi_col); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index f50b979979..f1a96e8b01 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2323,7 +2323,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, vp9_init_rd_parameters(cpi); init_frame_indexes(cm); - cpi->partition_search_skippable_frame = 0; cpi->tile_data = NULL; realloc_segmentation_maps(cpi); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index ea2d59e1b5..7a3f354bcd 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -711,9 +711,6 @@ typedef struct VP9_COMP { TileDataEnc *tile_data; int allocated_tiles; // Keep track of memory allocated for tiles. - // For a still frame, this flag is set to 1 to skip partition search. - int partition_search_skippable_frame; - int scaled_ref_idx[REFS_PER_FRAME]; int lst_fb_idx; int gld_fb_idx; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 7343d1bc66..e30e2bed01 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -3477,25 +3477,6 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) { } } -static int is_skippable_frame(const VP9_COMP *cpi) { - // If the current frame does not have non-zero motion vector detected in the - // first pass, and so do its previous and forward frames, then this frame - // can be skipped for partition check, and the partition size is assigned - // according to the variance - const TWO_PASS *const twopass = &cpi->twopass; - - return (!frame_is_intra_only(&cpi->common) && - twopass->stats_in - 2 > twopass->stats_in_start && - twopass->stats_in < twopass->stats_in_end && - (twopass->stats_in - 1)->pcnt_inter - - (twopass->stats_in - 1)->pcnt_motion == - 1 && - (twopass->stats_in - 2)->pcnt_inter - - (twopass->stats_in - 2)->pcnt_motion == - 1 && - twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1); -} - // Configure image size specific vizier parameters. // Later these will be set via additional command line options void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area) { @@ -3593,13 +3574,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { cm->frame_type = INTER_FRAME; - // Do the firstpass stats indicate that this frame is skippable for the - // partition search? - if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 && - !cpi->use_svc) { - cpi->partition_search_skippable_frame = is_skippable_frame(cpi); - } - // The multiplication by 256 reverses a scaling factor of (>> 8) // applied when combining MB error values for the frame. twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0); @@ -3682,13 +3656,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { vp9_configure_buffer_updates(cpi, gf_group->index); - // Do the firstpass stats indicate that this frame is skippable for the - // partition search? - if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 && - !cpi->use_svc) { - cpi->partition_search_skippable_frame = is_skippable_frame(cpi); - } - rc->base_frame_target = gf_group->bit_allocation[gf_group->index]; // The multiplication by 256 reverses a scaling factor of (>> 8) diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 1431446d9e..81695e9156 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -345,7 +345,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; sf->adaptive_interp_filter_search = 1; - sf->allow_partition_search_skip = 0; if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { for (i = 0; i < MAX_MESH_STEP; ++i) { @@ -931,7 +930,6 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { sf->max_delta_qindex = 0; sf->disable_filter_search_var_thresh = 0; sf->adaptive_interp_filter_search = 0; - sf->allow_partition_search_skip = 0; sf->allow_txfm_domain_distortion = 0; sf->tx_domain_thresh = 99.0; sf->allow_quant_coeff_opt = sf->optimize_coefficients; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 5ea04709ec..c2ae970b77 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -525,9 +525,6 @@ typedef struct SPEED_FEATURES { int prune_rect_thresh[4]; } rd_ml_partition; - // Allow skipping partition search for still image frame - int allow_partition_search_skip; - // Fast approximation of vp9_model_rd_from_var_lapndz int simple_model_rd_from_var; From 0973ac05bae529e54d9e04e0f69fa4e58869d923 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Fri, 23 Jul 2021 22:45:45 -0700 Subject: [PATCH 0209/1000] Remove unused old FP_MB_STATS code Change-Id: I78ac1f8ce1598de295efd2ac1fe8244072d9b501 --- vp9/encoder/vp9_encodeframe.c | 200 ---------------------------------- vp9/encoder/vp9_encoder.c | 30 ----- vp9/encoder/vp9_encoder.h | 8 -- vp9/encoder/vp9_firstpass.c | 86 --------------- vp9/encoder/vp9_firstpass.h | 27 ----- vp9/vp9_cx_iface.c | 9 -- vpxenc.c | 51 --------- 7 files changed, 411 deletions(-) diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 3c54aa548d..f08300976e 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -159,26 +159,6 @@ unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi, } #endif // CONFIG_VP9_HIGHBITDEPTH -#if !CONFIG_REALTIME_ONLY -#if CONFIG_FP_MB_STATS -static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi, - const struct buf_2d *ref, - int mi_row, int mi_col, - BLOCK_SIZE bs) { - unsigned int sse, var; - uint8_t *last_y; - const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME); - - assert(last != NULL); - last_y = - &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE]; - var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse); - return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); -} -#endif - -#endif // !CONFIG_REALTIME_ONLY - static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize, int segment_index) { VP9_COMMON *const cm = &cpi->common; @@ -3110,54 +3090,6 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv)); } -#if CONFIG_FP_MB_STATS -const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 4, 4 }; -const int num_16x16_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1, - 2, 1, 2, 4, 2, 4 }; -const int qindex_skip_threshold_lookup[BLOCK_SIZES] = { 0, 10, 10, 30, 40, - 40, 60, 80, 80, 90, - 100, 100, 120 }; -const int qindex_split_threshold_lookup[BLOCK_SIZES] = { 0, 3, 3, 7, 15, - 15, 30, 40, 40, 60, - 80, 80, 120 }; -const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = { 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, - 4, 4, 6 }; - -typedef enum { - MV_ZERO = 0, - MV_LEFT = 1, - MV_UP = 2, - MV_RIGHT = 3, - MV_DOWN = 4, - MV_INVALID -} MOTION_DIRECTION; - -static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) { - if (fp_byte & FPMB_MOTION_ZERO_MASK) { - return MV_ZERO; - } else if (fp_byte & FPMB_MOTION_LEFT_MASK) { - return MV_LEFT; - } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) { - return MV_RIGHT; - } else if (fp_byte & FPMB_MOTION_UP_MASK) { - return MV_UP; - } else { - return MV_DOWN; - } -} - -static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv, - MOTION_DIRECTION that_mv) { - if (this_mv == that_mv) { - return 0; - } else { - return abs(this_mv - that_mv) == 2 ? 2 : 1; - } -} -#endif - // Calculate prediction based on the given input features and neural net config. // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden // layer. @@ -4055,11 +3987,6 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, BLOCK_SIZE min_size = x->min_partition_size; BLOCK_SIZE max_size = x->max_partition_size; -#if CONFIG_FP_MB_STATS - unsigned int src_diff_var = UINT_MAX; - int none_complexity = 0; -#endif - int partition_none_allowed = !force_horz_split && !force_vert_split; int partition_horz_allowed = !force_vert_split && yss <= xss && bsize >= BLOCK_8X8; @@ -4146,65 +4073,6 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, save_context(x, mi_row, mi_col, a, l, sa, sl, bsize); -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); - src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src, mi_row, - mi_col, bsize); - } -#endif - -#if CONFIG_FP_MB_STATS - // Decide whether we shall split directly and skip searching NONE by using - // the first pass block statistics - if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_split && - partition_none_allowed && src_diff_var > 4 && - cm->base_qindex < qindex_split_threshold_lookup[bsize]) { - int mb_row = mi_row >> 1; - int mb_col = mi_col >> 1; - int mb_row_end = - VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows); - int mb_col_end = - VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols); - int r, c; - - // compute a complexity measure, basically measure inconsistency of motion - // vectors obtained from the first pass in the current block - for (r = mb_row; r < mb_row_end; r++) { - for (c = mb_col; c < mb_col_end; c++) { - const int mb_index = r * cm->mb_cols + c; - - MOTION_DIRECTION this_mv; - MOTION_DIRECTION right_mv; - MOTION_DIRECTION bottom_mv; - - this_mv = - get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]); - - // to its right - if (c != mb_col_end - 1) { - right_mv = get_motion_direction_fp( - cpi->twopass.this_frame_mb_stats[mb_index + 1]); - none_complexity += get_motion_inconsistency(this_mv, right_mv); - } - - // to its bottom - if (r != mb_row_end - 1) { - bottom_mv = get_motion_direction_fp( - cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]); - none_complexity += get_motion_inconsistency(this_mv, bottom_mv); - } - - // do not count its left and top neighbors to avoid double counting - } - } - - if (none_complexity > complexity_16x16_blocks_threshold[bsize]) { - partition_none_allowed = 0; - } - } -#endif - pc_tree->partitioning = PARTITION_NONE; if (cpi->sf.rd_ml_partition.var_pruning && !frame_is_intra_only(cm)) { @@ -4282,53 +4150,6 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } } } - -#if CONFIG_FP_MB_STATS - // Check if every 16x16 first pass block statistics has zero - // motion and the corresponding first pass residue is small enough. - // If that is the case, check the difference variance between the - // current frame and the last frame. If the variance is small enough, - // stop further splitting in RD optimization - if (cpi->use_fp_mb_stats && do_split != 0 && - cm->base_qindex > qindex_skip_threshold_lookup[bsize]) { - int mb_row = mi_row >> 1; - int mb_col = mi_col >> 1; - int mb_row_end = - VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows); - int mb_col_end = - VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols); - int r, c; - - int skip = 1; - for (r = mb_row; r < mb_row_end; r++) { - for (c = mb_col; c < mb_col_end; c++) { - const int mb_index = r * cm->mb_cols + c; - if (!(cpi->twopass.this_frame_mb_stats[mb_index] & - FPMB_MOTION_ZERO_MASK) || - !(cpi->twopass.this_frame_mb_stats[mb_index] & - FPMB_ERROR_SMALL_MASK)) { - skip = 0; - break; - } - } - if (skip == 0) { - break; - } - } - - if (skip) { - if (src_diff_var == UINT_MAX) { - set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); - src_diff_var = get_sby_perpixel_diff_variance( - cpi, &x->plane[0].src, mi_row, mi_col, bsize); - } - if (src_diff_var < 8) { - do_split = 0; - do_rect = 0; - } - } - } -#endif } } restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); @@ -6064,20 +5885,6 @@ static void encode_tiles(VP9_COMP *cpi) { vp9_encode_tile(cpi, &cpi->td, tile_row, tile_col); } -#if CONFIG_FP_MB_STATS -static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats, - VP9_COMMON *cm, uint8_t **this_frame_mb_stats) { - uint8_t *mb_stats_in = firstpass_mb_stats->mb_stats_start + - cm->current_video_frame * cm->MBs * sizeof(uint8_t); - - if (mb_stats_in > firstpass_mb_stats->mb_stats_end) return EOF; - - *this_frame_mb_stats = mb_stats_in; - - return 1; -} -#endif - static int compare_kmeans_data(const void *a, const void *b) { if (((const KMEANS_DATA *)a)->value > ((const KMEANS_DATA *)b)->value) { return 1; @@ -6284,13 +6091,6 @@ static void encode_frame_internal(VP9_COMP *cpi) { struct vpx_usec_timer emr_timer; vpx_usec_timer_start(&emr_timer); -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm, - &cpi->twopass.this_frame_mb_stats); - } -#endif - if (!cpi->row_mt) { cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy; cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index f1a96e8b01..9001d018f3 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2363,17 +2363,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, vpx_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1)); } -#if CONFIG_FP_MB_STATS - cpi->use_fp_mb_stats = 0; - if (cpi->use_fp_mb_stats) { - // a place holder used to store the first pass mb stats in the first pass - CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf, - vpx_calloc(cm->MBs * sizeof(uint8_t), 1)); - } else { - cpi->twopass.frame_mb_stats_buf = NULL; - } -#endif - cpi->refresh_alt_ref_frame = 0; cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; @@ -2526,18 +2515,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, vp9_init_second_pass_spatial_svc(cpi); } else { int num_frames; -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - const size_t psz = cpi->common.MBs * sizeof(uint8_t); - const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz); - - cpi->twopass.firstpass_mb_stats.mb_stats_start = - oxcf->firstpass_mb_stats_in.buf; - cpi->twopass.firstpass_mb_stats.mb_stats_end = - cpi->twopass.firstpass_mb_stats.mb_stats_start + - (ps - 1) * cpi->common.MBs * sizeof(uint8_t); - } -#endif cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf; cpi->twopass.stats_in = cpi->twopass.stats_in_start; @@ -2841,13 +2818,6 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vpx_free(cpi->mbgraph_stats[i].mb_stats); } -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - vpx_free(cpi->twopass.frame_mb_stats_buf); - cpi->twopass.frame_mb_stats_buf = NULL; - } -#endif - vp9_extrc_delete(&cpi->ext_ratectrl); vp9_remove_common(cm); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 7a3f354bcd..9774a64ccf 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -273,10 +273,6 @@ typedef struct VP9EncoderConfig { vpx_fixed_buf_t two_pass_stats_in; -#if CONFIG_FP_MB_STATS - vpx_fixed_buf_t firstpass_mb_stats_in; -#endif - vp8e_tuning tuning; vp9e_tune_content content; #if CONFIG_VP9_HIGHBITDEPTH @@ -803,10 +799,6 @@ typedef struct VP9_COMP { uint64_t time_pick_lpf; uint64_t time_encode_sb_row; -#if CONFIG_FP_MB_STATS - int use_fp_mb_stats; -#endif - TWO_PASS twopass; // Force recalculation of segment_ids for each mode info diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index e30e2bed01..67302ed035 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -135,17 +135,6 @@ static void output_stats(FIRSTPASS_STATS *stats) { #endif } -#if CONFIG_FP_MB_STATS -static void output_fpmb_stats(uint8_t *this_frame_mb_stats, VP9_COMMON *cm, - struct vpx_codec_pkt_list *pktlist) { - struct vpx_codec_cx_pkt pkt; - pkt.kind = VPX_CODEC_FPMB_STATS_PKT; - pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats; - pkt.data.firstpass_mb_stats.sz = cm->initial_mbs * sizeof(uint8_t); - vpx_codec_pkt_list_add(pktlist, &pkt); -} -#endif - static void zero_stats(FIRSTPASS_STATS *section) { section->frame = 0.0; section->weight = 0.0; @@ -953,10 +942,6 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, int level_sample; const int mb_index = mb_row * cm->mb_cols + mb_col; -#if CONFIG_FP_MB_STATS - const int mb_index = mb_row * cm->mb_cols + mb_col; -#endif - (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c); // Adjust to the next column of MBs. @@ -1092,13 +1077,6 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, // Accumulate the intra error. fp_acc_data->intra_error += (int64_t)this_error; -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - // initialization - cpi->twopass.frame_mb_stats_buf[mb_index] = 0; - } -#endif - // Set up limit values for motion vectors to prevent them extending // outside the UMV borders. x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16); @@ -1244,20 +1222,6 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, best_ref_mv->row = 0; best_ref_mv->col = 0; -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - // intra prediction statistics - cpi->twopass.frame_mb_stats_buf[mb_index] = 0; - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK; - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; - if (this_error > FPMB_ERROR_LARGE_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK; - } else if (this_error < FPMB_ERROR_SMALL_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK; - } - } -#endif - if (motion_error <= this_error) { vpx_clear_system_state(); @@ -1302,47 +1266,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, *best_ref_mv = mv; -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - // inter prediction statistics - cpi->twopass.frame_mb_stats_buf[mb_index] = 0; - cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK; - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; - if (this_error > FPMB_ERROR_LARGE_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK; - } else if (this_error < FPMB_ERROR_SMALL_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK; - } - } -#endif - if (!is_zero_mv(&mv)) { ++(fp_acc_data->mvcount); -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_MOTION_ZERO_MASK; - // check estimated motion direction - if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) { - // right direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_RIGHT_MASK; - } else if (mv.as_mv.row < 0 && - abs(mv.as_mv.row) >= abs(mv.as_mv.col)) { - // up direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_UP_MASK; - } else if (mv.as_mv.col < 0 && - abs(mv.as_mv.col) >= abs(mv.as_mv.row)) { - // left direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_LEFT_MASK; - } else { - // down direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_DOWN_MASK; - } - } -#endif // Does the row vector point inwards or outwards? if (mb_row < cm->mb_rows / 2) { if (mv.row > 0) @@ -1459,12 +1385,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { assert(new_yv12 != NULL); assert(frame_is_intra_only(cm) || (lst_yv12 != NULL)); -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs); - } -#endif - set_first_pass_params(cpi); vp9_set_quantizer(cpi, find_fp_qindex(cm->bit_depth)); @@ -1525,12 +1445,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { twopass->this_frame_stats = fps; output_stats(&twopass->this_frame_stats); accumulate_stats(&twopass->total_stats, &fps); - -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - output_fpmb_stats(twopass->frame_mb_stats_buf, cm, cpi->output_pkt_list); - } -#endif } // Copy the previous Last Frame back into gf and and arf buffers if diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index ddfc87d894..cdcf568723 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -21,27 +21,6 @@ extern "C" { #endif -#if CONFIG_FP_MB_STATS - -#define FPMB_DCINTRA_MASK 0x01 - -#define FPMB_MOTION_ZERO_MASK 0x02 -#define FPMB_MOTION_LEFT_MASK 0x04 -#define FPMB_MOTION_RIGHT_MASK 0x08 -#define FPMB_MOTION_UP_MASK 0x10 -#define FPMB_MOTION_DOWN_MASK 0x20 - -#define FPMB_ERROR_SMALL_MASK 0x40 -#define FPMB_ERROR_LARGE_MASK 0x80 -#define FPMB_ERROR_SMALL_TH 2000 -#define FPMB_ERROR_LARGE_TH 48000 - -typedef struct { - uint8_t *mb_stats_start; - uint8_t *mb_stats_end; -} FIRSTPASS_MB_STATS; -#endif - #define INVALID_ROW (-1) #define MAX_ARF_LAYERS 6 @@ -188,12 +167,6 @@ typedef struct { double mb_av_energy; double mb_smooth_pct; -#if CONFIG_FP_MB_STATS - uint8_t *frame_mb_stats_buf; - uint8_t *this_frame_mb_stats; - FIRSTPASS_MB_STATS firstpass_mb_stats; -#endif - FP_MB_FLOAT_STATS *fp_mb_float_stats; // An indication of the content type of the current frame diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 906f2b0b85..7d927ba80b 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -583,10 +583,6 @@ static vpx_codec_err_t set_encoder_config( vp9_set_first_pass_stats(oxcf, &cfg->rc_twopass_stats_in); -#if CONFIG_FP_MB_STATS - oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in; -#endif - oxcf->color_space = extra_cfg->color_space; oxcf->color_range = extra_cfg->color_range; oxcf->render_width = extra_cfg->render_width; @@ -2293,11 +2289,6 @@ void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp) { DUMP_STRUCT_VALUE(fp, oxcf, target_level); // TODO(angiebird): dump two_pass_stats_in - -#if CONFIG_FP_MB_STATS - // TODO(angiebird): dump firstpass_mb_stats_in -#endif - DUMP_STRUCT_VALUE(fp, oxcf, tuning); DUMP_STRUCT_VALUE(fp, oxcf, content); #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vpxenc.c b/vpxenc.c index 276ee9b902..a0122ef804 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -114,10 +114,6 @@ static const arg_def_t pass_arg = ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)"); static const arg_def_t fpf_name = ARG_DEF(NULL, "fpf", 1, "First pass statistics file name"); -#if CONFIG_FP_MB_STATS -static const arg_def_t fpmbf_name = - ARG_DEF(NULL, "fpmbf", 1, "First pass block statistics file name"); -#endif static const arg_def_t limit = ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames"); static const arg_def_t skip = @@ -674,9 +670,6 @@ struct stream_config { struct vpx_codec_enc_cfg cfg; const char *out_fn; const char *stats_fn; -#if CONFIG_FP_MB_STATS - const char *fpmb_stats_fn; -#endif stereo_format_t stereo_fmt; int arg_ctrls[ARG_CTRL_CNT_MAX][2]; int arg_ctrl_cnt; @@ -704,9 +697,6 @@ struct stream_state { uint64_t cx_time; size_t nbytes; stats_io_t stats; -#if CONFIG_FP_MB_STATS - stats_io_t fpmb_stats; -#endif struct vpx_image *img; vpx_codec_ctx_t decoder; int mismatch_seen; @@ -943,10 +933,6 @@ static int parse_stream_params(struct VpxEncoderConfig *global, config->out_fn = arg.val; } else if (arg_match(&arg, &fpf_name, argi)) { config->stats_fn = arg.val; -#if CONFIG_FP_MB_STATS - } else if (arg_match(&arg, &fpmbf_name, argi)) { - config->fpmb_stats_fn = arg.val; -#endif } else if (arg_match(&arg, &use_webm, argi)) { #if CONFIG_WEBM_IO config->write_webm = 1; @@ -1169,17 +1155,6 @@ static void validate_stream_config(const struct stream_state *stream, fatal("Stream %d: duplicate stats file (from stream %d)", streami->index, stream->index); } - -#if CONFIG_FP_MB_STATS - /* Check for two streams sharing a mb stats file. */ - if (streami != stream) { - const char *a = stream->config.fpmb_stats_fn; - const char *b = streami->config.fpmb_stats_fn; - if (a && b && !strcmp(a, b)) - fatal("Stream %d: duplicate mb stats file (from stream %d)", - streami->index, stream->index); - } -#endif } } @@ -1338,26 +1313,11 @@ static void setup_pass(struct stream_state *stream, fatal("Failed to open statistics store"); } -#if CONFIG_FP_MB_STATS - if (stream->config.fpmb_stats_fn) { - if (!stats_open_file(&stream->fpmb_stats, stream->config.fpmb_stats_fn, - pass)) - fatal("Failed to open mb statistics store"); - } else { - if (!stats_open_mem(&stream->fpmb_stats, pass)) - fatal("Failed to open mb statistics store"); - } -#endif - stream->config.cfg.g_pass = global->passes == 2 ? pass ? VPX_RC_LAST_PASS : VPX_RC_FIRST_PASS : VPX_RC_ONE_PASS; if (pass) { stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats); -#if CONFIG_FP_MB_STATS - stream->config.cfg.rc_firstpass_mb_stats_in = - stats_get(&stream->fpmb_stats); -#endif } stream->cx_time = 0; @@ -1569,13 +1529,6 @@ static void get_cx_data(struct stream_state *stream, pkt->data.twopass_stats.sz); stream->nbytes += pkt->data.raw.sz; break; -#if CONFIG_FP_MB_STATS - case VPX_CODEC_FPMB_STATS_PKT: - stats_write(&stream->fpmb_stats, pkt->data.firstpass_mb_stats.buf, - pkt->data.firstpass_mb_stats.sz); - stream->nbytes += pkt->data.raw.sz; - break; -#endif case VPX_CODEC_PSNR_PKT: if (global->show_psnr) { @@ -2069,10 +2022,6 @@ int main(int argc, const char **argv_) { FOREACH_STREAM(stats_close(&stream->stats, global.passes - 1)); -#if CONFIG_FP_MB_STATS - FOREACH_STREAM(stats_close(&stream->fpmb_stats, global.passes - 1)); -#endif - if (global.pass) break; } From fc04a9491ebaaa8e2b1c7c8e0587c8a1873531d6 Mon Sep 17 00:00:00 2001 From: Peter Kasting Date: Mon, 26 Jul 2021 03:57:55 -0700 Subject: [PATCH 0210/1000] Fix some instances of -Wunused-but-set-variable. Bug: chromium:1203071 Change-Id: Ieb628f95d676ba3814b5caf8a02a884330928c77 --- vp8/encoder/bitstream.c | 3 --- vp9/encoder/vp9_encodeframe.c | 18 +++--------------- vpx_ports/x86.h | 12 ++++++++++++ 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c index 80cbb882fd..87825fa6fe 100644 --- a/vp8/encoder/bitstream.c +++ b/vp8/encoder/bitstream.c @@ -866,7 +866,6 @@ void vp8_update_coef_probs(VP8_COMP *cpi) { #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) vp8_writer *const w = cpi->bc; #endif - int savings = 0; vpx_clear_system_state(); @@ -940,8 +939,6 @@ void vp8_update_coef_probs(VP8_COMP *cpi) { #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) vp8_write_literal(w, newp, 8); #endif - - savings += s; } } while (++t < ENTROPY_NODES); diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index f08300976e..131c4887f2 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -784,8 +784,8 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d, // Check if most of the superblock is skin content, and if so, force split to // 32x32, and set x->sb_is_skin for use in mode selection. -static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res, - int mi_row, int mi_col, int *force_split) { +static int skin_sb_split(VP9_COMP *cpi, const int low_res, int mi_row, + int mi_col, int *force_split) { VP9_COMMON *const cm = &cpi->common; #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) return 0; @@ -797,11 +797,6 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res, mi_row + 8 < cm->mi_rows)) { int num_16x16_skin = 0; int num_16x16_nonskin = 0; - uint8_t *ysignal = x->plane[0].src.buf; - uint8_t *usignal = x->plane[1].src.buf; - uint8_t *vsignal = x->plane[2].src.buf; - int sp = x->plane[0].src.stride; - int spuv = x->plane[1].src.stride; const int block_index = mi_row * cm->mi_cols + mi_col; const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64]; const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64]; @@ -820,13 +815,7 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res, i = ymis; break; } - ysignal += 16; - usignal += 8; - vsignal += 8; } - ysignal += (sp << 4) - 64; - usignal += (spuv << 3) - 32; - vsignal += (spuv << 3) - 32; } if (num_16x16_skin > 12) { *force_split = 1; @@ -1503,8 +1492,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64); if (cpi->use_skin_detection) - x->sb_is_skin = - skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split); + x->sb_is_skin = skin_sb_split(cpi, low_res, mi_row, mi_col, force_split); d = xd->plane[0].dst.buf; dp = xd->plane[0].dst.stride; diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h index 14f4344495..ad3da84aca 100644 --- a/vpx_ports/x86.h +++ b/vpx_ports/x86.h @@ -223,6 +223,8 @@ static INLINE int x86_simd_caps(void) { } } + (void)reg_eax; // Avoid compiler warning on unused-but-set variable. + return flags & mask; } @@ -307,6 +309,11 @@ static INLINE unsigned int x86_readtscp(void) { static INLINE unsigned int x86_tsc_start(void) { unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + // Avoid compiler warnings on unused-but-set variables. + (void)reg_eax; + (void)reg_ebx; + (void)reg_ecx; + (void)reg_edx; return x86_readtsc(); } @@ -314,6 +321,11 @@ static INLINE unsigned int x86_tsc_end(void) { uint32_t v = x86_readtscp(); unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + // Avoid compiler warnings on unused-but-set variables. + (void)reg_eax; + (void)reg_ebx; + (void)reg_ecx; + (void)reg_edx; return v; } From 0d1aec7373b6e43825281f0a4f9d40df77323e0a Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 26 Jul 2021 16:52:56 -0700 Subject: [PATCH 0211/1000] vpx_ports/x86.h: sync with aom_ports/x86.h adds a few comments and makes the file ascii: 854b2766a Replace non-ASCII characters Change-Id: I6c2d76b293158bcad9f1ded7a91a81bda1e700fb --- vpx_ports/x86.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h index ad3da84aca..4d5391b78d 100644 --- a/vpx_ports/x86.h +++ b/vpx_ports/x86.h @@ -242,7 +242,7 @@ static INLINE int x86_simd_caps(void) { // x86_readtsc directly, but prevent the CPU's out-of-order execution from // affecting the measurement (by having earlier/later instructions be evaluated // in the time interval). See the white paper, "How to Benchmark Code -// Execution Times on Intel® IA-32 and IA-64 Instruction Set Architectures" by +// Execution Times on Intel(R) IA-32 and IA-64 Instruction Set Architectures" by // Gabriele Paoloni for more information. // // If you are timing a large function (CPU time > a couple of seconds), use @@ -308,6 +308,7 @@ static INLINE unsigned int x86_readtscp(void) { static INLINE unsigned int x86_tsc_start(void) { unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + // This call should not be removed. See function notes above. cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); // Avoid compiler warnings on unused-but-set variables. (void)reg_eax; @@ -320,6 +321,7 @@ static INLINE unsigned int x86_tsc_start(void) { static INLINE unsigned int x86_tsc_end(void) { uint32_t v = x86_readtscp(); unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + // This call should not be removed. See function notes above. cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); // Avoid compiler warnings on unused-but-set variables. (void)reg_eax; From f685d508da549b0eccfc455c04e2b6fbc3eeb251 Mon Sep 17 00:00:00 2001 From: Hirokazu Honda Date: Fri, 30 Jul 2021 02:42:35 +0900 Subject: [PATCH 0212/1000] vp9 rc: Fills VP9_COMP zero at initialization Change-Id: Ib1a544ce87e8fdbe23c0e54b6426ee228011b126 --- vp9/ratectrl_rtc.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index 0f56e67e80..6446120f5b 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -25,9 +25,9 @@ std::unique_ptr VP9RateControlRTC::Create( VP9RateControlRTC()); if (!rc_api) return nullptr; rc_api->cpi_ = static_cast(vpx_memalign(32, sizeof(*cpi_))); - if (rc_api->cpi_ == nullptr) { - return nullptr; - } + if (!rc_api->cpi_) return nullptr; + vp9_zero(*rc_api->cpi_); + rc_api->InitRateControl(cfg); if (cfg.aq_mode) { VP9_COMP *const cpi = rc_api->cpi_; From 59c9e1d87ef33bc82fca82cfcf5202d4b86c92e7 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 24 Aug 2021 14:30:54 -0700 Subject: [PATCH 0213/1000] vp9 rc lib: Allow aq 3 to work for SVC with unit test Also use round to cast float to int with more accurate calculation to avoid error accumulation which causes qp to be different after ~290 frames. Change-Id: Iff65a8fdc67401814fd253dbf148afe9887df97f --- test/ratectrl_rtc_test.cc | 4 +++- vp9/encoder/vp9_ratectrl.c | 4 ++-- vp9/ratectrl_rtc.cc | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc index 8136bd8b93..22bc5ecf74 100644 --- a/test/ratectrl_rtc_test.cc +++ b/test/ratectrl_rtc_test.cc @@ -270,6 +270,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, for (int i = 0; i < VPX_MAX_LAYERS; ++i) { svc_params_.max_quantizers[i] = 56; svc_params_.min_quantizers[i] = 2; + svc_params_.speed_per_layer[i] = 7; } cfg_.rc_end_usage = VPX_CBR; cfg_.g_lag_in_frames = 0; @@ -318,6 +319,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, rc_cfg_.ss_number_layers = 3; rc_cfg_.ts_number_layers = 3; rc_cfg_.rc_mode = VPX_CBR; + rc_cfg_.aq_mode = aq_mode_; rc_cfg_.scaling_factor_num[0] = 1; rc_cfg_.scaling_factor_den[0] = 4; @@ -367,7 +369,7 @@ TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); } VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3), ::testing::Values(VPX_CBR, VPX_VBR)); -VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0)); +VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3)); } // namespace int main(int argc, char **argv) { diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index d0d83a8342..e38464c72c 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -277,9 +277,9 @@ static void update_buffer_level_svc_preencode(VP9_COMP *cpi) { svc->current_superframe > 0) { // TODO(marpan): This may need to be modified for temporal layers. const double framerate_pts = 10000000.0 / ts_delta; - lrc->bits_off_target += (int)(lc->target_bandwidth / framerate_pts); + lrc->bits_off_target += (int)round(lc->target_bandwidth / framerate_pts); } else { - lrc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate); + lrc->bits_off_target += (int)round(lc->target_bandwidth / lc->framerate); } // Clip buffer level to maximum buffer size for the layer. lrc->bits_off_target = diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index 6446120f5b..76ff367c06 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -105,7 +105,7 @@ void VP9RateControlRTC::UpdateRateControl( cpi_->framerate = rc_cfg.framerate; cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers; cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers; - + vp9_set_mb_mi(cm, cm->width, cm->height); for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) { for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) { const int layer = From ee73384f0304c7e8a84a214ddc8863d40fe716ad Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 2 Sep 2021 16:15:13 -0700 Subject: [PATCH 0214/1000] Add codec control for vp8 external rc disable cyclic refresh Change-Id: I7905602919d5780831fad840577e97730ce0afc2 --- vp8/vp8_cx_iface.c | 11 +++++++++++ vpx/vp8cx.h | 14 ++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 78631e7976..20e18aee78 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -605,6 +605,16 @@ static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx, return update_extracfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP8_COMP *cpi = ctx->cpi; + const unsigned int data = CAST(VP8E_SET_GF_CBR_BOOST_PCT, args); + if (data) { + cpi->cyclic_refresh_mode_enabled = 0; + } + return VPX_CODEC_OK; +} + static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg, void **mem_loc) { vpx_codec_err_t res = VPX_CODEC_OK; @@ -1243,6 +1253,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = { { VP8E_SET_MAX_INTRA_BITRATE_PCT, set_rc_max_intra_bitrate_pct }, { VP8E_SET_SCREEN_CONTENT_MODE, set_screen_content_mode }, { VP8E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct }, + { VP8E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl }, { -1, NULL }, }; diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 7d0dee0b78..17afac754b 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -742,6 +742,17 @@ enum vp8e_enc_control_id { * Supported in codecs: VP9 */ VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, + + /*!\brief Codec control to disable internal features in rate control. + * + * This will turn off cyclic refresh for vp8. + * + * With those, the rate control is expected to work exactly the same as the + * interface provided in vp8_ratectrl_rtc.cc/h + * + * Supported in codecs: VP8 + */ + VP8E_SET_RTC_EXTERNAL_RATECTRL, }; /*!\brief vpx 1-D scaling mode @@ -1107,6 +1118,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int) VPX_CTRL_USE_TYPE(VP9E_SET_RTC_EXTERNAL_RATECTRL, int) #define VPX_CTRL_VP9E_SET_RTC_EXTERNAL_RATECTRL +VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int) +#define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL + VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *) #define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL From ca40ca9bed87687eb0b534bf3974c95182dd29a1 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 8 Sep 2021 16:52:51 -0700 Subject: [PATCH 0215/1000] vp8 rc: always update correction factor Change-Id: Id40b9cb5a85a15fb313a2a93f14f6768259f7c15 --- vp8/encoder/onyx_if.c | 4 +++- vp8/encoder/onyx_int.h | 4 ++++ vp8/vp8_cx_iface.c | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 71ef057a4a..57c94071b0 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1910,6 +1910,7 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->force_maxqp = 0; cpi->frames_since_last_drop_overshoot = 0; + cpi->rt_always_update_correction_factor = 0; cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; #if CONFIG_INTERNAL_STATS @@ -4445,7 +4446,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } } - if (!active_worst_qchanged) vp8_update_rate_correction_factors(cpi, 2); + if (cpi->rt_always_update_correction_factor || !active_worst_qchanged) + vp8_update_rate_correction_factors(cpi, 2); cpi->last_q[cm->frame_type] = cm->base_qindex; diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index b96f9b1dc5..a29994a135 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -702,6 +702,10 @@ typedef struct VP8_COMP { int use_roi_static_threshold; int ext_refresh_frame_flags_pending; + + // Always update correction factor used for rate control after each frame for + // realtime encoding. + int rt_always_update_correction_factor; } VP8_COMP; void vp8_initialize_enc(void); diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 20e18aee78..893b7a5132 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -611,6 +611,7 @@ static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx, const unsigned int data = CAST(VP8E_SET_GF_CBR_BOOST_PCT, args); if (data) { cpi->cyclic_refresh_mode_enabled = 0; + cpi->rt_always_update_correction_factor = 1; } return VPX_CODEC_OK; } From 65a1751e5b98bf7f1d21bcbfdef352af34fb205d Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 31 Aug 2021 10:22:22 -0700 Subject: [PATCH 0216/1000] Add vp8 support to rc lib For 1 layer CBR only. Support for temporal layers comes later. Rename the library to libvpxrc Bug: b/188853141 Change-Id: Ib7f977b64c05b1a0596870cb7f8e6768cb483850 --- libs.mk | 54 ++-- test/test.mk | 8 +- test/test_rc_interface.cc | 6 + test/vp8_ratectrl_rtc_test.cc | 180 +++++++++++ ...l_rtc_test.cc => vp9_ratectrl_rtc_test.cc} | 5 - vp8/encoder/onyx_if.c | 6 +- vp8/encoder/ratectrl.c | 3 +- vp8/vp8_ratectrl_rtc.cc | 288 ++++++++++++++++++ vp8/vp8_ratectrl_rtc.h | 62 ++++ vp9/ratectrl_rtc.h | 41 +-- vpx/internal/vpx_ratectrl_rtc.h | 62 ++++ vpx/vp8cx.h | 2 +- vpx/vpx_codec.mk | 1 + 13 files changed, 646 insertions(+), 72 deletions(-) create mode 100644 test/test_rc_interface.cc create mode 100644 test/vp8_ratectrl_rtc_test.cc rename test/{ratectrl_rtc_test.cc => vp9_ratectrl_rtc_test.cc} (99%) create mode 100644 vp8/vp8_ratectrl_rtc.cc create mode 100644 vp8/vp8_ratectrl_rtc.h create mode 100644 vpx/internal/vpx_ratectrl_rtc.h diff --git a/libs.mk b/libs.mk index f5b43abadc..d4763efca0 100644 --- a/libs.mk +++ b/libs.mk @@ -94,15 +94,28 @@ ifeq ($(CONFIG_VP9_ENCODER),yes) INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/% CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h vpx/vpx_ext_ratectrl.h CODEC_DOC_SECTIONS += vp9 vp9_encoder +endif - RC_RTC_SRCS := $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS)) - RC_RTC_SRCS += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h - RC_RTC_SRCS += vpx/vpx_ext_ratectrl.h +RC_RTC_SRCS := vpx/vp8.h vpx/vp8cx.h +RC_RTC_SRCS += vpx/vpx_ext_ratectrl.h +RC_RTC_SRCS += vpx/internal/vpx_ratectrl_rtc.h +ifeq ($(CONFIG_VP9_ENCODER),yes) + VP9_PREFIX=vp9/ + RC_RTC_SRCS += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS)) + RC_RTC_SRCS += $(VP9_PREFIX)vp9cx.mk RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.cc RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.h INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.cc INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.h endif +ifeq ($(CONFIG_VP8_ENCODER),yes) + VP8_PREFIX=vp8/ + RC_RTC_SRCS += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS)) + RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.cc + RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.h + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.cc + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.h +endif ifeq ($(CONFIG_VP9_DECODER),yes) VP9_PREFIX=vp9/ @@ -126,7 +139,7 @@ endif ifeq ($(CONFIG_MSVS),yes) CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd) GTEST_LIB=$(if $(CONFIG_STATIC_MSVCRT),gtestmt,gtestmd) -RC_RTC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vp9rcmt,vp9rcmd) +RC_RTC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxrcmt,vpxrcmd) # This variable uses deferred expansion intentionally, since the results of # $(wildcard) may change during the course of the Make. VS_PLATFORMS = $(foreach d,$(wildcard */Release/$(CODEC_LIB).lib),$(word 1,$(subst /, ,$(d)))) @@ -249,16 +262,16 @@ PROJECTS-yes += vpx.$(VCPROJ_SFX) vpx.$(VCPROJ_SFX): vpx_config.asm vpx.$(VCPROJ_SFX): $(RTCD) -vp9rc.$(VCPROJ_SFX): \ +vpxrc.$(VCPROJ_SFX): \ VCPROJ_SRCS=$(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) -vp9rc.$(VCPROJ_SFX): $(RC_RTC_SRCS) +vpxrc.$(VCPROJ_SFX): $(RC_RTC_SRCS) @echo " [CREATE] $@" $(qexec)$(GEN_VCPROJ) \ $(if $(CONFIG_SHARED),--dll,--lib) \ --target=$(TOOLCHAIN) \ $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ - --name=vp9rc \ + --name=vpxrc \ --proj-guid=C26FF952-9494-4838-9A3F-7F3D4F613385 \ --ver=$(CONFIG_VS_VERSION) \ --src-path-bare="$(SRC_PATH_BARE)" \ @@ -275,10 +288,10 @@ vp9rc.$(VCPROJ_SFX): $(RC_RTC_SRCS) $(VCPROJ_SRCS)) \ --src-path-bare="$(SRC_PATH_BARE)" \ -PROJECTS-yes += vp9rc.$(VCPROJ_SFX) +PROJECTS-yes += vpxrc.$(VCPROJ_SFX) -vp9rc.$(VCPROJ_SFX): vpx_config.asm -vp9rc.$(VCPROJ_SFX): $(RTCD) +vpxrc.$(VCPROJ_SFX): vpx_config.asm +vpxrc.$(VCPROJ_SFX): $(RTCD) endif # ifeq ($(CONFIG_MSVS),yes) else # ifeq ($(CONFIG_EXTERNAL_BUILD),yes) @@ -398,12 +411,11 @@ INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/vpx.pc INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc CLEAN-OBJS += vpx.pc -ifeq ($(CONFIG_VP9_ENCODER),yes) - RC_RTC_OBJS=$(call objs,$(RC_RTC_SRCS)) +ifeq ($(CONFIG_ENCODERS),yes) RC_RTC_OBJS=$(call objs,$(RC_RTC_SRCS)) OBJS-yes += $(RC_RTC_OBJS) - LIBS-yes += $(BUILD_PFX)libvp9rc.a $(BUILD_PFX)libvp9rc_g.a - $(BUILD_PFX)libvp9rc_g.a: $(RC_RTC_OBJS) + LIBS-yes += $(BUILD_PFX)libvpxrc.a $(BUILD_PFX)libvpxrc_g.a + $(BUILD_PFX)libvpxrc_g.a: $(RC_RTC_OBJS) endif ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_RATE_CTRL),yesyes) @@ -493,7 +505,7 @@ TEST_INTRA_PRED_SPEED_SRCS=$(call addprefix_clean,test/,\ $(call enabled,TEST_INTRA_PRED_SPEED_SRCS)) TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS))) -ifeq ($(CONFIG_VP9_ENCODER),yes) +ifeq ($(CONFIG_ENCODERS),yes) RC_INTERFACE_TEST_BIN=./test_rc_interface$(EXE_SFX) RC_INTERFACE_TEST_SRCS=$(call addprefix_clean,test/,\ $(call enabled,RC_INTERFACE_TEST_SRCS)) @@ -599,11 +611,11 @@ test_intra_pred_speed.$(VCPROJ_SFX): $(TEST_INTRA_PRED_SPEED_SRCS) vpx.$(VCPROJ_ -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^ endif # TEST_INTRA_PRED_SPEED -ifeq ($(CONFIG_VP9_ENCODER),yes) +ifeq ($(CONFIG_ENCODERS),yes) ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),) PROJECTS-$(CONFIG_MSVS) += test_rc_interface.$(VCPROJ_SFX) test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \ - vp9rc.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX) + vpxrc.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX) @echo " [CREATE] $@" $(qexec)$(GEN_VCPROJ) \ --exe \ @@ -661,19 +673,19 @@ $(eval $(call linkerxx_template,$(TEST_INTRA_PRED_SPEED_BIN), \ -L. -lvpx -lgtest $(extralibs) -lm)) endif # TEST_INTRA_PRED_SPEED -ifeq ($(CONFIG_VP9_ENCODER),yes) +ifeq ($(CONFIG_ENCODERS),yes) ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),) $(RC_INTERFACE_TEST_OBJS) $(RC_INTERFACE_TEST_OBJS:.o=.d): \ CXXFLAGS += $(GTEST_INCLUDES) OBJS-yes += $(RC_INTERFACE_TEST_OBJS) BINS-yes += $(RC_INTERFACE_TEST_BIN) -$(RC_INTERFACE_TEST_BIN): $(TEST_LIBS) libvp9rc.a +$(RC_INTERFACE_TEST_BIN): $(TEST_LIBS) libvpxrc.a $(eval $(call linkerxx_template,$(RC_INTERFACE_TEST_BIN), \ $(RC_INTERFACE_TEST_OBJS) \ - -L. -lvpx -lgtest -lvp9rc $(extralibs) -lm)) + -L. -lvpx -lgtest -lvpxrc $(extralibs) -lm)) endif # RC_INTERFACE_TEST -endif # CONFIG_VP9_ENCODER +endif # CONFIG_ENCODERS ifneq ($(strip $(SIMPLE_ENCODE_TEST_OBJS)),) $(SIMPLE_ENCODE_TEST_OBJS) $(SIMPLE_ENCODE_TEST_OBJS:.o=.d): \ diff --git a/test/test.mk b/test/test.mk index 11228ecdda..41dfd5d835 100644 --- a/test/test.mk +++ b/test/test.mk @@ -213,9 +213,11 @@ endif TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c -RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) := ratectrl_rtc_test.cc -RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += encode_test_driver.cc -RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += encode_test_driver.h +RC_INTERFACE_TEST_SRCS-yes := test_rc_interface.cc +RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ratectrl_rtc_test.cc +RC_INTERFACE_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_ratectrl_rtc_test.cc +RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.cc +RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.h RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.cc RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.h RC_INTERFACE_TEST_SRCS-yes += codec_factory.h diff --git a/test/test_rc_interface.cc b/test/test_rc_interface.cc new file mode 100644 index 0000000000..ec75700f73 --- /dev/null +++ b/test/test_rc_interface.cc @@ -0,0 +1,6 @@ +#include "third_party/googletest/src/include/gtest/gtest.h" + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc new file mode 100644 index 0000000000..d5032b38e7 --- /dev/null +++ b/test/vp8_ratectrl_rtc_test.cc @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // NOLINT +#include + +#include "./vpx_config.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/video_source.h" +#include "vp8/vp8_ratectrl_rtc.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace { + +struct Vp8RCTestVideo { + Vp8RCTestVideo() {} + Vp8RCTestVideo(const char *name_, int width_, int height_, + unsigned int frames_) + : name(name_), width(width_), height(height_), frames(frames_) {} + + friend std::ostream &operator<<(std::ostream &os, + const Vp8RCTestVideo &video) { + os << video.name << " " << video.width << " " << video.height << " " + << video.frames; + return os; + } + const char *name; + int width; + int height; + unsigned int frames; +}; + +const Vp8RCTestVideo kVp8RCTestVectors[] = { + Vp8RCTestVideo("niklas_640_480_30.yuv", 640, 480, 470), + Vp8RCTestVideo("desktop_office1.1280_720-020.yuv", 1280, 720, 300), +}; + +class Vp8RcInterfaceTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + public: + Vp8RcInterfaceTest() + : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false) {} + virtual ~Vp8RcInterfaceTest() {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, -6); + encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1); + encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); + } + frame_params_.frame_type = + video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME; + if (frame_params_.frame_type == INTER_FRAME) { + // Disable golden frame update. + frame_flags_ |= VP8_EFLAG_NO_UPD_GF; + frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; + } + encoder_exit_ = video->frame() == test_video_.frames; + } + + virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + if (encoder_exit_) { + return; + } + int qp; + encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp); + rc_api_->ComputeQP(frame_params_); + ASSERT_EQ(rc_api_->GetQP(), qp); + } + + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + rc_api_->PostEncodeUpdate(pkt->data.frame.sz); + } + + void RunOneLayer() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + if (test_video_.width == 1280 && target_bitrate_ == 200) return; + if (test_video_.width == 640 && target_bitrate_ == 1000) return; + SetConfig(); + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + rc_api_->UpdateRateControl(rc_cfg_); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunPeriodicKey() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + if (test_video_.width == 1280 && target_bitrate_ == 200) return; + if (test_video_.width == 640 && target_bitrate_ == 1000) return; + key_interval_ = 100; + SetConfig(); + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + rc_api_->UpdateRateControl(rc_cfg_); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + private: + void SetConfig() { + rc_cfg_.width = test_video_.width; + rc_cfg_.height = test_video_.height; + rc_cfg_.max_quantizer = 60; + rc_cfg_.min_quantizer = 2; + rc_cfg_.target_bandwidth = target_bitrate_; + rc_cfg_.buf_initial_sz = 600; + rc_cfg_.buf_optimal_sz = 600; + rc_cfg_.buf_sz = target_bitrate_; + rc_cfg_.undershoot_pct = 50; + rc_cfg_.overshoot_pct = 50; + rc_cfg_.max_intra_bitrate_pct = 1000; + rc_cfg_.framerate = 30.0; + rc_cfg_.layer_target_bitrate[0] = target_bitrate_; + + // Encoder settings for ground truth. + cfg_.g_w = test_video_.width; + cfg_.g_h = test_video_.height; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_buf_initial_sz = 600; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = target_bitrate_; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 60; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.rc_target_bitrate = target_bitrate_; + cfg_.kf_min_dist = key_interval_; + cfg_.kf_max_dist = key_interval_; + } + + std::unique_ptr rc_api_; + libvpx::VP8RateControlRtcConfig rc_cfg_; + int key_interval_; + int target_bitrate_; + Vp8RCTestVideo test_video_; + libvpx::VP8FrameParamsQpRTC frame_params_; + bool encoder_exit_; +}; + +TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); } + +TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); } + +VP8_INSTANTIATE_TEST_SUITE(Vp8RcInterfaceTest, + ::testing::Values(200, 400, 1000), + ::testing::ValuesIn(kVp8RCTestVectors)); + +} // namespace diff --git a/test/ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc similarity index 99% rename from test/ratectrl_rtc_test.cc rename to test/vp9_ratectrl_rtc_test.cc index 22bc5ecf74..b09a45bb76 100644 --- a/test/ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -371,8 +371,3 @@ VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3), ::testing::Values(VPX_CBR, VPX_VBR)); VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3)); } // namespace - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 57c94071b0..fc154afd14 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -4017,7 +4017,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; /* Are we are overshooting and up against the limit of active max Q. */ - if (((cpi->pass != 2) || + if (!cpi->rt_always_update_correction_factor && + ((cpi->pass != 2) || (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) && (Q == cpi->active_worst_quality) && (cpi->active_worst_quality < cpi->worst_quality) && @@ -4446,8 +4447,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } } - if (cpi->rt_always_update_correction_factor || !active_worst_qchanged) - vp8_update_rate_correction_factors(cpi, 2); + if (!active_worst_qchanged) vp8_update_rate_correction_factors(cpi, 2); cpi->last_q[cm->frame_type] = cm->base_qindex; diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index d2b8dff06a..4b76cc6429 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -327,7 +327,8 @@ static void calc_iframe_target_size(VP8_COMP *cpi) { int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */ /* Boost depends somewhat on frame rate: only used for 1 layer case. */ if (cpi->oxcf.number_of_layers == 1) { - kf_boost = VPXMAX(initial_boost, (int)(2 * cpi->output_framerate - 16)); + kf_boost = + VPXMAX(initial_boost, (int)round(2 * cpi->output_framerate - 16)); } else { /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */ kf_boost = initial_boost; diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc new file mode 100644 index 0000000000..c42ab971e2 --- /dev/null +++ b/vp8/vp8_ratectrl_rtc.cc @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "vp8/vp8_ratectrl_rtc.h" +#include "vp8/encoder/ratectrl.h" + +namespace libvpx { +/* Quant MOD */ +static const int kQTrans[] = { + 0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 15, 17, 18, 19, + 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 35, 37, 39, 41, + 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 64, 67, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 118, 121, 124, 127, +}; + +static const unsigned char kf_high_motion_minq[QINDEX_RANGE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, + 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 8, 8, 8, 8, 9, 9, 10, 10, + 10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 15, 15, 15, 15, 16, + 16, 16, 16, 17, 17, 18, 18, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, + 22, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30 +}; + +static const unsigned char inter_minq[QINDEX_RANGE] = { + 0, 0, 1, 1, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 9, 10, 11, + 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 22, 23, 24, + 24, 25, 26, 27, 27, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 36, 36, 37, 38, + 39, 39, 40, 41, 42, 42, 43, 44, 45, 46, 46, 47, 48, 49, 50, 50, 51, 52, 53, + 54, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86, + 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100 +}; + +static int rescale(int val, int num, int denom) { + int64_t llnum = num; + int64_t llden = denom; + int64_t llval = val; + + return (int)(llval * llnum / llden); +} + +std::unique_ptr VP8RateControlRTC::Create( + const VP8RateControlRtcConfig &cfg) { + std::unique_ptr rc_api(new (std::nothrow) + VP8RateControlRTC()); + if (!rc_api) return nullptr; + rc_api->cpi_ = static_cast(vpx_memalign(32, sizeof(*cpi_))); + if (!rc_api->cpi_) return nullptr; + vp8_zero(*rc_api->cpi_); + + rc_api->InitRateControl(cfg); + + return rc_api; +} + +void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) { + VP8_COMMON *cm = &cpi_->common; + VP8_CONFIG *oxcf = &cpi_->oxcf; + oxcf->end_usage = USAGE_STREAM_FROM_SERVER; + cpi_->pass = 0; + cm->show_frame = 1; + oxcf->drop_frames_water_mark = 0; + cm->current_video_frame = 0; + cpi_->auto_gold = 1; + cpi_->key_frame_count = 1; + cpi_->rate_correction_factor = 1.0; + cpi_->key_frame_rate_correction_factor = 1.0; + cpi_->cyclic_refresh_mode_enabled = 0; + cpi_->auto_worst_q = 1; + cpi_->kf_overspend_bits = 0; + cpi_->kf_bitrate_adjustment = 0; + cpi_->gf_overspend_bits = 0; + cpi_->non_gf_bitrate_adjustment = 0; + UpdateRateControl(rc_cfg); + cpi_->buffer_level = oxcf->starting_buffer_level; + cpi_->bits_off_target = oxcf->starting_buffer_level; +} + +void VP8RateControlRTC::UpdateRateControl( + const VP8RateControlRtcConfig &rc_cfg) { + VP8_COMMON *cm = &cpi_->common; + VP8_CONFIG *oxcf = &cpi_->oxcf; + + cm->Width = rc_cfg.width; + cm->Height = rc_cfg.height; + oxcf->Width = rc_cfg.width; + oxcf->Height = rc_cfg.height; + oxcf->worst_allowed_q = kQTrans[rc_cfg.max_quantizer]; + oxcf->best_allowed_q = kQTrans[rc_cfg.min_quantizer]; + cpi_->worst_quality = oxcf->worst_allowed_q; + cpi_->best_quality = oxcf->best_allowed_q; + cpi_->output_framerate = rc_cfg.framerate; + oxcf->target_bandwidth = 1000 * rc_cfg.target_bandwidth; + oxcf->fixed_q = -1; + oxcf->error_resilient_mode = 1; + oxcf->starting_buffer_level_in_ms = rc_cfg.buf_initial_sz; + oxcf->optimal_buffer_level_in_ms = rc_cfg.buf_optimal_sz; + oxcf->maximum_buffer_size_in_ms = rc_cfg.buf_sz; + oxcf->starting_buffer_level = rc_cfg.buf_initial_sz; + oxcf->optimal_buffer_level = rc_cfg.buf_optimal_sz; + oxcf->maximum_buffer_size = rc_cfg.buf_sz; + oxcf->number_of_layers = 1; + cpi_->buffered_mode = oxcf->optimal_buffer_level > 0; + oxcf->under_shoot_pct = rc_cfg.undershoot_pct; + oxcf->over_shoot_pct = rc_cfg.overshoot_pct; + cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct; + cpi_->framerate = rc_cfg.framerate; + for (int i = 0; i < KEY_FRAME_CONTEXT; ++i) { + cpi_->prior_key_frame_distance[i] = + static_cast(cpi_->output_framerate); + } + + cpi_->total_actual_bits = 0; + cpi_->total_target_vs_actual = 0; + + cm->mb_rows = cm->Height >> 4; + cm->mb_cols = cm->Width >> 4; + cm->MBs = cm->mb_rows * cm->mb_cols; + cm->mode_info_stride = cm->mb_cols + 1; + + oxcf->starting_buffer_level = + rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000); + /* Set or reset optimal and maximum buffer levels. */ + if (oxcf->optimal_buffer_level == 0) { + oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8; + } else { + oxcf->optimal_buffer_level = + rescale((int)oxcf->optimal_buffer_level, oxcf->target_bandwidth, 1000); + } + if (oxcf->maximum_buffer_size == 0) { + oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8; + } else { + oxcf->maximum_buffer_size = + rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000); + } + + if (cpi_->bits_off_target > oxcf->maximum_buffer_size) { + cpi_->bits_off_target = oxcf->maximum_buffer_size; + cpi_->buffer_level = cpi_->bits_off_target; + } + + vp8_new_framerate(cpi_, cpi_->framerate); +} + +void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) { + VP8_COMMON *const cm = &cpi_->common; + cm->frame_type = frame_params.frame_type; + cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; + cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; + if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) { + cpi_->common.frame_flags |= FRAMEFLAGS_KEY; + } + + vp8_pick_frame_size(cpi_); + + if (cpi_->buffer_level >= cpi_->oxcf.optimal_buffer_level && + cpi_->buffered_mode) { + /* Max adjustment is 1/4 */ + int Adjustment = cpi_->active_worst_quality / 4; + if (Adjustment) { + int buff_lvl_step; + if (cpi_->buffer_level < cpi_->oxcf.maximum_buffer_size) { + buff_lvl_step = (int)((cpi_->oxcf.maximum_buffer_size - + cpi_->oxcf.optimal_buffer_level) / + Adjustment); + if (buff_lvl_step) { + Adjustment = + (int)((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) / + buff_lvl_step); + } else { + Adjustment = 0; + } + } + cpi_->active_worst_quality -= Adjustment; + if (cpi_->active_worst_quality < cpi_->active_best_quality) { + cpi_->active_worst_quality = cpi_->active_best_quality; + } + } + } + + if (cpi_->ni_frames > 150) { + int q = cpi_->active_worst_quality; + if (cm->frame_type == KEY_FRAME) { + cpi_->active_best_quality = kf_high_motion_minq[q]; + } else { + cpi_->active_best_quality = inter_minq[q]; + } + + if (cpi_->buffer_level >= cpi_->oxcf.maximum_buffer_size) { + cpi_->active_best_quality = cpi_->best_quality; + + } else if (cpi_->buffer_level > cpi_->oxcf.optimal_buffer_level) { + int Fraction = + (int)(((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) * 128) / + (cpi_->oxcf.maximum_buffer_size - + cpi_->oxcf.optimal_buffer_level)); + int min_qadjustment = + ((cpi_->active_best_quality - cpi_->best_quality) * Fraction) / 128; + + cpi_->active_best_quality -= min_qadjustment; + } + } + + /* Clip the active best and worst quality values to limits */ + if (cpi_->active_worst_quality > cpi_->worst_quality) { + cpi_->active_worst_quality = cpi_->worst_quality; + } + if (cpi_->active_best_quality < cpi_->best_quality) { + cpi_->active_best_quality = cpi_->best_quality; + } + if (cpi_->active_worst_quality < cpi_->active_best_quality) { + cpi_->active_worst_quality = cpi_->active_best_quality; + } + + q_ = vp8_regulate_q(cpi_, cpi_->this_frame_target); + vp8_set_quantizer(cpi_, q_); +} + +int VP8RateControlRTC::GetQP() const { return q_; } + +void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { + VP8_COMMON *const cm = &cpi_->common; + + cpi_->total_byte_count += encoded_frame_size; + cpi_->projected_frame_size = static_cast(encoded_frame_size << 3); + + vp8_update_rate_correction_factors(cpi_, 2); + + cpi_->last_q[cm->frame_type] = cm->base_qindex; + + if (cm->frame_type == KEY_FRAME) { + vp8_adjust_key_frame_context(cpi_); + } + + /* Keep a record of ambient average Q. */ + if (cm->frame_type != KEY_FRAME) { + cpi_->avg_frame_qindex = + (2 + 3 * cpi_->avg_frame_qindex + cm->base_qindex) >> 2; + } + /* Keep a record from which we can calculate the average Q excluding + * key frames. + */ + if (cm->frame_type != KEY_FRAME) { + cpi_->ni_frames++; + /* Damp value for first few frames */ + if (cpi_->ni_frames > 150) { + cpi_->ni_tot_qi += q_; + cpi_->ni_av_qi = (cpi_->ni_tot_qi / cpi_->ni_frames); + } else { + cpi_->ni_tot_qi += q_; + cpi_->ni_av_qi = + ((cpi_->ni_tot_qi / cpi_->ni_frames) + cpi_->worst_quality + 1) / 2; + } + + /* If the average Q is higher than what was used in the last + * frame (after going through the recode loop to keep the frame + * size within range) then use the last frame value - 1. The -1 + * is designed to stop Q and hence the data rate, from + * progressively falling away during difficult sections, but at + * the same time reduce the number of itterations around the + * recode loop. + */ + if (q_ > cpi_->ni_av_qi) cpi_->ni_av_qi = q_ - 1; + } + + cpi_->bits_off_target += + cpi_->av_per_frame_bandwidth - cpi_->projected_frame_size; + if (cpi_->bits_off_target > cpi_->oxcf.maximum_buffer_size) { + cpi_->bits_off_target = cpi_->oxcf.maximum_buffer_size; + } + + cpi_->total_actual_bits += cpi_->projected_frame_size; + cpi_->buffer_level = cpi_->bits_off_target; + + cpi_->common.current_video_frame++; + cpi_->frames_since_key++; +} +} // namespace libvpx diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h new file mode 100644 index 0000000000..a1cd52b051 --- /dev/null +++ b/vp8/vp8_ratectrl_rtc.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_RATECTRL_RTC_H_ +#define VPX_VP8_RATECTRL_RTC_H_ + +#include +#include + +#include "vp8/encoder/onyx_int.h" +#include "vp8/common/common.h" +#include "vpx/internal/vpx_ratectrl_rtc.h" + +namespace libvpx { +struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig { + public: + VP8RateControlRtcConfig() { + vp8_zero(layer_target_bitrate); + vp8_zero(ts_rate_decimator); + } +}; + +struct VP8FrameParamsQpRTC { + FRAME_TYPE frame_type; +}; + +class VP8RateControlRTC { + public: + static std::unique_ptr Create( + const VP8RateControlRtcConfig &cfg); + ~VP8RateControlRTC() { + if (cpi_) { + vpx_free(cpi_->gf_active_flags); + vpx_free(cpi_); + } + } + + void UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg); + // GetQP() needs to be called after ComputeQP() to get the latest QP + int GetQP() const; + // int GetLoopfilterLevel() const; + void ComputeQP(const VP8FrameParamsQpRTC &frame_params); + // Feedback to rate control with the size of current encoded frame + void PostEncodeUpdate(uint64_t encoded_frame_size); + + private: + VP8RateControlRTC() {} + void InitRateControl(const VP8RateControlRtcConfig &cfg); + VP8_COMP *cpi_; + int q_; +}; + +} // namespace libvpx + +#endif // VPX_VP8_RATECTRL_RTC_H_ diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index 5cc7ec9457..d2b9417aef 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -22,28 +22,14 @@ #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/vp9_cx_iface.h" +#include "vpx/internal/vpx_ratectrl_rtc.h" #include "vpx_mem/vpx_mem.h" namespace libvpx { -struct VP9RateControlRtcConfig { +struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig { public: VP9RateControlRtcConfig() { - width = 1280; - height = 720; - max_quantizer = 63; - min_quantizer = 2; - target_bandwidth = 1000; - buf_initial_sz = 600; - buf_optimal_sz = 600; - buf_sz = 1000; - undershoot_pct = overshoot_pct = 50; - max_intra_bitrate_pct = 50; - max_inter_bitrate_pct = 0; - framerate = 30.0; - ss_number_layers = ts_number_layers = 1; - rc_mode = VPX_CBR; - aq_mode = 0; vp9_zero(max_quantizers); vp9_zero(min_quantizers); vp9_zero(scaling_factor_den); @@ -52,26 +38,10 @@ struct VP9RateControlRtcConfig { vp9_zero(ts_rate_decimator); scaling_factor_num[0] = 1; scaling_factor_den[0] = 1; - layer_target_bitrate[0] = static_cast(target_bandwidth); max_quantizers[0] = max_quantizer; min_quantizers[0] = min_quantizer; - ts_rate_decimator[0] = 1; } - int width; - int height; - // 0-63 - int max_quantizer; - int min_quantizer; - int64_t target_bandwidth; - int64_t buf_initial_sz; - int64_t buf_optimal_sz; - int64_t buf_sz; - int undershoot_pct; - int overshoot_pct; - int max_intra_bitrate_pct; - int max_inter_bitrate_pct; - double framerate; // Number of spatial layers int ss_number_layers; // Number of temporal layers @@ -80,11 +50,6 @@ struct VP9RateControlRtcConfig { int min_quantizers[VPX_MAX_LAYERS]; int scaling_factor_num[VPX_SS_MAX_LAYERS]; int scaling_factor_den[VPX_SS_MAX_LAYERS]; - int layer_target_bitrate[VPX_MAX_LAYERS]; - int ts_rate_decimator[VPX_TS_MAX_LAYERS]; - // vbr, cbr - enum vpx_rc_mode rc_mode; - int aq_mode; }; struct VP9FrameParamsQpRTC { @@ -94,7 +59,7 @@ struct VP9FrameParamsQpRTC { }; // This interface allows using VP9 real-time rate control without initializing -// the encoder. To use this interface, you need to link with libvp9rc.a. +// the encoder. To use this interface, you need to link with libvpxrc.a. // // #include "vp9/ratectrl_rtc.h" // VP9RateControlRTC rc_api; diff --git a/vpx/internal/vpx_ratectrl_rtc.h b/vpx/internal/vpx_ratectrl_rtc.h new file mode 100644 index 0000000000..0474e0a85b --- /dev/null +++ b/vpx/internal/vpx_ratectrl_rtc.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_RATECTRL_RTC_H_ +#define VPX_VPX_RATECTRL_RTC_H_ + +#include "vpx/vpx_encoder.h" + +namespace libvpx { +struct VpxRateControlRtcConfig { + public: + VpxRateControlRtcConfig() { + width = 1280; + height = 720; + max_quantizer = 63; + min_quantizer = 2; + target_bandwidth = 1000; + buf_initial_sz = 600; + buf_optimal_sz = 600; + buf_sz = 1000; + undershoot_pct = overshoot_pct = 50; + max_intra_bitrate_pct = 50; + max_inter_bitrate_pct = 0; + framerate = 30.0; + ts_number_layers = 1; + rc_mode = VPX_CBR; + aq_mode = 0; + layer_target_bitrate[0] = static_cast(target_bandwidth); + ts_rate_decimator[0] = 1; + } + + int width; + int height; + // 0-63 + int max_quantizer; + int min_quantizer; + int64_t target_bandwidth; + int64_t buf_initial_sz; + int64_t buf_optimal_sz; + int64_t buf_sz; + int undershoot_pct; + int overshoot_pct; + int max_intra_bitrate_pct; + int max_inter_bitrate_pct; + double framerate; + // Number of temporal layers + int ts_number_layers; + int layer_target_bitrate[VPX_MAX_LAYERS]; + int ts_rate_decimator[VPX_TS_MAX_LAYERS]; + // vbr, cbr + enum vpx_rc_mode rc_mode; + int aq_mode; +}; +} // namespace libvpx +#endif diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 17afac754b..28bd861747 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -747,7 +747,7 @@ enum vp8e_enc_control_id { * * This will turn off cyclic refresh for vp8. * - * With those, the rate control is expected to work exactly the same as the + * With this, the rate control is expected to work exactly the same as the * interface provided in vp8_ratectrl_rtc.cc/h * * Supported in codecs: VP8 diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk index 350dc247bc..de86579d58 100644 --- a/vpx/vpx_codec.mk +++ b/vpx/vpx_codec.mk @@ -33,6 +33,7 @@ API_SRCS-yes += vpx_decoder.h API_SRCS-yes += src/vpx_encoder.c API_SRCS-yes += vpx_encoder.h API_SRCS-yes += internal/vpx_codec_internal.h +API_SRCS-yes += internal/vpx_ratectrl_rtc.h API_SRCS-yes += src/vpx_codec.c API_SRCS-yes += src/vpx_image.c API_SRCS-yes += vpx_codec.h From 8a6fbc0b4eb8538e213782bcdc3969a08b44e73b Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Fri, 10 Sep 2021 15:54:51 -0700 Subject: [PATCH 0217/1000] Define the VPX_NO_RETURN macro for MSVC Define VPX_NO_RETURN as __declspec(noreturn) for MSVC. See https://docs.microsoft.com/en-us/cpp/cpp/noreturn?view=msvc-160 This requires moving VPX_NO_RETURN before function declarations because __declspec(noreturn) must be placed there. Fortunately GCC's __attribute__((noreturn)) can be placed either before or after function declarations. Change-Id: Id9bb0077e2a4f16ec2ca9c913dd93673a0e385cf --- args.c | 6 ++++-- tools_common.h | 10 ++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/args.c b/args.c index a87b138b9d..17b615584e 100644 --- a/args.c +++ b/args.c @@ -16,8 +16,10 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/msvc.h" -#if defined(__GNUC__) && __GNUC__ -extern void die(const char *fmt, ...) __attribute__((noreturn)); +#if defined(__GNUC__) +__attribute__((noreturn)) extern void die(const char *fmt, ...); +#elif defined(_MSC_VER) +__declspec(noreturn) extern void die(const char *fmt, ...); #else extern void die(const char *fmt, ...); #endif diff --git a/tools_common.h b/tools_common.h index 4526d9f165..4e8851fc15 100644 --- a/tools_common.h +++ b/tools_common.h @@ -110,6 +110,8 @@ extern "C" { #if defined(__GNUC__) #define VPX_NO_RETURN __attribute__((noreturn)) +#elif defined(_MSC_VER) +#define VPX_NO_RETURN __declspec(noreturn) #else #define VPX_NO_RETURN #endif @@ -117,14 +119,14 @@ extern "C" { /* Sets a stdio stream into binary mode */ FILE *set_binary_mode(FILE *stream); -void die(const char *fmt, ...) VPX_NO_RETURN; -void fatal(const char *fmt, ...) VPX_NO_RETURN; +VPX_NO_RETURN void die(const char *fmt, ...); +VPX_NO_RETURN void fatal(const char *fmt, ...); void warn(const char *fmt, ...); -void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN; +VPX_NO_RETURN void die_codec(vpx_codec_ctx_t *ctx, const char *s); /* The tool including this file must define usage_exit() */ -void usage_exit(void) VPX_NO_RETURN; +VPX_NO_RETURN void usage_exit(void); #undef VPX_NO_RETURN From 7366195e5a7098de0b7c131f40dd5238b9065a56 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 16 Sep 2021 10:19:09 -0700 Subject: [PATCH 0218/1000] vp8 rc: explicit cast to avoid VS build failure Change-Id: I6a4daca12b79cf996964661e1af85aa6e258b446 --- vp8/vp8_ratectrl_rtc.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc index c42ab971e2..b489940cb7 100644 --- a/vp8/vp8_ratectrl_rtc.cc +++ b/vp8/vp8_ratectrl_rtc.cc @@ -100,7 +100,8 @@ void VP8RateControlRTC::UpdateRateControl( cpi_->worst_quality = oxcf->worst_allowed_q; cpi_->best_quality = oxcf->best_allowed_q; cpi_->output_framerate = rc_cfg.framerate; - oxcf->target_bandwidth = 1000 * rc_cfg.target_bandwidth; + oxcf->target_bandwidth = + static_cast(1000 * rc_cfg.target_bandwidth); oxcf->fixed_q = -1; oxcf->error_resilient_mode = 1; oxcf->starting_buffer_level_in_ms = rc_cfg.buf_initial_sz; From 09775194ffdb84b4979f3988e7ef301575b661df Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 20 Sep 2021 13:37:43 -0700 Subject: [PATCH 0219/1000] Cap duration to avoid overflow Bug: webm:1728 Change-Id: Id13475660fa921e8ddcc89847e978da4c8d85886 --- vp8/encoder/onyx_if.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index fc154afd14..cdcb0a09f7 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -4921,6 +4921,8 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen; last_duration = cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen; + // Cap this to avoid overflow of (this_duration - last_duration) * 10 + this_duration = VPXMIN(this_duration, INT64_MAX / 10); /* do a step update if the duration changes by 10% */ if (last_duration) { step = (int)(((this_duration - last_duration) * 10 / last_duration)); From 0de415cf6a945457115783807a702a5249f44a9d Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 16 Sep 2021 10:16:44 -0700 Subject: [PATCH 0220/1000] vp8 rc: support temporal layers Change-Id: I2c7d5de0e17b072cb763f1659b1badce4fe0b82b --- test/vp8_ratectrl_rtc_test.cc | 181 ++++++++++++++++++++++++++++++++-- vp8/encoder/onyx_if.c | 40 ++++---- vp8/encoder/onyx_int.h | 6 ++ vp8/vp8_ratectrl_rtc.cc | 61 +++++++++++- vp8/vp8_ratectrl_rtc.h | 1 + 5 files changed, 257 insertions(+), 32 deletions(-) diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc index d5032b38e7..ad310666e7 100644 --- a/test/vp8_ratectrl_rtc_test.cc +++ b/test/vp8_ratectrl_rtc_test.cc @@ -61,20 +61,81 @@ class Vp8RcInterfaceTest SetMode(::libvpx_test::kRealTime); } + // From error_resilience_test.cc + int SetFrameFlags(int frame_num, int num_temp_layers) { + int frame_flags = 0; + if (num_temp_layers == 2) { + if (frame_num % 2 == 0) { + // Layer 0: predict from L and ARF, update L. + frame_flags = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + } else { + // Layer 1: predict from L, G and ARF, and update G. + frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ENTROPY; + } + } else if (num_temp_layers == 3) { + if (frame_num % 4 == 0) { + // Layer 0: predict from L, update L. + frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF; + } else if ((frame_num - 2) % 4 == 0) { + // Layer 1: predict from L, G, update G. + frame_flags = + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_REF_ARF; + } else if ((frame_num - 1) % 2 == 0) { + // Layer 2: predict from L, G, ARF; update ARG. + frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST; + } + } + return frame_flags; + } + + int SetLayerId(int frame_num, int num_temp_layers) { + int layer_id = 0; + if (num_temp_layers == 2) { + if (frame_num % 2 == 0) { + layer_id = 0; + } else { + layer_id = 1; + } + } else if (num_temp_layers == 3) { + if (frame_num % 4 == 0) { + layer_id = 0; + } else if ((frame_num - 2) % 4 == 0) { + layer_id = 1; + } else if ((frame_num - 1) % 2 == 0) { + layer_id = 2; + } + } + return layer_id; + } + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { - if (video->frame() == 0) { - encoder->Control(VP8E_SET_CPUUSED, -6); - encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1); - encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); + if (rc_cfg_.ts_number_layers > 1) { + const int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers); + const int frame_flags = + SetFrameFlags(video->frame(), cfg_.ts_number_layers); + frame_params_.temporal_layer_id = layer_id; + if (video->frame() > 0) { + encoder->Control(VP8E_SET_TEMPORAL_LAYER_ID, layer_id); + encoder->Control(VP8E_SET_FRAME_FLAGS, frame_flags); + } + } else { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, -6); + encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1); + encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); + } + if (frame_params_.frame_type == INTER_FRAME) { + // Disable golden frame update. + frame_flags_ |= VP8_EFLAG_NO_UPD_GF; + frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; + } } frame_params_.frame_type = video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME; - if (frame_params_.frame_type == INTER_FRAME) { - // Disable golden frame update. - frame_flags_ |= VP8_EFLAG_NO_UPD_GF; - frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; - } encoder_exit_ = video->frame() == test_video_.frames; } @@ -125,6 +186,38 @@ class Vp8RcInterfaceTest ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } + void RunTemporalLayers2TL() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + if (test_video_.width == 1280 && target_bitrate_ == 200) return; + if (test_video_.width == 640 && target_bitrate_ == 1000) return; + SetConfigTemporalLayers(2); + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + rc_api_->UpdateRateControl(rc_cfg_); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunTemporalLayers3TL() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + if (test_video_.width == 1280 && target_bitrate_ == 200) return; + if (test_video_.width == 640 && target_bitrate_ == 1000) return; + SetConfigTemporalLayers(3); + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + rc_api_->UpdateRateControl(rc_cfg_); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + private: void SetConfig() { rc_cfg_.width = test_video_.width; @@ -160,6 +253,72 @@ class Vp8RcInterfaceTest cfg_.kf_max_dist = key_interval_; } + void SetConfigTemporalLayers(int temporal_layers) { + rc_cfg_.width = test_video_.width; + rc_cfg_.height = test_video_.height; + rc_cfg_.max_quantizer = 60; + rc_cfg_.min_quantizer = 2; + rc_cfg_.target_bandwidth = target_bitrate_; + rc_cfg_.buf_initial_sz = 600; + rc_cfg_.buf_optimal_sz = 600; + rc_cfg_.buf_sz = target_bitrate_; + rc_cfg_.undershoot_pct = 50; + rc_cfg_.overshoot_pct = 50; + rc_cfg_.max_intra_bitrate_pct = 1000; + rc_cfg_.framerate = 30.0; + if (temporal_layers == 2) { + rc_cfg_.layer_target_bitrate[0] = 60 * target_bitrate_ / 100; + rc_cfg_.layer_target_bitrate[1] = target_bitrate_; + rc_cfg_.ts_rate_decimator[0] = 2; + rc_cfg_.ts_rate_decimator[1] = 1; + } else if (temporal_layers == 3) { + rc_cfg_.layer_target_bitrate[0] = 40 * target_bitrate_ / 100; + rc_cfg_.layer_target_bitrate[1] = 60 * target_bitrate_ / 100; + rc_cfg_.layer_target_bitrate[2] = target_bitrate_; + rc_cfg_.ts_rate_decimator[0] = 4; + rc_cfg_.ts_rate_decimator[1] = 2; + rc_cfg_.ts_rate_decimator[2] = 1; + } + + rc_cfg_.ts_number_layers = temporal_layers; + + // Encoder settings for ground truth. + cfg_.g_w = test_video_.width; + cfg_.g_h = test_video_.height; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_buf_initial_sz = 600; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = target_bitrate_; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 60; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.rc_target_bitrate = target_bitrate_; + cfg_.kf_min_dist = key_interval_; + cfg_.kf_max_dist = key_interval_; + // 2 Temporal layers, no spatial layers, CBR mode. + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = temporal_layers; + if (temporal_layers == 2) { + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.ts_periodicity = 2; + cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate; + } else if (temporal_layers == 3) { + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.ts_periodicity = 4; + cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate; + } + } + std::unique_ptr rc_api_; libvpx::VP8RateControlRtcConfig rc_cfg_; int key_interval_; @@ -173,6 +332,10 @@ TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); } TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); } +TEST_P(Vp8RcInterfaceTest, TemporalLayers2TL) { RunTemporalLayers2TL(); } + +TEST_P(Vp8RcInterfaceTest, TemporalLayers3TL) { RunTemporalLayers3TL(); } + VP8_INSTANTIATE_TEST_SUITE(Vp8RcInterfaceTest, ::testing::Values(200, 400, 1000), ::testing::ValuesIn(kVp8RCTestVectors)); diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index fc154afd14..5e00732786 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -183,7 +183,7 @@ static const unsigned char inter_minq[QINDEX_RANGE] = { extern FILE *vpxlogc; #endif -static void save_layer_context(VP8_COMP *cpi) { +void vp8_save_layer_context(VP8_COMP *cpi) { LAYER_CONTEXT *lc = &cpi->layer_context[cpi->current_layer]; /* Save layer dependent coding state */ @@ -222,7 +222,7 @@ static void save_layer_context(VP8_COMP *cpi) { sizeof(cpi->mb.count_mb_ref_frame_usage)); } -static void restore_layer_context(VP8_COMP *cpi, const int layer) { +void vp8_restore_layer_context(VP8_COMP *cpi, const int layer) { LAYER_CONTEXT *lc = &cpi->layer_context[layer]; /* Restore layer dependent coding state */ @@ -269,9 +269,9 @@ static int rescale(int val, int num, int denom) { return (int)(llval * llnum / llden); } -static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, - const int layer, - double prev_layer_framerate) { +void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, + const int layer, + double prev_layer_framerate) { LAYER_CONTEXT *lc = &cpi->layer_context[layer]; lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer]; @@ -336,12 +336,12 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, // We need this to set the layer context for the new layers below. if (prev_num_layers == 1) { cpi->current_layer = 0; - save_layer_context(cpi); + vp8_save_layer_context(cpi); } for (i = 0; i < curr_num_layers; ++i) { LAYER_CONTEXT *lc = &cpi->layer_context[i]; if (i >= prev_num_layers) { - init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate); + vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate); } // The initial buffer levels are set based on their starting levels. // We could set the buffer levels based on the previous state (normalized @@ -356,7 +356,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, // state (to smooth-out quality dips/rate fluctuation at transition)? // We need to treat the 1 layer case separately: oxcf.target_bitrate[i] - // is not set for 1 layer, and the restore_layer_context/save_context() + // is not set for 1 layer, and the vp8_restore_layer_context/save_context() // are not called in the encoding loop, so we need to call it here to // pass the layer context state to |cpi|. if (curr_num_layers == 1) { @@ -364,7 +364,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, lc->buffer_level = cpi->oxcf.starting_buffer_level_in_ms * lc->target_bandwidth / 1000; lc->bits_off_target = lc->buffer_level; - restore_layer_context(cpi, 0); + vp8_restore_layer_context(cpi, 0); } prev_layer_framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[i]; } @@ -1274,7 +1274,7 @@ void vp8_new_framerate(VP8_COMP *cpi, double framerate) { cpi->framerate = framerate; cpi->output_framerate = framerate; cpi->per_frame_bandwidth = - (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate); + (int)round(cpi->oxcf.target_bandwidth / cpi->output_framerate); cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth; cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); @@ -1365,7 +1365,7 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { double prev_layer_framerate = 0; for (i = 0; i < cpi->oxcf.number_of_layers; ++i) { - init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate); + vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate); prev_layer_framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[i]; } @@ -1382,7 +1382,7 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { #endif } -static void update_layer_contexts(VP8_COMP *cpi) { +void vp8_update_layer_contexts(VP8_COMP *cpi) { VP8_CONFIG *oxcf = &cpi->oxcf; /* Update snapshots of the layer contexts to reflect new parameters */ @@ -1417,8 +1417,8 @@ static void update_layer_contexts(VP8_COMP *cpi) { /* Work out the average size of a frame within this layer */ if (i > 0) { lc->avg_frame_size_for_layer = - (int)((oxcf->target_bitrate[i] - oxcf->target_bitrate[i - 1]) * - 1000 / (lc->framerate - prev_layer_framerate)); + (int)round((oxcf->target_bitrate[i] - oxcf->target_bitrate[i - 1]) * + 1000 / (lc->framerate - prev_layer_framerate)); } prev_layer_framerate = lc->framerate; @@ -3261,7 +3261,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #endif // !CONFIG_REALTIME_ONLY default: cpi->per_frame_bandwidth = - (int)(cpi->target_bandwidth / cpi->output_framerate); + (int)round(cpi->target_bandwidth / cpi->output_framerate); break; } @@ -4554,8 +4554,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) { LAYER_CONTEXT *lc = &cpi->layer_context[i]; - int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate - - cpi->projected_frame_size); + int bits_off_for_this_layer = (int)round( + lc->target_bandwidth / lc->framerate - cpi->projected_frame_size); lc->bits_off_target += bits_off_for_this_layer; @@ -4990,7 +4990,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, if (cpi->oxcf.number_of_layers > 1) { int layer; - update_layer_contexts(cpi); + vp8_update_layer_contexts(cpi); /* Restore layer specific context & set frame rate */ if (cpi->temporal_layer_id >= 0) { @@ -5000,7 +5000,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, cpi->oxcf .layer_id[cpi->temporal_pattern_counter % cpi->oxcf.periodicity]; } - restore_layer_context(cpi, layer); + vp8_restore_layer_context(cpi, layer); vp8_new_framerate(cpi, cpi->layer_context[layer].framerate); } @@ -5131,7 +5131,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, } /* Save layer specific state */ - if (cpi->oxcf.number_of_layers > 1) save_layer_context(cpi); + if (cpi->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi); vpx_usec_timer_mark(&cmptimer); cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index a29994a135..7f8298e44a 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -712,6 +712,12 @@ void vp8_initialize_enc(void); void vp8_alloc_compressor_data(VP8_COMP *cpi); int vp8_reverse_trans(int x); +void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, + const int layer, + double prev_layer_framerate); +void vp8_update_layer_contexts(VP8_COMP *cpi); +void vp8_save_layer_context(VP8_COMP *cpi); +void vp8_restore_layer_context(VP8_COMP *cpi, const int layer); void vp8_new_framerate(VP8_COMP *cpi, double framerate); void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm); diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc index b489940cb7..2098edaf97 100644 --- a/vp8/vp8_ratectrl_rtc.cc +++ b/vp8/vp8_ratectrl_rtc.cc @@ -8,9 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include "vp8/vp8_ratectrl_rtc.h" #include "vp8/encoder/ratectrl.h" +#include "vpx_ports/system_state.h" namespace libvpx { /* Quant MOD */ @@ -90,7 +92,7 @@ void VP8RateControlRTC::UpdateRateControl( const VP8RateControlRtcConfig &rc_cfg) { VP8_COMMON *cm = &cpi_->common; VP8_CONFIG *oxcf = &cpi_->oxcf; - + vpx_clear_system_state(); cm->Width = rc_cfg.width; cm->Height = rc_cfg.height; oxcf->Width = rc_cfg.width; @@ -102,6 +104,7 @@ void VP8RateControlRTC::UpdateRateControl( cpi_->output_framerate = rc_cfg.framerate; oxcf->target_bandwidth = static_cast(1000 * rc_cfg.target_bandwidth); + cpi_->ref_framerate = cpi_->output_framerate; oxcf->fixed_q = -1; oxcf->error_resilient_mode = 1; oxcf->starting_buffer_level_in_ms = rc_cfg.buf_initial_sz; @@ -110,7 +113,7 @@ void VP8RateControlRTC::UpdateRateControl( oxcf->starting_buffer_level = rc_cfg.buf_initial_sz; oxcf->optimal_buffer_level = rc_cfg.buf_optimal_sz; oxcf->maximum_buffer_size = rc_cfg.buf_sz; - oxcf->number_of_layers = 1; + oxcf->number_of_layers = rc_cfg.ts_number_layers; cpi_->buffered_mode = oxcf->optimal_buffer_level > 0; oxcf->under_shoot_pct = rc_cfg.undershoot_pct; oxcf->over_shoot_pct = rc_cfg.overshoot_pct; @@ -121,6 +124,20 @@ void VP8RateControlRTC::UpdateRateControl( static_cast(cpi_->output_framerate); } + if (oxcf->number_of_layers > 1) { + memcpy(oxcf->target_bitrate, rc_cfg.layer_target_bitrate, + sizeof(rc_cfg.layer_target_bitrate)); + memcpy(oxcf->rate_decimator, rc_cfg.ts_rate_decimator, + sizeof(rc_cfg.ts_rate_decimator)); + oxcf->periodicity = 2; + + double prev_layer_framerate = 0; + for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) { + vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate); + prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i]; + } + } + cpi_->total_actual_bits = 0; cpi_->total_target_vs_actual = 0; @@ -155,6 +172,15 @@ void VP8RateControlRTC::UpdateRateControl( void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) { VP8_COMMON *const cm = &cpi_->common; + vpx_clear_system_state(); + if (cpi_->oxcf.number_of_layers > 1) { + cpi_->temporal_layer_id = frame_params.temporal_layer_id; + const int layer = frame_params.temporal_layer_id; + vp8_update_layer_contexts(cpi_); + /* Restore layer specific context & set frame rate */ + vp8_restore_layer_context(cpi_, layer); + vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate); + } cm->frame_type = frame_params.frame_type; cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; @@ -231,9 +257,15 @@ int VP8RateControlRTC::GetQP() const { return q_; } void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { VP8_COMMON *const cm = &cpi_->common; - + vpx_clear_system_state(); cpi_->total_byte_count += encoded_frame_size; cpi_->projected_frame_size = static_cast(encoded_frame_size << 3); + if (cpi_->oxcf.number_of_layers > 1) { + for (unsigned int i = cpi_->current_layer + 1; + i < cpi_->oxcf.number_of_layers; ++i) { + cpi_->layer_context[i].total_byte_count += encoded_frame_size; + } + } vp8_update_rate_correction_factors(cpi_, 2); @@ -283,7 +315,30 @@ void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { cpi_->total_actual_bits += cpi_->projected_frame_size; cpi_->buffer_level = cpi_->bits_off_target; + /* Propagate values to higher temporal layers */ + if (cpi_->oxcf.number_of_layers > 1) { + for (unsigned int i = cpi_->current_layer + 1; + i < cpi_->oxcf.number_of_layers; ++i) { + LAYER_CONTEXT *lc = &cpi_->layer_context[i]; + int bits_off_for_this_layer = (int)round( + lc->target_bandwidth / lc->framerate - cpi_->projected_frame_size); + + lc->bits_off_target += bits_off_for_this_layer; + + /* Clip buffer level to maximum buffer size for the layer */ + if (lc->bits_off_target > lc->maximum_buffer_size) { + lc->bits_off_target = lc->maximum_buffer_size; + } + + lc->total_actual_bits += cpi_->projected_frame_size; + lc->total_target_vs_actual += bits_off_for_this_layer; + lc->buffer_level = lc->bits_off_target; + } + } + cpi_->common.current_video_frame++; cpi_->frames_since_key++; + + if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_); } } // namespace libvpx diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h index a1cd52b051..def7dd8f9e 100644 --- a/vp8/vp8_ratectrl_rtc.h +++ b/vp8/vp8_ratectrl_rtc.h @@ -29,6 +29,7 @@ struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig { struct VP8FrameParamsQpRTC { FRAME_TYPE frame_type; + int temporal_layer_id; }; class VP8RateControlRTC { From b68877a7ebfe764714f8ce7aeb2a7f6d12b77989 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 24 Sep 2021 14:56:00 -0700 Subject: [PATCH 0221/1000] vp8 rc: Clear system state at the end of calls Clear system state at the end of rc calls to make sure the state is consistent before and after Change-Id: I59fe9c99485b1a8603c20db37961339b7575455f --- vp8/vp8_ratectrl_rtc.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc index 2098edaf97..2f23c5b1d9 100644 --- a/vp8/vp8_ratectrl_rtc.cc +++ b/vp8/vp8_ratectrl_rtc.cc @@ -168,6 +168,7 @@ void VP8RateControlRTC::UpdateRateControl( } vp8_new_framerate(cpi_, cpi_->framerate); + vpx_clear_system_state(); } void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) { @@ -251,6 +252,7 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) { q_ = vp8_regulate_q(cpi_, cpi_->this_frame_target); vp8_set_quantizer(cpi_, q_); + vpx_clear_system_state(); } int VP8RateControlRTC::GetQP() const { return q_; } @@ -340,5 +342,6 @@ void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { cpi_->frames_since_key++; if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_); + vpx_clear_system_state(); } } // namespace libvpx From 5df4195b43e5b69572cdb1903d67d6f6c2917285 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Fri, 10 Sep 2021 15:54:51 -0700 Subject: [PATCH 0222/1000] Define the VPX_NO_RETURN macro for MSVC Define VPX_NO_RETURN as __declspec(noreturn) for MSVC. See https://docs.microsoft.com/en-us/cpp/cpp/noreturn?view=msvc-160 This requires moving VPX_NO_RETURN before function declarations because __declspec(noreturn) must be placed there. Fortunately GCC's __attribute__((noreturn)) can be placed either before or after function declarations. Change-Id: Id9bb0077e2a4f16ec2ca9c913dd93673a0e385cf (cherry picked from commit 8a6fbc0b4eb8538e213782bcdc3969a08b44e73b) --- args.c | 6 ++++-- tools_common.h | 10 ++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/args.c b/args.c index a87b138b9d..17b615584e 100644 --- a/args.c +++ b/args.c @@ -16,8 +16,10 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/msvc.h" -#if defined(__GNUC__) && __GNUC__ -extern void die(const char *fmt, ...) __attribute__((noreturn)); +#if defined(__GNUC__) +__attribute__((noreturn)) extern void die(const char *fmt, ...); +#elif defined(_MSC_VER) +__declspec(noreturn) extern void die(const char *fmt, ...); #else extern void die(const char *fmt, ...); #endif diff --git a/tools_common.h b/tools_common.h index 4526d9f165..4e8851fc15 100644 --- a/tools_common.h +++ b/tools_common.h @@ -110,6 +110,8 @@ extern "C" { #if defined(__GNUC__) #define VPX_NO_RETURN __attribute__((noreturn)) +#elif defined(_MSC_VER) +#define VPX_NO_RETURN __declspec(noreturn) #else #define VPX_NO_RETURN #endif @@ -117,14 +119,14 @@ extern "C" { /* Sets a stdio stream into binary mode */ FILE *set_binary_mode(FILE *stream); -void die(const char *fmt, ...) VPX_NO_RETURN; -void fatal(const char *fmt, ...) VPX_NO_RETURN; +VPX_NO_RETURN void die(const char *fmt, ...); +VPX_NO_RETURN void fatal(const char *fmt, ...); void warn(const char *fmt, ...); -void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN; +VPX_NO_RETURN void die_codec(vpx_codec_ctx_t *ctx, const char *s); /* The tool including this file must define usage_exit() */ -void usage_exit(void) VPX_NO_RETURN; +VPX_NO_RETURN void usage_exit(void); #undef VPX_NO_RETURN From d00e68ad8789dc8bb210961532e20f0e9f6d55ae Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 20 Sep 2021 13:37:43 -0700 Subject: [PATCH 0223/1000] Cap duration to avoid overflow Bug: webm:1728 Change-Id: Id13475660fa921e8ddcc89847e978da4c8d85886 (cherry picked from commit 09775194ffdb84b4979f3988e7ef301575b661df) --- vp8/encoder/onyx_if.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 71ef057a4a..c6c162347f 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -4919,6 +4919,8 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen; last_duration = cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen; + // Cap this to avoid overflow of (this_duration - last_duration) * 10 + this_duration = VPXMIN(this_duration, INT64_MAX / 10); /* do a step update if the duration changes by 10% */ if (last_duration) { step = (int)(((this_duration - last_duration) * 10 / last_duration)); From 16837ae1680bbc73381570cc783439b0ea121ba6 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 27 Sep 2021 15:52:53 -0700 Subject: [PATCH 0224/1000] CHANGELOG for Smew v1.11.0 Bug: webm:1732 Change-Id: I6038f401cf1dfdcaca85b81d0b8b2c04967b44dd --- CHANGELOG | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 6338caa380..ea2fc9d81c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,29 @@ +2021-09-27 v1.11.0 "Smew Duck" + This maintenance release adds support for VBR mode in VP9 rate control + interface, new codec controls to get quantization parameters and loop filter + levels, and includes several improvements to NEON and numerous bug fixes. + + - Upgrading: + New codec control is added to get quantization parameters and loop filter + levels. + + VBR mode is supported in VP9 rate control library. + + - Enhancement: + Numerous improvements for Neon optimizations. + Code clean-up and refactoring. + Calculation of rd multiplier is changed with BDRATE gains. + + - Bug fixes: + Fix to overflow on duration. + Fix to several instances of -Wunused-but-set-variable. + Fix to avoid chroma resampling for 420mpeg2 input. + Fix to overflow in calc_iframe_target_size. + Fix to disallow skipping transform and quantization. + Fix some -Wsign-compare warnings in simple_encode. + Fix input file path in simple_encode_test. + Fix valid range for under/over_shoot pct. + 2021-03-09 v1.10.0 "Ruddy Duck" This maintenance release adds support for darwin20 and new codec controls, as well as numerous bug fixes. From f8733b3fb7eb3cf1154a9e693351097ec42005a2 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Fri, 1 Oct 2021 11:54:53 -0700 Subject: [PATCH 0225/1000] vp8: For screen mode: clip buffer from below Condition already existed for screen content mode, but only when frame-dropper was off. Remove the frame drop condition. Change-Id: Ie7357041f5ca05b01e78b4bd3b40da060382591b --- vp8/encoder/onyx_if.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index c57c746469..6890a470a9 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -4516,10 +4516,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; } - // If the frame dropper is not enabled, don't let the buffer level go below - // some threshold, given here by -|maximum_buffer_size|. For now we only do - // this for screen content input. - if (cpi->drop_frames_allowed == 0 && cpi->oxcf.screen_content_mode && + // Don't let the buffer level go below some threshold, given here + // by -|maximum_buffer_size|. For now we only do this for + // screen content input. + if (cpi->oxcf.screen_content_mode && cpi->bits_off_target < -cpi->oxcf.maximum_buffer_size) { cpi->bits_off_target = -cpi->oxcf.maximum_buffer_size; } From 167de33ca8b24f072a88fcf51fbe782763717d00 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Fri, 1 Oct 2021 13:16:56 -0700 Subject: [PATCH 0226/1000] vp8: Condition decimation drop logic on drop_frames_allowed This allows user to make sure frame will be encoded when drop_frames is set off (on the fly), no matter the state of the buffer. Change-Id: Ia7b39b93fe3721dd586bdbede72c525db87b6890 --- vp8/encoder/onyx_if.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index c57c746469..8466bba7c7 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -3481,7 +3481,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, * Note that dropping a key frame can be problematic if spatial * resampling is also active */ - if (cpi->decimation_factor > 0) { + if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) { switch (cpi->decimation_factor) { case 1: cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2; From fccaa5fa7a3e134949f5ea9fe3d4f3c388d4243b Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 1 Oct 2021 13:46:02 -0700 Subject: [PATCH 0227/1000] {vp8,vp9}_set_roi_map: fix validation with INT_MIN previously ranges were checked with abs() whose behavior is undefined with INT_MIN. this fixes a crash when the original value is returned and it later used as and offset into a table. Bug: webm:1742 Change-Id: I345970b75c46699587a4fbc4a059e59277f4c2c8 --- test/encode_api_test.cc | 122 ++++++++++++++++++++++++++++++++++++-- vp8/encoder/onyx_if.c | 18 +++--- vp9/encoder/vp9_encoder.c | 13 ++-- 3 files changed, 134 insertions(+), 19 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 6bd7e593da..dec19b2268 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -8,6 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include +#include + #include "third_party/googletest/src/include/gtest/gtest.h" #include "./vpx_config.h" @@ -18,6 +21,12 @@ namespace { #define NELEMENTS(x) static_cast(sizeof(x) / sizeof(x[0])) +bool IsVP9(const vpx_codec_iface_t *iface) { + static const char kVP9Name[] = "WebM Project VP9"; + return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) == + 0; +} + TEST(EncodeAPI, InvalidParams) { static const vpx_codec_iface_t *kCodecs[] = { #if CONFIG_VP8_ENCODER @@ -184,10 +193,7 @@ TEST(EncodeAPI, MultiResEncode) { } // VP9 should report incapable, VP8 invalid for all configurations. - const char kVP9Name[] = "WebM Project VP9"; - const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface), - sizeof(kVP9Name) - 1) == 0; - EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM, + EXPECT_EQ(IsVP9(iface) ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0])); for (int i = 0; i < 2; i++) { @@ -196,4 +202,112 @@ TEST(EncodeAPI, MultiResEncode) { } } +TEST(EncodeAPI, SetRoi) { + static struct { + const vpx_codec_iface_t *iface; + int ctrl_id; + } kCodecs[] = { +#if CONFIG_VP8_ENCODER + { &vpx_codec_vp8_cx_algo, VP8E_SET_ROI_MAP }, +#endif +#if CONFIG_VP9_ENCODER + { &vpx_codec_vp9_cx_algo, VP9E_SET_ROI_MAP }, +#endif + }; + constexpr int kWidth = 64; + constexpr int kHeight = 64; + + for (const auto &codec : kCodecs) { + SCOPED_TRACE(vpx_codec_iface_name(codec.iface)); + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + EXPECT_EQ(vpx_codec_enc_config_default(codec.iface, &cfg, 0), VPX_CODEC_OK); + cfg.g_w = kWidth; + cfg.g_h = kHeight; + EXPECT_EQ(vpx_codec_enc_init(&enc, codec.iface, &cfg, 0), VPX_CODEC_OK); + + vpx_roi_map_t roi = {}; + uint8_t roi_map[kWidth * kHeight] = {}; + if (IsVP9(codec.iface)) { + roi.rows = (cfg.g_w + 7) >> 3; + roi.cols = (cfg.g_h + 7) >> 3; + } else { + roi.rows = (cfg.g_w + 15) >> 4; + roi.cols = (cfg.g_h + 15) >> 4; + } + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK); + + roi.roi_map = roi_map; + // VP8 only. This value isn't range checked. + roi.static_threshold[1] = 1000; + roi.static_threshold[2] = INT_MIN; + roi.static_threshold[3] = INT_MAX; + + for (const auto delta : { -63, -1, 0, 1, 63 }) { + for (int i = 0; i < 8; ++i) { + roi.delta_q[i] = delta; + roi.delta_lf[i] = delta; + // VP9 only. + roi.skip[i] ^= 1; + roi.ref_frame[i] = (roi.ref_frame[i] + 1) % 4; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK); + } + } + + vpx_codec_err_t expected_error; + for (const auto delta : { -64, 64, INT_MIN, INT_MAX }) { + expected_error = VPX_CODEC_INVALID_PARAM; + for (int i = 0; i < 8; ++i) { + roi.delta_q[i] = delta; + // The max segment count for VP8 is 4, the remainder of the entries are + // ignored. + if (i >= 4 && !IsVP9(codec.iface)) expected_error = VPX_CODEC_OK; + + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "delta_q[" << i << "]: " << delta; + roi.delta_q[i] = 0; + + roi.delta_lf[i] = delta; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "delta_lf[" << i << "]: " << delta; + roi.delta_lf[i] = 0; + } + } + + // VP8 should ignore skip[] and ref_frame[] values. + expected_error = + IsVP9(codec.iface) ? VPX_CODEC_INVALID_PARAM : VPX_CODEC_OK; + for (const auto skip : { -2, 2, INT_MIN, INT_MAX }) { + for (int i = 0; i < 8; ++i) { + roi.skip[i] = skip; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "skip[" << i << "]: " << skip; + roi.skip[i] = 0; + } + } + + // VP9 allows negative values to be used to disable segmentation. + for (int ref_frame = -3; ref_frame < 0; ++ref_frame) { + for (int i = 0; i < 8; ++i) { + roi.ref_frame[i] = ref_frame; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK) + << "ref_frame[" << i << "]: " << ref_frame; + roi.ref_frame[i] = 0; + } + } + + for (const auto ref_frame : { 4, INT_MIN, INT_MAX }) { + for (int i = 0; i < 8; ++i) { + roi.ref_frame[i] = ref_frame; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "ref_frame[" << i << "]: " << ref_frame; + roi.ref_frame[i] = 0; + } + } + + EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + } // namespace diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index c57c746469..1b181cebe4 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -5320,17 +5320,13 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, return -1; } - // Range check the delta Q values and convert the external Q range values - // to internal ones. - if ((abs(delta_q[0]) > range) || (abs(delta_q[1]) > range) || - (abs(delta_q[2]) > range) || (abs(delta_q[3]) > range)) { - return -1; - } - - // Range check the delta lf values - if ((abs(delta_lf[0]) > range) || (abs(delta_lf[1]) > range) || - (abs(delta_lf[2]) > range) || (abs(delta_lf[3]) > range)) { - return -1; + for (i = 0; i < MAX_MB_SEGMENTS; ++i) { + // Note abs() alone can't be used as the behavior of abs(INT_MIN) is + // undefined. + if (delta_q[i] > range || delta_q[i] < -range || delta_lf[i] > range || + delta_lf[i] < -range) { + return -1; + } } // Also disable segmentation if no deltas are specified. diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 6cd8cb80e8..7e80835f6c 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -654,10 +654,15 @@ static void init_level_info(Vp9LevelInfo *level_info) { } static int check_seg_range(int seg_data[8], int range) { - return !(abs(seg_data[0]) > range || abs(seg_data[1]) > range || - abs(seg_data[2]) > range || abs(seg_data[3]) > range || - abs(seg_data[4]) > range || abs(seg_data[5]) > range || - abs(seg_data[6]) > range || abs(seg_data[7]) > range); + int i; + for (i = 0; i < 8; ++i) { + // Note abs() alone can't be used as the behavior of abs(INT_MIN) is + // undefined. + if (seg_data[i] > range || seg_data[i] < -range) { + return 0; + } + } + return 1; } VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { From fe3b58cffa250aaa8c20ee2e53ce3e92cac3f440 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 1 Oct 2021 15:42:50 -0700 Subject: [PATCH 0228/1000] vpx_roi_map: add delta range info Change-Id: If2ef4400562075b4e7abadc01638a46c0c7f1859 --- vpx/vp8cx.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 28bd861747..47c38d3b5e 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -808,8 +808,8 @@ typedef struct vpx_roi_map { unsigned int rows; /**< Number of rows. */ unsigned int cols; /**< Number of columns. */ /*! VP8 only uses the first 4 segments. VP9 uses 8 segments. */ - int delta_q[8]; /**< Quantizer deltas. */ - int delta_lf[8]; /**< Loop filter deltas. */ + int delta_q[8]; /**< Quantizer deltas. Valid range: [-63, 63].*/ + int delta_lf[8]; /**< Loop filter deltas. Valid range: [-63, 63].*/ /*! skip and ref frame segment is only used in VP9. */ int skip[8]; /**< Skip this block. */ int ref_frame[8]; /**< Reference frame for this block. */ From 2ea1b908d87b29bcc6214efd3073b92392d495ff Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 1 Oct 2021 13:46:02 -0700 Subject: [PATCH 0229/1000] {vp8,vp9}_set_roi_map: fix validation with INT_MIN previously ranges were checked with abs() whose behavior is undefined with INT_MIN. this fixes a crash when the original value is returned and it later used as and offset into a table. Bug: webm:1742 Change-Id: I345970b75c46699587a4fbc4a059e59277f4c2c8 --- test/encode_api_test.cc | 122 ++++++++++++++++++++++++++++++++++++-- vp8/encoder/onyx_if.c | 18 +++--- vp9/encoder/vp9_encoder.c | 13 ++-- 3 files changed, 134 insertions(+), 19 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 6bd7e593da..dec19b2268 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -8,6 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include +#include + #include "third_party/googletest/src/include/gtest/gtest.h" #include "./vpx_config.h" @@ -18,6 +21,12 @@ namespace { #define NELEMENTS(x) static_cast(sizeof(x) / sizeof(x[0])) +bool IsVP9(const vpx_codec_iface_t *iface) { + static const char kVP9Name[] = "WebM Project VP9"; + return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) == + 0; +} + TEST(EncodeAPI, InvalidParams) { static const vpx_codec_iface_t *kCodecs[] = { #if CONFIG_VP8_ENCODER @@ -184,10 +193,7 @@ TEST(EncodeAPI, MultiResEncode) { } // VP9 should report incapable, VP8 invalid for all configurations. - const char kVP9Name[] = "WebM Project VP9"; - const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface), - sizeof(kVP9Name) - 1) == 0; - EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM, + EXPECT_EQ(IsVP9(iface) ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0])); for (int i = 0; i < 2; i++) { @@ -196,4 +202,112 @@ TEST(EncodeAPI, MultiResEncode) { } } +TEST(EncodeAPI, SetRoi) { + static struct { + const vpx_codec_iface_t *iface; + int ctrl_id; + } kCodecs[] = { +#if CONFIG_VP8_ENCODER + { &vpx_codec_vp8_cx_algo, VP8E_SET_ROI_MAP }, +#endif +#if CONFIG_VP9_ENCODER + { &vpx_codec_vp9_cx_algo, VP9E_SET_ROI_MAP }, +#endif + }; + constexpr int kWidth = 64; + constexpr int kHeight = 64; + + for (const auto &codec : kCodecs) { + SCOPED_TRACE(vpx_codec_iface_name(codec.iface)); + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + EXPECT_EQ(vpx_codec_enc_config_default(codec.iface, &cfg, 0), VPX_CODEC_OK); + cfg.g_w = kWidth; + cfg.g_h = kHeight; + EXPECT_EQ(vpx_codec_enc_init(&enc, codec.iface, &cfg, 0), VPX_CODEC_OK); + + vpx_roi_map_t roi = {}; + uint8_t roi_map[kWidth * kHeight] = {}; + if (IsVP9(codec.iface)) { + roi.rows = (cfg.g_w + 7) >> 3; + roi.cols = (cfg.g_h + 7) >> 3; + } else { + roi.rows = (cfg.g_w + 15) >> 4; + roi.cols = (cfg.g_h + 15) >> 4; + } + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK); + + roi.roi_map = roi_map; + // VP8 only. This value isn't range checked. + roi.static_threshold[1] = 1000; + roi.static_threshold[2] = INT_MIN; + roi.static_threshold[3] = INT_MAX; + + for (const auto delta : { -63, -1, 0, 1, 63 }) { + for (int i = 0; i < 8; ++i) { + roi.delta_q[i] = delta; + roi.delta_lf[i] = delta; + // VP9 only. + roi.skip[i] ^= 1; + roi.ref_frame[i] = (roi.ref_frame[i] + 1) % 4; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK); + } + } + + vpx_codec_err_t expected_error; + for (const auto delta : { -64, 64, INT_MIN, INT_MAX }) { + expected_error = VPX_CODEC_INVALID_PARAM; + for (int i = 0; i < 8; ++i) { + roi.delta_q[i] = delta; + // The max segment count for VP8 is 4, the remainder of the entries are + // ignored. + if (i >= 4 && !IsVP9(codec.iface)) expected_error = VPX_CODEC_OK; + + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "delta_q[" << i << "]: " << delta; + roi.delta_q[i] = 0; + + roi.delta_lf[i] = delta; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "delta_lf[" << i << "]: " << delta; + roi.delta_lf[i] = 0; + } + } + + // VP8 should ignore skip[] and ref_frame[] values. + expected_error = + IsVP9(codec.iface) ? VPX_CODEC_INVALID_PARAM : VPX_CODEC_OK; + for (const auto skip : { -2, 2, INT_MIN, INT_MAX }) { + for (int i = 0; i < 8; ++i) { + roi.skip[i] = skip; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "skip[" << i << "]: " << skip; + roi.skip[i] = 0; + } + } + + // VP9 allows negative values to be used to disable segmentation. + for (int ref_frame = -3; ref_frame < 0; ++ref_frame) { + for (int i = 0; i < 8; ++i) { + roi.ref_frame[i] = ref_frame; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK) + << "ref_frame[" << i << "]: " << ref_frame; + roi.ref_frame[i] = 0; + } + } + + for (const auto ref_frame : { 4, INT_MIN, INT_MAX }) { + for (int i = 0; i < 8; ++i) { + roi.ref_frame[i] = ref_frame; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "ref_frame[" << i << "]: " << ref_frame; + roi.ref_frame[i] = 0; + } + } + + EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + } // namespace diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index c6c162347f..2b059a1e44 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -5318,17 +5318,13 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, return -1; } - // Range check the delta Q values and convert the external Q range values - // to internal ones. - if ((abs(delta_q[0]) > range) || (abs(delta_q[1]) > range) || - (abs(delta_q[2]) > range) || (abs(delta_q[3]) > range)) { - return -1; - } - - // Range check the delta lf values - if ((abs(delta_lf[0]) > range) || (abs(delta_lf[1]) > range) || - (abs(delta_lf[2]) > range) || (abs(delta_lf[3]) > range)) { - return -1; + for (i = 0; i < MAX_MB_SEGMENTS; ++i) { + // Note abs() alone can't be used as the behavior of abs(INT_MIN) is + // undefined. + if (delta_q[i] > range || delta_q[i] < -range || delta_lf[i] > range || + delta_lf[i] < -range) { + return -1; + } } // Also disable segmentation if no deltas are specified. diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 6cd8cb80e8..7e80835f6c 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -654,10 +654,15 @@ static void init_level_info(Vp9LevelInfo *level_info) { } static int check_seg_range(int seg_data[8], int range) { - return !(abs(seg_data[0]) > range || abs(seg_data[1]) > range || - abs(seg_data[2]) > range || abs(seg_data[3]) > range || - abs(seg_data[4]) > range || abs(seg_data[5]) > range || - abs(seg_data[6]) > range || abs(seg_data[7]) > range); + int i; + for (i = 0; i < 8; ++i) { + // Note abs() alone can't be used as the behavior of abs(INT_MIN) is + // undefined. + if (seg_data[i] > range || seg_data[i] < -range) { + return 0; + } + } + return 1; } VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { From 626ff35955c2c35b806b3e0ecf551a1a8611cdbf Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 5 Oct 2021 15:57:34 -0700 Subject: [PATCH 0230/1000] Update AUTHORS and version info in libs.mk Bug: webm:1732 Change-Id: I29ce77c7d02bd2f5cb0ef8412333df032744b668 --- AUTHORS | 6 ++++++ libs.mk | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/AUTHORS b/AUTHORS index e804842f78..174cc59ee7 100644 --- a/AUTHORS +++ b/AUTHORS @@ -3,6 +3,7 @@ Aaron Watry Abo Talib Mahfoodh +Adam B. Goode Adrian Grange Ahmad Sharif Aidan Welch @@ -25,6 +26,7 @@ Angie Chiang Aron Rosenberg Attila Nagy Birk Magnussen +Bohan Li Brian Foley Brion Vibber changjun.yang @@ -34,6 +36,7 @@ Chi Yo Tsai chm Chris Cunningham Christian Duvivier +Chunbo Hua Clement Courbet Daniele Castagna Daniel Kang @@ -68,6 +71,7 @@ Han Shen Harish Mahendrakar Henrik Lundin Hien Ho +Hirokazu Honda Hui Su Ivan Krasin Ivan Maltz @@ -97,6 +101,7 @@ Johann Koenig John Koleszar Johnny Klonaris John Stark +Jonathan Wright Jon Kunkee Jorge E. Moreira Joshua Bleecher Snyder @@ -146,6 +151,7 @@ Pengchong Jin Peter Boström Peter Collingbourne Peter de Rivaz +Peter Kasting Philip Jägenstedt Priit Laes Rafael Ávila de Espíndola diff --git a/libs.mk b/libs.mk index f5b43abadc..7cd973bd18 100644 --- a/libs.mk +++ b/libs.mk @@ -299,8 +299,8 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) # To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current # SO_VERSION_* then follow the rules in the link to detemine the new version # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1 -SO_VERSION_MAJOR := 6 -SO_VERSION_MINOR := 4 +SO_VERSION_MAJOR := 7 +SO_VERSION_MINOR := 0 SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib From 27b8a778bdbaaf803c2a15eb4d96837757480106 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 8 Oct 2021 16:24:23 -0700 Subject: [PATCH 0231/1000] vp8_yv12_realloc_frame_buffer: move allocation check to before the memset used under msan to avoid any spurious reports in OOM conditions Change-Id: I0c4ee92829bbcb356e94f503a4615caf891bb49d --- vpx_scale/generic/yv12config.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c index eee291c30d..c52dab0588 100644 --- a/vpx_scale/generic/yv12config.c +++ b/vpx_scale/generic/yv12config.c @@ -64,6 +64,10 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, if (!ybf->buffer_alloc) { ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size); + if (!ybf->buffer_alloc) { + ybf->buffer_alloc_sz = 0; + return -1; + } #if defined(__has_feature) #if __has_feature(memory_sanitizer) // This memset is needed for fixing the issue of using uninitialized @@ -75,7 +79,7 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, ybf->buffer_alloc_sz = frame_size; } - if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size) return -1; + if (ybf->buffer_alloc_sz < frame_size) return -1; /* Only support allocating buffers that have a border that's a multiple * of 32. The border restriction is required to get 16-byte alignment of From 9039995e94ef5c70e6f3cbf5fe43367da18a0dcc Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 9 Oct 2021 10:33:37 -0700 Subject: [PATCH 0232/1000] Android.mk: import LICENSE indicators from AOSP https://android-review.googlesource.com/c/platform/external/libvpx/+/1588942 https://android.googlesource.com/platform/external/libvpx/+/099828b5c770ef8630741721be4b6c25a8394204 Change-Id: Ieca1c882f82bcbc7546944b43af7fab358f925d2 --- build/make/Android.mk | 3 +++ 1 file changed, 3 insertions(+) diff --git a/build/make/Android.mk b/build/make/Android.mk index 6cb3af027b..b8032e67aa 100644 --- a/build/make/Android.mk +++ b/build/make/Android.mk @@ -166,6 +166,9 @@ LOCAL_CFLAGS += \ -I$(ASM_CNV_PATH)/libvpx LOCAL_MODULE := libvpx +LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD +LOCAL_LICENSE_CONDITIONS := notice +LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes) LOCAL_STATIC_LIBRARIES := cpufeatures From e259e6951d794ca6a6f2f3c9c40c5c99818613d3 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 12 Oct 2021 11:57:39 -0700 Subject: [PATCH 0233/1000] test/Android.mk: import LICENSE indicators from AOSP https://android-review.googlesource.com/c/platform/external/libvpx/+/1853628 https://android.googlesource.com/platform/external/libvpx/+/e40f8afb1e51d3bd13d662c1881e3cfb616fa2b8 Change-Id: I15f185ab7c7661f4456c4ad7296fdda01dfb8d53 --- test/android/Android.mk | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/android/Android.mk b/test/android/Android.mk index 7318de2fc4..87155fcb58 100644 --- a/test/android/Android.mk +++ b/test/android/Android.mk @@ -34,6 +34,9 @@ LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/ LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/ LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/include/ LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc +LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD +LOCAL_LICENSE_CONDITIONS := notice +LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS include $(BUILD_STATIC_LIBRARY) #libvpx_test @@ -48,6 +51,9 @@ else LOCAL_STATIC_LIBRARIES += vpx endif +LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD +LOCAL_LICENSE_CONDITIONS := notice +LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS include $(LOCAL_PATH)/test/test.mk LOCAL_C_INCLUDES := $(BINDINGS_DIR) FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBVPX_TEST_SRCS-yes))) From 340f60524ffa35c7324c54fe404d84cc1a1ac402 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 2 Nov 2021 16:29:52 -0700 Subject: [PATCH 0234/1000] vpx_codec_internal.h: add LIBVPX_FORMAT_PRINTF and use it to set the format attribute for the printf like function vpx_internal_error(). this allows the main library to be built with -Wformat-nonliteral without producing warnings; the examples will be handled in a followup. Bug: webm:1744 Change-Id: Iebc322e24db35d902c5a2b1ed767d2e10e9c91b9 --- vpx/internal/vpx_codec_internal.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h index 961b0bfe4c..670fe380ed 100644 --- a/vpx/internal/vpx_codec_internal.h +++ b/vpx/internal/vpx_codec_internal.h @@ -435,9 +435,21 @@ struct vpx_internal_error_info { #endif #endif +// Tells the compiler to perform `printf` format string checking if the +// compiler supports it; see the 'format' attribute in +// . +#define LIBVPX_FORMAT_PRINTF(string_index, first_to_check) +#if defined(__has_attribute) +#if __has_attribute(format) +#undef LIBVPX_FORMAT_PRINTF +#define LIBVPX_FORMAT_PRINTF(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +#endif +#endif + void vpx_internal_error(struct vpx_internal_error_info *info, - vpx_codec_err_t error, const char *fmt, - ...) CLANG_ANALYZER_NORETURN; + vpx_codec_err_t error, const char *fmt, ...) + LIBVPX_FORMAT_PRINTF(3, 4) CLANG_ANALYZER_NORETURN; #ifdef __cplusplus } // extern "C" From dd10ac8f69c1bc77fc69cd10de51092d07fbebb5 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 2 Nov 2021 17:19:10 -0700 Subject: [PATCH 0235/1000] tools_common.h: add VPX_TOOLS_FORMAT_PRINTF and use it to set the format attribute for printf like functions. this allows the examples to be built with -Wformat-nonliteral without producing warnings. Bug: webm:1744 Change-Id: I26b4c41c9a42790053b1ae0e4a678af8f2cd1d82 Fixed: webm:1744 --- configure | 1 + examples/svc_encodeframe.c | 17 +++++++----- examples/twopass_encoder.c | 2 +- examples/vp8_multi_resolution_encoder.c | 6 ++--- examples/vpx_temporal_svc_encoder.c | 4 +-- rate_hist.c | 35 +++++++++++++------------ tools_common.h | 18 ++++++++++--- vpxenc.c | 12 +++++---- vpxstats.c | 2 +- warnings.c | 2 +- 10 files changed, 59 insertions(+), 40 deletions(-) diff --git a/configure b/configure index da631a45e1..e3babbe824 100755 --- a/configure +++ b/configure @@ -621,6 +621,7 @@ process_toolchain() { check_add_cflags -Wdeclaration-after-statement check_add_cflags -Wdisabled-optimization check_add_cflags -Wfloat-conversion + check_add_cflags -Wformat=2 check_add_cflags -Wparentheses-equality check_add_cflags -Wpointer-arith check_add_cflags -Wtype-limits diff --git a/examples/svc_encodeframe.c b/examples/svc_encodeframe.c index a73ee8ed66..08bda0e5c9 100644 --- a/examples/svc_encodeframe.c +++ b/examples/svc_encodeframe.c @@ -21,6 +21,7 @@ #include #include #define VPX_DISABLE_CTRL_TYPECHECKS 1 +#include "../tools_common.h" #include "./vpx_config.h" #include "./svc_context.h" #include "vpx/vp8cx.h" @@ -95,8 +96,9 @@ static const SvcInternal_t *get_const_svc_internal(const SvcContext *svc_ctx) { return (const SvcInternal_t *)svc_ctx->internal; } -static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt, - ...) { +static VPX_TOOLS_FORMAT_PRINTF(3, 4) int svc_log(SvcContext *svc_ctx, + SVC_LOG_LEVEL level, + const char *fmt, ...) { char buf[512]; int retval = 0; va_list ap; @@ -264,7 +266,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) { if (alt_ref_enabled > REF_FRAMES - svc_ctx->spatial_layers) { svc_log(svc_ctx, SVC_LOG_ERROR, "svc: auto alt ref: Maxinum %d(REF_FRAMES - layers) layers could" - "enabled auto alt reference frame, but % layers are enabled\n", + "enabled auto alt reference frame, but %d layers are enabled\n", REF_FRAMES - svc_ctx->spatial_layers, alt_ref_enabled); res = VPX_CODEC_INVALID_PARAM; } @@ -456,10 +458,11 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS; if (svc_ctx->temporal_layers * svc_ctx->spatial_layers > VPX_MAX_LAYERS) { - svc_log(svc_ctx, SVC_LOG_ERROR, - "spatial layers * temporal layers exceeds the maximum number of " - "allowed layers of %d\n", - svc_ctx->spatial_layers * svc_ctx->temporal_layers, VPX_MAX_LAYERS); + svc_log( + svc_ctx, SVC_LOG_ERROR, + "spatial layers * temporal layers (%d) exceeds the maximum number of " + "allowed layers of %d\n", + svc_ctx->spatial_layers * svc_ctx->temporal_layers, VPX_MAX_LAYERS); return VPX_CODEC_INVALID_PARAM; } res = assign_layer_bitrates(svc_ctx, enc_cfg); diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c index 3d950b2c4b..07ba37dfd0 100644 --- a/examples/twopass_encoder.c +++ b/examples/twopass_encoder.c @@ -221,7 +221,7 @@ int main(int argc, char **argv) { die("Invalid frame size: %dx%d", w, h); if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1)) - die("Failed to allocate image", w, h); + die("Failed to allocate image (%dx%d)", w, h); printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); diff --git a/examples/vp8_multi_resolution_encoder.c b/examples/vp8_multi_resolution_encoder.c index e72f8a0197..62d96de557 100644 --- a/examples/vp8_multi_resolution_encoder.c +++ b/examples/vp8_multi_resolution_encoder.c @@ -352,7 +352,7 @@ int main(int argc, char **argv) { framerate = (int)strtol(argv[3], NULL, 0); if (width < 16 || width % 2 || height < 16 || height % 2) - die("Invalid resolution: %ldx%ld", width, height); + die("Invalid resolution: %dx%d", width, height); /* Open input video file for encoding */ if (!(infile = fopen(argv[4], "rb"))) @@ -380,7 +380,7 @@ int main(int argc, char **argv) { (int)strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0); if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3) die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n", - num_temporal_layers); + num_temporal_layers[i]); } /* Open file to write out each spatially downsampled input stream. */ @@ -468,7 +468,7 @@ int main(int argc, char **argv) { /* Allocate image for each encoder */ for (i = 0; i < NUM_ENCODERS; i++) if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32)) - die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h); + die("Failed to allocate image (%dx%d)", cfg[i].g_w, cfg[i].g_h); if (raw[0].stride[VPX_PLANE_Y] == (int)raw[0].d_w) read_frame_p = mulres_read_frame; diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c index ad3e79c713..47f30751eb 100644 --- a/examples/vpx_temporal_svc_encoder.c +++ b/examples/vpx_temporal_svc_encoder.c @@ -687,14 +687,14 @@ int main(int argc, char **argv) { &raw, bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016, width, height, 32)) { - die("Failed to allocate image", width, height); + die("Failed to allocate image (%dx%d)", width, height); } } #else // Y4M reader has its own allocation. if (input_ctx.file_type != FILE_TYPE_Y4M) { if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) { - die("Failed to allocate image", width, height); + die("Failed to allocate image (%dx%d)", width, height); } } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/rate_hist.c b/rate_hist.c index 6cf8ce7bb0..d10e754fee 100644 --- a/rate_hist.c +++ b/rate_hist.c @@ -193,7 +193,7 @@ static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets, static void show_histogram(const struct hist_bucket *bucket, int buckets, int total, int scale) { - const char *pat1, *pat2; + int width1, width2; int i; assert(bucket != NULL); @@ -201,32 +201,32 @@ static void show_histogram(const struct hist_bucket *bucket, int buckets, switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) { case 1: case 2: - pat1 = "%4d %2s: "; - pat2 = "%4d-%2d: "; + width1 = 4; + width2 = 2; break; case 3: - pat1 = "%5d %3s: "; - pat2 = "%5d-%3d: "; + width1 = 5; + width2 = 3; break; case 4: - pat1 = "%6d %4s: "; - pat2 = "%6d-%4d: "; + width1 = 6; + width2 = 4; break; case 5: - pat1 = "%7d %5s: "; - pat2 = "%7d-%5d: "; + width1 = 7; + width2 = 5; break; case 6: - pat1 = "%8d %6s: "; - pat2 = "%8d-%6d: "; + width1 = 8; + width2 = 6; break; case 7: - pat1 = "%9d %7s: "; - pat2 = "%9d-%7d: "; + width1 = 9; + width2 = 7; break; default: - pat1 = "%12d %10s: "; - pat2 = "%12d-%10d: "; + width1 = 12; + width2 = 10; break; } @@ -241,9 +241,10 @@ static void show_histogram(const struct hist_bucket *bucket, int buckets, assert(len <= HIST_BAR_MAX); if (bucket[i].low == bucket[i].high) - fprintf(stderr, pat1, bucket[i].low, ""); + fprintf(stderr, "%*d %*s: ", width1, bucket[i].low, width2, ""); else - fprintf(stderr, pat2, bucket[i].low, bucket[i].high); + fprintf(stderr, "%*d-%*d: ", width1, bucket[i].low, width2, + bucket[i].high); for (j = 0; j < HIST_BAR_MAX; j++) fprintf(stderr, j < len ? "=" : " "); fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct); diff --git a/tools_common.h b/tools_common.h index 4e8851fc15..b9cfb9cc85 100644 --- a/tools_common.h +++ b/tools_common.h @@ -116,12 +116,24 @@ extern "C" { #define VPX_NO_RETURN #endif +// Tells the compiler to perform `printf` format string checking if the +// compiler supports it; see the 'format' attribute in +// . +#define VPX_TOOLS_FORMAT_PRINTF(string_index, first_to_check) +#if defined(__has_attribute) +#if __has_attribute(format) +#undef VPX_TOOLS_FORMAT_PRINTF +#define VPX_TOOLS_FORMAT_PRINTF(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +#endif +#endif + /* Sets a stdio stream into binary mode */ FILE *set_binary_mode(FILE *stream); -VPX_NO_RETURN void die(const char *fmt, ...); -VPX_NO_RETURN void fatal(const char *fmt, ...); -void warn(const char *fmt, ...); +VPX_NO_RETURN void die(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2); +VPX_NO_RETURN void fatal(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2); +void warn(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2); VPX_NO_RETURN void die_codec(vpx_codec_ctx_t *ctx, const char *s); diff --git a/vpxenc.c b/vpxenc.c index a0122ef804..b64b6cf441 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -58,8 +58,8 @@ static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb, static const char *exec_name; -static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal, - const char *s, va_list ap) { +static VPX_TOOLS_FORMAT_PRINTF(3, 0) void warn_or_exit_on_errorv( + vpx_codec_ctx_t *ctx, int fatal, const char *s, va_list ap) { if (ctx->err) { const char *detail = vpx_codec_error_detail(ctx); @@ -72,7 +72,9 @@ static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal, } } -static void ctx_exit_on_error(vpx_codec_ctx_t *ctx, const char *s, ...) { +static VPX_TOOLS_FORMAT_PRINTF(2, + 3) void ctx_exit_on_error(vpx_codec_ctx_t *ctx, + const char *s, ...) { va_list ap; va_start(ap, s); @@ -80,8 +82,8 @@ static void ctx_exit_on_error(vpx_codec_ctx_t *ctx, const char *s, ...) { va_end(ap); } -static void warn_or_exit_on_error(vpx_codec_ctx_t *ctx, int fatal, - const char *s, ...) { +static VPX_TOOLS_FORMAT_PRINTF(3, 4) void warn_or_exit_on_error( + vpx_codec_ctx_t *ctx, int fatal, const char *s, ...) { va_list ap; va_start(ap, s); diff --git a/vpxstats.c b/vpxstats.c index 142e367bb4..c0dd14e450 100644 --- a/vpxstats.c +++ b/vpxstats.c @@ -41,7 +41,7 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass) { stats->buf.buf = malloc(stats->buf_alloc_sz); if (!stats->buf.buf) - fatal("Failed to allocate first-pass stats buffer (%lu bytes)", + fatal("Failed to allocate first-pass stats buffer (%u bytes)", (unsigned int)stats->buf_alloc_sz); nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file); diff --git a/warnings.c b/warnings.c index a80da527f7..3e6e702536 100644 --- a/warnings.c +++ b/warnings.c @@ -98,7 +98,7 @@ void check_encoder_config(int disable_prompt, /* Count and print warnings. */ for (warning = warning_list.warning_node; warning != NULL; warning = warning->next_warning, ++num_warnings) { - warn(warning->warning_string); + warn("%s", warning->warning_string); } free_warning_list(&warning_list); From f3b95b1f56ed0b8d3fce8b998f431341f5d8c680 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 3 Nov 2021 16:23:06 -0700 Subject: [PATCH 0236/1000] update tools/cpplint.py https://github.com/google/styleguide.git 100755 blob 4a82bde4f95cef8103520bc2c019483397ec51f4 cpplint/cpplint.py Bug: aomedia:3178 Change-Id: I9e11d647096fc2082b18d74731026dabb52639bb --- tools/cpplint.py | 3442 +++++++++++++++++++++++++++++++++------------- 1 file changed, 2465 insertions(+), 977 deletions(-) diff --git a/tools/cpplint.py b/tools/cpplint.py index 25fbef73d8..e3ebde2f5a 100755 --- a/tools/cpplint.py +++ b/tools/cpplint.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # # Copyright (c) 2009 Google Inc. All rights reserved. # @@ -51,16 +51,23 @@ import string import sys import unicodedata +import sysconfig + +try: + xrange # Python 2 +except NameError: + xrange = range # Python 3 _USAGE = """ Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...] [--counting=total|toplevel|detailed] [--root=subdir] - [--linelength=digits] + [--linelength=digits] [--headers=x,y,...] + [--quiet] [file] ... The style guidelines this tries to follow are those in - http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml + https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml Every problem is given a confidence score from 1-5, with 5 meaning we are certain of the problem, and 1 meaning it could be a legitimate construct. @@ -83,6 +90,9 @@ verbose=# Specify a number 0-5 to restrict errors to certain verbosity levels. + quiet + Don't print anything if no errors are found. + filter=-x,+y,... Specify a comma-separated list of category-filters to apply: only error messages whose category names pass the filters will be printed. @@ -114,12 +124,13 @@ ignored. Examples: - Assuing that src/.git exists, the header guard CPP variables for - src/chrome/browser/ui/browser.h are: + Assuming that top/src/.git exists (and cwd=top/src), the header guard + CPP variables for top/src/chrome/browser/ui/browser.h are: No flag => CHROME_BROWSER_UI_BROWSER_H_ --root=chrome => BROWSER_UI_BROWSER_H_ --root=chrome/browser => UI_BROWSER_H_ + --root=.. => SRC_CHROME_BROWSER_UI_BROWSER_H_ linelength=digits This is the allowed line length for the project. The default value is @@ -133,6 +144,57 @@ Examples: --extensions=hpp,cpp + + headers=x,y,... + The header extensions that cpplint will treat as .h in checks. Values are + automatically added to --extensions list. + + Examples: + --headers=hpp,hxx + --headers=hpp + + cpplint.py supports per-directory configurations specified in CPPLINT.cfg + files. CPPLINT.cfg file can contain a number of key=value pairs. + Currently the following options are supported: + + set noparent + filter=+filter1,-filter2,... + exclude_files=regex + linelength=80 + root=subdir + headers=x,y,... + + "set noparent" option prevents cpplint from traversing directory tree + upwards looking for more .cfg files in parent directories. This option + is usually placed in the top-level project directory. + + The "filter" option is similar in function to --filter flag. It specifies + message filters in addition to the |_DEFAULT_FILTERS| and those specified + through --filter command-line flag. + + "exclude_files" allows to specify a regular expression to be matched against + a file name. If the expression matches, the file is skipped and not run + through liner. + + "linelength" allows to specify the allowed line length for the project. + + The "root" option is similar in function to the --root flag (see example + above). Paths are relative to the directory of the CPPLINT.cfg. + + The "headers" option is similar in function to the --headers flag + (see example above). + + CPPLINT.cfg has an effect on files in the same directory and all + sub-directories, unless overridden by a nested configuration file. + + Example file: + filter=-build/include_order,+build/include_alpha + exclude_files=.*\.cc + + The above example disables build/include_order warning and enables + build/include_alpha as well as excludes all .cc from being + processed by linter, in the current directory (where the .cfg + file is located) and all sub-directories. """ # We categorize each error message we print. Here are the categories. @@ -140,81 +202,101 @@ # If you add a new error message with a new category, add it to the list # here! cpplint_unittest.py should tell you if you forget to do this. _ERROR_CATEGORIES = [ - 'build/class', - 'build/deprecated', - 'build/endif_comment', - 'build/explicit_make_pair', - 'build/forward_decl', - 'build/header_guard', - 'build/include', - 'build/include_alpha', - 'build/include_order', - 'build/include_what_you_use', - 'build/namespaces', - 'build/printf_format', - 'build/storage_class', - 'legal/copyright', - 'readability/alt_tokens', - 'readability/braces', - 'readability/casting', - 'readability/check', - 'readability/constructors', - 'readability/fn_size', - 'readability/function', - 'readability/multiline_comment', - 'readability/multiline_string', - 'readability/namespace', - 'readability/nolint', - 'readability/nul', - 'readability/streams', - 'readability/todo', - 'readability/utf8', - 'runtime/arrays', - 'runtime/casting', - 'runtime/explicit', - 'runtime/int', - 'runtime/init', - 'runtime/invalid_increment', - 'runtime/member_string_references', - 'runtime/memset', - 'runtime/operator', - 'runtime/printf', - 'runtime/printf_format', - 'runtime/references', - 'runtime/sizeof', - 'runtime/string', - 'runtime/threadsafe_fn', - 'runtime/vlog', - 'whitespace/blank_line', - 'whitespace/braces', - 'whitespace/comma', - 'whitespace/comments', - 'whitespace/empty_conditional_body', - 'whitespace/empty_loop_body', - 'whitespace/end_of_line', - 'whitespace/ending_newline', - 'whitespace/forcolon', - 'whitespace/indent', - 'whitespace/line_length', - 'whitespace/newline', - 'whitespace/operators', - 'whitespace/parens', - 'whitespace/semicolon', - 'whitespace/tab', - 'whitespace/todo' - ] - -# The default state of the category filter. This is overrided by the --filter= + 'build/class', + 'build/c++11', + 'build/c++14', + 'build/c++tr1', + 'build/deprecated', + 'build/endif_comment', + 'build/explicit_make_pair', + 'build/forward_decl', + 'build/header_guard', + 'build/include', + 'build/include_alpha', + 'build/include_order', + 'build/include_what_you_use', + 'build/namespaces', + 'build/printf_format', + 'build/storage_class', + 'legal/copyright', + 'readability/alt_tokens', + 'readability/braces', + 'readability/casting', + 'readability/check', + 'readability/constructors', + 'readability/fn_size', + 'readability/inheritance', + 'readability/multiline_comment', + 'readability/multiline_string', + 'readability/namespace', + 'readability/nolint', + 'readability/nul', + 'readability/strings', + 'readability/todo', + 'readability/utf8', + 'runtime/arrays', + 'runtime/casting', + 'runtime/explicit', + 'runtime/int', + 'runtime/init', + 'runtime/invalid_increment', + 'runtime/member_string_references', + 'runtime/memset', + 'runtime/indentation_namespace', + 'runtime/operator', + 'runtime/printf', + 'runtime/printf_format', + 'runtime/references', + 'runtime/string', + 'runtime/threadsafe_fn', + 'runtime/vlog', + 'whitespace/blank_line', + 'whitespace/braces', + 'whitespace/comma', + 'whitespace/comments', + 'whitespace/empty_conditional_body', + 'whitespace/empty_if_body', + 'whitespace/empty_loop_body', + 'whitespace/end_of_line', + 'whitespace/ending_newline', + 'whitespace/forcolon', + 'whitespace/indent', + 'whitespace/line_length', + 'whitespace/newline', + 'whitespace/operators', + 'whitespace/parens', + 'whitespace/semicolon', + 'whitespace/tab', + 'whitespace/todo', + ] + +# These error categories are no longer enforced by cpplint, but for backwards- +# compatibility they may still appear in NOLINT comments. +_LEGACY_ERROR_CATEGORIES = [ + 'readability/streams', + 'readability/function', + ] + +# The default state of the category filter. This is overridden by the --filter= # flag. By default all errors are on, so only add here categories that should be # off by default (i.e., categories that must be enabled by the --filter= flags). # All entries here should start with a '-' or '+', as in the --filter= flag. _DEFAULT_FILTERS = ['-build/include_alpha'] +# The default list of categories suppressed for C (not C++) files. +_DEFAULT_C_SUPPRESSED_CATEGORIES = [ + 'readability/casting', + ] + +# The default list of categories suppressed for Linux Kernel files. +_DEFAULT_KERNEL_SUPPRESSED_CATEGORIES = [ + 'whitespace/tab', + ] + # We used to check for high-bit characters, but after much discussion we # decided those were OK, as long as they were in UTF-8 and didn't represent # hard-coded international strings, which belong in a separate i18n file. - # C++ headers _CPP_HEADERS = frozenset([ # Legacy @@ -304,6 +386,7 @@ 'random', 'ratio', 'regex', + 'scoped_allocator', 'set', 'sstream', 'stack', @@ -351,15 +434,40 @@ 'cwctype', ]) +# Type names +_TYPES = re.compile( + r'^(?:' + # [dcl.type.simple] + r'(char(16_t|32_t)?)|wchar_t|' + r'bool|short|int|long|signed|unsigned|float|double|' + # [support.types] + r'(ptrdiff_t|size_t|max_align_t|nullptr_t)|' + # [cstdint.syn] + r'(u?int(_fast|_least)?(8|16|32|64)_t)|' + r'(u?int(max|ptr)_t)|' + r')$') + + +# These headers are excluded from [build/include] and [build/include_order] +# checks: +# - Anything not following google file name conventions (containing an +# uppercase character, such as Python.h or nsStringAPI.h, for example). +# - Lua headers. +_THIRD_PARTY_HEADERS_PATTERN = re.compile( + r'^(?:[^/]*[A-Z][^/]*\.h|lua\.h|lauxlib\.h|lualib\.h)$') + +# Pattern for matching FileInfo.BaseName() against test file name +_TEST_FILE_SUFFIX = r'(_test|_unittest|_regtest)$' + +# Pattern that matches only complete whitespace, possibly across multiple lines. +_EMPTY_CONDITIONAL_BODY_PATTERN = re.compile(r'^\s*$', re.DOTALL) + # Assertion macros. These are defined in base/logging.h and -# testing/base/gunit.h. Note that the _M versions need to come first -# for substring matching to work. +# testing/base/public/gunit.h. _CHECK_MACROS = [ 'DCHECK', 'CHECK', - 'EXPECT_TRUE_M', 'EXPECT_TRUE', - 'ASSERT_TRUE_M', 'ASSERT_TRUE', - 'EXPECT_FALSE_M', 'EXPECT_FALSE', - 'ASSERT_FALSE_M', 'ASSERT_FALSE', + 'EXPECT_TRUE', 'ASSERT_TRUE', + 'EXPECT_FALSE', 'ASSERT_FALSE', ] # Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE @@ -372,16 +480,12 @@ _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement - _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement - _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'), ('>=', 'LT'), ('>', 'LE'), ('<=', 'GT'), ('<', 'GE')]: _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement - _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement - _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement # Alternative tokens and their replacements. For full list, see section 2.5 # Alternative tokens [lex.digraph] in the C++ standard. @@ -430,11 +534,14 @@ r'(?:\s+(volatile|__volatile__))?' r'\s*[{(]') +# Match strings that indicate we're working on a C (not C++) file. +_SEARCH_C_FILE = re.compile(r'\b(?:LINT_C_FILE|' + r'vim?:\s*.*(\s*|:)filetype=c(\s*|:|$))') -_regexp_compile_cache = {} +# Match string that indicates we're working on a Linux Kernel file. +_SEARCH_KERNEL_FILE = re.compile(r'\b(?:LINT_KERNEL_FILE)') -# Finds occurrences of NOLINT or NOLINT(...). -_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?') +_regexp_compile_cache = {} # {str, set(int)}: a map from error categories to sets of linenumbers # on which those errors are expected and should be suppressed. @@ -443,6 +550,7 @@ # The root directory used for deriving header guard CPP variable. # This is set by --root flag. _root = None +_root_debug = False # The allowed line length of files. # This is set by --linelength flag. @@ -452,8 +560,28 @@ # This is set by --extensions flag. _valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh']) +# Treat all headers starting with 'h' equally: .h, .hpp, .hxx etc. +# This is set by --headers flag. +_hpp_headers = set(['h']) + +# {str, bool}: a map from error categories to booleans which indicate if the +# category should be suppressed for every line. +_global_error_suppressions = {} + +def ProcessHppHeadersOption(val): + global _hpp_headers + try: + _hpp_headers = set(val.split(',')) + # Automatically append to extensions list so it does not have to be set 2 times + _valid_extensions.update(_hpp_headers) + except ValueError: + PrintUsage('Header extensions must be comma separated list.') + +def IsHeaderExtension(file_extension): + return file_extension in _hpp_headers + def ParseNolintSuppressions(filename, raw_line, linenum, error): - """Updates the global list of error-suppressions. + """Updates the global list of line error-suppressions. Parses any NOLINT comments on the current line, updating the global error_suppressions store. Reports an error if the NOLINT comment @@ -465,42 +593,67 @@ def ParseNolintSuppressions(filename, raw_line, linenum, error): linenum: int, the number of the current line. error: function, an error handler. """ - # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*). - matched = _RE_SUPPRESSION.search(raw_line) + matched = Search(r'\bNOLINT(NEXTLINE)?\b(\([^)]+\))?', raw_line) if matched: - category = matched.group(1) + if matched.group(1): + suppressed_line = linenum + 1 + else: + suppressed_line = linenum + category = matched.group(2) if category in (None, '(*)'): # => "suppress all" - _error_suppressions.setdefault(None, set()).add(linenum) + _error_suppressions.setdefault(None, set()).add(suppressed_line) else: if category.startswith('(') and category.endswith(')'): category = category[1:-1] if category in _ERROR_CATEGORIES: - _error_suppressions.setdefault(category, set()).add(linenum) - else: + _error_suppressions.setdefault(category, set()).add(suppressed_line) + elif category not in _LEGACY_ERROR_CATEGORIES: error(filename, linenum, 'readability/nolint', 5, 'Unknown NOLINT error category: %s' % category) +def ProcessGlobalSuppresions(lines): + """Updates the list of global error suppressions. + + Parses any lint directives in the file that have global effect. + + Args: + lines: An array of strings, each representing a line of the file, with the + last element being empty if the file is terminated with a newline. + """ + for line in lines: + if _SEARCH_C_FILE.search(line): + for category in _DEFAULT_C_SUPPRESSED_CATEGORIES: + _global_error_suppressions[category] = True + if _SEARCH_KERNEL_FILE.search(line): + for category in _DEFAULT_KERNEL_SUPPRESSED_CATEGORIES: + _global_error_suppressions[category] = True + + def ResetNolintSuppressions(): - "Resets the set of NOLINT suppressions to empty." + """Resets the set of NOLINT suppressions to empty.""" _error_suppressions.clear() + _global_error_suppressions.clear() def IsErrorSuppressedByNolint(category, linenum): """Returns true if the specified error category is suppressed on this line. Consults the global error_suppressions map populated by - ParseNolintSuppressions/ResetNolintSuppressions. + ParseNolintSuppressions/ProcessGlobalSuppresions/ResetNolintSuppressions. Args: category: str, the category of the error. linenum: int, the current line number. Returns: - bool, True iff the error should be suppressed due to a NOLINT comment. + bool, True iff the error should be suppressed due to a NOLINT comment or + global suppression. """ - return (linenum in _error_suppressions.get(category, set()) or + return (_global_error_suppressions.get(category, False) or + linenum in _error_suppressions.get(category, set()) or linenum in _error_suppressions.get(None, set())) + def Match(pattern, s): """Matches the string with the pattern, caching the compiled regexp.""" # The regexp compilation caching is inlined in both Match and Search for @@ -536,11 +689,17 @@ def Search(pattern, s): return _regexp_compile_cache[pattern].search(s) -class _IncludeState(dict): +def _IsSourceExtension(s): + """File extension (excluding dot) matches a source file extension.""" + return s in ('c', 'cc', 'cpp', 'cxx') + + +class _IncludeState(object): """Tracks line numbers for includes, and the order in which includes appear. - As a dict, an _IncludeState object serves as a mapping between include - filename and line number on which that file was included. + include_list contains list of lists of (header, line number) pairs. + It's a lists of lists rather than just one flat list to make it + easier to update across preprocessor boundaries. Call CheckNextIncludeOrder() once for each header in the file, passing in the type constants defined above. Calls in an illegal order will @@ -571,15 +730,42 @@ class _IncludeState(dict): } def __init__(self): - dict.__init__(self) - self.ResetSection() + self.include_list = [[]] + self.ResetSection('') + + def FindHeader(self, header): + """Check if a header has already been included. + + Args: + header: header to check. + Returns: + Line number of previous occurrence, or -1 if the header has not + been seen before. + """ + for section_list in self.include_list: + for f in section_list: + if f[0] == header: + return f[1] + return -1 + + def ResetSection(self, directive): + """Reset section checking for preprocessor directive. - def ResetSection(self): + Args: + directive: preprocessor directive (e.g. "if", "else"). + """ # The name of the current section. self._section = self._INITIAL_SECTION # The path of last found header. self._last_header = '' + # Update list of includes. Note that we never pop from the + # include list. + if directive in ('if', 'ifdef', 'ifndef'): + self.include_list.append([]) + elif directive in ('else', 'elif'): + self.include_list[-1] = [] + def SetLastHeader(self, header_path): self._last_header = header_path @@ -615,7 +801,7 @@ def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path): # If previous line was a blank line, assume that the headers are # intentionally sorted the way they are. if (self._last_header > header_path and - not Match(r'^\s*$', clean_lines.elided[linenum - 1])): + Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])): return False return True @@ -681,8 +867,11 @@ def __init__(self): self.error_count = 0 # global count of reported errors # filters to apply when emitting error messages self.filters = _DEFAULT_FILTERS[:] + # backup of filter list. Used to restore the state after each file. + self._filters_backup = self.filters[:] self.counting = 'total' # In what way are we counting errors? self.errors_by_category = {} # string to int dict storing error counts + self.quiet = False # Suppress non-error messagess? # output format: # "emacs" - format that emacs can parse (default) @@ -693,6 +882,12 @@ def SetOutputFormat(self, output_format): """Sets the output format for errors.""" self.output_format = output_format + def SetQuiet(self, quiet): + """Sets the module's quiet settings, and returns the previous setting.""" + last_quiet = self.quiet + self.quiet = quiet + return last_quiet + def SetVerboseLevel(self, level): """Sets the module's verbosity, and returns the previous setting.""" last_verbose_level = self.verbose_level @@ -719,6 +914,10 @@ def SetFilters(self, filters): """ # Default filters always have less priority than the flag ones. self.filters = _DEFAULT_FILTERS[:] + self.AddFilters(filters) + + def AddFilters(self, filters): + """ Adds more filters to the existing list of error-message filters. """ for filt in filters.split(','): clean_filt = filt.strip() if clean_filt: @@ -728,6 +927,14 @@ def SetFilters(self, filters): raise ValueError('Every filter in --filters must start with + or -' ' (%s does not)' % filt) + def BackupFilters(self): + """ Saves the current filter list to backup storage.""" + self._filters_backup = self.filters[:] + + def RestoreFilters(self): + """ Restores filters previously backed up.""" + self.filters = self._filters_backup[:] + def ResetErrorCounts(self): """Sets the module's error statistic back to zero.""" self.error_count = 0 @@ -748,7 +955,7 @@ def PrintErrorCounts(self): for category, count in self.errors_by_category.iteritems(): sys.stderr.write('Category \'%s\' errors found: %d\n' % (category, count)) - sys.stderr.write('Total errors found: %d\n' % self.error_count) + sys.stdout.write('Total errors found: %d\n' % self.error_count) _cpplint_state = _CppLintState() @@ -762,6 +969,14 @@ def _SetOutputFormat(output_format): """Sets the module's output format.""" _cpplint_state.SetOutputFormat(output_format) +def _Quiet(): + """Return's the module's quiet setting.""" + return _cpplint_state.quiet + +def _SetQuiet(quiet): + """Set the module's quiet status, and return previous setting.""" + return _cpplint_state.SetQuiet(quiet) + def _VerboseLevel(): """Returns the module's verbosity setting.""" @@ -795,6 +1010,25 @@ def _SetFilters(filters): """ _cpplint_state.SetFilters(filters) +def _AddFilters(filters): + """Adds more filter overrides. + + Unlike _SetFilters, this function does not reset the current list of filters + available. + + Args: + filters: A string of comma-separated filters (eg "whitespace/indent"). + Each filter should start with + or -; else we die. + """ + _cpplint_state.AddFilters(filters) + +def _BackupFilters(): + """ Saves the current filter list to backup storage.""" + _cpplint_state.BackupFilters() + +def _RestoreFilters(): + """ Restores filters previously backed up.""" + _cpplint_state.RestoreFilters() class _FunctionState(object): """Tracks current function name and the number of lines in its body.""" @@ -830,6 +1064,9 @@ def Check(self, error, filename, linenum): filename: The name of the current file. linenum: The number of the line to check. """ + if not self.in_a_function: + return + if Match(r'T(EST|est)', self.current_function): base_trigger = self._TEST_TRIGGER else: @@ -857,7 +1094,7 @@ class _IncludeError(Exception): pass -class FileInfo: +class FileInfo(object): """Provides utility functions for filenames. FileInfo provides easy access to the components of a file's path @@ -900,12 +1137,13 @@ def RepositoryName(self): # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by # searching up from the current path. - root_dir = os.path.dirname(fullname) - while (root_dir != os.path.dirname(root_dir) and - not os.path.exists(os.path.join(root_dir, ".git")) and - not os.path.exists(os.path.join(root_dir, ".hg")) and - not os.path.exists(os.path.join(root_dir, ".svn"))): - root_dir = os.path.dirname(root_dir) + root_dir = current_dir = os.path.dirname(fullname) + while current_dir != os.path.dirname(current_dir): + if (os.path.exists(os.path.join(current_dir, ".git")) or + os.path.exists(os.path.join(current_dir, ".hg")) or + os.path.exists(os.path.join(current_dir, ".svn"))): + root_dir = current_dir + current_dir = os.path.dirname(current_dir) if (os.path.exists(os.path.join(root_dir, ".git")) or os.path.exists(os.path.join(root_dir, ".hg")) or @@ -944,7 +1182,7 @@ def NoExtension(self): def IsSource(self): """File has a source file extension.""" - return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx') + return _IsSourceExtension(self.Extension()[1:]) def _ShouldPrintError(category, confidence, linenum): @@ -955,6 +1193,7 @@ def _ShouldPrintError(category, confidence, linenum): # the verbosity level isn't high enough, or the filters filter it out. if IsErrorSuppressedByNolint(category, linenum): return False + if confidence < _cpplint_state.verbose_level: return False @@ -999,8 +1238,8 @@ def Error(filename, linenum, category, confidence, message): if _ShouldPrintError(category, confidence, linenum): _cpplint_state.IncrementErrorCount(category) if _cpplint_state.output_format == 'vs7': - sys.stderr.write('%s(%s): %s [%s] [%d]\n' % ( - filename, linenum, message, category, confidence)) + sys.stderr.write('%s(%s): error cpplint: [%s] %s [%d]\n' % ( + filename, linenum, category, message, confidence)) elif _cpplint_state.output_format == 'eclipse': sys.stderr.write('%s:%s: warning: %s [%s] [%d]\n' % ( filename, linenum, message, category, confidence)) @@ -1012,11 +1251,9 @@ def Error(filename, linenum, category, confidence, message): # Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard. _RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile( r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)') -# Matches strings. Escape codes should already be removed by ESCAPES. -_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"') -# Matches characters. Escape codes should already be removed by ESCAPES. -_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'") -# Matches multi-line C++ comments. +# Match a single C style comment on the same line. +_RE_PATTERN_C_COMMENTS = r'/\*(?:[^*]|\*(?!/))*\*/' +# Matches multi-line C style comments. # This RE is a little bit more complicated than one might expect, because we # have to take care of space removals tools so we can handle comments inside # statements better. @@ -1025,10 +1262,10 @@ def Error(filename, linenum, category, confidence, message): # if this doesn't work we try on left side but only if there's a non-character # on the right. _RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile( - r"""(\s*/\*.*\*/\s*$| - /\*.*\*/\s+| - \s+/\*.*\*/(?=\W)| - /\*.*\*/)""", re.VERBOSE) + r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' + + _RE_PATTERN_C_COMMENTS + r'\s+|' + + r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' + + _RE_PATTERN_C_COMMENTS + r')') def IsCppString(line): @@ -1083,13 +1320,26 @@ def CleanseRawStrings(raw_lines): delimiter = None else: # Haven't found the end yet, append a blank line. - line = '' + line = '""' - else: + # Look for beginning of a raw string, and replace them with + # empty strings. This is done in a loop to handle multiple raw + # strings on the same line. + while delimiter is None: # Look for beginning of a raw string. # See 2.14.15 [lex.string] for syntax. - matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line) - if matched: + # + # Once we have matched a raw string, we check the prefix of the + # line to make sure that the line is not part of a single line + # comment. It's done this way because we remove raw strings + # before removing comments as opposed to removing comments + # before removing raw strings. This is because there are some + # cpplint checks that requires the comments to be preserved, but + # we don't want to check comments that are inside raw strings. + matched = Match(r'^(.*?)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line) + if (matched and + not Match(r'^([^\'"]|\'(\\.|[^\'])*\'|"(\\.|[^"])*")*//', + matched.group(1))): delimiter = ')' + matched.group(2) + '"' end = matched.group(3).find(delimiter) @@ -1101,6 +1351,8 @@ def CleanseRawStrings(raw_lines): else: # Start of a multi-line raw string line = matched.group(1) + '""' + else: + break lines_without_raw_strings.append(line) @@ -1131,10 +1383,10 @@ def FindNextMultiLineCommentEnd(lines, lineix): def RemoveMultiLineCommentsFromRange(lines, begin, end): """Clears a range of lines for multi-line comments.""" - # Having // dummy comments makes the lines non-empty, so we will not get + # Having // comments makes the lines non-empty, so we will not get # unnecessary blank line warnings later in the code. for i in range(begin, end): - lines[i] = '// dummy' + lines[i] = '/**/' def RemoveMultiLineComments(filename, lines, error): @@ -1170,12 +1422,14 @@ def CleanseComments(line): class CleansedLines(object): - """Holds 3 copies of all lines with different preprocessing applied to them. + """Holds 4 copies of all lines with different preprocessing applied to them. - 1) elided member contains lines without strings and comments, - 2) lines member contains lines without comments, and + 1) elided member contains lines without strings and comments. + 2) lines member contains lines without comments. 3) raw_lines member contains all the lines without processing. - All these three members are of , and of the same length. + 4) lines_without_raw_strings member is same as raw_lines, but with C++11 raw + strings removed. + All these members are of , and of the same length. """ def __init__(self, lines): @@ -1206,38 +1460,138 @@ def _CollapseStrings(elided): Returns: The line with collapsed strings. """ - if not _RE_PATTERN_INCLUDE.match(elided): - # Remove escaped characters first to make quote/single quote collapsing - # basic. Things that look like escaped characters shouldn't occur - # outside of strings and chars. - elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided) - elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided) - elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided) - return elided + if _RE_PATTERN_INCLUDE.match(elided): + return elided + + # Remove escaped characters first to make quote/single quote collapsing + # basic. Things that look like escaped characters shouldn't occur + # outside of strings and chars. + elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided) + + # Replace quoted strings and digit separators. Both single quotes + # and double quotes are processed in the same loop, otherwise + # nested quotes wouldn't work. + collapsed = '' + while True: + # Find the first quote character + match = Match(r'^([^\'"]*)([\'"])(.*)$', elided) + if not match: + collapsed += elided + break + head, quote, tail = match.groups() + + if quote == '"': + # Collapse double quoted strings + second_quote = tail.find('"') + if second_quote >= 0: + collapsed += head + '""' + elided = tail[second_quote + 1:] + else: + # Unmatched double quote, don't bother processing the rest + # of the line since this is probably a multiline string. + collapsed += elided + break + else: + # Found single quote, check nearby text to eliminate digit separators. + # + # There is no special handling for floating point here, because + # the integer/fractional/exponent parts would all be parsed + # correctly as long as there are digits on both sides of the + # separator. So we are fine as long as we don't see something + # like "0.'3" (gcc 4.9.0 will not allow this literal). + if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head): + match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$', "'" + tail) + collapsed += head + match_literal.group(1).replace("'", '') + elided = match_literal.group(2) + else: + second_quote = tail.find('\'') + if second_quote >= 0: + collapsed += head + "''" + elided = tail[second_quote + 1:] + else: + # Unmatched single quote + collapsed += elided + break + return collapsed -def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar): - """Find the position just after the matching endchar. + +def FindEndOfExpressionInLine(line, startpos, stack): + """Find the position just after the end of current parenthesized expression. Args: line: a CleansedLines line. startpos: start searching at this position. - depth: nesting level at startpos. - startchar: expression opening character. - endchar: expression closing character. + stack: nesting stack at startpos. Returns: - On finding matching endchar: (index just after matching endchar, 0) - Otherwise: (-1, new depth at end of this line) + On finding matching end: (index just after matching end, None) + On finding an unclosed expression: (-1, None) + Otherwise: (-1, new stack at end of this line) """ for i in xrange(startpos, len(line)): - if line[i] == startchar: - depth += 1 - elif line[i] == endchar: - depth -= 1 - if depth == 0: - return (i + 1, 0) - return (-1, depth) + char = line[i] + if char in '([{': + # Found start of parenthesized expression, push to expression stack + stack.append(char) + elif char == '<': + # Found potential start of template argument list + if i > 0 and line[i - 1] == '<': + # Left shift operator + if stack and stack[-1] == '<': + stack.pop() + if not stack: + return (-1, None) + elif i > 0 and Search(r'\boperator\s*$', line[0:i]): + # operator<, don't add to stack + continue + else: + # Tentative start of template argument list + stack.append('<') + elif char in ')]}': + # Found end of parenthesized expression. + # + # If we are currently expecting a matching '>', the pending '<' + # must have been an operator. Remove them from expression stack. + while stack and stack[-1] == '<': + stack.pop() + if not stack: + return (-1, None) + if ((stack[-1] == '(' and char == ')') or + (stack[-1] == '[' and char == ']') or + (stack[-1] == '{' and char == '}')): + stack.pop() + if not stack: + return (i + 1, None) + else: + # Mismatched parentheses + return (-1, None) + elif char == '>': + # Found potential end of template argument list. + + # Ignore "->" and operator functions + if (i > 0 and + (line[i - 1] == '-' or Search(r'\boperator\s*$', line[0:i - 1]))): + continue + + # Pop the stack if there is a matching '<'. Otherwise, ignore + # this '>' since it must be an operator. + if stack: + if stack[-1] == '<': + stack.pop() + if not stack: + return (i + 1, None) + elif char == ';': + # Found something that look like end of statements. If we are currently + # expecting a '>', the matching '<' must have been an operator, since + # template argument list should not contain statements. + while stack and stack[-1] == '<': + stack.pop() + if not stack: + return (-1, None) + + # Did not find end of expression or unbalanced parentheses on this line + return (-1, stack) def CloseExpression(clean_lines, linenum, pos): @@ -1246,6 +1600,11 @@ def CloseExpression(clean_lines, linenum, pos): If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the linenum/pos that correspond to the closing of the expression. + TODO(unknown): cpplint spends a fair bit of time matching parentheses. + Ideally we would want to index all opening and closing parentheses once + and have CloseExpression be just a simple lookup, but due to preprocessor + tricks, this is not so easy. + Args: clean_lines: A CleansedLines instance containing the file. linenum: The number of the line to check. @@ -1259,35 +1618,28 @@ def CloseExpression(clean_lines, linenum, pos): """ line = clean_lines.elided[linenum] - startchar = line[pos] - if startchar not in '({[<': + if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]): return (line, clean_lines.NumLines(), -1) - if startchar == '(': endchar = ')' - if startchar == '[': endchar = ']' - if startchar == '{': endchar = '}' - if startchar == '<': endchar = '>' # Check first line - (end_pos, num_open) = FindEndOfExpressionInLine( - line, pos, 0, startchar, endchar) + (end_pos, stack) = FindEndOfExpressionInLine(line, pos, []) if end_pos > -1: return (line, linenum, end_pos) # Continue scanning forward - while linenum < clean_lines.NumLines() - 1: + while stack and linenum < clean_lines.NumLines() - 1: linenum += 1 line = clean_lines.elided[linenum] - (end_pos, num_open) = FindEndOfExpressionInLine( - line, 0, num_open, startchar, endchar) + (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack) if end_pos > -1: return (line, linenum, end_pos) - # Did not find endchar before end of file, give up + # Did not find end of expression before end of file, give up return (line, clean_lines.NumLines(), -1) -def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar): - """Find position at the matching startchar. +def FindStartOfExpressionInLine(line, endpos, stack): + """Find position at the matching start of current expression. This is almost the reverse of FindEndOfExpressionInLine, but note that the input position and returned position differs by 1. @@ -1295,22 +1647,72 @@ def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar): Args: line: a CleansedLines line. endpos: start searching at this position. - depth: nesting level at endpos. - startchar: expression opening character. - endchar: expression closing character. + stack: nesting stack at endpos. Returns: - On finding matching startchar: (index at matching startchar, 0) - Otherwise: (-1, new depth at beginning of this line) + On finding matching start: (index at matching start, None) + On finding an unclosed expression: (-1, None) + Otherwise: (-1, new stack at beginning of this line) """ - for i in xrange(endpos, -1, -1): - if line[i] == endchar: - depth += 1 - elif line[i] == startchar: - depth -= 1 - if depth == 0: - return (i, 0) - return (-1, depth) + i = endpos + while i >= 0: + char = line[i] + if char in ')]}': + # Found end of expression, push to expression stack + stack.append(char) + elif char == '>': + # Found potential end of template argument list. + # + # Ignore it if it's a "->" or ">=" or "operator>" + if (i > 0 and + (line[i - 1] == '-' or + Match(r'\s>=\s', line[i - 1:]) or + Search(r'\boperator\s*$', line[0:i]))): + i -= 1 + else: + stack.append('>') + elif char == '<': + # Found potential start of template argument list + if i > 0 and line[i - 1] == '<': + # Left shift operator + i -= 1 + else: + # If there is a matching '>', we can pop the expression stack. + # Otherwise, ignore this '<' since it must be an operator. + if stack and stack[-1] == '>': + stack.pop() + if not stack: + return (i, None) + elif char in '([{': + # Found start of expression. + # + # If there are any unmatched '>' on the stack, they must be + # operators. Remove those. + while stack and stack[-1] == '>': + stack.pop() + if not stack: + return (-1, None) + if ((char == '(' and stack[-1] == ')') or + (char == '[' and stack[-1] == ']') or + (char == '{' and stack[-1] == '}')): + stack.pop() + if not stack: + return (i, None) + else: + # Mismatched parentheses + return (-1, None) + elif char == ';': + # Found something that look like end of statements. If we are currently + # expecting a '<', the matching '>' must have been an operator, since + # template argument list should not contain statements. + while stack and stack[-1] == '>': + stack.pop() + if not stack: + return (-1, None) + + i -= 1 + + return (-1, stack) def ReverseCloseExpression(clean_lines, linenum, pos): @@ -1331,30 +1733,23 @@ def ReverseCloseExpression(clean_lines, linenum, pos): return is the 'cleansed' line at linenum. """ line = clean_lines.elided[linenum] - endchar = line[pos] - if endchar not in ')}]>': + if line[pos] not in ')}]>': return (line, 0, -1) - if endchar == ')': startchar = '(' - if endchar == ']': startchar = '[' - if endchar == '}': startchar = '{' - if endchar == '>': startchar = '<' # Check last line - (start_pos, num_open) = FindStartOfExpressionInLine( - line, pos, 0, startchar, endchar) + (start_pos, stack) = FindStartOfExpressionInLine(line, pos, []) if start_pos > -1: return (line, linenum, start_pos) # Continue scanning backward - while linenum > 0: + while stack and linenum > 0: linenum -= 1 line = clean_lines.elided[linenum] - (start_pos, num_open) = FindStartOfExpressionInLine( - line, len(line) - 1, num_open, startchar, endchar) + (start_pos, stack) = FindStartOfExpressionInLine(line, len(line) - 1, stack) if start_pos > -1: return (line, linenum, start_pos) - # Did not find startchar before beginning of file, give up + # Did not find start of expression before beginning of file, give up return (line, 0, -1) @@ -1362,7 +1757,7 @@ def CheckForCopyright(filename, lines, error): """Logs an error if no Copyright message appears at the top of the file.""" # We'll say it should occur by line 10. Don't forget there's a - # dummy line at the front. + # placeholder line at the front. for line in xrange(1, min(len(lines), 11)): if re.search(r'Copyright', lines[line], re.I): break else: # means no copyright line was found @@ -1371,6 +1766,46 @@ def CheckForCopyright(filename, lines, error): 'You should have a line: "Copyright [year] "') +def GetIndentLevel(line): + """Return the number of leading spaces in line. + + Args: + line: A string to check. + + Returns: + An integer count of leading spaces, possibly zero. + """ + indent = Match(r'^( *)\S', line) + if indent: + return len(indent.group(1)) + else: + return 0 + +def PathSplitToList(path): + """Returns the path split into a list by the separator. + + Args: + path: An absolute or relative path (e.g. '/a/b/c/' or '../a') + + Returns: + A list of path components (e.g. ['a', 'b', 'c]). + """ + lst = [] + while True: + (head, tail) = os.path.split(path) + if head == path: # absolute paths end + lst.append(head) + break + if tail == path: # relative paths end + lst.append(tail) + break + + path = head + lst.append(tail) + + lst.reverse() + return lst + def GetHeaderGuardCPPVariable(filename): """Returns the CPP variable that should be used as a header guard. @@ -1387,15 +1822,67 @@ def GetHeaderGuardCPPVariable(filename): # flymake. filename = re.sub(r'_flymake\.h$', '.h', filename) filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename) + # Replace 'c++' with 'cpp'. + filename = filename.replace('C++', 'cpp').replace('c++', 'cpp') fileinfo = FileInfo(filename) file_path_from_root = fileinfo.RepositoryName() - if _root: - file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root) - return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_' + def FixupPathFromRoot(): + if _root_debug: + sys.stderr.write("\n_root fixup, _root = '%s', repository name = '%s'\n" + %(_root, fileinfo.RepositoryName())) + + # Process the file path with the --root flag if it was set. + if not _root: + if _root_debug: + sys.stderr.write("_root unspecified\n") + return file_path_from_root + + def StripListPrefix(lst, prefix): + # f(['x', 'y'], ['w, z']) -> None (not a valid prefix) + if lst[:len(prefix)] != prefix: + return None + # f(['a, 'b', 'c', 'd'], ['a', 'b']) -> ['c', 'd'] + return lst[(len(prefix)):] + + # root behavior: + # --root=subdir , lstrips subdir from the header guard + maybe_path = StripListPrefix(PathSplitToList(file_path_from_root), + PathSplitToList(_root)) + + if _root_debug: + sys.stderr.write(("_root lstrip (maybe_path=%s, file_path_from_root=%s," + + " _root=%s)\n") %(maybe_path, file_path_from_root, _root)) + + if maybe_path: + return os.path.join(*maybe_path) + + # --root=.. , will prepend the outer directory to the header guard + full_path = fileinfo.FullName() + root_abspath = os.path.abspath(_root) + + maybe_path = StripListPrefix(PathSplitToList(full_path), + PathSplitToList(root_abspath)) + + if _root_debug: + sys.stderr.write(("_root prepend (maybe_path=%s, full_path=%s, " + + "root_abspath=%s)\n") %(maybe_path, full_path, root_abspath)) + + if maybe_path: + return os.path.join(*maybe_path) -def CheckForHeaderGuard(filename, lines, error): + if _root_debug: + sys.stderr.write("_root ignore, returning %s\n" %(file_path_from_root)) + + # --root=FAKE_DIR is ignored + return file_path_from_root + + file_path_from_root = FixupPathFromRoot() + return re.sub(r'[^a-zA-Z0-9]', '_', file_path_from_root).upper() + '_' + + +def CheckForHeaderGuard(filename, clean_lines, error): """Checks that the file contains a header guard. Logs an error if no #ifndef header guard is present. For other @@ -1403,18 +1890,29 @@ def CheckForHeaderGuard(filename, lines, error): Args: filename: The name of the C++ header file. - lines: An array of strings, each representing a line of the file. + clean_lines: A CleansedLines instance containing the file. error: The function to call with any errors found. """ + # Don't check for header guards if there are error suppression + # comments somewhere in this file. + # + # Because this is silencing a warning for a nonexistent line, we + # only support the very specific NOLINT(build/header_guard) syntax, + # and not the general NOLINT or NOLINT(*) syntax. + raw_lines = clean_lines.lines_without_raw_strings + for i in raw_lines: + if Search(r'//\s*NOLINT\(build/header_guard\)', i): + return + cppvar = GetHeaderGuardCPPVariable(filename) - ifndef = None + ifndef = '' ifndef_linenum = 0 - define = None - endif = None + define = '' + endif = '' endif_linenum = 0 - for linenum, line in enumerate(lines): + for linenum, line in enumerate(raw_lines): linesplit = line.split() if len(linesplit) >= 2: # find the first occurrence of #ifndef and #define, save arg @@ -1429,18 +1927,12 @@ def CheckForHeaderGuard(filename, lines, error): endif = line endif_linenum = linenum - if not ifndef: + if not ifndef or not define or ifndef != define: error(filename, 0, 'build/header_guard', 5, 'No #ifndef header guard found, suggested CPP variable is: %s' % cppvar) return - if not define: - error(filename, 0, 'build/header_guard', 5, - 'No #define header guard found, suggested CPP variable is: %s' % - cppvar) - return - # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__ # for backward compatibility. if ifndef != cppvar: @@ -1448,26 +1940,69 @@ def CheckForHeaderGuard(filename, lines, error): if ifndef != cppvar + '_': error_level = 5 - ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum, + ParseNolintSuppressions(filename, raw_lines[ifndef_linenum], ifndef_linenum, error) error(filename, ifndef_linenum, 'build/header_guard', error_level, '#ifndef header guard has wrong style, please use: %s' % cppvar) - if define != ifndef: - error(filename, 0, 'build/header_guard', 5, - '#ifndef and #define don\'t match, suggested CPP variable is: %s' % - cppvar) + # Check for "//" comments on endif line. + ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum, + error) + match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif) + if match: + if match.group(1) == '_': + # Issue low severity warning for deprecated double trailing underscore + error(filename, endif_linenum, 'build/header_guard', 0, + '#endif line should be "#endif // %s"' % cppvar) return - if endif != ('#endif // %s' % cppvar): - error_level = 0 - if endif != ('#endif // %s' % (cppvar + '_')): - error_level = 5 + # Didn't find the corresponding "//" comment. If this file does not + # contain any "//" comments at all, it could be that the compiler + # only wants "/**/" comments, look for those instead. + no_single_line_comments = True + for i in xrange(1, len(raw_lines) - 1): + line = raw_lines[i] + if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//', line): + no_single_line_comments = False + break - ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum, - error) - error(filename, endif_linenum, 'build/header_guard', error_level, - '#endif line should be "#endif // %s"' % cppvar) + if no_single_line_comments: + match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif) + if match: + if match.group(1) == '_': + # Low severity warning for double trailing underscore + error(filename, endif_linenum, 'build/header_guard', 0, + '#endif line should be "#endif /* %s */"' % cppvar) + return + + # Didn't find anything + error(filename, endif_linenum, 'build/header_guard', 5, + '#endif line should be "#endif // %s"' % cppvar) + + +def CheckHeaderFileIncluded(filename, include_state, error): + """Logs an error if a .cc file does not include its header.""" + + # Do not check test files + fileinfo = FileInfo(filename) + if Search(_TEST_FILE_SUFFIX, fileinfo.BaseName()): + return + + headerfile = filename[0:len(filename) - len(fileinfo.Extension())] + '.h' + if not os.path.exists(headerfile): + return + headername = FileInfo(headerfile).RepositoryName() + first_include = 0 + for section_list in include_state.include_list: + for f in section_list: + if headername in f[0] or f[0] in headername: + return + if not first_include: + first_include = f[1] + + error(filename, first_include, 'build/include', 5, + '%s should include its header file %s' % (fileinfo.RepositoryName(), + headername)) def CheckForBadCharacters(filename, lines, error): @@ -1551,19 +2086,33 @@ def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error): 'Use C++11 raw strings or concatenation instead.') -threading_list = ( - ('asctime(', 'asctime_r('), - ('ctime(', 'ctime_r('), - ('getgrgid(', 'getgrgid_r('), - ('getgrnam(', 'getgrnam_r('), - ('getlogin(', 'getlogin_r('), - ('getpwnam(', 'getpwnam_r('), - ('getpwuid(', 'getpwuid_r('), - ('gmtime(', 'gmtime_r('), - ('localtime(', 'localtime_r('), - ('rand(', 'rand_r('), - ('strtok(', 'strtok_r('), - ('ttyname(', 'ttyname_r('), +# (non-threadsafe name, thread-safe alternative, validation pattern) +# +# The validation pattern is used to eliminate false positives such as: +# _rand(); // false positive due to substring match. +# ->rand(); // some member function rand(). +# ACMRandom rand(seed); // some variable named rand. +# ISAACRandom rand(); // another variable named rand. +# +# Basically we require the return value of these functions to be used +# in some expression context on the same line by matching on some +# operator before the function name. This eliminates constructors and +# member function calls. +_UNSAFE_FUNC_PREFIX = r'(?:[-+*/=%^&|(<]\s*|>\s+)' +_THREADING_LIST = ( + ('asctime(', 'asctime_r(', _UNSAFE_FUNC_PREFIX + r'asctime\([^)]+\)'), + ('ctime(', 'ctime_r(', _UNSAFE_FUNC_PREFIX + r'ctime\([^)]+\)'), + ('getgrgid(', 'getgrgid_r(', _UNSAFE_FUNC_PREFIX + r'getgrgid\([^)]+\)'), + ('getgrnam(', 'getgrnam_r(', _UNSAFE_FUNC_PREFIX + r'getgrnam\([^)]+\)'), + ('getlogin(', 'getlogin_r(', _UNSAFE_FUNC_PREFIX + r'getlogin\(\)'), + ('getpwnam(', 'getpwnam_r(', _UNSAFE_FUNC_PREFIX + r'getpwnam\([^)]+\)'), + ('getpwuid(', 'getpwuid_r(', _UNSAFE_FUNC_PREFIX + r'getpwuid\([^)]+\)'), + ('gmtime(', 'gmtime_r(', _UNSAFE_FUNC_PREFIX + r'gmtime\([^)]+\)'), + ('localtime(', 'localtime_r(', _UNSAFE_FUNC_PREFIX + r'localtime\([^)]+\)'), + ('rand(', 'rand_r(', _UNSAFE_FUNC_PREFIX + r'rand\(\)'), + ('strtok(', 'strtok_r(', + _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'), + ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'), ) @@ -1583,14 +2132,13 @@ def CheckPosixThreading(filename, clean_lines, linenum, error): error: The function to call with any errors found. """ line = clean_lines.elided[linenum] - for single_thread_function, multithread_safe_function in threading_list: - ix = line.find(single_thread_function) - # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison - if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and - line[ix - 1] not in ('_', '.', '>'))): + for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST: + # Additional pattern matching check to confirm that this is the + # function we are looking for + if Search(pattern, line): error(filename, linenum, 'runtime/threadsafe_fn', 2, - 'Consider using ' + multithread_safe_function + - '...) instead of ' + single_thread_function + + 'Consider using ' + multithread_safe_func + + '...) instead of ' + single_thread_func + '...) for improved thread safety.') @@ -1612,7 +2160,6 @@ def CheckVlogArguments(filename, clean_lines, linenum, error): 'VLOG() should be used with numeric verbosity level. ' 'Use LOG() if you want symbolic severity levels.') - # Matches invalid increment: *count++, which moves pointer instead of # incrementing a value. _RE_PATTERN_INVALID_INCREMENT = re.compile( @@ -1641,13 +2188,29 @@ def CheckInvalidIncrement(filename, clean_lines, linenum, error): 'Changing pointer instead of value (or unused value of operator*).') +def IsMacroDefinition(clean_lines, linenum): + if Search(r'^#define', clean_lines[linenum]): + return True + + if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]): + return True + + return False + + +def IsForwardClassDeclaration(clean_lines, linenum): + return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum]) + + class _BlockInfo(object): """Stores information about a generic block of code.""" - def __init__(self, seen_open_brace): + def __init__(self, linenum, seen_open_brace): + self.starting_linenum = linenum self.seen_open_brace = seen_open_brace self.open_parentheses = 0 self.inline_asm = _NO_ASM + self.check_namespace_indentation = False def CheckBegin(self, filename, clean_lines, linenum, error): """Run checks that applies to text up to the opening brace. @@ -1677,15 +2240,33 @@ def CheckEnd(self, filename, clean_lines, linenum, error): """ pass + def IsBlockInfo(self): + """Returns true if this block is a _BlockInfo. + + This is convenient for verifying that an object is an instance of + a _BlockInfo, but not an instance of any of the derived classes. + + Returns: + True for this class, False for derived classes. + """ + return self.__class__ == _BlockInfo + + +class _ExternCInfo(_BlockInfo): + """Stores information about an 'extern "C"' block.""" + + def __init__(self, linenum): + _BlockInfo.__init__(self, linenum, True) + class _ClassInfo(_BlockInfo): """Stores information about a class.""" def __init__(self, name, class_or_struct, clean_lines, linenum): - _BlockInfo.__init__(self, False) + _BlockInfo.__init__(self, linenum, False) self.name = name - self.starting_linenum = linenum self.is_derived = False + self.check_namespace_indentation = True if class_or_struct == 'struct': self.access = 'public' self.is_struct = True @@ -1695,11 +2276,7 @@ def __init__(self, name, class_or_struct, clean_lines, linenum): # Remember initial indentation level for this class. Using raw_lines here # instead of elided to account for leading comments. - initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum]) - if initial_indent: - self.class_indent = len(initial_indent.group(1)) - else: - self.class_indent = 0 + self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum]) # Try to find the end of the class. This will be confused by things like: # class A { @@ -1721,6 +2298,23 @@ def CheckBegin(self, filename, clean_lines, linenum, error): self.is_derived = True def CheckEnd(self, filename, clean_lines, linenum, error): + # If there is a DISALLOW macro, it should appear near the end of + # the class. + seen_last_thing_in_class = False + for i in xrange(linenum - 1, self.starting_linenum, -1): + match = Search( + r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\(' + + self.name + r'\)', + clean_lines.elided[i]) + if match: + if seen_last_thing_in_class: + error(filename, i, 'readability/constructors', 3, + match.group(1) + ' should be the last thing in the class') + break + + if not Match(r'^\s*$', clean_lines.elided[i]): + seen_last_thing_in_class = True + # Check that closing brace is aligned with beginning of the class. # Only do this if the closing brace is indented by only whitespaces. # This means we will not check single-line class definitions. @@ -1738,9 +2332,9 @@ class _NamespaceInfo(_BlockInfo): """Stores information about a namespace.""" def __init__(self, name, linenum): - _BlockInfo.__init__(self, False) + _BlockInfo.__init__(self, linenum, False) self.name = name or '' - self.starting_linenum = linenum + self.check_namespace_indentation = True def CheckEnd(self, filename, clean_lines, linenum, error): """Check end of namespace comments.""" @@ -1758,7 +2352,7 @@ def CheckEnd(self, filename, clean_lines, linenum, error): # deciding what these nontrivial things are, so this check is # triggered by namespace size only, which works most of the time. if (linenum - self.starting_linenum < 10 - and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)): + and not Match(r'^\s*};*\s*(//|/\*).*\bnamespace\b', line)): return # Look for matching comment at end of namespace. @@ -1775,17 +2369,24 @@ def CheckEnd(self, filename, clean_lines, linenum, error): # expected namespace. if self.name: # Named namespace - if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) + - r'[\*/\.\\\s]*$'), + if not Match((r'^\s*};*\s*(//|/\*).*\bnamespace\s+' + + re.escape(self.name) + r'[\*/\.\\\s]*$'), line): error(filename, linenum, 'readability/namespace', 5, 'Namespace should be terminated with "// namespace %s"' % self.name) else: # Anonymous namespace - if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line): - error(filename, linenum, 'readability/namespace', 5, - 'Namespace should be terminated with "// namespace"') + if not Match(r'^\s*};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line): + # If "// namespace anonymous" or "// anonymous namespace (more text)", + # mention "// anonymous namespace" as an acceptable form + if Match(r'^\s*}.*\b(namespace anonymous|anonymous namespace)\b', line): + error(filename, linenum, 'readability/namespace', 5, + 'Anonymous namespace should be terminated with "// namespace"' + ' or "// anonymous namespace"') + else: + error(filename, linenum, 'readability/namespace', 5, + 'Anonymous namespace should be terminated with "// namespace"') class _PreprocessorInfo(object): @@ -1802,7 +2403,7 @@ def __init__(self, stack_before_if): self.seen_else = False -class _NestingState(object): +class NestingState(object): """Holds states related to parsing braces.""" def __init__(self): @@ -1814,6 +2415,17 @@ def __init__(self): # - _BlockInfo: some other type of block. self.stack = [] + # Top of the previous stack before each Update(). + # + # Because the nesting_stack is updated at the end of each line, we + # had to do some convoluted checks to find out what is the current + # scope at the beginning of the line. This check is simplified by + # saving the previous top of nesting stack. + # + # We could save the full stack, but we only need the top. Copying + # the full nesting stack would slow down cpplint by ~10%. + self.previous_stack_top = [] + # Stack of _PreprocessorInfo objects. self.pp_stack = [] @@ -1834,6 +2446,82 @@ def InNamespaceBody(self): """ return self.stack and isinstance(self.stack[-1], _NamespaceInfo) + def InExternC(self): + """Check if we are currently one level inside an 'extern "C"' block. + + Returns: + True if top of the stack is an extern block, False otherwise. + """ + return self.stack and isinstance(self.stack[-1], _ExternCInfo) + + def InClassDeclaration(self): + """Check if we are currently one level inside a class or struct declaration. + + Returns: + True if top of the stack is a class/struct, False otherwise. + """ + return self.stack and isinstance(self.stack[-1], _ClassInfo) + + def InAsmBlock(self): + """Check if we are currently one level inside an inline ASM block. + + Returns: + True if the top of the stack is a block containing inline ASM. + """ + return self.stack and self.stack[-1].inline_asm != _NO_ASM + + def InTemplateArgumentList(self, clean_lines, linenum, pos): + """Check if current position is inside template argument list. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + pos: position just after the suspected template argument. + Returns: + True if (linenum, pos) is inside template arguments. + """ + while linenum < clean_lines.NumLines(): + # Find the earliest character that might indicate a template argument + line = clean_lines.elided[linenum] + match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:]) + if not match: + linenum += 1 + pos = 0 + continue + token = match.group(1) + pos += len(match.group(0)) + + # These things do not look like template argument list: + # class Suspect { + # class Suspect x; } + if token in ('{', '}', ';'): return False + + # These things look like template argument list: + # template + # template + # template + # template + if token in ('>', '=', '[', ']', '.'): return True + + # Check if token is an unmatched '<'. + # If not, move on to the next character. + if token != '<': + pos += 1 + if pos >= len(line): + linenum += 1 + pos = 0 + continue + + # We can't be sure if we just find a single '<', and need to + # find the matching '>'. + (_, end_line, end_pos) = CloseExpression(clean_lines, linenum, pos - 1) + if end_pos < 0: + # Not sure if template argument list or syntax error in file + return False + linenum = end_line + pos = end_pos + return False + def UpdatePreprocessor(self, line): """Update preprocessor stack. @@ -1890,6 +2578,7 @@ def UpdatePreprocessor(self, line): # TODO(unknown): unexpected #endif, issue warning? pass + # TODO(unknown): Update() is too long, but we will refactor later. def Update(self, filename, clean_lines, linenum, error): """Update nesting state with current line. @@ -1901,7 +2590,17 @@ def Update(self, filename, clean_lines, linenum, error): """ line = clean_lines.elided[linenum] - # Update pp_stack first + # Remember top of the previous nesting stack. + # + # The stack is always pushed/popped and not modified in place, so + # we can just do a shallow copy instead of copy.deepcopy. Using + # deepcopy would slow down cpplint by ~28%. + if self.stack: + self.previous_stack_top = self.stack[-1] + else: + self.previous_stack_top = None + + # Update pp_stack self.UpdatePreprocessor(line) # Count parentheses. This is to avoid adding struct arguments to @@ -1952,32 +2651,27 @@ def Update(self, filename, clean_lines, linenum, error): # such as in: # class LOCKABLE API Object { # }; - # - # Templates with class arguments may confuse the parser, for example: - # template , - # class Vector = vector > - # class HeapQueue { - # - # Because this parser has no nesting state about templates, by the - # time it saw "class Comparator", it may think that it's a new class. - # Nested templates have a similar problem: - # template < - # typename ExportedType, - # typename TupleType, - # template class ImplTemplate> - # - # To avoid these cases, we ignore classes that are followed by '=' or '>' class_decl_match = Match( - r'\s*(template\s*<[\w\s<>,:]*>\s*)?' - r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)' - r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line) + r'^(\s*(?:template\s*<[\w\s<>,:]*>\s*)?' + r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))' + r'(.*)$', line) if (class_decl_match and (not self.stack or self.stack[-1].open_parentheses == 0)): - self.stack.append(_ClassInfo( - class_decl_match.group(4), class_decl_match.group(2), - clean_lines, linenum)) - line = class_decl_match.group(5) + # We do not want to accept classes that are actually template arguments: + # template , + # template class Ignore3> + # void Function() {}; + # + # To avoid template argument cases, we scan forward and look for + # an unmatched '>'. If we see one, assume we are inside a + # template argument list. + end_declaration = len(class_decl_match.group(1)) + if not self.InTemplateArgumentList(clean_lines, linenum, end_declaration): + self.stack.append(_ClassInfo( + class_decl_match.group(3), class_decl_match.group(2), + clean_lines, linenum)) + line = class_decl_match.group(4) # If we have not yet seen the opening brace for the innermost block, # run checks here. @@ -2024,10 +2718,13 @@ def Update(self, filename, clean_lines, linenum, error): # stack otherwise. if not self.SeenOpenBrace(): self.stack[-1].seen_open_brace = True + elif Match(r'^extern\s*"[^"]*"\s*\{', line): + self.stack.append(_ExternCInfo(linenum)) else: - self.stack.append(_BlockInfo(True)) + self.stack.append(_BlockInfo(linenum, True)) if _MATCH_ASM.match(line): self.stack[-1].inline_asm = _BLOCK_ASM + elif token == ';' or token == ')': # If we haven't seen an opening brace yet, but we already saw # a semicolon, this is probably a forward declaration. Pop @@ -2103,7 +2800,7 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum, filename: The name of the current file. clean_lines: A CleansedLines instance containing the file. linenum: The number of the line to check. - nesting_state: A _NestingState instance which maintains information about + nesting_state: A NestingState instance which maintains information about the current stack of nested blocks being parsed. error: A callable to which errors are reported, which takes 4 arguments: filename, line number, error level, and message @@ -2136,7 +2833,8 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum, r'\s+(register|static|extern|typedef)\b', line): error(filename, linenum, 'build/storage_class', 5, - 'Storage class (static, extern, typedef, etc) should be first.') + 'Storage-class specifier (static, extern, typedef, etc) should be ' + 'at the beginning of the declaration.') if Match(r'\s*#\s*endif\s*[^/\s]+', line): error(filename, linenum, 'build/endif_comment', 5, @@ -2176,26 +2874,79 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum, # Look for single-argument constructors that aren't marked explicit. # Technically a valid construct, but against style. - args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)' - % re.escape(base_classname), - line) - if (args and - args.group(1) != 'void' and - not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&' - % re.escape(base_classname), args.group(1).strip())): - error(filename, linenum, 'runtime/explicit', 5, - 'Single-argument constructors should be marked explicit.') - - -def CheckSpacingForFunctionCall(filename, line, linenum, error): + explicit_constructor_match = Match( + r'\s+(?:(?:inline|constexpr)\s+)*(explicit\s+)?' + r'(?:(?:inline|constexpr)\s+)*%s\s*' + r'\(((?:[^()]|\([^()]*\))*)\)' + % re.escape(base_classname), + line) + + if explicit_constructor_match: + is_marked_explicit = explicit_constructor_match.group(1) + + if not explicit_constructor_match.group(2): + constructor_args = [] + else: + constructor_args = explicit_constructor_match.group(2).split(',') + + # collapse arguments so that commas in template parameter lists and function + # argument parameter lists don't split arguments in two + i = 0 + while i < len(constructor_args): + constructor_arg = constructor_args[i] + while (constructor_arg.count('<') > constructor_arg.count('>') or + constructor_arg.count('(') > constructor_arg.count(')')): + constructor_arg += ',' + constructor_args[i + 1] + del constructor_args[i + 1] + constructor_args[i] = constructor_arg + i += 1 + + defaulted_args = [arg for arg in constructor_args if '=' in arg] + noarg_constructor = (not constructor_args or # empty arg list + # 'void' arg specifier + (len(constructor_args) == 1 and + constructor_args[0].strip() == 'void')) + onearg_constructor = ((len(constructor_args) == 1 and # exactly one arg + not noarg_constructor) or + # all but at most one arg defaulted + (len(constructor_args) >= 1 and + not noarg_constructor and + len(defaulted_args) >= len(constructor_args) - 1)) + initializer_list_constructor = bool( + onearg_constructor and + Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0])) + copy_constructor = bool( + onearg_constructor and + Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&' + % re.escape(base_classname), constructor_args[0].strip())) + + if (not is_marked_explicit and + onearg_constructor and + not initializer_list_constructor and + not copy_constructor): + if defaulted_args: + error(filename, linenum, 'runtime/explicit', 5, + 'Constructors callable with one argument ' + 'should be marked explicit.') + else: + error(filename, linenum, 'runtime/explicit', 5, + 'Single-parameter constructors should be marked explicit.') + elif is_marked_explicit and not onearg_constructor: + if noarg_constructor: + error(filename, linenum, 'runtime/explicit', 5, + 'Zero-parameter constructors should not be marked explicit.') + + +def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error): """Checks for the correctness of various spacing around function calls. Args: filename: The name of the current file. - line: The text of the line to check. + clean_lines: A CleansedLines instance containing the file. linenum: The number of the line to check. error: The function to call with any errors found. """ + line = clean_lines.elided[linenum] # Since function calls often occur inside if/for/while/switch # expressions - which have their own, more liberal conventions - we @@ -2238,10 +2989,18 @@ def CheckSpacingForFunctionCall(filename, line, linenum, error): error(filename, linenum, 'whitespace/parens', 2, 'Extra space after (') if (Search(r'\w\s+\(', fncall) and - not Search(r'#\s*define|typedef', fncall) and - not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)): - error(filename, linenum, 'whitespace/parens', 4, - 'Extra space before ( in function call') + not Search(r'_{0,2}asm_{0,2}\s+_{0,2}volatile_{0,2}\s+\(', fncall) and + not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and + not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and + not Search(r'\bcase\s+\(', fncall)): + # TODO(unknown): Space after an operator function seem to be a common + # error, silence those for now by restricting them to highest verbosity. + if Search(r'\boperator_*\b', line): + error(filename, linenum, 'whitespace/parens', 0, + 'Extra space before ( in function call') + else: + error(filename, linenum, 'whitespace/parens', 4, + 'Extra space before ( in function call') # If the ) is followed only by a newline or a { + newline, assume it's # part of a control statement (if/while/etc), and don't complain if Search(r'[^)]\s+\)\s*[^{\s]', fncall): @@ -2270,12 +3029,26 @@ def IsBlankLine(line): return not line or line.isspace() +def CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line, + error): + is_namespace_indent_item = ( + len(nesting_state.stack) > 1 and + nesting_state.stack[-1].check_namespace_indentation and + isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and + nesting_state.previous_stack_top == nesting_state.stack[-2]) + + if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item, + clean_lines.elided, line): + CheckItemIndentationInNamespace(filename, clean_lines.elided, + line, error) + + def CheckForFunctionLengths(filename, clean_lines, linenum, function_state, error): """Reports for long function bodies. For an overview why this is done, see: - http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions + https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions Uses a simplistic algorithm assuming other style guidelines (especially spacing) are followed. @@ -2295,8 +3068,6 @@ def CheckForFunctionLengths(filename, clean_lines, linenum, """ lines = clean_lines.lines line = lines[linenum] - raw = clean_lines.raw_lines - raw_line = raw[linenum] joined_line = '' starting_func = False @@ -2343,190 +3114,58 @@ def CheckForFunctionLengths(filename, clean_lines, linenum, _RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?') -def CheckComment(comment, filename, linenum, error): - """Checks for common mistakes in TODO comments. +def CheckComment(line, filename, linenum, next_line_start, error): + """Checks for common mistakes in comments. Args: - comment: The text of the comment from the line in question. + line: The line in question. filename: The name of the current file. linenum: The number of the line to check. + next_line_start: The first non-whitespace column of the next line. error: The function to call with any errors found. """ - match = _RE_PATTERN_TODO.match(comment) - if match: - # One whitespace is correct; zero whitespace is handled elsewhere. - leading_whitespace = match.group(1) - if len(leading_whitespace) > 1: - error(filename, linenum, 'whitespace/todo', 2, - 'Too many spaces before TODO') - - username = match.group(2) - if not username: - error(filename, linenum, 'readability/todo', 2, - 'Missing username in TODO; it should look like ' - '"// TODO(my_username): Stuff."') - - middle_whitespace = match.group(3) - # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison - if middle_whitespace != ' ' and middle_whitespace != '': - error(filename, linenum, 'whitespace/todo', 2, - 'TODO(my_username) should be followed by a space') - -def CheckAccess(filename, clean_lines, linenum, nesting_state, error): - """Checks for improper use of DISALLOW* macros. + commentpos = line.find('//') + if commentpos != -1: + # Check if the // may be in quotes. If so, ignore it + if re.sub(r'\\.', '', line[0:commentpos]).count('"') % 2 == 0: + # Allow one space for new scopes, two spaces otherwise: + if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos) and + ((commentpos >= 1 and + line[commentpos-1] not in string.whitespace) or + (commentpos >= 2 and + line[commentpos-2] not in string.whitespace))): + error(filename, linenum, 'whitespace/comments', 2, + 'At least two spaces is best between code and comments') - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - nesting_state: A _NestingState instance which maintains information about - the current stack of nested blocks being parsed. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] # get rid of comments and strings - - matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|' - r'DISALLOW_EVIL_CONSTRUCTORS|' - r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line) - if not matched: - return - if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo): - if nesting_state.stack[-1].access != 'private': - error(filename, linenum, 'readability/constructors', 3, - '%s must be in the private: section' % matched.group(1)) - - else: - # Found DISALLOW* macro outside a class declaration, or perhaps it - # was used inside a function when it should have been part of the - # class declaration. We could issue a warning here, but it - # probably resulted in a compiler error already. - pass - - -def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix): - """Find the corresponding > to close a template. - - Args: - clean_lines: A CleansedLines instance containing the file. - linenum: Current line number. - init_suffix: Remainder of the current line after the initial <. - - Returns: - True if a matching bracket exists. - """ - line = init_suffix - nesting_stack = ['<'] - while True: - # Find the next operator that can tell us whether < is used as an - # opening bracket or as a less-than operator. We only want to - # warn on the latter case. - # - # We could also check all other operators and terminate the search - # early, e.g. if we got something like this "a(),;\[\]]*([<>(),;\[\]])(.*)$', line) - if match: - # Found an operator, update nesting stack - operator = match.group(1) - line = match.group(2) - - if nesting_stack[-1] == '<': - # Expecting closing angle bracket - if operator in ('<', '(', '['): - nesting_stack.append(operator) - elif operator == '>': - nesting_stack.pop() - if not nesting_stack: - # Found matching angle bracket - return True - elif operator == ',': - # Got a comma after a bracket, this is most likely a template - # argument. We have not seen a closing angle bracket yet, but - # it's probably a few lines later if we look for it, so just - # return early here. - return True - else: - # Got some other operator. - return False - - else: - # Expecting closing parenthesis or closing bracket - if operator in ('<', '(', '['): - nesting_stack.append(operator) - elif operator in (')', ']'): - # We don't bother checking for matching () or []. If we got - # something like (] or [), it would have been a syntax error. - nesting_stack.pop() - - else: - # Scan the next line - linenum += 1 - if linenum >= len(clean_lines.elided): - break - line = clean_lines.elided[linenum] - - # Exhausted all remaining lines and still no matching angle bracket. - # Most likely the input was incomplete, otherwise we should have - # seen a semicolon and returned early. - return True - - -def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix): - """Find the corresponding < that started a template. - - Args: - clean_lines: A CleansedLines instance containing the file. - linenum: Current line number. - init_prefix: Part of the current line before the initial >. - - Returns: - True if a matching bracket exists. - """ - line = init_prefix - nesting_stack = ['>'] - while True: - # Find the previous operator - match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line) - if match: - # Found an operator, update nesting stack - operator = match.group(2) - line = match.group(1) - - if nesting_stack[-1] == '>': - # Expecting opening angle bracket - if operator in ('>', ')', ']'): - nesting_stack.append(operator) - elif operator == '<': - nesting_stack.pop() - if not nesting_stack: - # Found matching angle bracket - return True - elif operator == ',': - # Got a comma before a bracket, this is most likely a - # template argument. The opening angle bracket is probably - # there if we look for it, so just return early here. - return True - else: - # Got some other operator. - return False - - else: - # Expecting opening parenthesis or opening bracket - if operator in ('>', ')', ']'): - nesting_stack.append(operator) - elif operator in ('(', '['): - nesting_stack.pop() - - else: - # Scan the previous line - linenum -= 1 - if linenum < 0: - break - line = clean_lines.elided[linenum] - - # Exhausted all earlier lines and still no matching angle bracket. - return False + # Checks for common mistakes in TODO comments. + comment = line[commentpos:] + match = _RE_PATTERN_TODO.match(comment) + if match: + # One whitespace is correct; zero whitespace is handled elsewhere. + leading_whitespace = match.group(1) + if len(leading_whitespace) > 1: + error(filename, linenum, 'whitespace/todo', 2, + 'Too many spaces before TODO') + + username = match.group(2) + if not username: + error(filename, linenum, 'readability/todo', 2, + 'Missing username in TODO; it should look like ' + '"// TODO(my_username): Stuff."') + + middle_whitespace = match.group(3) + # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison + if middle_whitespace != ' ' and middle_whitespace != '': + error(filename, linenum, 'whitespace/todo', 2, + 'TODO(my_username) should be followed by a space') + + # If the comment contains an alphanumeric character, there + # should be a space somewhere between it and the // unless + # it's a /// or //! Doxygen comment. + if (Match(r'//[^ ]*\w', comment) and + not Match(r'(///|//\!)(\s+|$)', comment)): + error(filename, linenum, 'whitespace/comments', 4, + 'Should have a space between // and comment') def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): @@ -2542,7 +3181,7 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): filename: The name of the current file. clean_lines: A CleansedLines instance containing the file. linenum: The number of the line to check. - nesting_state: A _NestingState instance which maintains information about + nesting_state: A NestingState instance which maintains information about the current stack of nested blocks being parsed. error: The function to call with any errors found. """ @@ -2565,7 +3204,12 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): # } # # A warning about missing end of namespace comments will be issued instead. - if IsBlankLine(line) and not nesting_state.InNamespaceBody(): + # + # Also skip blank line checks for 'extern "C"' blocks, which are formatted + # like namespaces. + if (IsBlankLine(line) and + not nesting_state.InNamespaceBody() and + not nesting_state.InExternC()): elided = clean_lines.elided prev_line = elided[linenum - 1] prevbrace = prev_line.rfind('{') @@ -2628,54 +3272,64 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): error(filename, linenum, 'whitespace/blank_line', 3, 'Do not leave a blank line after "%s:"' % matched.group(1)) - # Next, we complain if there's a comment too near the text - commentpos = line.find('//') - if commentpos != -1: - # Check if the // may be in quotes. If so, ignore it - # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison - if (line.count('"', 0, commentpos) - - line.count('\\"', 0, commentpos)) % 2 == 0: # not in quotes - # Allow one space for new scopes, two spaces otherwise: - if (not Match(r'^\s*{ //', line) and - ((commentpos >= 1 and - line[commentpos-1] not in string.whitespace) or - (commentpos >= 2 and - line[commentpos-2] not in string.whitespace))): - error(filename, linenum, 'whitespace/comments', 2, - 'At least two spaces is best between code and comments') - # There should always be a space between the // and the comment - commentend = commentpos + 2 - if commentend < len(line) and not line[commentend] == ' ': - # but some lines are exceptions -- e.g. if they're big - # comment delimiters like: - # //---------------------------------------------------------- - # or are an empty C++ style Doxygen comment, like: - # /// - # or C++ style Doxygen comments placed after the variable: - # ///< Header comment - # //!< Header comment - # or they begin with multiple slashes followed by a space: - # //////// Header comment - match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or - Search(r'^/$', line[commentend:]) or - Search(r'^!< ', line[commentend:]) or - Search(r'^/< ', line[commentend:]) or - Search(r'^/+ ', line[commentend:])) - if not match: - error(filename, linenum, 'whitespace/comments', 4, - 'Should have a space between // and comment') - CheckComment(line[commentpos:], filename, linenum, error) - - line = clean_lines.elided[linenum] # get rid of comments and strings - - # Don't try to do spacing checks for operator methods - line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line) + # Next, check comments + next_line_start = 0 + if linenum + 1 < clean_lines.NumLines(): + next_line = raw[linenum + 1] + next_line_start = len(next_line) - len(next_line.lstrip()) + CheckComment(line, filename, linenum, next_line_start, error) + + # get rid of comments and strings + line = clean_lines.elided[linenum] + + # You shouldn't have spaces before your brackets, except maybe after + # 'delete []', 'return []() {};', or 'auto [abc, ...] = ...;'. + if Search(r'\w\s+\[', line) and not Search(r'(?:auto&?|delete|return)\s+\[', line): + error(filename, linenum, 'whitespace/braces', 5, + 'Extra space before [') + + # In range-based for, we wanted spaces before and after the colon, but + # not around "::" tokens that might appear. + if (Search(r'for *\(.*[^:]:[^: ]', line) or + Search(r'for *\(.*[^: ]:[^:]', line)): + error(filename, linenum, 'whitespace/forcolon', 2, + 'Missing space around colon in range-based for loop') + + +def CheckOperatorSpacing(filename, clean_lines, linenum, error): + """Checks for horizontal spacing around operators. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Don't try to do spacing checks for operator methods. Do this by + # replacing the troublesome characters with something else, + # preserving column position for all other characters. + # + # The replacement is done repeatedly to avoid false positives from + # operators that call operators. + while True: + match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line) + if match: + line = match.group(1) + ('_' * len(match.group(2))) + match.group(3) + else: + break # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )". # Otherwise not. Note we only check for non-spaces on *both* sides; # sometimes people put non-spaces on one side when aligning ='s among # many lines (not that this is behavior that I approve of...) - if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line): + if ((Search(r'[\w.]=', line) or + Search(r'=[\w.]', line)) + and not Search(r'\b(if|while|for) ', line) + # Operators taken from [lex.operators] in C++11 standard. + and not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line) + and not Search(r'operator=', line)): error(filename, linenum, 'whitespace/operators', 4, 'Missing spaces around =') @@ -2687,42 +3341,51 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): # # Check <= and >= first to avoid false positives with < and >, then # check non-include lines for spacing around < and >. - match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line) + # + # If the operator is followed by a comma, assume it's be used in a + # macro context and don't do any checks. This avoids false + # positives. + # + # Note that && is not included here. This is because there are too + # many false positives due to RValue references. + match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line) if match: error(filename, linenum, 'whitespace/operators', 3, 'Missing spaces around %s' % match.group(1)) - # We allow no-spaces around << when used like this: 10<<20, but - # not otherwise (particularly, not when used as streams) - # Also ignore using ns::operator<<; - match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line) - if (match and - not (match.group(1).isdigit() and match.group(2).isdigit()) and - not (match.group(1) == 'operator' and match.group(2) == ';')): - error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around <<') elif not Match(r'#.*include', line): - # Avoid false positives on -> - reduced_line = line.replace('->', '') - # Look for < that is not surrounded by spaces. This is only # triggered if both sides are missing spaces, even though # technically should should flag if at least one side is missing a # space. This is done to avoid some false positives with shifts. - match = Search(r'[^\s<]<([^\s=<].*)', reduced_line) - if (match and - not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))): - error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around <') + match = Match(r'^(.*[^\s<])<[^\s=<,]', line) + if match: + (_, _, end_pos) = CloseExpression( + clean_lines, linenum, len(match.group(1))) + if end_pos <= -1: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around <') # Look for > that is not surrounded by spaces. Similar to the # above, we only trigger if both sides are missing spaces to avoid # false positives with shifts. - match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line) - if (match and - not FindPreviousMatchingAngleBracket(clean_lines, linenum, - match.group(1))): - error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around >') + match = Match(r'^(.*[^-\s>])>[^\s=>,]', line) + if match: + (_, _, start_pos) = ReverseCloseExpression( + clean_lines, linenum, len(match.group(1))) + if start_pos <= -1: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around >') + + # We allow no-spaces around << when used like this: 10<<20, but + # not otherwise (particularly, not when used as streams) + # + # We also allow operators following an opening parenthesis, since + # those tend to be macros that deal with operators. + match = Search(r'(operator|[^\s(<])(?:L|UL|LL|ULL|l|ul|ll|ull)?<<([^\s,=<])', line) + if (match and not (match.group(1).isdigit() and match.group(2).isdigit()) and + not (match.group(1) == 'operator' and match.group(2) == ';')): + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around <<') # We allow no-spaces around >> for almost anything. This is because # C++11 allows ">>" to close nested templates, which accounts for @@ -2747,7 +3410,19 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): error(filename, linenum, 'whitespace/operators', 4, 'Extra space for operator %s' % match.group(1)) - # A pet peeve of mine: no spaces after an if, while, switch, or for + +def CheckParenthesisSpacing(filename, clean_lines, linenum, error): + """Checks for horizontal spacing around parentheses. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # No spaces after an if, while, switch, or for match = Search(r' (if\(|for\(|while\(|switch\()', line) if match: error(filename, linenum, 'whitespace/parens', 5, @@ -2773,6 +3448,19 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): 'Should have zero or one spaces inside ( and ) in %s' % match.group(1)) + +def CheckCommaSpacing(filename, clean_lines, linenum, error): + """Checks for horizontal spacing near commas and semicolons. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + raw = clean_lines.lines_without_raw_strings + line = clean_lines.elided[linenum] + # You should always have a space after a comma (either as fn arg or operator) # # This does not apply when the non-space character following the @@ -2783,7 +3471,8 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): # verify that lines contain missing whitespaces, second pass on raw # lines to confirm that those missing whitespaces are not due to # elided comments. - if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]): + if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and + Search(r',[^,\s]', raw[linenum])): error(filename, linenum, 'whitespace/comma', 3, 'Missing space after ,') @@ -2795,14 +3484,91 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): error(filename, linenum, 'whitespace/semicolon', 3, 'Missing space after ;') - # Next we will look for issues with function calls. - CheckSpacingForFunctionCall(filename, line, linenum, error) + +def _IsType(clean_lines, nesting_state, expr): + """Check if expression looks like a type name, returns true if so. + + Args: + clean_lines: A CleansedLines instance containing the file. + nesting_state: A NestingState instance which maintains information about + the current stack of nested blocks being parsed. + expr: The expression to check. + Returns: + True, if token looks like a type. + """ + # Keep only the last token in the expression + last_word = Match(r'^.*(\b\S+)$', expr) + if last_word: + token = last_word.group(1) + else: + token = expr + + # Match native types and stdint types + if _TYPES.match(token): + return True + + # Try a bit harder to match templated types. Walk up the nesting + # stack until we find something that resembles a typename + # declaration for what we are looking for. + typename_pattern = (r'\b(?:typename|class|struct)\s+' + re.escape(token) + + r'\b') + block_index = len(nesting_state.stack) - 1 + while block_index >= 0: + if isinstance(nesting_state.stack[block_index], _NamespaceInfo): + return False + + # Found where the opening brace is. We want to scan from this + # line up to the beginning of the function, minus a few lines. + # template + # class C + # : public ... { // start scanning here + last_line = nesting_state.stack[block_index].starting_linenum + + next_block_start = 0 + if block_index > 0: + next_block_start = nesting_state.stack[block_index - 1].starting_linenum + first_line = last_line + while first_line >= next_block_start: + if clean_lines.elided[first_line].find('template') >= 0: + break + first_line -= 1 + if first_line < next_block_start: + # Didn't find any "template" keyword before reaching the next block, + # there are probably no template things to check for this block + block_index -= 1 + continue + + # Look for typename in the specified range + for i in xrange(first_line, last_line + 1, 1): + if Search(typename_pattern, clean_lines.elided[i]): + return True + block_index -= 1 + + return False + + +def CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error): + """Checks for horizontal spacing near commas. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + nesting_state: A NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] # Except after an opening paren, or after another opening brace (in case of # an initializer list, for instance), you should have spaces before your - # braces. And since you should never have braces at the beginning of a line, - # this is an easy test. - match = Match(r'^(.*[^ ({]){', line) + # braces when they are delimiting blocks, classes, namespaces etc. + # And since you should never have braces at the beginning of a line, + # this is an easy test. Except that braces used for initialization don't + # follow the same rule; we often don't want spaces before those. + match = Match(r'^(.*[^ ({>]){', line) + if match: # Try a bit harder to check for brace initialization. This # happens in one of the following forms: @@ -2813,10 +3579,12 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): # LastArgument(..., type{}); # LOG(INFO) << type{} << " ..."; # map_of_type[{...}] = ...; + # ternary = expr ? new type{} : nullptr; + # OuterTemplate{}> # # We check for the character following the closing brace, and # silence the warning if it's one of those listed above, i.e. - # "{.;,)<]". + # "{.;,)<>]:". # # To account for nested initializer list, we allow any number of # closing braces up to "{;,)<". We can't simply silence the @@ -2830,6 +3598,7 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): # There is a false negative with this approach if people inserted # spurious semicolons, e.g. "if (cond){};", but we will catch the # spurious semicolon with a separate check. + leading_text = match.group(1) (endline, endlinenum, endpos) = CloseExpression( clean_lines, linenum, len(match.group(1))) trailing_text = '' @@ -2838,7 +3607,11 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): for offset in xrange(endlinenum + 1, min(endlinenum + 3, clean_lines.NumLines() - 1)): trailing_text += clean_lines.elided[offset] - if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text): + # We also suppress warnings for `uint64_t{expression}` etc., as the style + # guide recommends brace initialization for integral types to avoid + # overflow/truncation. + if (not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text) + and not _IsType(clean_lines, nesting_state, leading_text)): error(filename, linenum, 'whitespace/braces', 5, 'Missing space before {') @@ -2847,12 +3620,6 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): error(filename, linenum, 'whitespace/braces', 5, 'Missing space before else') - # You shouldn't have spaces before your brackets, except maybe after - # 'delete []' or 'new char * []'. - if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line): - error(filename, linenum, 'whitespace/braces', 5, - 'Extra space before [') - # You shouldn't have a space before a semicolon at the end of the line. # There's a special case for "for" since the style guide allows space before # the semicolon there. @@ -2869,12 +3636,23 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): 'Extra space before last semicolon. If this should be an empty ' 'statement, use {} instead.') - # In range-based for, we wanted spaces before and after the colon, but - # not around "::" tokens that might appear. - if (Search('for *\(.*[^:]:[^: ]', line) or - Search('for *\(.*[^: ]:[^:]', line)): - error(filename, linenum, 'whitespace/forcolon', 2, - 'Missing space around colon in range-based for loop') + +def IsDecltype(clean_lines, linenum, column): + """Check if the token ending on (linenum, column) is decltype(). + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: the number of the line to check. + column: end column of the token to check. + Returns: + True if this token is decltype() expression, False otherwise. + """ + (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column) + if start_col < 0: + return False + if Search(r'\bdecltype\s*$', text[0:start_col]): + return True + return False def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error): @@ -2974,15 +3752,18 @@ def CheckBraces(filename, clean_lines, linenum, error): # used for brace initializers inside function calls. We don't detect this # perfectly: we just don't complain if the last non-whitespace character on # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the - # previous line starts a preprocessor block. + # previous line starts a preprocessor block. We also allow a brace on the + # following line if it is part of an array initialization and would not fit + # within the 80 character limit of the preceding line. prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] if (not Search(r'[,;:}{(]\s*$', prevline) and - not Match(r'\s*#', prevline)): + not Match(r'\s*#', prevline) and + not (GetLineWidth(prevline) > _line_length - 2 and '[]' in prevline)): error(filename, linenum, 'whitespace/braces', 4, '{ should almost always be at the end of the previous line') # An else clause should be on the same line as the preceding closing brace. - if Match(r'\s*else\s*', line): + if Match(r'\s*else\b\s*(?:if\b|\{|$)', line): prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] if Match(r'\s*}\s*$', prevline): error(filename, linenum, 'whitespace/newline', 4, @@ -2990,19 +3771,20 @@ def CheckBraces(filename, clean_lines, linenum, error): # If braces come on one side of an else, they should be on both. # However, we have to worry about "else if" that spans multiple lines! - if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line): - if Search(r'}\s*else if([^{]*)$', line): # could be multi-line if - # find the ( after the if - pos = line.find('else if') - pos = line.find('(', pos) - if pos > 0: - (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos) - if endline[endpos:].find('{') == -1: # must be brace after if - error(filename, linenum, 'readability/braces', 5, - 'If an else has a brace on one side, it should have it on both') - else: # common case: else not followed by a multi-line if - error(filename, linenum, 'readability/braces', 5, - 'If an else has a brace on one side, it should have it on both') + if Search(r'else if\s*\(', line): # could be multi-line if + brace_on_left = bool(Search(r'}\s*else if\s*\(', line)) + # find the ( after the if + pos = line.find('else if') + pos = line.find('(', pos) + if pos > 0: + (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos) + brace_on_right = endline[endpos:].find('{') != -1 + if brace_on_left != brace_on_right: # must be brace after if + error(filename, linenum, 'readability/braces', 5, + 'If an else has a brace on one side, it should have it on both') + elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line): + error(filename, linenum, 'readability/braces', 5, + 'If an else has a brace on one side, it should have it on both') # Likewise, an else should never have the else clause on the same line if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line): @@ -3014,11 +3796,79 @@ def CheckBraces(filename, clean_lines, linenum, error): error(filename, linenum, 'whitespace/newline', 4, 'do/while clauses should not be on a single line') + # Check single-line if/else bodies. The style guide says 'curly braces are not + # required for single-line statements'. We additionally allow multi-line, + # single statements, but we reject anything with more than one semicolon in + # it. This means that the first semicolon after the if should be at the end of + # its line, and the line after that should have an indent level equal to or + # lower than the if. We also check for ambiguous if/else nesting without + # braces. + if_else_match = Search(r'\b(if\s*\(|else\b)', line) + if if_else_match and not Match(r'\s*#', line): + if_indent = GetIndentLevel(line) + endline, endlinenum, endpos = line, linenum, if_else_match.end() + if_match = Search(r'\bif\s*\(', line) + if if_match: + # This could be a multiline if condition, so find the end first. + pos = if_match.end() - 1 + (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum, pos) + # Check for an opening brace, either directly after the if or on the next + # line. If found, this isn't a single-statement conditional. + if (not Match(r'\s*{', endline[endpos:]) + and not (Match(r'\s*$', endline[endpos:]) + and endlinenum < (len(clean_lines.elided) - 1) + and Match(r'\s*{', clean_lines.elided[endlinenum + 1]))): + while (endlinenum < len(clean_lines.elided) + and ';' not in clean_lines.elided[endlinenum][endpos:]): + endlinenum += 1 + endpos = 0 + if endlinenum < len(clean_lines.elided): + endline = clean_lines.elided[endlinenum] + # We allow a mix of whitespace and closing braces (e.g. for one-liner + # methods) and a single \ after the semicolon (for macros) + endpos = endline.find(';') + if not Match(r';[\s}]*(\\?)$', endline[endpos:]): + # Semicolon isn't the last character, there's something trailing. + # Output a warning if the semicolon is not contained inside + # a lambda expression. + if not Match(r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$', + endline): + error(filename, linenum, 'readability/braces', 4, + 'If/else bodies with multiple statements require braces') + elif endlinenum < len(clean_lines.elided) - 1: + # Make sure the next line is dedented + next_line = clean_lines.elided[endlinenum + 1] + next_indent = GetIndentLevel(next_line) + # With ambiguous nested if statements, this will error out on the + # if that *doesn't* match the else, regardless of whether it's the + # inner one or outer one. + if (if_match and Match(r'\s*else\b', next_line) + and next_indent != if_indent): + error(filename, linenum, 'readability/braces', 4, + 'Else clause should be indented at the same level as if. ' + 'Ambiguous nested if/else chains require braces.') + elif next_indent > if_indent: + error(filename, linenum, 'readability/braces', 4, + 'If/else bodies with multiple statements require braces') + + +def CheckTrailingSemicolon(filename, clean_lines, linenum, error): + """Looks for redundant trailing semicolon. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + line = clean_lines.elided[linenum] + # Block bodies should not be followed by a semicolon. Due to C++11 # brace initialization, there are more places where semicolons are - # required than not, so we use a whitelist approach to check these - # rather than a blacklist. These are the places where "};" should - # be replaced by just "}": + # required than not, so we explicitly list the allowed rules rather + # than listing the disallowed ones. These are the places where "};" + # should be replaced by just "}": # 1. Some flavor of block following closing parenthesis: # for (;;) {}; # while (...) {}; @@ -3074,28 +3924,40 @@ def CheckBraces(filename, clean_lines, linenum, error): # - INTERFACE_DEF # - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED: # - # We implement a whitelist of safe macros instead of a blacklist of + # We implement a list of safe macros instead of a list of # unsafe macros, even though the latter appears less frequently in # google code and would have been easier to implement. This is because - # the downside for getting the whitelist wrong means some extra - # semicolons, while the downside for getting the blacklist wrong + # the downside for getting the allowed checks wrong means some extra + # semicolons, while the downside for getting disallowed checks wrong # would result in compile errors. # - # In addition to macros, we also don't want to warn on compound - # literals. + # In addition to macros, we also don't want to warn on + # - Compound literals + # - Lambdas + # - alignas specifier with anonymous structs + # - decltype closing_brace_pos = match.group(1).rfind(')') opening_parenthesis = ReverseCloseExpression( clean_lines, linenum, closing_brace_pos) if opening_parenthesis[2] > -1: line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]] - macro = Search(r'\b([A-Z_]+)\s*$', line_prefix) + macro = Search(r'\b([A-Z_][A-Z0-9_]*)\s*$', line_prefix) + func = Match(r'^(.*\])\s*$', line_prefix) if ((macro and macro.group(1) not in ( 'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST', 'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED', 'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or + (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or + Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or + Search(r'\bdecltype$', line_prefix) or Search(r'\s+=\s*$', line_prefix)): match = None + if (match and + opening_parenthesis[1] > 1 and + Search(r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])): + # Multi-line lambda-expression + match = None else: # Try matching cases 2-3. @@ -3125,6 +3987,14 @@ def CheckBraces(filename, clean_lines, linenum, error): # outputting warnings for the matching closing brace, if there are # nested blocks with trailing semicolons, we will get the error # messages in reversed order. + + # We need to check the line forward for NOLINT + raw_lines = clean_lines.raw_lines + ParseNolintSuppressions(filename, raw_lines[endlinenum-1], endlinenum-1, + error) + ParseNolintSuppressions(filename, raw_lines[endlinenum], endlinenum, + error) + error(filename, endlinenum, 'readability/braces', 4, "You don't need a ; after a }") @@ -3148,7 +4018,7 @@ def CheckEmptyBlockBody(filename, clean_lines, linenum, error): line = clean_lines.elided[linenum] matched = Match(r'\s*(for|while|if)\s*\(', line) if matched: - # Find the end of the conditional expression + # Find the end of the conditional expression. (end_line, end_linenum, end_pos) = CloseExpression( clean_lines, linenum, line.find('(')) @@ -3163,6 +4033,98 @@ def CheckEmptyBlockBody(filename, clean_lines, linenum, error): error(filename, end_linenum, 'whitespace/empty_loop_body', 5, 'Empty loop bodies should use {} or continue') + # Check for if statements that have completely empty bodies (no comments) + # and no else clauses. + if end_pos >= 0 and matched.group(1) == 'if': + # Find the position of the opening { for the if statement. + # Return without logging an error if it has no brackets. + opening_linenum = end_linenum + opening_line_fragment = end_line[end_pos:] + # Loop until EOF or find anything that's not whitespace or opening {. + while not Search(r'^\s*\{', opening_line_fragment): + if Search(r'^(?!\s*$)', opening_line_fragment): + # Conditional has no brackets. + return + opening_linenum += 1 + if opening_linenum == len(clean_lines.elided): + # Couldn't find conditional's opening { or any code before EOF. + return + opening_line_fragment = clean_lines.elided[opening_linenum] + # Set opening_line (opening_line_fragment may not be entire opening line). + opening_line = clean_lines.elided[opening_linenum] + + # Find the position of the closing }. + opening_pos = opening_line_fragment.find('{') + if opening_linenum == end_linenum: + # We need to make opening_pos relative to the start of the entire line. + opening_pos += end_pos + (closing_line, closing_linenum, closing_pos) = CloseExpression( + clean_lines, opening_linenum, opening_pos) + if closing_pos < 0: + return + + # Now construct the body of the conditional. This consists of the portion + # of the opening line after the {, all lines until the closing line, + # and the portion of the closing line before the }. + if (clean_lines.raw_lines[opening_linenum] != + CleanseComments(clean_lines.raw_lines[opening_linenum])): + # Opening line ends with a comment, so conditional isn't empty. + return + if closing_linenum > opening_linenum: + # Opening line after the {. Ignore comments here since we checked above. + body = list(opening_line[opening_pos+1:]) + # All lines until closing line, excluding closing line, with comments. + body.extend(clean_lines.raw_lines[opening_linenum+1:closing_linenum]) + # Closing line before the }. Won't (and can't) have comments. + body.append(clean_lines.elided[closing_linenum][:closing_pos-1]) + body = '\n'.join(body) + else: + # If statement has brackets and fits on a single line. + body = opening_line[opening_pos+1:closing_pos-1] + + # Check if the body is empty + if not _EMPTY_CONDITIONAL_BODY_PATTERN.search(body): + return + # The body is empty. Now make sure there's not an else clause. + current_linenum = closing_linenum + current_line_fragment = closing_line[closing_pos:] + # Loop until EOF or find anything that's not whitespace or else clause. + while Search(r'^\s*$|^(?=\s*else)', current_line_fragment): + if Search(r'^(?=\s*else)', current_line_fragment): + # Found an else clause, so don't log an error. + return + current_linenum += 1 + if current_linenum == len(clean_lines.elided): + break + current_line_fragment = clean_lines.elided[current_linenum] + + # The body is empty and there's no else clause until EOF or other code. + error(filename, end_linenum, 'whitespace/empty_if_body', 4, + ('If statement had no body and no else clause')) + + +def FindCheckMacro(line): + """Find a replaceable CHECK-like macro. + + Args: + line: line to search on. + Returns: + (macro name, start position), or (None, -1) if no replaceable + macro is found. + """ + for macro in _CHECK_MACROS: + i = line.find(macro) + if i >= 0: + # Find opening parenthesis. Do a regular expression match here + # to make sure that we are matching the expected CHECK macro, as + # opposed to some other macro that happens to contain the CHECK + # substring. + matched = Match(r'^(.*\b' + macro + r'\s*)\(', line) + if not matched: + continue + return (macro, len(matched.group(1))) + return (None, -1) + def CheckCheck(filename, clean_lines, linenum, error): """Checks the use of CHECK and EXPECT macros. @@ -3176,24 +4138,8 @@ def CheckCheck(filename, clean_lines, linenum, error): # Decide the set of replacement macros that should be suggested lines = clean_lines.elided - check_macro = None - start_pos = -1 - for macro in _CHECK_MACROS: - i = lines[linenum].find(macro) - if i >= 0: - check_macro = macro - - # Find opening parenthesis. Do a regular expression match here - # to make sure that we are matching the expected CHECK macro, as - # opposed to some other macro that happens to contain the CHECK - # substring. - matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum]) - if not matched: - continue - start_pos = len(matched.group(1)) - break - if not check_macro or start_pos < 0: - # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT' + (check_macro, start_pos) = FindCheckMacro(lines[linenum]) + if not check_macro: return # Find end of the boolean expression by matching parentheses @@ -3201,6 +4147,13 @@ def CheckCheck(filename, clean_lines, linenum, error): clean_lines, linenum, start_pos) if end_pos < 0: return + + # If the check macro is followed by something other than a + # semicolon, assume users will log their own custom error messages + # and don't suggest any replacements. + if not Match(r'\s*;', last_line[end_pos:]): + return + if linenum == end_line: expression = lines[linenum][start_pos + 1:end_pos - 1] else: @@ -3223,7 +4176,7 @@ def CheckCheck(filename, clean_lines, linenum, error): if token == '(': # Parenthesized operand expression = matched.group(2) - (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')') + (end, _) = FindEndOfExpressionInLine(expression, 0, ['(']) if end < 0: return # Unmatched parenthesis lhs += '(' + expression[0:end] @@ -3339,6 +4292,16 @@ def GetLineWidth(line): if unicodedata.east_asian_width(uc) in ('W', 'F'): width += 2 elif not unicodedata.combining(uc): + # Issue 337 + # https://mail.python.org/pipermail/python-list/2012-August/628809.html + if (sys.version_info.major, sys.version_info.minor) <= (3, 2): + # https://github.com/python/cpython/blob/2.7/Include/unicodeobject.h#L81 + is_wide_build = sysconfig.get_config_var("Py_UNICODE_SIZE") >= 4 + # https://github.com/python/cpython/blob/2.7/Objects/unicodeobject.c#L564 + is_low_surrogate = 0xDC00 <= ord(uc) <= 0xDFFF + if not is_wide_build and is_low_surrogate: + width -= 1 + width += 1 return width else: @@ -3358,7 +4321,7 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, clean_lines: A CleansedLines instance containing the file. linenum: The number of the line to check. file_extension: The extension (without the dot) of the filename. - nesting_state: A _NestingState instance which maintains information about + nesting_state: A NestingState instance which maintains information about the current stack of nested blocks being parsed. error: The function to call with any errors found. """ @@ -3368,6 +4331,7 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, # raw strings, raw_lines = clean_lines.lines_without_raw_strings line = raw_lines[linenum] + prev = raw_lines[linenum - 1] if linenum > 0 else '' if line.find('\t') != -1: error(filename, linenum, 'whitespace/tab', 1, @@ -3385,23 +4349,33 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, # if(match($0, " <<")) complain = 0; # if(match(prev, " +for \\(")) complain = 0; # if(prevodd && match(prevprev, " +for \\(")) complain = 0; + scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$' + classinfo = nesting_state.InnermostClass() initial_spaces = 0 cleansed_line = clean_lines.elided[linenum] while initial_spaces < len(line) and line[initial_spaces] == ' ': initial_spaces += 1 - if line and line[-1].isspace(): - error(filename, linenum, 'whitespace/end_of_line', 4, - 'Line ends in whitespace. Consider deleting these extra spaces.') - # There are certain situations we allow one space, notably for section labels - elif ((initial_spaces == 1 or initial_spaces == 3) and - not Match(r'\s*\w+\s*:\s*$', cleansed_line)): + # There are certain situations we allow one space, notably for + # section labels, and also lines containing multi-line raw strings. + # We also don't check for lines that look like continuation lines + # (of lines ending in double quotes, commas, equals, or angle brackets) + # because the rules for how to indent those are non-trivial. + if (not Search(r'[",=><] *$', prev) and + (initial_spaces == 1 or initial_spaces == 3) and + not Match(scope_or_label_pattern, cleansed_line) and + not (clean_lines.raw_lines[linenum] != line and + Match(r'^\s*""', line))): error(filename, linenum, 'whitespace/indent', 3, 'Weird number of spaces at line-start. ' 'Are you using a 2-space indent?') + if line and line[-1].isspace(): + error(filename, linenum, 'whitespace/end_of_line', 4, + 'Line ends in whitespace. Consider deleting these extra spaces.') + # Check if the line is a header guard. is_header_guard = False - if file_extension == 'h': + if IsHeaderExtension(file_extension): cppvar = GetHeaderGuardCPPVariable(filename) if (line.startswith('#ifndef %s' % cppvar) or line.startswith('#define %s' % cppvar) or @@ -3417,14 +4391,10 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, # developers fault. if (not line.startswith('#include') and not is_header_guard and not Match(r'^\s*//.*http(s?)://\S*$', line) and + not Match(r'^\s*//\s*[^\s]*$', line) and not Match(r'^// \$Id:.*#[0-9]+ \$$', line)): line_width = GetLineWidth(line) - extended_length = int((_line_length * 1.25)) - if line_width > extended_length: - error(filename, linenum, 'whitespace/line_length', 4, - 'Lines should very rarely be longer than %i characters' % - extended_length) - elif line_width > _line_length: + if line_width > _line_length: error(filename, linenum, 'whitespace/line_length', 2, 'Lines should be <= %i characters long' % _line_length) @@ -3442,9 +4412,14 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, # Some more style checks CheckBraces(filename, clean_lines, linenum, error) + CheckTrailingSemicolon(filename, clean_lines, linenum, error) CheckEmptyBlockBody(filename, clean_lines, linenum, error) - CheckAccess(filename, clean_lines, linenum, nesting_state, error) CheckSpacing(filename, clean_lines, linenum, nesting_state, error) + CheckOperatorSpacing(filename, clean_lines, linenum, error) + CheckParenthesisSpacing(filename, clean_lines, linenum, error) + CheckCommaSpacing(filename, clean_lines, linenum, error) + CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error) + CheckSpacingForFunctionCall(filename, clean_lines, linenum, error) CheckCheck(filename, clean_lines, linenum, error) CheckAltTokens(filename, clean_lines, linenum, error) classinfo = nesting_state.InnermostClass() @@ -3452,7 +4427,6 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error) -_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"') _RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$') # Matches the first component of a filename delimited by -s and _s. That is: # _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo' @@ -3489,23 +4463,6 @@ def _DropCommonSuffixes(filename): return os.path.splitext(filename)[0] -def _IsTestFilename(filename): - """Determines if the given filename has a suffix that identifies it as a test. - - Args: - filename: The input filename. - - Returns: - True if 'filename' looks like a test, False otherwise. - """ - if (filename.endswith('_test.cc') or - filename.endswith('_unittest.cc') or - filename.endswith('_regtest.cc')): - return True - else: - return False - - def _ClassifyInclude(fileinfo, include, is_system): """Figures out what kind of header 'include' is. @@ -3581,11 +4538,17 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): error: The function to call with any errors found. """ fileinfo = FileInfo(filename) - line = clean_lines.lines[linenum] # "include" should use the new style "foo/bar.h" instead of just "bar.h" - if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line): + # Only do this check if the included header follows google naming + # conventions. If not, assume that it's a 3rd party API that + # requires special include conventions. + # + # We also make an exception for Lua headers, which follow google + # naming convention but not the include convention. + match = Match(r'#include\s*"([^/]+\.h)"', line) + if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)): error(filename, linenum, 'build/include', 4, 'Include the directory when naming .h files') @@ -3596,12 +4559,17 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): if match: include = match.group(2) is_system = (match.group(1) == '<') - if include in include_state: + duplicate_line = include_state.FindHeader(include) + if duplicate_line >= 0: error(filename, linenum, 'build/include', 4, '"%s" already included at %s:%s' % - (include, filename, include_state[include])) - else: - include_state[include] = linenum + (include, filename, duplicate_line)) + elif (include.endswith('.cc') and + os.path.dirname(fileinfo.RepositoryName()) != os.path.dirname(include)): + error(filename, linenum, 'build/include', 4, + 'Do not include .cc files from other packages') + elif not _THIRD_PARTY_HEADERS_PATTERN.match(include): + include_state.include_list[-1].append((include, linenum)) # We want to ensure that headers appear in the right order: # 1) for foo.cc, foo.h (preferred location) @@ -3627,15 +4595,6 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): 'Include "%s" not in alphabetical order' % include) include_state.SetLastHeader(canonical_include) - # Look for any of the stream classes that are part of standard C++. - match = _RE_PATTERN_INCLUDE.match(line) - if match: - include = match.group(2) - if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include): - # Many unit tests use cout, so we exempt them. - if not _IsTestFilename(filename): - error(filename, linenum, 'readability/streams', 3, - 'Streams are highly discouraged.') def _GetTextInside(text, start_pattern): @@ -3658,7 +4617,7 @@ def _GetTextInside(text, start_pattern): The extracted text. None if either the opening string or ending punctuation could not be found. """ - # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably + # TODO(unknown): Audit cpplint.py to see what places could be profitably # rewritten to use _GetTextInside (and use inferior regexp matching today). # Give opening punctuations to get the matching close-punctuations. @@ -3718,6 +4677,9 @@ def _GetTextInside(text, start_pattern): _RE_PATTERN_CONST_REF_PARAM = ( r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT + r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')') +# Stream types. +_RE_PATTERN_REF_STREAM_PARAM = ( + r'(?:.*stream\s*&\s*' + _RE_PATTERN_IDENT + r')') def CheckLanguage(filename, clean_lines, linenum, file_extension, @@ -3733,7 +4695,7 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension, linenum: The number of the line to check. file_extension: The extension (without the dot) of the filename. include_state: An _IncludeState instance in which the headers are inserted. - nesting_state: A _NestingState instance which maintains information about + nesting_state: A NestingState instance which maintains information about the current stack of nested blocks being parsed. error: The function to call with any errors found. """ @@ -3750,129 +4712,23 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension, # Reset include state across preprocessor directives. This is meant # to silence warnings for conditional includes. - if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line): - include_state.ResetSection() + match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line) + if match: + include_state.ResetSection(match.group(1)) # Make Windows paths like Unix. fullname = os.path.abspath(filename).replace('\\', '/') - # TODO(unknown): figure out if they're using default arguments in fn proto. + # Perform other checks now that we are sure that this is not an include line + CheckCasts(filename, clean_lines, linenum, error) + CheckGlobalStatic(filename, clean_lines, linenum, error) + CheckPrintf(filename, clean_lines, linenum, error) - # Check to see if they're using an conversion function cast. - # I just try to capture the most common basic types, though there are more. - # Parameterless conversion functions, such as bool(), are allowed as they are - # probably a member operator declaration or default constructor. - match = Search( - r'(\bnew\s+)?\b' # Grab 'new' operator, if it's there - r'(int|float|double|bool|char|int32|uint32|int64|uint64)' - r'(\([^)].*)', line) - if match: - matched_new = match.group(1) - matched_type = match.group(2) - matched_funcptr = match.group(3) - - # gMock methods are defined using some variant of MOCK_METHODx(name, type) - # where type may be float(), int(string), etc. Without context they are - # virtually indistinguishable from int(x) casts. Likewise, gMock's - # MockCallback takes a template parameter of the form return_type(arg_type), - # which looks much like the cast we're trying to detect. - # - # std::function<> wrapper has a similar problem. - # - # Return types for function pointers also look like casts if they - # don't have an extra space. - if (matched_new is None and # If new operator, then this isn't a cast - not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or - Search(r'\bMockCallback<.*>', line) or - Search(r'\bstd::function<.*>', line)) and - not (matched_funcptr and - Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(', - matched_funcptr))): - # Try a bit harder to catch gmock lines: the only place where - # something looks like an old-style cast is where we declare the - # return type of the mocked method, and the only time when we - # are missing context is if MOCK_METHOD was split across - # multiple lines. The missing MOCK_METHOD is usually one or two - # lines back, so scan back one or two lines. - # - # It's not possible for gmock macros to appear in the first 2 - # lines, since the class head + section name takes up 2 lines. - if (linenum < 2 or - not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$', - clean_lines.elided[linenum - 1]) or - Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$', - clean_lines.elided[linenum - 2]))): - error(filename, linenum, 'readability/casting', 4, - 'Using deprecated casting style. ' - 'Use static_cast<%s>(...) instead' % - matched_type) - - CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum], - 'static_cast', - r'\((int|float|double|bool|char|u?int(16|32|64))\)', error) - - # This doesn't catch all cases. Consider (const char * const)"hello". - # - # (char *) "foo" should always be a const_cast (reinterpret_cast won't - # compile). - if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum], - 'const_cast', r'\((char\s?\*+\s?)\)\s*"', error): - pass - else: - # Check pointer casts for other than string constants - CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum], - 'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error) - - # In addition, we look for people taking the address of a cast. This - # is dangerous -- casts can assign to temporaries, so the pointer doesn't - # point where you think. - match = Search( - r'(?:&\(([^)]+)\)[\w(])|' - r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line) - if match and match.group(1) != '*': - error(filename, linenum, 'runtime/casting', 4, - ('Are you taking an address of a cast? ' - 'This is dangerous: could be a temp var. ' - 'Take the address before doing the cast, rather than after')) - - # Create an extended_line, which is the concatenation of the current and - # next lines, for more effective checking of code that may span more than one - # line. - if linenum + 1 < clean_lines.NumLines(): - extended_line = line + clean_lines.elided[linenum + 1] - else: - extended_line = line - - # Check for people declaring static/global STL strings at the top level. - # This is dangerous because the C++ language does not guarantee that - # globals with constructors are initialized before the first access. - match = Match( - r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)', - line) - # Make sure it's not a function. - # Function template specialization looks like: "string foo(...". - # Class template definitions look like: "string Foo::Method(...". - # - # Also ignore things that look like operators. These are matched separately - # because operator names cross non-word boundaries. If we change the pattern - # above, we would decrease the accuracy of matching identifiers. - if (match and - not Search(r'\boperator\W', line) and - not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))): - error(filename, linenum, 'runtime/string', 4, - 'For a static/global string constant, use a C style string instead: ' - '"%schar %s[]".' % - (match.group(1), match.group(2))) - - if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line): - error(filename, linenum, 'runtime/init', 4, - 'You seem to be initializing a member variable with itself.') - - if file_extension == 'h': + if IsHeaderExtension(file_extension): # TODO(unknown): check that 1-arg constructors are explicit. # How to tell it's a constructor? # (handled in CheckForNonStandardConstructs for now) - # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS + # TODO(unknown): check that classes declare or disable copy/assign # (level 1 error) pass @@ -3888,23 +4744,6 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension, error(filename, linenum, 'runtime/int', 4, 'Use int16/int64/etc, rather than the C type %s' % match.group(1)) - # When snprintf is used, the second argument shouldn't be a literal. - match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line) - if match and match.group(2) != '0': - # If 2nd arg is zero, snprintf is used to calculate size. - error(filename, linenum, 'runtime/printf', 3, - 'If you can, use sizeof(%s) instead of %s as the 2nd arg ' - 'to snprintf.' % (match.group(1), match.group(2))) - - # Check if some verboten C functions are being used. - if Search(r'\bsprintf\b', line): - error(filename, linenum, 'runtime/printf', 5, - 'Never use sprintf. Use snprintf instead.') - match = Search(r'\b(strcpy|strcat)\b', line) - if match: - error(filename, linenum, 'runtime/printf', 4, - 'Almost always, snprintf is better than %s' % match.group(1)) - # Check if some verboten operator overloading is going on # TODO(unknown): catch out-of-line unary operator&: # class X {}; @@ -3924,7 +4763,7 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension, # Check for potential format string bugs like printf(foo). # We constrain the pattern not to pick things like DocidForPrintf(foo). # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str()) - # TODO(sugawarayu): Catch the following case. Need to change the calling + # TODO(unknown): Catch the following case. Need to change the calling # convention of the whole function to process multiple line to handle it. # printf( # boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line); @@ -3989,37 +4828,188 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension, 'Do not use variable-length arrays. Use an appropriately named ' "('k' followed by CamelCase) compile-time constant for the size.") - # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or - # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing - # in the class declaration. - match = Match( - (r'\s*' - r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))' - r'\(.*\);$'), - line) - if match and linenum + 1 < clean_lines.NumLines(): - next_line = clean_lines.elided[linenum + 1] - # We allow some, but not all, declarations of variables to be present - # in the statement that defines the class. The [\w\*,\s]* fragment of - # the regular expression below allows users to declare instances of - # the class or pointers to instances, but not less common types such - # as function pointers or arrays. It's a tradeoff between allowing - # reasonable code and avoiding trying to parse more C++ using regexps. - if not Search(r'^\s*}[\w\*,\s]*;', next_line): - error(filename, linenum, 'readability/constructors', 3, - match.group(1) + ' should be the last thing in the class') - # Check for use of unnamed namespaces in header files. Registration # macros are typically OK, so we allow use of "namespace {" on lines # that end with backslashes. - if (file_extension == 'h' + if (IsHeaderExtension(file_extension) and Search(r'\bnamespace\s*{', line) and line[-1] != '\\'): error(filename, linenum, 'build/namespaces', 4, 'Do not use unnamed namespaces in header files. See ' - 'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces' + 'https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces' ' for more information.') + +def CheckGlobalStatic(filename, clean_lines, linenum, error): + """Check for unsafe global or static objects. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Match two lines at a time to support multiline declarations + if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line): + line += clean_lines.elided[linenum + 1].strip() + + # Check for people declaring static/global STL strings at the top level. + # This is dangerous because the C++ language does not guarantee that + # globals with constructors are initialized before the first access, and + # also because globals can be destroyed when some threads are still running. + # TODO(unknown): Generalize this to also find static unique_ptr instances. + # TODO(unknown): File bugs for clang-tidy to find these. + match = Match( + r'((?:|static +)(?:|const +))(?::*std::)?string( +const)? +' + r'([a-zA-Z0-9_:]+)\b(.*)', + line) + + # Remove false positives: + # - String pointers (as opposed to values). + # string *pointer + # const string *pointer + # string const *pointer + # string *const pointer + # + # - Functions and template specializations. + # string Function(... + # string Class::Method(... + # + # - Operators. These are matched separately because operator names + # cross non-word boundaries, and trying to match both operators + # and functions at the same time would decrease accuracy of + # matching identifiers. + # string Class::operator*() + if (match and + not Search(r'\bstring\b(\s+const)?\s*[\*\&]\s*(const\s+)?\w', line) and + not Search(r'\boperator\W', line) and + not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(4))): + if Search(r'\bconst\b', line): + error(filename, linenum, 'runtime/string', 4, + 'For a static/global string constant, use a C style string ' + 'instead: "%schar%s %s[]".' % + (match.group(1), match.group(2) or '', match.group(3))) + else: + error(filename, linenum, 'runtime/string', 4, + 'Static/global string variables are not permitted.') + + if (Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line) or + Search(r'\b([A-Za-z0-9_]*_)\(CHECK_NOTNULL\(\1\)\)', line)): + error(filename, linenum, 'runtime/init', 4, + 'You seem to be initializing a member variable with itself.') + + +def CheckPrintf(filename, clean_lines, linenum, error): + """Check for printf related issues. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # When snprintf is used, the second argument shouldn't be a literal. + match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line) + if match and match.group(2) != '0': + # If 2nd arg is zero, snprintf is used to calculate size. + error(filename, linenum, 'runtime/printf', 3, + 'If you can, use sizeof(%s) instead of %s as the 2nd arg ' + 'to snprintf.' % (match.group(1), match.group(2))) + + # Check if some verboten C functions are being used. + if Search(r'\bsprintf\s*\(', line): + error(filename, linenum, 'runtime/printf', 5, + 'Never use sprintf. Use snprintf instead.') + match = Search(r'\b(strcpy|strcat)\s*\(', line) + if match: + error(filename, linenum, 'runtime/printf', 4, + 'Almost always, snprintf is better than %s' % match.group(1)) + + +def IsDerivedFunction(clean_lines, linenum): + """Check if current line contains an inherited function. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + Returns: + True if current line contains a function with "override" + virt-specifier. + """ + # Scan back a few lines for start of current function + for i in xrange(linenum, max(-1, linenum - 10), -1): + match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i]) + if match: + # Look for "override" after the matching closing parenthesis + line, _, closing_paren = CloseExpression( + clean_lines, i, len(match.group(1))) + return (closing_paren >= 0 and + Search(r'\boverride\b', line[closing_paren:])) + return False + + +def IsOutOfLineMethodDefinition(clean_lines, linenum): + """Check if current line contains an out-of-line method definition. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + Returns: + True if current line contains an out-of-line method definition. + """ + # Scan back a few lines for start of current function + for i in xrange(linenum, max(-1, linenum - 10), -1): + if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]): + return Match(r'^[^()]*\w+::\w+\(', clean_lines.elided[i]) is not None + return False + + +def IsInitializerList(clean_lines, linenum): + """Check if current line is inside constructor initializer list. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + Returns: + True if current line appears to be inside constructor initializer + list, False otherwise. + """ + for i in xrange(linenum, 1, -1): + line = clean_lines.elided[i] + if i == linenum: + remove_function_body = Match(r'^(.*)\{\s*$', line) + if remove_function_body: + line = remove_function_body.group(1) + + if Search(r'\s:\s*\w+[({]', line): + # A lone colon tend to indicate the start of a constructor + # initializer list. It could also be a ternary operator, which + # also tend to appear in constructor initializer lists as + # opposed to parameter lists. + return True + if Search(r'\}\s*,\s*$', line): + # A closing brace followed by a comma is probably the end of a + # brace-initialized member in constructor initializer list. + return True + if Search(r'[{};]\s*$', line): + # Found one of the following: + # - A closing brace or semicolon, probably the end of the previous + # function. + # - An opening brace, probably the start of current class or namespace. + # + # Current line is probably not inside an initializer list since + # we saw one of those things without seeing the starting colon. + return False + + # Got to the beginning of the file without seeing the start of + # constructor initializer list. + return False + + def CheckForNonConstReference(filename, clean_lines, linenum, nesting_state, error): """Check for non-const references. @@ -4031,7 +5021,7 @@ def CheckForNonConstReference(filename, clean_lines, linenum, filename: The name of the current file. clean_lines: A CleansedLines instance containing the file. linenum: The number of the line to check. - nesting_state: A _NestingState instance which maintains information about + nesting_state: A NestingState instance which maintains information about the current stack of nested blocks being parsed. error: The function to call with any errors found. """ @@ -4040,6 +5030,17 @@ def CheckForNonConstReference(filename, clean_lines, linenum, if '&' not in line: return + # If a function is inherited, current function doesn't have much of + # a choice, so any non-const references should not be blamed on + # derived function. + if IsDerivedFunction(clean_lines, linenum): + return + + # Don't warn on out-of-line method definitions, as we would warn on the + # in-line declaration, if it isn't marked with 'override'. + if IsOutOfLineMethodDefinition(clean_lines, linenum): + return + # Long type names may be broken across multiple lines, usually in one # of these forms: # LongType @@ -4088,60 +5089,192 @@ def CheckForNonConstReference(filename, clean_lines, linenum, # inside declarators: reference parameter # We will exclude the first two cases by checking that we are not inside a # function body, including one that was just introduced by a trailing '{'. - # TODO(unknwon): Doesn't account for preprocessor directives. # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare]. - check_params = False - if not nesting_state.stack: - check_params = True # top level - elif (isinstance(nesting_state.stack[-1], _ClassInfo) or - isinstance(nesting_state.stack[-1], _NamespaceInfo)): - check_params = True # within class or namespace - elif Match(r'.*{\s*$', line): - if (len(nesting_state.stack) == 1 or - isinstance(nesting_state.stack[-2], _ClassInfo) or - isinstance(nesting_state.stack[-2], _NamespaceInfo)): - check_params = True # just opened global/class/namespace block + if (nesting_state.previous_stack_top and + not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or + isinstance(nesting_state.previous_stack_top, _NamespaceInfo))): + # Not at toplevel, not within a class, and not within a namespace + return + + # Avoid initializer lists. We only need to scan back from the + # current line for something that starts with ':'. + # + # We don't need to check the current line, since the '&' would + # appear inside the second set of parentheses on the current line as + # opposed to the first set. + if linenum > 0: + for i in xrange(linenum - 1, max(0, linenum - 10), -1): + previous_line = clean_lines.elided[i] + if not Search(r'[),]\s*$', previous_line): + break + if Match(r'^\s*:\s+\S', previous_line): + return + + # Avoid preprocessors + if Search(r'\\\s*$', line): + return + + # Avoid constructor initializer lists + if IsInitializerList(clean_lines, linenum): + return + # We allow non-const references in a few standard places, like functions # called "swap()" or iostream operators like "<<" or ">>". Do not check # those function parameters. # # We also accept & in static_assert, which looks like a function but # it's actually a declaration expression. - whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|' + allowed_functions = (r'(?:[sS]wap(?:<\w:+>)?|' r'operator\s*[<>][<>]|' r'static_assert|COMPILE_ASSERT' r')\s*\(') - if Search(whitelisted_functions, line): - check_params = False + if Search(allowed_functions, line): + return elif not Search(r'\S+\([^)]*$', line): - # Don't see a whitelisted function on this line. Actually we + # Don't see an allowed function on this line. Actually we # didn't see any function name on this line, so this is likely a # multi-line parameter list. Try a bit harder to catch this case. for i in xrange(2): if (linenum > i and - Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])): - check_params = False - break + Search(allowed_functions, clean_lines.elided[linenum - i - 1])): + return - if check_params: - decls = ReplaceAll(r'{[^}]*}', ' ', line) # exclude function body - for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls): - if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter): - error(filename, linenum, 'runtime/references', 2, - 'Is this a non-const reference? ' - 'If so, make const or use a pointer: ' + - ReplaceAll(' *<', '<', parameter)) + decls = ReplaceAll(r'{[^}]*}', ' ', line) # exclude function body + for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls): + if (not Match(_RE_PATTERN_CONST_REF_PARAM, parameter) and + not Match(_RE_PATTERN_REF_STREAM_PARAM, parameter)): + error(filename, linenum, 'runtime/references', 2, + 'Is this a non-const reference? ' + 'If so, make const or use a pointer: ' + + ReplaceAll(' *<', '<', parameter)) -def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern, - error): +def CheckCasts(filename, clean_lines, linenum, error): + """Various cast related checks. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Check to see if they're using an conversion function cast. + # I just try to capture the most common basic types, though there are more. + # Parameterless conversion functions, such as bool(), are allowed as they are + # probably a member operator declaration or default constructor. + match = Search( + r'(\bnew\s+(?:const\s+)?|\S<\s*(?:const\s+)?)?\b' + r'(int|float|double|bool|char|int32|uint32|int64|uint64)' + r'(\([^)].*)', line) + expecting_function = ExpectingFunctionArgs(clean_lines, linenum) + if match and not expecting_function: + matched_type = match.group(2) + + # matched_new_or_template is used to silence two false positives: + # - New operators + # - Template arguments with function types + # + # For template arguments, we match on types immediately following + # an opening bracket without any spaces. This is a fast way to + # silence the common case where the function type is the first + # template argument. False negative with less-than comparison is + # avoided because those operators are usually followed by a space. + # + # function // bracket + no space = false positive + # value < double(42) // bracket + space = true positive + matched_new_or_template = match.group(1) + + # Avoid arrays by looking for brackets that come after the closing + # parenthesis. + if Match(r'\([^()]+\)\s*\[', match.group(3)): + return + + # Other things to ignore: + # - Function pointers + # - Casts to pointer types + # - Placement new + # - Alias declarations + matched_funcptr = match.group(3) + if (matched_new_or_template is None and + not (matched_funcptr and + (Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(', + matched_funcptr) or + matched_funcptr.startswith('(*)'))) and + not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and + not Search(r'new\(\S+\)\s*' + matched_type, line)): + error(filename, linenum, 'readability/casting', 4, + 'Using deprecated casting style. ' + 'Use static_cast<%s>(...) instead' % + matched_type) + + if not expecting_function: + CheckCStyleCast(filename, clean_lines, linenum, 'static_cast', + r'\((int|float|double|bool|char|u?int(16|32|64))\)', error) + + # This doesn't catch all cases. Consider (const char * const)"hello". + # + # (char *) "foo" should always be a const_cast (reinterpret_cast won't + # compile). + if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast', + r'\((char\s?\*+\s?)\)\s*"', error): + pass + else: + # Check pointer casts for other than string constants + CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast', + r'\((\w+\s?\*+\s?)\)', error) + + # In addition, we look for people taking the address of a cast. This + # is dangerous -- casts can assign to temporaries, so the pointer doesn't + # point where you think. + # + # Some non-identifier character is required before the '&' for the + # expression to be recognized as a cast. These are casts: + # expression = &static_cast(temporary()); + # function(&(int*)(temporary())); + # + # This is not a cast: + # reference_type&(int* function_param); + match = Search( + r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|' + r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line) + if match: + # Try a better error message when the & is bound to something + # dereferenced by the casted pointer, as opposed to the casted + # pointer itself. + parenthesis_error = False + match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<', line) + if match: + _, y1, x1 = CloseExpression(clean_lines, linenum, len(match.group(1))) + if x1 >= 0 and clean_lines.elided[y1][x1] == '(': + _, y2, x2 = CloseExpression(clean_lines, y1, x1) + if x2 >= 0: + extended_line = clean_lines.elided[y2][x2:] + if y2 < clean_lines.NumLines() - 1: + extended_line += clean_lines.elided[y2 + 1] + if Match(r'\s*(?:->|\[)', extended_line): + parenthesis_error = True + + if parenthesis_error: + error(filename, linenum, 'readability/casting', 4, + ('Are you taking an address of something dereferenced ' + 'from a cast? Wrapping the dereferenced expression in ' + 'parentheses will make the binding more obvious')) + else: + error(filename, linenum, 'runtime/casting', 4, + ('Are you taking an address of a cast? ' + 'This is dangerous: could be a temp var. ' + 'Take the address before doing the cast, rather than after')) + + +def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error): """Checks for a C-style cast by looking for the pattern. Args: filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. linenum: The number of the line to check. - line: The line of code to check. - raw_line: The raw line of code to check, with comments. cast_type: The string for the C++ cast to recommend. This is either reinterpret_cast, static_cast, or const_cast, depending. pattern: The regular expression used to find C-style casts. @@ -4151,75 +5284,34 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern, True if an error was emitted. False otherwise. """ + line = clean_lines.elided[linenum] match = Search(pattern, line) if not match: return False - # e.g., sizeof(int) - sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1]) - if sizeof_match: - error(filename, linenum, 'runtime/sizeof', 1, - 'Using sizeof(type). Use sizeof(varname) instead if possible') - return True + # Exclude lines with keywords that tend to look like casts + context = line[0:match.start(1) - 1] + if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context): + return False + + # Try expanding current context to see if we one level of + # parentheses inside a macro. + if linenum > 0: + for i in xrange(linenum - 1, max(0, linenum - 5), -1): + context = clean_lines.elided[i] + context + if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context): + return False # operator++(int) and operator--(int) - if (line[0:match.start(1) - 1].endswith(' operator++') or - line[0:match.start(1) - 1].endswith(' operator--')): + if context.endswith(' operator++') or context.endswith(' operator--'): return False - # A single unnamed argument for a function tends to look like old - # style cast. If we see those, don't issue warnings for deprecated - # casts, instead issue warnings for unnamed arguments where - # appropriate. - # - # These are things that we want warnings for, since the style guide - # explicitly require all parameters to be named: - # Function(int); - # Function(int) { - # ConstMember(int) const; - # ConstMember(int) const { - # ExceptionMember(int) throw (...); - # ExceptionMember(int) throw (...) { - # PureVirtual(int) = 0; - # - # These are functions of some sort, where the compiler would be fine - # if they had named parameters, but people often omit those - # identifiers to reduce clutter: - # (FunctionPointer)(int); - # (FunctionPointer)(int) = value; - # Function((function_pointer_arg)(int)) - # ; - # <(FunctionPointerTemplateArgument)(int)>; + # A single unnamed argument for a function tends to look like old style cast. + # If we see those, don't issue warnings for deprecated casts. remainder = line[match.end(0):] - if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder): - # Looks like an unnamed parameter. - - # Don't warn on any kind of template arguments. - if Match(r'^\s*>', remainder): - return False - - # Don't warn on assignments to function pointers, but keep warnings for - # unnamed parameters to pure virtual functions. Note that this pattern - # will also pass on assignments of "0" to function pointers, but the - # preferred values for those would be "nullptr" or "NULL". - matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder) - if matched_zero and matched_zero.group(1) != '0': - return False - - # Don't warn on function pointer declarations. For this we need - # to check what came before the "(type)" string. - if Match(r'.*\)\s*$', line[0:match.start(0)]): - return False - - # Don't warn if the parameter is named with block comments, e.g.: - # Function(int /*unused_param*/); - if '/*' in raw_line: - return False - - # Passed all filters, issue warning here. - error(filename, linenum, 'readability/function', 3, - 'All parameters should be named in a function') - return True + if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)', + remainder): + return False # At this point, all that should be left is actual casts. error(filename, linenum, 'readability/casting', 4, @@ -4229,6 +5321,28 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern, return True +def ExpectingFunctionArgs(clean_lines, linenum): + """Checks whether where function type arguments are expected. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + + Returns: + True if the line at 'linenum' is inside something that expects arguments + of function types. + """ + line = clean_lines.elided[linenum] + return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or + (linenum >= 2 and + (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$', + clean_lines.elided[linenum - 1]) or + Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$', + clean_lines.elided[linenum - 2]) or + Search(r'\bstd::m?function\s*\<\s*$', + clean_lines.elided[linenum - 1])))) + + _HEADERS_CONTAINING_TEMPLATES = ( ('', ('deque',)), ('', ('unary_function', 'binary_function', @@ -4251,11 +5365,15 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern, ('', ('numeric_limits',)), ('', ('list',)), ('', ('map', 'multimap',)), - ('', ('allocator',)), + ('', ('allocator', 'make_shared', 'make_unique', 'shared_ptr', + 'unique_ptr', 'weak_ptr')), ('', ('queue', 'priority_queue',)), ('', ('set', 'multiset',)), ('', ('stack',)), ('', ('char_traits', 'basic_string',)), + ('', ('tuple',)), + ('', ('unordered_map', 'unordered_multimap')), + ('', ('unordered_set', 'unordered_multiset')), ('', ('pair',)), ('', ('vector',)), @@ -4266,18 +5384,26 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern, ('', ('slist',)), ) -_RE_PATTERN_STRING = re.compile(r'\bstring\b') +_HEADERS_MAYBE_TEMPLATES = ( + ('', ('copy', 'max', 'min', 'min_element', 'sort', + 'transform', + )), + ('', ('forward', 'make_pair', 'move', 'swap')), + ) -_re_pattern_algorithm_header = [] -for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap', - 'transform'): - # Match max(..., ...), max(..., ...), but not foo->max, foo.max or - # type::max(). - _re_pattern_algorithm_header.append( - (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'), - _template, - '')) +_RE_PATTERN_STRING = re.compile(r'\bstring\b') +_re_pattern_headers_maybe_templates = [] +for _header, _templates in _HEADERS_MAYBE_TEMPLATES: + for _template in _templates: + # Match max(..., ...), max(..., ...), but not foo->max, foo.max or + # type::max(). + _re_pattern_headers_maybe_templates.append( + (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'), + _template, + _header)) + +# Other scripts may reach in and modify this pattern. _re_pattern_templates = [] for _header, _templates in _HEADERS_CONTAINING_TEMPLATES: for _template in _templates: @@ -4317,13 +5443,13 @@ def FilesBelongToSameModule(filename_cc, filename_h): string: the additional prefix needed to open the header file. """ - if not filename_cc.endswith('.cc'): + fileinfo = FileInfo(filename_cc) + if not fileinfo.IsSource(): return (False, '') - filename_cc = filename_cc[:-len('.cc')] - if filename_cc.endswith('_unittest'): - filename_cc = filename_cc[:-len('_unittest')] - elif filename_cc.endswith('_test'): - filename_cc = filename_cc[:-len('_test')] + filename_cc = filename_cc[:-len(fileinfo.Extension())] + matched_test_suffix = Search(_TEST_FILE_SUFFIX, fileinfo.BaseName()) + if matched_test_suffix: + filename_cc = filename_cc[:-len(matched_test_suffix.group(1))] filename_cc = filename_cc.replace('/public/', '/') filename_cc = filename_cc.replace('/internal/', '/') @@ -4342,16 +5468,16 @@ def FilesBelongToSameModule(filename_cc, filename_h): return files_belong_to_same_module, common_path -def UpdateIncludeState(filename, include_state, io=codecs): - """Fill up the include_state with new includes found from the file. +def UpdateIncludeState(filename, include_dict, io=codecs): + """Fill up the include_dict with new includes found from the file. Args: filename: the name of the header to read. - include_state: an _IncludeState instance in which the headers are inserted. + include_dict: a dictionary in which the headers are inserted. io: The io factory to use to read the file. Provided for testability. Returns: - True if a header was succesfully added. False otherwise. + True if a header was successfully added. False otherwise. """ headerfile = None try: @@ -4365,9 +5491,7 @@ def UpdateIncludeState(filename, include_state, io=codecs): match = _RE_PATTERN_INCLUDE.search(clean_line) if match: include = match.group(2) - # The value formatting is cute, but not really used right now. - # What matters here is that the key is in include_state. - include_state.setdefault(include, '%s:%d' % (filename, linenum)) + include_dict.setdefault(include, linenum) return True @@ -4406,7 +5530,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, if prefix.endswith('std::') or not prefix.endswith('::'): required[''] = (linenum, 'string') - for pattern, template, header in _re_pattern_algorithm_header: + for pattern, template, header in _re_pattern_headers_maybe_templates: if pattern.search(line): required[header] = (linenum, template) @@ -4415,15 +5539,21 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, continue for pattern, template, header in _re_pattern_templates: - if pattern.search(line): - required[header] = (linenum, template) + matched = pattern.search(line) + if matched: + # Don't warn about IWYU in non-STL namespaces: + # (We check only the first match per line; good enough.) + prefix = line[:matched.start()] + if prefix.endswith('std::') or not prefix.endswith('::'): + required[header] = (linenum, template) # The policy is that if you #include something in foo.h you don't need to # include it again in foo.cc. Here, we will look at possible includes. - # Let's copy the include_state so it is only messed up within this function. - include_state = include_state.copy() + # Let's flatten the include_state include_list and copy it into a dictionary. + include_dict = dict([item for sublist in include_state.include_list + for item in sublist]) - # Did we find the header for this file (if any) and succesfully load it? + # Did we find the header for this file (if any) and successfully load it? header_found = False # Use the absolute path so that matching works properly. @@ -4438,13 +5568,13 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, # instead of 'foo_flymake.h' abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename) - # include_state is modified during iteration, so we iterate over a copy of + # include_dict is modified during iteration, so we iterate over a copy of # the keys. - header_keys = include_state.keys() + header_keys = include_dict.keys() for header in header_keys: (same_module, common_path) = FilesBelongToSameModule(abs_filename, header) fullpath = common_path + header - if same_module and UpdateIncludeState(fullpath, include_state, io): + if same_module and UpdateIncludeState(fullpath, include_dict, io): header_found = True # If we can't find the header file for a .cc, assume it's because we don't @@ -4458,7 +5588,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, # All the lines have been processed, report the errors found. for required_header_unstripped in required: template = required[required_header_unstripped][1] - if required_header_unstripped.strip('<>"') not in include_state: + if required_header_unstripped.strip('<>"') not in include_dict: error(filename, required[required_header_unstripped][0], 'build/include_what_you_use', 4, 'Add #include ' + required_header_unstripped + ' for ' + template) @@ -4470,7 +5600,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error): """Check that make_pair's template arguments are deduced. - G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are + G++ 4.6 in C++11 mode fails badly if make_pair's template arguments are specified explicitly, and such use isn't intended in any case. Args: @@ -4488,6 +5618,165 @@ def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error): ' OR use pair directly OR if appropriate, construct a pair directly') +def CheckRedundantVirtual(filename, clean_lines, linenum, error): + """Check if line contains a redundant "virtual" function-specifier. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + # Look for "virtual" on current line. + line = clean_lines.elided[linenum] + virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line) + if not virtual: return + + # Ignore "virtual" keywords that are near access-specifiers. These + # are only used in class base-specifier and do not apply to member + # functions. + if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or + Match(r'^\s+(public|protected|private)\b', virtual.group(3))): + return + + # Ignore the "virtual" keyword from virtual base classes. Usually + # there is a column on the same line in these cases (virtual base + # classes are rare in google3 because multiple inheritance is rare). + if Match(r'^.*[^:]:[^:].*$', line): return + + # Look for the next opening parenthesis. This is the start of the + # parameter list (possibly on the next line shortly after virtual). + # TODO(unknown): doesn't work if there are virtual functions with + # decltype() or other things that use parentheses, but csearch suggests + # that this is rare. + end_col = -1 + end_line = -1 + start_col = len(virtual.group(2)) + for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())): + line = clean_lines.elided[start_line][start_col:] + parameter_list = Match(r'^([^(]*)\(', line) + if parameter_list: + # Match parentheses to find the end of the parameter list + (_, end_line, end_col) = CloseExpression( + clean_lines, start_line, start_col + len(parameter_list.group(1))) + break + start_col = 0 + + if end_col < 0: + return # Couldn't find end of parameter list, give up + + # Look for "override" or "final" after the parameter list + # (possibly on the next few lines). + for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())): + line = clean_lines.elided[i][end_col:] + match = Search(r'\b(override|final)\b', line) + if match: + error(filename, linenum, 'readability/inheritance', 4, + ('"virtual" is redundant since function is ' + 'already declared as "%s"' % match.group(1))) + + # Set end_col to check whole lines after we are done with the + # first line. + end_col = 0 + if Search(r'[^\w]\s*$', line): + break + + +def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error): + """Check if line contains a redundant "override" or "final" virt-specifier. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + # Look for closing parenthesis nearby. We need one to confirm where + # the declarator ends and where the virt-specifier starts to avoid + # false positives. + line = clean_lines.elided[linenum] + declarator_end = line.rfind(')') + if declarator_end >= 0: + fragment = line[declarator_end:] + else: + if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0: + fragment = line + else: + return + + # Check that at most one of "override" or "final" is present, not both + if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment): + error(filename, linenum, 'readability/inheritance', 4, + ('"override" is redundant since function is ' + 'already declared as "final"')) + + + + +# Returns true if we are at a new block, and it is directly +# inside of a namespace. +def IsBlockInNameSpace(nesting_state, is_forward_declaration): + """Checks that the new block is directly in a namespace. + + Args: + nesting_state: The _NestingState object that contains info about our state. + is_forward_declaration: If the class is a forward declared class. + Returns: + Whether or not the new block is directly in a namespace. + """ + if is_forward_declaration: + if len(nesting_state.stack) >= 1 and ( + isinstance(nesting_state.stack[-1], _NamespaceInfo)): + return True + else: + return False + + return (len(nesting_state.stack) > 1 and + nesting_state.stack[-1].check_namespace_indentation and + isinstance(nesting_state.stack[-2], _NamespaceInfo)) + + +def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item, + raw_lines_no_comments, linenum): + """This method determines if we should apply our namespace indentation check. + + Args: + nesting_state: The current nesting state. + is_namespace_indent_item: If we just put a new class on the stack, True. + If the top of the stack is not a class, or we did not recently + add the class, False. + raw_lines_no_comments: The lines without the comments. + linenum: The current line number we are processing. + + Returns: + True if we should apply our namespace indentation check. Currently, it + only works for classes and namespaces inside of a namespace. + """ + + is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments, + linenum) + + if not (is_namespace_indent_item or is_forward_declaration): + return False + + # If we are in a macro, we do not want to check the namespace indentation. + if IsMacroDefinition(raw_lines_no_comments, linenum): + return False + + return IsBlockInNameSpace(nesting_state, is_forward_declaration) + + +# Call this method if the line is directly inside of a namespace. +# If the line above is blank (excluding comments) or the start of +# an inner namespace, it cannot be indented. +def CheckItemIndentationInNamespace(filename, raw_lines_no_comments, linenum, + error): + line = raw_lines_no_comments[linenum] + if Match(r'^\s+', line): + error(filename, linenum, 'runtime/indentation_namespace', 4, + 'Do not indent within a namespace') + + def ProcessLine(filename, file_extension, clean_lines, line, include_state, function_state, nesting_state, error, extra_check_functions=[]): @@ -4501,7 +5790,7 @@ def ProcessLine(filename, file_extension, clean_lines, line, line: Number of line being processed. include_state: An _IncludeState instance in which the headers are inserted. function_state: A _FunctionState instance which counts function lines, etc. - nesting_state: A _NestingState instance which maintains information about + nesting_state: A NestingState instance which maintains information about the current stack of nested blocks being parsed. error: A callable to which errors are reported, which takes 4 arguments: filename, line number, error level, and message @@ -4512,8 +5801,9 @@ def ProcessLine(filename, file_extension, clean_lines, line, raw_lines = clean_lines.raw_lines ParseNolintSuppressions(filename, raw_lines[line], line, error) nesting_state.Update(filename, clean_lines, line, error) - if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM: - return + CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line, + error) + if nesting_state.InAsmBlock(): return CheckForFunctionLengths(filename, clean_lines, line, function_state, error) CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error) CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error) @@ -4526,9 +5816,82 @@ def ProcessLine(filename, file_extension, clean_lines, line, CheckPosixThreading(filename, clean_lines, line, error) CheckInvalidIncrement(filename, clean_lines, line, error) CheckMakePairUsesDeduction(filename, clean_lines, line, error) + CheckRedundantVirtual(filename, clean_lines, line, error) + CheckRedundantOverrideOrFinal(filename, clean_lines, line, error) for check_fn in extra_check_functions: check_fn(filename, clean_lines, line, error) +def FlagCxx11Features(filename, clean_lines, linenum, error): + """Flag those c++11 features that we only allow in certain places. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line) + + # Flag unapproved C++ TR1 headers. + if include and include.group(1).startswith('tr1/'): + error(filename, linenum, 'build/c++tr1', 5, + ('C++ TR1 headers such as <%s> are unapproved.') % include.group(1)) + + # Flag unapproved C++11 headers. + if include and include.group(1) in ('cfenv', + 'condition_variable', + 'fenv.h', + 'future', + 'mutex', + 'thread', + 'chrono', + 'ratio', + 'regex', + 'system_error', + ): + error(filename, linenum, 'build/c++11', 5, + ('<%s> is an unapproved C++11 header.') % include.group(1)) + + # The only place where we need to worry about C++11 keywords and library + # features in preprocessor directives is in macro definitions. + if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return + + # These are classes and free functions. The classes are always + # mentioned as std::*, but we only catch the free functions if + # they're not found by ADL. They're alphabetical by header. + for top_name in ( + # type_traits + 'alignment_of', + 'aligned_union', + ): + if Search(r'\bstd::%s\b' % top_name, line): + error(filename, linenum, 'build/c++11', 5, + ('std::%s is an unapproved C++11 class or function. Send c-style ' + 'an example of where it would make your code more readable, and ' + 'they may let you use it.') % top_name) + + +def FlagCxx14Features(filename, clean_lines, linenum, error): + """Flag those C++14 features that we restrict. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line) + + # Flag unapproved C++14 headers. + if include and include.group(1) in ('scoped_allocator', 'shared_mutex'): + error(filename, linenum, 'build/c++14', 5, + ('<%s> is an unapproved C++14 header.') % include.group(1)) + + def ProcessFileData(filename, file_extension, lines, error, extra_check_functions=[]): """Performs lint checks and reports any errors to the given error function. @@ -4549,31 +5912,122 @@ def ProcessFileData(filename, file_extension, lines, error, include_state = _IncludeState() function_state = _FunctionState() - nesting_state = _NestingState() + nesting_state = NestingState() ResetNolintSuppressions() CheckForCopyright(filename, lines, error) - - if file_extension == 'h': - CheckForHeaderGuard(filename, lines, error) - + ProcessGlobalSuppresions(lines) RemoveMultiLineComments(filename, lines, error) clean_lines = CleansedLines(lines) + + if IsHeaderExtension(file_extension): + CheckForHeaderGuard(filename, clean_lines, error) + for line in xrange(clean_lines.NumLines()): ProcessLine(filename, file_extension, clean_lines, line, include_state, function_state, nesting_state, error, extra_check_functions) + FlagCxx11Features(filename, clean_lines, line, error) nesting_state.CheckCompletedBlocks(filename, error) CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error) + # Check that the .cc file has included its header if it exists. + if _IsSourceExtension(file_extension): + CheckHeaderFileIncluded(filename, include_state, error) + # We check here rather than inside ProcessLine so that we see raw # lines rather than "cleaned" lines. CheckForBadCharacters(filename, lines, error) CheckForNewlineAtEOF(filename, lines, error) +def ProcessConfigOverrides(filename): + """ Loads the configuration files and processes the config overrides. + + Args: + filename: The name of the file being processed by the linter. + + Returns: + False if the current |filename| should not be processed further. + """ + + abs_filename = os.path.abspath(filename) + cfg_filters = [] + keep_looking = True + while keep_looking: + abs_path, base_name = os.path.split(abs_filename) + if not base_name: + break # Reached the root directory. + + cfg_file = os.path.join(abs_path, "CPPLINT.cfg") + abs_filename = abs_path + if not os.path.isfile(cfg_file): + continue + + try: + with open(cfg_file) as file_handle: + for line in file_handle: + line, _, _ = line.partition('#') # Remove comments. + if not line.strip(): + continue + + name, _, val = line.partition('=') + name = name.strip() + val = val.strip() + if name == 'set noparent': + keep_looking = False + elif name == 'filter': + cfg_filters.append(val) + elif name == 'exclude_files': + # When matching exclude_files pattern, use the base_name of + # the current file name or the directory name we are processing. + # For example, if we are checking for lint errors in /foo/bar/baz.cc + # and we found the .cfg file at /foo/CPPLINT.cfg, then the config + # file's "exclude_files" filter is meant to be checked against "bar" + # and not "baz" nor "bar/baz.cc". + if base_name: + pattern = re.compile(val) + if pattern.match(base_name): + if _cpplint_state.quiet: + # Suppress "Ignoring file" warning when using --quiet. + return False + sys.stderr.write('Ignoring "%s": file excluded by "%s". ' + 'File path component "%s" matches ' + 'pattern "%s"\n' % + (filename, cfg_file, base_name, val)) + return False + elif name == 'linelength': + global _line_length + try: + _line_length = int(val) + except ValueError: + sys.stderr.write('Line length must be numeric.') + elif name == 'root': + global _root + # root directories are specified relative to CPPLINT.cfg dir. + _root = os.path.join(os.path.dirname(cfg_file), val) + elif name == 'headers': + ProcessHppHeadersOption(val) + else: + sys.stderr.write( + 'Invalid configuration option (%s) in file %s\n' % + (name, cfg_file)) + + except IOError: + sys.stderr.write( + "Skipping config file '%s': Can't open for reading\n" % cfg_file) + keep_looking = False + + # Apply all the accumulated filters in reverse order (top-level directory + # config options having the least priority). + for filter in reversed(cfg_filters): + _AddFilters(filter) + + return True + + def ProcessFile(filename, vlevel, extra_check_functions=[]): """Does google-lint on a single file. @@ -4589,7 +6043,15 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]): """ _SetVerboseLevel(vlevel) + _BackupFilters() + old_errors = _cpplint_state.error_count + + if not ProcessConfigOverrides(filename): + _RestoreFilters() + return + lf_lines = [] + crlf_lines = [] try: # Support the UNIX convention of using "-" for stdin. Note that # we are not opening the file with universal newline support @@ -4597,10 +6059,7 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]): # contain trailing '\r' characters if we are reading a file that # has CRLF endings. # If after the split a trailing '\r' is present, it is removed - # below. If it is not expected to be present (i.e. os.linesep != - # '\r\n' as in Windows), a warning is issued below if this file - # is processed. - + # below. if filename == '-': lines = codecs.StreamReaderWriter(sys.stdin, codecs.getreader('utf8'), @@ -4609,16 +6068,19 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]): else: lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n') - carriage_return_found = False # Remove trailing '\r'. - for linenum in range(len(lines)): + # The -1 accounts for the extra trailing blank line we get from split() + for linenum in range(len(lines) - 1): if lines[linenum].endswith('\r'): lines[linenum] = lines[linenum].rstrip('\r') - carriage_return_found = True + crlf_lines.append(linenum + 1) + else: + lf_lines.append(linenum + 1) except IOError: sys.stderr.write( "Skipping input '%s': Can't open for reading\n" % filename) + _RestoreFilters() return # Note, if no dot is found, this will give the entire filename as the ext. @@ -4632,14 +6094,30 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]): else: ProcessFileData(filename, file_extension, lines, Error, extra_check_functions) - if carriage_return_found and os.linesep != '\r\n': - # Use 0 for linenum since outputting only one error for potentially - # several lines. - Error(filename, 0, 'whitespace/newline', 1, - 'One or more unexpected \\r (^M) found;' - 'better to use only a \\n') - sys.stderr.write('Done processing %s\n' % filename) + # If end-of-line sequences are a mix of LF and CR-LF, issue + # warnings on the lines with CR. + # + # Don't issue any warnings if all lines are uniformly LF or CR-LF, + # since critique can handle these just fine, and the style guide + # doesn't dictate a particular end of line sequence. + # + # We can't depend on os.linesep to determine what the desired + # end-of-line sequence should be, since that will return the + # server-side end-of-line sequence. + if lf_lines and crlf_lines: + # Warn on every line with CR. An alternative approach might be to + # check whether the file is mostly CRLF or just LF, and warn on the + # minority, we bias toward LF here since most tools prefer LF. + for linenum in crlf_lines: + Error(filename, linenum, 'whitespace/newline', 1, + 'Unexpected \\r (^M) found; better to use only \\n') + + # Suppress printing anything if --quiet was passed unless the error + # count has increased after processing this file. + if not _cpplint_state.quiet or old_errors != _cpplint_state.error_count: + sys.stdout.write('Done processing %s\n' % filename) + _RestoreFilters() def PrintUsage(message): @@ -4681,13 +6159,16 @@ def ParseArguments(args): 'filter=', 'root=', 'linelength=', - 'extensions=']) + 'extensions=', + 'headers=', + 'quiet']) except getopt.GetoptError: PrintUsage('Invalid arguments.') verbosity = _VerboseLevel() output_format = _OutputFormat() filters = '' + quiet = _Quiet() counting_style = '' for (opt, val) in opts: @@ -4697,6 +6178,8 @@ def ParseArguments(args): if val not in ('emacs', 'vs7', 'eclipse'): PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.') output_format = val + elif opt == '--quiet': + quiet = True elif opt == '--verbose': verbosity = int(val) elif opt == '--filter': @@ -4721,12 +6204,15 @@ def ParseArguments(args): try: _valid_extensions = set(val.split(',')) except ValueError: - PrintUsage('Extensions must be comma seperated list.') + PrintUsage('Extensions must be comma separated list.') + elif opt == '--headers': + ProcessHppHeadersOption(val) if not filenames: PrintUsage('No files were specified.') _SetOutputFormat(output_format) + _SetQuiet(quiet) _SetVerboseLevel(verbosity) _SetFilters(filters) _SetCountingStyle(counting_style) @@ -4747,7 +6233,9 @@ def main(): _cpplint_state.ResetErrorCounts() for filename in filenames: ProcessFile(filename, _cpplint_state.verbose_level) - _cpplint_state.PrintErrorCounts() + # If --quiet is passed, suppress printing error count unless there are errors. + if not _cpplint_state.quiet or _cpplint_state.error_count > 0: + _cpplint_state.PrintErrorCounts() sys.exit(_cpplint_state.error_count > 0) From ca93fc740edcc84da5c61f0951b86cf98c0c4752 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 27 Oct 2021 10:05:19 -0700 Subject: [PATCH 0237/1000] update libwebm to libwebm-1.0.0.28-20-g206d268 picks up Android.mk license updates from AOSP and fixes as part of the 1.0.0.28 release changelog: https://chromium.googlesource.com/webm/libwebm/+log/37d9b86..206d268 Change-Id: I18d5238f7d1aff2678d903018929da952410fa0e --- third_party/libwebm/Android.mk | 3 + third_party/libwebm/README.libvpx | 2 +- third_party/libwebm/mkvmuxer/mkvmuxer.cc | 30 ++++--- third_party/libwebm/mkvmuxer/mkvmuxerutil.cc | 4 +- third_party/libwebm/mkvparser/mkvparser.cc | 87 +++++++++++++------- 5 files changed, 84 insertions(+), 42 deletions(-) diff --git a/third_party/libwebm/Android.mk b/third_party/libwebm/Android.mk index 3b3dd1d390..23f935f2db 100644 --- a/third_party/libwebm/Android.mk +++ b/third_party/libwebm/Android.mk @@ -14,4 +14,7 @@ LOCAL_SRC_FILES:= common/file_util.cc \ mkvmuxer/mkvmuxer.cc \ mkvmuxer/mkvmuxerutil.cc \ mkvmuxer/mkvwriter.cc +LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD +LOCAL_LICENSE_CONDITIONS := notice +LOCAL_NOTICE_FILE := $(LOCAL_PATH)/LICENSE.TXT $(LOCAL_PATH)/PATENTS.TXT include $(BUILD_STATIC_LIBRARY) diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx index 1e87afd3d1..5cc0a83701 100644 --- a/third_party/libwebm/README.libvpx +++ b/third_party/libwebm/README.libvpx @@ -1,5 +1,5 @@ URL: https://chromium.googlesource.com/webm/libwebm -Version: 37d9b860ebbf40cb0f6dcb7a6fef452d798062da +Version: 206d268d4d8066e5a37c49025325b80c95c771dd License: BSD License File: LICENSE.txt diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/libwebm/mkvmuxer/mkvmuxer.cc index 5120312119..24c288863f 100644 --- a/third_party/libwebm/mkvmuxer/mkvmuxer.cc +++ b/third_party/libwebm/mkvmuxer/mkvmuxer.cc @@ -774,7 +774,7 @@ bool Track::Write(IMkvWriter* writer) const { return false; // AV1 tracks require a CodecPrivate. See - // https://github.com/Matroska-Org/matroska-specification/blob/av1-mappin/codec/av1.md + // https://github.com/ietf-wg-cellar/matroska-specification/blob/HEAD/codec/av1.md // TODO(tomfinegan): Update the above link to the AV1 Matroska mappings to // point to a stable version once it is finalized, or our own WebM mappings // page on webmproject.org should we decide to release them. @@ -2622,7 +2622,8 @@ bool Cluster::Finalize(bool set_last_frame_duration, uint64_t duration) { uint64_t Cluster::Size() const { const uint64_t element_size = - EbmlMasterElementSize(libwebm::kMkvCluster, 0xFFFFFFFFFFFFFFFFULL) + + EbmlMasterElementSize(static_cast(libwebm::kMkvCluster), + uint64_t{0xFFFFFFFFFFFFFFFFU}) + payload_size_; return element_size; } @@ -3084,6 +3085,7 @@ Segment::Segment() accurate_cluster_duration_(false), fixed_size_cluster_timecode_(false), estimate_file_duration_(false), + ebml_header_size_(0), payload_pos_(0), size_position_(0), doc_type_version_(kDefaultDocTypeVersion), @@ -4105,12 +4107,16 @@ int Segment::WriteFramesAll() { // places where |doc_type_version_| needs to be updated. if (frame->discard_padding() != 0) doc_type_version_ = 4; - if (!cluster->AddFrame(frame)) - return -1; + if (!cluster->AddFrame(frame)) { + delete frame; + continue; + } if (new_cuepoint_ && cues_track_ == frame->track_number()) { - if (!AddCuePoint(frame->timestamp(), cues_track_)) - return -1; + if (!AddCuePoint(frame->timestamp(), cues_track_)) { + delete frame; + continue; + } } if (frame->timestamp() > last_timestamp_) { @@ -4153,12 +4159,16 @@ bool Segment::WriteFramesLessThan(uint64_t timestamp) { const Frame* const frame_prev = frames_[i - 1]; if (frame_prev->discard_padding() != 0) doc_type_version_ = 4; - if (!cluster->AddFrame(frame_prev)) - return false; + if (!cluster->AddFrame(frame_prev)) { + delete frame_prev; + continue; + } if (new_cuepoint_ && cues_track_ == frame_prev->track_number()) { - if (!AddCuePoint(frame_prev->timestamp(), cues_track_)) - return false; + if (!AddCuePoint(frame_prev->timestamp(), cues_track_)) { + delete frame_prev; + continue; + } } ++shift_left; diff --git a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc index 6436817c9b..bd2f769138 100644 --- a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc +++ b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc @@ -606,8 +606,8 @@ uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) { void GetVersion(int32* major, int32* minor, int32* build, int32* revision) { *major = 0; - *minor = 2; - *build = 1; + *minor = 3; + *build = 0; *revision = 0; } diff --git a/third_party/libwebm/mkvparser/mkvparser.cc b/third_party/libwebm/mkvparser/mkvparser.cc index ace65bd595..de8884b381 100644 --- a/third_party/libwebm/mkvparser/mkvparser.cc +++ b/third_party/libwebm/mkvparser/mkvparser.cc @@ -54,9 +54,9 @@ Type* SafeArrayAlloc(unsigned long long num_elements, void GetVersion(int& major, int& minor, int& build, int& revision) { major = 1; - minor = 0; + minor = 1; build = 0; - revision = 30; + revision = 0; } long long ReadUInt(IMkvReader* pReader, long long pos, long& len) { @@ -1502,8 +1502,8 @@ long SeekHead::Parse() { // first count the seek head entries - int entry_count = 0; - int void_element_count = 0; + long long entry_count = 0; + long long void_element_count = 0; while (pos < stop) { long long id, size; @@ -1513,10 +1513,15 @@ long SeekHead::Parse() { if (status < 0) // error return status; - if (id == libwebm::kMkvSeek) + if (id == libwebm::kMkvSeek) { ++entry_count; - else if (id == libwebm::kMkvVoid) + if (entry_count > INT_MAX) + return E_PARSE_FAILED; + } else if (id == libwebm::kMkvVoid) { ++void_element_count; + if (void_element_count > INT_MAX) + return E_PARSE_FAILED; + } pos += size; // consume payload @@ -1528,14 +1533,15 @@ long SeekHead::Parse() { return E_FILE_FORMAT_INVALID; if (entry_count > 0) { - m_entries = new (std::nothrow) Entry[entry_count]; + m_entries = new (std::nothrow) Entry[static_cast(entry_count)]; if (m_entries == NULL) return -1; } if (void_element_count > 0) { - m_void_elements = new (std::nothrow) VoidElement[void_element_count]; + m_void_elements = + new (std::nothrow) VoidElement[static_cast(void_element_count)]; if (m_void_elements == NULL) return -1; @@ -1582,13 +1588,13 @@ long SeekHead::Parse() { ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries); assert(count_ >= 0); - assert(count_ <= entry_count); + assert(static_cast(count_) <= entry_count); m_entry_count = static_cast(count_); count_ = ptrdiff_t(pVoidElement - m_void_elements); assert(count_ >= 0); - assert(count_ <= void_element_count); + assert(static_cast(count_) <= void_element_count); m_void_element_count = static_cast(count_); @@ -2299,7 +2305,7 @@ bool CuePoint::Load(IMkvReader* pReader) { long long pos = pos_; // First count number of track positions - + unsigned long long track_positions_count = 0; while (pos < stop) { long len; @@ -2323,12 +2329,17 @@ bool CuePoint::Load(IMkvReader* pReader) { if (id == libwebm::kMkvCueTime) m_timecode = UnserializeUInt(pReader, pos, size); - else if (id == libwebm::kMkvCueTrackPositions) - ++m_track_positions_count; + else if (id == libwebm::kMkvCueTrackPositions) { + ++track_positions_count; + if (track_positions_count > UINT_MAX) + return E_PARSE_FAILED; + } pos += size; // consume payload } + m_track_positions_count = static_cast(track_positions_count); + if (m_timecode < 0 || m_track_positions_count <= 0) { return false; } @@ -4194,8 +4205,8 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, const long long stop = start + size; // Count ContentCompression and ContentEncryption elements. - int compression_count = 0; - int encryption_count = 0; + long long compression_count = 0; + long long encryption_count = 0; while (pos < stop) { long long id, size; @@ -4203,11 +4214,17 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, if (status < 0) // error return status; - if (id == libwebm::kMkvContentCompression) + if (id == libwebm::kMkvContentCompression) { ++compression_count; + if (compression_count > INT_MAX) + return E_PARSE_FAILED; + } - if (id == libwebm::kMkvContentEncryption) + if (id == libwebm::kMkvContentEncryption) { ++encryption_count; + if (encryption_count > INT_MAX) + return E_PARSE_FAILED; + } pos += size; // consume payload if (pos > stop) @@ -4218,16 +4235,16 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, return -1; if (compression_count > 0) { - compression_entries_ = - new (std::nothrow) ContentCompression*[compression_count]; + compression_entries_ = new (std::nothrow) + ContentCompression*[static_cast(compression_count)]; if (!compression_entries_) return -1; compression_entries_end_ = compression_entries_; } if (encryption_count > 0) { - encryption_entries_ = - new (std::nothrow) ContentEncryption*[encryption_count]; + encryption_entries_ = new (std::nothrow) + ContentEncryption*[static_cast(encryption_count)]; if (!encryption_entries_) { delete[] compression_entries_; compression_entries_ = NULL; @@ -4918,7 +4935,7 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) { const long long stop = start + size; // Count ContentEncoding elements. - int count = 0; + long long count = 0; while (pos < stop) { long long id, size; const long status = ParseElementHeader(pReader, pos, stop, id, size); @@ -4926,8 +4943,11 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) { return status; // pos now designates start of element - if (id == libwebm::kMkvContentEncoding) + if (id == libwebm::kMkvContentEncoding) { ++count; + if (count > INT_MAX) + return E_PARSE_FAILED; + } pos += size; // consume payload if (pos > stop) @@ -4937,7 +4957,8 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) { if (count <= 0) return -1; - content_encoding_entries_ = new (std::nothrow) ContentEncoding*[count]; + content_encoding_entries_ = + new (std::nothrow) ContentEncoding*[static_cast(count)]; if (!content_encoding_entries_) return -1; @@ -5229,6 +5250,8 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size, projection_ptr->type = static_cast(projection_type); } else if (child_id == libwebm::kMkvProjectionPrivate) { + if (projection_ptr->private_data != NULL) + return false; unsigned char* data = SafeArrayAlloc(1, child_size); if (data == NULL) @@ -5286,6 +5309,7 @@ VideoTrack::VideoTrack(Segment* pSegment, long long element_start, m_projection(NULL) {} VideoTrack::~VideoTrack() { + delete[] m_colour_space; delete m_colour; delete m_projection; } @@ -5307,7 +5331,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, long long stereo_mode = 0; double rate = 0.0; - char* colour_space = NULL; + std::unique_ptr colour_space_ptr; IMkvReader* const pReader = pSegment->m_pReader; @@ -5384,9 +5408,11 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, projection_ptr.reset(projection); } } else if (id == libwebm::kMkvColourSpace) { + char* colour_space = NULL; const long status = UnserializeString(pReader, pos, size, colour_space); if (status < 0) return status; + colour_space_ptr.reset(colour_space); } pos += size; // consume payload @@ -5418,7 +5444,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, pTrack->m_stereo_mode = stereo_mode; pTrack->m_rate = rate; pTrack->m_colour = colour_ptr.release(); - pTrack->m_colour_space = colour_space; + pTrack->m_colour_space = colour_space_ptr.release(); pTrack->m_projection = projection_ptr.release(); pResult = pTrack; @@ -5648,7 +5674,7 @@ long Tracks::Parse() { const long long stop = m_start + m_size; IMkvReader* const pReader = m_pSegment->m_pReader; - int count = 0; + long long count = 0; long long pos = m_start; while (pos < stop) { @@ -5662,8 +5688,11 @@ long Tracks::Parse() { if (size == 0) // weird continue; - if (id == libwebm::kMkvTrackEntry) + if (id == libwebm::kMkvTrackEntry) { ++count; + if (count > INT_MAX) + return E_PARSE_FAILED; + } pos += size; // consume payload if (pos > stop) @@ -5676,7 +5705,7 @@ long Tracks::Parse() { if (count <= 0) return 0; // success - m_trackEntries = new (std::nothrow) Track*[count]; + m_trackEntries = new (std::nothrow) Track*[static_cast(count)]; if (m_trackEntries == NULL) return -1; From 69d08cb9d33af163ebef67d8c4dc45c897106905 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 6 Nov 2021 10:42:46 -0700 Subject: [PATCH 0238/1000] vp8_update_rate_correction_factors: fix integer overflow the intermediate value in the correction_factor calculation may exceed integer bounds Bug: b/189602769 Change-Id: I75726b12f3095663911d78333f3ea26eb6dee21e --- vp8/encoder/ratectrl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index 4b76cc6429..d591680ce3 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -1079,8 +1079,8 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) { /* Work out a size correction factor. */ if (projected_size_based_on_q > 0) { - correction_factor = - (100 * cpi->projected_frame_size) / projected_size_based_on_q; + correction_factor = (int)((100 * (int64_t)cpi->projected_frame_size) / + projected_size_based_on_q); } /* More heavily damped adjustment used if we have been oscillating From 23796337ce5b0a2f58eb7386c9aded5e6a4b84f6 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 8 Nov 2021 12:57:12 -0800 Subject: [PATCH 0239/1000] vp8,calc_pframe_target_size: fix integer overflow this is similar to the fix for calc_iframe_target_size: 5f345a924 Avoid overflow in calc_iframe_target_size Bug: chromium:1264506 Change-Id: I2f0e161cf9da59ca0724692d581f1594c8098ebb --- vp8/encoder/ratectrl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index d591680ce3..3df34009ab 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -781,6 +781,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { } } else { int percent_high = 0; + int64_t target = cpi->this_frame_target; if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && (cpi->buffer_level > cpi->oxcf.optimal_buffer_level)) { @@ -798,7 +799,9 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { percent_high = 0; } - cpi->this_frame_target += (cpi->this_frame_target * percent_high) / 200; + target += (target * percent_high) / 200; + target = VPXMIN(target, INT_MAX); + cpi->this_frame_target = (int)target; /* Are we allowing control of active_worst_allowed_q according * to buffer level. From 40c21ff6fe32c12d7e0a7f66b0a2f7ca67a26695 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 6 Nov 2021 10:42:37 -0700 Subject: [PATCH 0240/1000] video_source.h,ReallocImage: quiet implicit conv warning with -fsanitize=undefined test/video_source.h:194:33: runtime error: implicit conversion from type 'int' of value -32 (32-bit, signed) to type 'unsigned int' changed the value to 4294967264 (32-bit, unsigned) Change-Id: I92013086d517fecf01c9e4cdfe6737b8ce733a1f --- test/video_source.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/video_source.h b/test/video_source.h index e9340f21e9..2ba3f64211 100644 --- a/test/video_source.h +++ b/test/video_source.h @@ -191,7 +191,7 @@ class DummyVideoSource : public VideoSource { void ReallocImage() { vpx_img_free(img_); img_ = vpx_img_alloc(NULL, format_, width_, height_, 32); - raw_sz_ = ((img_->w + 31) & ~31) * img_->h * img_->bps / 8; + raw_sz_ = ((img_->w + 31) & ~31u) * img_->h * img_->bps / 8; } vpx_image_t *img_; From 1676cddaaa4feebc766c64767f035ca9b0e5739f Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 6 Nov 2021 16:33:00 -0700 Subject: [PATCH 0241/1000] vp8: fix some implicit signed -> unsigned conv warnings and vice-versa mostly when dealing with bitmasks w/clang-11 -fsanitize=undefined Change-Id: I6d8f676bf87679ba1dad9cb7f55eea172103d9d3 --- vp8/decoder/decodeframe.c | 4 ++-- vp8/encoder/encodeframe.c | 4 ++-- vp8/encoder/encodemv.c | 2 +- vp8/encoder/lookahead.c | 4 ++-- vp8/encoder/ratectrl.c | 2 +- vpx_ports/x86.h | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c index 67c254fa14..1c1566766b 100644 --- a/vp8/decoder/decodeframe.c +++ b/vp8/decoder/decodeframe.c @@ -872,8 +872,8 @@ static void init_frame(VP8D_COMP *pbi) { xd->mode_info_stride = pc->mode_info_stride; xd->corrupted = 0; /* init without corruption */ - xd->fullpixel_mask = 0xffffffff; - if (pc->full_pixel) xd->fullpixel_mask = 0xfffffff8; + xd->fullpixel_mask = ~0; + if (pc->full_pixel) xd->fullpixel_mask = ~7; } int vp8_decode_frame(VP8D_COMP *pbi) { diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 2f84381d24..69271f1a73 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -634,8 +634,8 @@ static void init_encode_frame_mb_context(VP8_COMP *cpi) { cpi->prob_last_coded, cpi->prob_gf_coded); } - xd->fullpixel_mask = 0xffffffff; - if (cm->full_pixel) xd->fullpixel_mask = 0xfffffff8; + xd->fullpixel_mask = ~0; + if (cm->full_pixel) xd->fullpixel_mask = ~7; vp8_zero(x->coef_counts); vp8_zero(x->ymode_count); diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c index 04adf105b9..ff38965393 100644 --- a/vp8/encoder/encodemv.c +++ b/vp8/encoder/encodemv.c @@ -160,7 +160,7 @@ static void calc_prob(vp8_prob *p, const unsigned int ct[2]) { const unsigned int tot = ct[0] + ct[1]; if (tot) { - const vp8_prob x = ((ct[0] * 255) / tot) & -2; + const vp8_prob x = ((ct[0] * 255) / tot) & ~1u; *p = x ? x : 1; } } diff --git a/vp8/encoder/lookahead.c b/vp8/encoder/lookahead.c index 37aa9eee84..49f851d019 100644 --- a/vp8/encoder/lookahead.c +++ b/vp8/encoder/lookahead.c @@ -66,8 +66,8 @@ struct lookahead_ctx *vp8_lookahead_init(unsigned int width, depth += 1; /* Align the buffer dimensions */ - width = (width + 15) & ~15; - height = (height + 15) & ~15; + width = (width + 15) & ~15u; + height = (height + 15) & ~15u; /* Allocate the lookahead structures */ ctx = calloc(1, sizeof(*ctx)); diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index 4b76cc6429..0a346ccfbe 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -314,7 +314,7 @@ static void calc_iframe_target_size(VP8_COMP *cpi) { * bandwidth per second * fraction of the initial buffer * level */ - target = cpi->oxcf.starting_buffer_level / 2; + target = (uint64_t)cpi->oxcf.starting_buffer_level / 2; if (target > cpi->oxcf.target_bandwidth * 3 / 2) { target = cpi->oxcf.target_bandwidth * 3 / 2; diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h index 4d5391b78d..651ff64606 100644 --- a/vpx_ports/x86.h +++ b/vpx_ports/x86.h @@ -391,7 +391,7 @@ static INLINE unsigned int x87_set_double_precision(void) { // Reserved 01B // Double Precision (53-Bits) 10B // Extended Precision (64-Bits) 11B - x87_set_control_word((mode & ~0x300) | 0x200); + x87_set_control_word((mode & ~0x300u) | 0x200u); return mode; } From 2e73da326a26ddc367317e860e24e274947c26d8 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 6 Nov 2021 16:43:11 -0700 Subject: [PATCH 0242/1000] mem_sse2.h: storeu_uint32 -> storeu_int32 this changes the parameter to int32_t which matches the type with usage of this call using _mm_cvtsi128_si32() as a parameter. quiets an implicit conversion warning with clang-11 -fsanitize=undefined Change-Id: I1e9e9ffac5d2996962d29611458311221eca8ea0 --- vp8/common/x86/bilinear_filter_sse2.c | 4 ++-- vpx_dsp/x86/loopfilter_sse2.c | 16 ++++++++-------- vpx_dsp/x86/mem_sse2.h | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/vp8/common/x86/bilinear_filter_sse2.c b/vp8/common/x86/bilinear_filter_sse2.c index 9bf65d8045..ff6cbbd68c 100644 --- a/vp8/common/x86/bilinear_filter_sse2.c +++ b/vp8/common/x86/bilinear_filter_sse2.c @@ -313,10 +313,10 @@ static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride, const __m128i compensated = _mm_add_epi16(sum, round_factor); const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); __m128i packed = _mm_packus_epi16(shifted, shifted); - storeu_uint32(dst, _mm_cvtsi128_si32(packed)); + storeu_int32(dst, _mm_cvtsi128_si32(packed)); packed = _mm_srli_si128(packed, 4); dst += stride; - storeu_uint32(dst, _mm_cvtsi128_si32(packed)); + storeu_int32(dst, _mm_cvtsi128_si32(packed)); dst += stride; src += 8; } diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c index b6ff24834b..347c9fdbe9 100644 --- a/vpx_dsp/x86/loopfilter_sse2.c +++ b/vpx_dsp/x86/loopfilter_sse2.c @@ -211,21 +211,21 @@ void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); - storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); + storeu_int32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); ps1ps0 = _mm_srli_si128(ps1ps0, 4); - storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); + storeu_int32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); ps1ps0 = _mm_srli_si128(ps1ps0, 4); - storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); + storeu_int32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); ps1ps0 = _mm_srli_si128(ps1ps0, 4); - storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); + storeu_int32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); - storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); + storeu_int32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); qs1qs0 = _mm_srli_si128(qs1qs0, 4); - storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); + storeu_int32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); qs1qs0 = _mm_srli_si128(qs1qs0, 4); - storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); + storeu_int32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); qs1qs0 = _mm_srli_si128(qs1qs0, 4); - storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); + storeu_int32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); } void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch, diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h index 258ab38e60..75fa2b0b72 100644 --- a/vpx_dsp/x86/mem_sse2.h +++ b/vpx_dsp/x86/mem_sse2.h @@ -16,7 +16,7 @@ #include "./vpx_config.h" -static INLINE void storeu_uint32(void *dst, uint32_t v) { +static INLINE void storeu_int32(void *dst, int32_t v) { memcpy(dst, &v, sizeof(v)); } From 16333de2890ea84fff5c2d788003f4eea2ce5600 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 6 Nov 2021 16:48:13 -0700 Subject: [PATCH 0243/1000] mem_sse2.h: loadu_uint32 -> loadu_int32 this changes the return to int32_t which matches the type with usage of this call as input to _mm_cvtsi32_si128(), _mm_set_epi32(), etc. fixes implicit conversion warning with clang-11 -fsanitize=undefined Change-Id: I1425f12d4f79155dd5d7af0eb00fbdb9f1940544 --- vpx_dsp/x86/avg_pred_sse2.c | 6 +++--- vpx_dsp/x86/mem_sse2.h | 4 ++-- vpx_dsp/x86/variance_sse2.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/vpx_dsp/x86/avg_pred_sse2.c b/vpx_dsp/x86/avg_pred_sse2.c index e4e1e0e7a2..c6e70f744e 100644 --- a/vpx_dsp/x86/avg_pred_sse2.c +++ b/vpx_dsp/x86/avg_pred_sse2.c @@ -46,9 +46,9 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, r = _mm_loadu_si128((const __m128i *)ref); ref += 16; } else if (width == 4) { - r = _mm_set_epi32(loadu_uint32(ref + 3 * ref_stride), - loadu_uint32(ref + 2 * ref_stride), - loadu_uint32(ref + ref_stride), loadu_uint32(ref)); + r = _mm_set_epi32(loadu_int32(ref + 3 * ref_stride), + loadu_int32(ref + 2 * ref_stride), + loadu_int32(ref + ref_stride), loadu_int32(ref)); ref += 4 * ref_stride; } else { diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h index 75fa2b0b72..8b6d4d1dd4 100644 --- a/vpx_dsp/x86/mem_sse2.h +++ b/vpx_dsp/x86/mem_sse2.h @@ -20,8 +20,8 @@ static INLINE void storeu_int32(void *dst, int32_t v) { memcpy(dst, &v, sizeof(v)); } -static INLINE uint32_t loadu_uint32(const void *src) { - uint32_t v; +static INLINE int32_t loadu_int32(const void *src) { + int32_t v; memcpy(&v, src, sizeof(v)); return v; } diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c index 37ef64ecaa..67645c57ac 100644 --- a/vpx_dsp/x86/variance_sse2.c +++ b/vpx_dsp/x86/variance_sse2.c @@ -36,8 +36,8 @@ unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) { } static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) { - const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride)); - const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride)); + const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride)); + const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride)); const __m128i p01 = _mm_unpacklo_epi32(p0, p1); return _mm_unpacklo_epi8(p01, _mm_setzero_si128()); } From 888bafc78d8bddb5cfc4262c93f456c812763571 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 6 Nov 2021 10:42:46 -0700 Subject: [PATCH 0244/1000] vp8 encoder: fix some integer overflows cap the bitrate to 1000Mbps to avoid many instances of bitrate * 3 / 2 overflowing. this adds coverage for 2048x2048 in the default test for VP8 with TODOs for issues at that resolution for VP9 and at max resolution for both. Bug: b/189602769 Bug: chromium:1264506 Bug: webm:1748 Bug: webm:1749 Bug: webm:1750 Bug: webm:1751 Change-Id: Iedee4dd8d3609c2504271f94d22433dfcd828429 --- test/realtime_test.cc | 43 ++++++++++++++++++++++++++++++++++++++----- vp8/vp8_cx_iface.c | 4 +++- 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/test/realtime_test.cc b/test/realtime_test.cc index 63a5347d99..b32a35513c 100644 --- a/test/realtime_test.cc +++ b/test/realtime_test.cc @@ -7,6 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include + #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/util.h" @@ -52,6 +54,22 @@ class RealtimeTest frame_packets_++; } + bool IsVP9() const { +#if CONFIG_VP9_ENCODER + return codec_ == &libvpx_test::kVP9; +#else + return false; +#endif + } + + void TestIntegerOverflow(unsigned int width, unsigned int height) { + ::libvpx_test::RandomVideoSource video; + video.SetSize(width, height); + video.set_limit(20); + cfg_.rc_target_bitrate = UINT_MAX; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + int frame_packets_; }; @@ -64,11 +82,26 @@ TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) { } TEST_P(RealtimeTest, IntegerOverflow) { - ::libvpx_test::RandomVideoSource video; - video.SetSize(800, 480); - video.set_limit(20); - cfg_.rc_target_bitrate = 140000000; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + if (IsVP9()) { + // TODO(https://crbug.com/webm/1749): This should match VP8. + TestIntegerOverflow(800, 480); + } else { + TestIntegerOverflow(2048, 2048); + } +} + +TEST_P(RealtimeTest, IntegerOverflowLarge) { + if (IsVP9()) { + GTEST_SKIP() << "TODO(https://crbug.com/webm/1750): Enable this test after " + "undefined sanitizer warnings are fixed."; + // TestIntegerOverflow(16384, 16384); + } else { + GTEST_SKIP() + << "TODO(https://crbug.com/webm/1748,https://crbug.com/webm/1751):" + << " Enable this test after bitstream errors & undefined sanitizer " + "warnings are fixed."; + // TestIntegerOverflow(16383, 16383); + } } VP8_INSTANTIATE_TEST_SUITE(RealtimeTest, diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 893b7a5132..ab954c46f2 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -339,7 +339,9 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, oxcf->end_usage = USAGE_CONSTANT_QUALITY; } - oxcf->target_bandwidth = cfg.rc_target_bitrate; + // Cap the target rate to 1000 Mbps to avoid some integer overflows in + // target bandwidth calculations. + oxcf->target_bandwidth = VPXMIN(cfg.rc_target_bitrate, 1000000); oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct; oxcf->gf_cbr_boost_pct = vp8_cfg.gf_cbr_boost_pct; From 7e4c6fed0c9b212dcee4787d461042c94dee4468 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 11 Nov 2021 13:42:35 -0800 Subject: [PATCH 0245/1000] test/DummyVideoSource::ReallocImage: check img_ alloc prevents a crash on the next line accessing img_ members Bug: aomedia:3191 Change-Id: I430fb4ee662b0001629096eb8b554f8a2b30cce0 --- test/video_source.h | 1 + 1 file changed, 1 insertion(+) diff --git a/test/video_source.h b/test/video_source.h index 2ba3f64211..7a2dbe7ef7 100644 --- a/test/video_source.h +++ b/test/video_source.h @@ -191,6 +191,7 @@ class DummyVideoSource : public VideoSource { void ReallocImage() { vpx_img_free(img_); img_ = vpx_img_alloc(NULL, format_, width_, height_, 32); + ASSERT_NE(img_, nullptr); raw_sz_ = ((img_->w + 31) & ~31u) * img_->h * img_->bps / 8; } From 9fb780c5e75584fe198a89f0cf1898cebb542104 Mon Sep 17 00:00:00 2001 From: Mikko Koivisto Date: Mon, 15 Nov 2021 18:47:05 +0000 Subject: [PATCH 0246/1000] vp9: Fix multiplication overflow Fix UBSan error reported from aosp Cuttlefish device: /vp9/encoder/vp9_ratectrl.c:238:33: unsigned integer overflow: 2500000 * 1800 cannot be represented in type 'unsigned int' ...by casting the operand and the result of multiplication to 64bit integer. Test: vp9 webrtc streaming with Cuttlefish Change-Id: Id5bb3d4071a96179caffae0829d3cc4e48c7614b --- vp9/encoder/vp9_ratectrl.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index e38464c72c..ac346115fb 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -223,9 +223,10 @@ int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) { if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; if (oxcf->rc_max_inter_bitrate_pct) { - const int max_rate = - rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100; - target = VPXMIN(target, max_rate); + const int64_t max_rate = + (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100; + // target is of type int and VPXMIN cannot evaluate to larger than target + target = (int)VPXMIN(target, max_rate); } return target; } @@ -234,9 +235,9 @@ int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) { const RATE_CONTROL *rc = &cpi->rc; const VP9EncoderConfig *oxcf = &cpi->oxcf; if (oxcf->rc_max_intra_bitrate_pct) { - const int max_rate = - rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100; - target = VPXMIN(target, max_rate); + const int64_t max_rate = + (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100; + target = (int)VPXMIN(target, max_rate); } if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; return target; From bf93b61f789bc890b83b0fd34bb9665ec6137638 Mon Sep 17 00:00:00 2001 From: Mikko Koivisto Date: Tue, 16 Nov 2021 12:29:21 +0000 Subject: [PATCH 0247/1000] vp9: fix ubsan sub-overflows Fix errors reported by UBSan diagnostics: 1. /vp9/encoder/vp9_pickmode.c:308:29: unsigned integer overflow: 99 - 100 cannot be represented in type 'unsigned int' 2. /vp9/encoder/vp9_pickmode.c:330:27: unsigned integer overflow: 21976 - 21978 cannot be represented in type 'unsigned int' 3. /vp9/encoder/vp9_pickmode.c:468:13: unsigned integer overflow: 18852144 - 18852149 cannot be represented in type 'unsigned int' (Notice that line numbers might vary a bit because fixes have been applied incrementally i.e. fix for error #1 affects line number reported in #2) Fix by calculating difference instead of wrapping around to a value near maximum. Test: Cuttlefish webrtc with VP9 codec Change-Id: I4f85712028647e915a4e2da31e4b0a266e9e2705 --- vp9/encoder/vp9_pickmode.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 695fd484fc..1a66c0a867 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -268,6 +268,7 @@ static void block_variance(const uint8_t *src, int src_stride, #endif uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) { int i, j, k = 0; + uint32_t k_sqr = 0; *sse = 0; *sum = 0; @@ -305,7 +306,8 @@ static void block_variance(const uint8_t *src, int src_stride, #endif *sse += sse8x8[k]; *sum += sum8x8[k]; - var8x8[k] = sse8x8[k] - (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6); + k_sqr = (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6); + var8x8[k] = sse8x8[k] > k_sqr ? sse8x8[k] - k_sqr : k_sqr - sse8x8[k]; k++; } } @@ -319,6 +321,7 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size, const int nw = 1 << (bw - b_width_log2_lookup[unit_size]); const int nh = 1 << (bh - b_height_log2_lookup[unit_size]); int i, j, k = 0; + uint32_t k_sqr = 0; for (i = 0; i < nh; i += 2) { for (j = 0; j < nw; j += 2) { @@ -326,9 +329,10 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size, sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1]; sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] + sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1]; - var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >> - (b_width_log2_lookup[unit_size] + - b_height_log2_lookup[unit_size] + 6)); + k_sqr = (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >> + (b_width_log2_lookup[unit_size] + + b_height_log2_lookup[unit_size] + 6)); + var_o[k] = sse_o[k] > k_sqr ? sse_o[k] - k_sqr : k_sqr - sse_o[k]; k++; } } @@ -452,6 +456,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, unsigned int var8x8[64] = { 0 }; TX_SIZE tx_size; int i, k; + uint32_t sum_sqr; #if CONFIG_VP9_HIGHBITDEPTH const vpx_bit_depth_t bd = cpi->common.bit_depth; #endif @@ -463,7 +468,8 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, cpi->common.use_highbitdepth, bd, #endif sse8x8, sum8x8, var8x8); - var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4)); + sum_sqr = (uint32_t)((int64_t)sum * sum) >> (bw + bh + 4); + var = sse > sum_sqr ? sse - sum_sqr : sum_sqr - sse; *var_y = var; *sse_y = sse; From c59de7bc914d40305c1a7b066b9965d809d81533 Mon Sep 17 00:00:00 2001 From: Johann Date: Wed, 17 Nov 2021 06:02:15 +0900 Subject: [PATCH 0248/1000] MacOS 12 is darwin21 Remove -mmacosx-version-min. The library does not use any calls which are affected by the platform version. There is also no version 10.16 as it went from 10.15 to 11 and now to 12. At some point it may be good to clarify that the bare -darwin- target is for iOS and the -darwinN- targets are for macOS. Change-Id: I2fd5f7cae2637905acf3ab77bfddfbe367abbb68 --- build/make/configure.sh | 8 ++++---- configure | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/build/make/configure.sh b/build/make/configure.sh index 81d30a16c7..b24e79a0d2 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -774,7 +774,7 @@ process_common_toolchain() { tgt_isa=x86_64 tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'` ;; - *darwin20*) + *darwin2[0-1]*) tgt_isa=`uname -m` tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'` ;; @@ -918,9 +918,9 @@ process_common_toolchain() { add_cflags "-mmacosx-version-min=10.15" add_ldflags "-mmacosx-version-min=10.15" ;; - *-darwin20-*) - add_cflags "-mmacosx-version-min=10.16 -arch ${toolchain%%-*}" - add_ldflags "-mmacosx-version-min=10.16 -arch ${toolchain%%-*}" + *-darwin2[0-1]-*) + add_cflags "-arch ${toolchain%%-*}" + add_ldflags "-arch ${toolchain%%-*}" ;; *-iphonesimulator-*) add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}" diff --git a/configure b/configure index e3babbe824..d39db6cb08 100755 --- a/configure +++ b/configure @@ -100,6 +100,7 @@ EOF all_platforms="${all_platforms} arm64-android-gcc" all_platforms="${all_platforms} arm64-darwin-gcc" all_platforms="${all_platforms} arm64-darwin20-gcc" +all_platforms="${all_platforms} arm64-darwin21-gcc" all_platforms="${all_platforms} arm64-linux-gcc" all_platforms="${all_platforms} arm64-win64-gcc" all_platforms="${all_platforms} arm64-win64-vs15" @@ -152,6 +153,7 @@ all_platforms="${all_platforms} x86_64-darwin17-gcc" all_platforms="${all_platforms} x86_64-darwin18-gcc" all_platforms="${all_platforms} x86_64-darwin19-gcc" all_platforms="${all_platforms} x86_64-darwin20-gcc" +all_platforms="${all_platforms} x86_64-darwin21-gcc" all_platforms="${all_platforms} x86_64-iphonesimulator-gcc" all_platforms="${all_platforms} x86_64-linux-gcc" all_platforms="${all_platforms} x86_64-linux-icc" From c0ba429863bb9f85484430f1c62edf636e4fe6de Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 15 Nov 2021 19:20:36 -0800 Subject: [PATCH 0249/1000] encode_api_test.cc: unify kCodecs[] definitions and rename the table to kCodecIfaces[] to be a little more specific and avoid shadowing kCodecs[] in SetRoi() Change-Id: I64905f48d8bf76e812bdba8374b82e3f7654686f --- test/encode_api_test.cc | 42 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index dec19b2268..6f940b629e 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -19,7 +19,14 @@ namespace { -#define NELEMENTS(x) static_cast(sizeof(x) / sizeof(x[0])) +const vpx_codec_iface_t *kCodecIfaces[] = { +#if CONFIG_VP8_ENCODER + &vpx_codec_vp8_cx_algo, +#endif +#if CONFIG_VP9_ENCODER + &vpx_codec_vp9_cx_algo, +#endif +}; bool IsVP9(const vpx_codec_iface_t *iface) { static const char kVP9Name[] = "WebM Project VP9"; @@ -28,14 +35,6 @@ bool IsVP9(const vpx_codec_iface_t *iface) { } TEST(EncodeAPI, InvalidParams) { - static const vpx_codec_iface_t *kCodecs[] = { -#if CONFIG_VP8_ENCODER - &vpx_codec_vp8_cx_algo, -#endif -#if CONFIG_VP9_ENCODER - &vpx_codec_vp9_cx_algo, -#endif - }; uint8_t buf[1] = { 0 }; vpx_image_t img; vpx_codec_ctx_t enc; @@ -58,17 +57,17 @@ TEST(EncodeAPI, InvalidParams) { vpx_codec_enc_config_default(nullptr, &cfg, 0)); EXPECT_NE(vpx_codec_error(nullptr), nullptr); - for (int i = 0; i < NELEMENTS(kCodecs); ++i) { - SCOPED_TRACE(vpx_codec_iface_name(kCodecs[i])); + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_enc_init(nullptr, kCodecs[i], nullptr, 0)); + vpx_codec_enc_init(nullptr, iface, nullptr, 0)); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_enc_init(&enc, kCodecs[i], nullptr, 0)); + vpx_codec_enc_init(&enc, iface, nullptr, 0)); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_enc_config_default(kCodecs[i], &cfg, 1)); + vpx_codec_enc_config_default(iface, &cfg, 1)); - EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(kCodecs[i], &cfg, 0)); - EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, kCodecs[i], &cfg, 0)); + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(iface, &cfg, 0)); + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, iface, &cfg, 0)); EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, nullptr, 0, 0, 0, 0)); EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc)); @@ -124,14 +123,6 @@ TEST(EncodeAPI, ImageSizeSetting) { // (ts_target_bitrate[]) to 0 for both layers. This should fail independent of // CONFIG_MULTI_RES_ENCODING. TEST(EncodeAPI, MultiResEncode) { - static const vpx_codec_iface_t *kCodecs[] = { -#if CONFIG_VP8_ENCODER - &vpx_codec_vp8_cx_algo, -#endif -#if CONFIG_VP9_ENCODER - &vpx_codec_vp9_cx_algo, -#endif - }; const int width = 1280; const int height = 720; const int width_down = width / 2; @@ -139,8 +130,7 @@ TEST(EncodeAPI, MultiResEncode) { const int target_bitrate = 1000; const int framerate = 30; - for (int c = 0; c < NELEMENTS(kCodecs); ++c) { - const vpx_codec_iface_t *const iface = kCodecs[c]; + for (const auto *iface : kCodecIfaces) { vpx_codec_ctx_t enc[2]; vpx_codec_enc_cfg_t cfg[2]; vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } }; From 4a5a0a9a795782f33b8eb04461f9a5dfc9a146e1 Mon Sep 17 00:00:00 2001 From: Ilya Kurdyukov Date: Sat, 13 Nov 2021 18:22:14 +0700 Subject: [PATCH 0250/1000] faster vp8_regular_quantize_b_sse4_1 Gives 10% faster VP8 encoding in simple tests. This patch requires testing on wider datasets and encoder settings to see if this speedup is achieved on most data. Change-Id: If8e04819623e78fff126c413db66c964c0b4c11a --- vp8/encoder/x86/quantize_sse4.c | 95 ++++++++++++++++++++------------- vpx_ports/bitops.h | 23 +++++++- 2 files changed, 78 insertions(+), 40 deletions(-) diff --git a/vp8/encoder/x86/quantize_sse4.c b/vp8/encoder/x86/quantize_sse4.c index 389c16705d..6d03365fcb 100644 --- a/vp8/encoder/x86/quantize_sse4.c +++ b/vp8/encoder/x86/quantize_sse4.c @@ -11,28 +11,14 @@ #include /* SSE4.1 */ #include "./vp8_rtcd.h" -#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ #include "vp8/encoder/block.h" - -#define SELECT_EOB(i, z, x, y, q) \ - do { \ - short boost = *zbin_boost_ptr; \ - /* Technically _mm_extract_epi16() returns an int: */ \ - /* https://bugs.llvm.org/show_bug.cgi?id=41657 */ \ - short x_z = (short)_mm_extract_epi16(x, z); \ - short y_z = (short)_mm_extract_epi16(y, z); \ - int cmp = (x_z < boost) | (y_z == 0); \ - zbin_boost_ptr++; \ - if (cmp) break; \ - q = _mm_insert_epi16(q, y_z, z); \ - eob = i; \ - zbin_boost_ptr = b->zrun_zbin_boost; \ - } while (0) +#include "vpx_ports/bitops.h" /* get_lsb */ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { - char eob = 0; + int eob = -1; short *zbin_boost_ptr = b->zrun_zbin_boost; - + __m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr)); + __m128i zbin_boost1 = _mm_load_si128((__m128i *)(zbin_boost_ptr + 8)); __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1; __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); @@ -47,8 +33,12 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); - __m128i qcoeff0 = _mm_setzero_si128(); - __m128i qcoeff1 = _mm_setzero_si128(); + __m128i qcoeff0, qcoeff1, t0, t1, x_shuf0, x_shuf1; + uint32_t mask, ymask; + DECLARE_ALIGNED(16, static const uint8_t, + zig_zag_mask[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, + 9, 12, 13, 10, 7, 11, 14, 15 }; + DECLARE_ALIGNED(16, uint16_t, qcoeff[16]) = { 0 }; /* Duplicate to all lanes. */ zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); @@ -88,23 +78,52 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { y0 = _mm_sign_epi16(y0, z0); y1 = _mm_sign_epi16(y1, z1); - /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ - SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1); + { + const __m128i zig_zag_i16_0 = + _mm_setr_epi8(0, 1, 2, 3, 8, 9, 14, 15, 10, 11, 4, 5, 6, 7, 12, 13); + const __m128i zig_zag_i16_1 = + _mm_setr_epi8(0, 1, 6, 7, 8, 9, 2, 3, 14, 15, 4, 5, 10, 11, 12, 13); + + /* The first part of the zig zag needs a value + * from x_minus_zbin1 and vice versa. */ + t1 = _mm_alignr_epi8(x_minus_zbin1, x_minus_zbin1, 2); + t0 = _mm_blend_epi16(x_minus_zbin0, t1, 0x80); + t1 = _mm_blend_epi16(t1, x_minus_zbin0, 0x80); + x_shuf0 = _mm_shuffle_epi8(t0, zig_zag_i16_0); + x_shuf1 = _mm_shuffle_epi8(t1, zig_zag_i16_1); + } + + /* Check if y is nonzero and put it in zig zag order. */ + t0 = _mm_packs_epi16(y0, y1); + t0 = _mm_cmpeq_epi8(t0, _mm_setzero_si128()); + t0 = _mm_shuffle_epi8(t0, _mm_load_si128((const __m128i *)zig_zag_mask)); + ymask = _mm_movemask_epi8(t0) ^ 0xffff; + + for (;;) { + t0 = _mm_cmpgt_epi16(zbin_boost0, x_shuf0); + t1 = _mm_cmpgt_epi16(zbin_boost1, x_shuf1); + t0 = _mm_packs_epi16(t0, t1); + mask = _mm_movemask_epi8(t0); + mask = ~mask & ymask; + if (!mask) break; + /* |eob| will contain the index of the next found element where: + * boost[i - old_eob - 1] <= x[zigzag[i]] && y[zigzag[i]] != 0 */ + eob = get_lsb(mask); + /* Need to clear the mask from processed elements so that + * they are no longer counted in the next iteration. */ + ymask &= ~1U << eob; + /* It's safe to read ahead of this buffer if struct VP8_COMP has at + * least 32 bytes before the zrun_zbin_boost_* fields (it has 384). + * Any data read outside of the buffer is masked by the updated |ymask|. */ + zbin_boost0 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob - 1)); + zbin_boost1 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob + 7)); + qcoeff[zig_zag_mask[eob]] = 0xffff; + } + + qcoeff0 = _mm_load_si128((__m128i *)(qcoeff)); + qcoeff1 = _mm_load_si128((__m128i *)(qcoeff + 8)); + qcoeff0 = _mm_and_si128(qcoeff0, y0); + qcoeff1 = _mm_and_si128(qcoeff1, y1); _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0); _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1); @@ -115,5 +134,5 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0); _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1); - *d->eob = eob; + *d->eob = eob + 1; } diff --git a/vpx_ports/bitops.h b/vpx_ports/bitops.h index 5b2f31cd11..1b5cdaa6dd 100644 --- a/vpx_ports/bitops.h +++ b/vpx_ports/bitops.h @@ -26,20 +26,32 @@ extern "C" { #endif -// These versions of get_msb() are only valid when n != 0 because all -// of the optimized versions are undefined when n == 0: +// These versions of get_lsb() and get_msb() are only valid when n != 0 +// because all of the optimized versions are undefined when n == 0: // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html // use GNU builtins where available. #if defined(__GNUC__) && \ ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) +static INLINE int get_lsb(unsigned int n) { + assert(n != 0); + return __builtin_ctz(n); +} + static INLINE int get_msb(unsigned int n) { assert(n != 0); return 31 ^ __builtin_clz(n); } #elif defined(USE_MSC_INTRINSICS) +#pragma intrinsic(_BitScanForward) #pragma intrinsic(_BitScanReverse) +static INLINE int get_lsb(unsigned int n) { + unsigned long first_set_bit; // NOLINT(runtime/int) + _BitScanForward(&first_set_bit, n); + return first_set_bit; +} + static INLINE int get_msb(unsigned int n) { unsigned long first_set_bit; assert(n != 0); @@ -48,6 +60,13 @@ static INLINE int get_msb(unsigned int n) { } #undef USE_MSC_INTRINSICS #else +static INLINE int get_lsb(unsigned int n) { + int i; + assert(n != 0); + for (i = 0; i < 32 && !(n & 1); ++i) n >>= 1; + return i; +} + // Returns (int)floor(log2(n)). n must be > 0. static INLINE int get_msb(unsigned int n) { int log = 0; From 87ce2bc3e3266e670e0da71d7915c3c40e948c15 Mon Sep 17 00:00:00 2001 From: Ilya Kurdyukov Date: Wed, 17 Nov 2021 14:16:02 +0700 Subject: [PATCH 0251/1000] replaced bsr() with get_msb() from bitops.h The modified line should now compile into two instructions instead of four. Change-Id: Ie2eb6b13ff1e29b3107cb9e76f37ff9065504316 --- vp8/encoder/x86/vp8_quantize_ssse3.c | 33 +++++----------------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/vp8/encoder/x86/vp8_quantize_ssse3.c b/vp8/encoder/x86/vp8_quantize_ssse3.c index 147c30cc35..f6df146f08 100644 --- a/vp8/encoder/x86/vp8_quantize_ssse3.c +++ b/vp8/encoder/x86/vp8_quantize_ssse3.c @@ -12,31 +12,7 @@ #include "./vp8_rtcd.h" #include "vp8/encoder/block.h" - -/* bitscan reverse (bsr) */ -#if defined(_MSC_VER) -#include -#pragma intrinsic(_BitScanReverse) -static int bsr(int mask) { - unsigned long eob; - _BitScanReverse(&eob, mask); - eob++; - if (mask == 0) eob = 0; - return eob; -} -#else -static int bsr(int mask) { - int eob; -#if defined(__GNUC__) && __GNUC__ - __asm__ __volatile__("bsr %1, %0" : "=r"(eob) : "r"(mask) : "flags"); -#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) - asm volatile("bsr %1, %0" : "=r"(eob) : "r"(mask) : "flags"); -#endif - eob++; - if (mask == 0) eob = 0; - return eob; -} -#endif +#include "vpx_ports/bitops.h" /* get_msb */ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) { int eob, mask; @@ -108,7 +84,10 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) { mask = _mm_movemask_epi8(x); - eob = bsr(mask); + /* x2 is needed to increase the result from non-zero masks by 1, + * +1 is needed to mask undefined behavior for a null argument, + * the result of get_msb(1) is 0 */ + eob = get_msb(mask * 2 + 1); - *d->eob = 0xFF & eob; + *d->eob = eob; } From 2d9e4d3c7a07aa50dc5e12c59d190f28b9e1bcb7 Mon Sep 17 00:00:00 2001 From: Fyodor Kyslov Date: Wed, 17 Nov 2021 13:15:00 -0800 Subject: [PATCH 0252/1000] vp9 encoder: fix some integer overflows cap bitrate to 1000Mbps, change bitsaving budget to int64_t this make test coverage for 2048x2048 - same as for vp8 Bug: webm:1749 Fixed: webm:1749 Change-Id: Ic58d73cb7529b0826d1f501ad09af8e80f706a6e --- test/realtime_test.cc | 9 +-------- vp9/encoder/vp9_bitstream.c | 2 +- vp9/vp9_cx_iface.c | 5 +++-- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/test/realtime_test.cc b/test/realtime_test.cc index b32a35513c..ab2080a85d 100644 --- a/test/realtime_test.cc +++ b/test/realtime_test.cc @@ -81,14 +81,7 @@ TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) { EXPECT_EQ(kFramesToEncode, frame_packets_); } -TEST_P(RealtimeTest, IntegerOverflow) { - if (IsVP9()) { - // TODO(https://crbug.com/webm/1749): This should match VP8. - TestIntegerOverflow(800, 480); - } else { - TestIntegerOverflow(2048, 2048); - } -} +TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); } TEST_P(RealtimeTest, IntegerOverflowLarge) { if (IsVP9()) { diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 3eff4ce830..c23e150a45 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -554,7 +554,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, switch (cpi->sf.use_fast_coef_updates) { case TWO_LOOP: { /* dry run to see if there is any update at all needed */ - int savings = 0; + int64_t savings = 0; int update[2] = { 0, 0 }; for (i = 0; i < PLANE_TYPES; ++i) { for (j = 0; j < REF_TYPES; ++j) { diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 48d555532d..cc4081c4f5 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -523,8 +523,9 @@ static vpx_codec_err_t set_encoder_config( raw_target_rate = (unsigned int)((int64_t)oxcf->width * oxcf->height * oxcf->bit_depth * 3 * oxcf->init_framerate / 1000); - // Cap target bitrate to raw rate - cfg->rc_target_bitrate = VPXMIN(raw_target_rate, cfg->rc_target_bitrate); + // Cap target bitrate to raw rate or 1000Mbps, whichever is less + cfg->rc_target_bitrate = + VPXMIN(VPXMIN(raw_target_rate, cfg->rc_target_bitrate), 1000000); // Convert target bandwidth from Kbit/s to Bit/s oxcf->target_bandwidth = 1000 * (int64_t)cfg->rc_target_bitrate; From 1794f6db24d400ddc834b543510c547d777216cb Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 16 Nov 2021 18:21:34 -0800 Subject: [PATCH 0253/1000] vp9 encoder: fix row-mt crash w/thread config change previously row-mt would allocate thread data once, so increasing the number of threads with a config change would cause a heap overflow. Bug: chromium:1261415 Bug: chromium:1270689 Change-Id: I3c5ec8444ae91964fa34a19dd780bd2cbb0368bf --- test/encode_api_test.cc | 60 +++++++++++++++++++++ vp9/encoder/vp9_encoder.c | 25 ++------- vp9/encoder/vp9_ethread.c | 110 +++++++++++++++++++++++--------------- vp9/encoder/vp9_ethread.h | 5 ++ 4 files changed, 134 insertions(+), 66 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 6f940b629e..6f61c77502 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -10,10 +10,12 @@ #include #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" #include "./vpx_config.h" +#include "test/video_source.h" #include "vpx/vp8cx.h" #include "vpx/vpx_encoder.h" @@ -300,4 +302,62 @@ TEST(EncodeAPI, SetRoi) { } } +void InitCodec(const vpx_codec_iface_t &iface, int width, int height, + vpx_codec_ctx_t *enc, vpx_codec_enc_cfg_t *cfg) { + ASSERT_EQ(vpx_codec_enc_config_default(&iface, cfg, 0), VPX_CODEC_OK); + cfg->g_w = width; + cfg->g_h = height; + cfg->g_lag_in_frames = 0; + cfg->g_pass = VPX_RC_ONE_PASS; + ASSERT_EQ(vpx_codec_enc_init(enc, &iface, cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control_(enc, VP8E_SET_CPUUSED, 2), VPX_CODEC_OK); +} + +// Encodes 1 frame of size |cfg.g_w| x |cfg.g_h| setting |enc|'s configuration +// to |cfg|. +void EncodeWithConfig(const vpx_codec_enc_cfg_t &cfg, vpx_codec_ctx_t *enc) { + libvpx_test::DummyVideoSource video; + video.SetSize(cfg.g_w, cfg.g_h); + video.Begin(); + EXPECT_EQ(vpx_codec_enc_config_set(enc, &cfg), VPX_CODEC_OK) + << vpx_codec_error_detail(enc); + + EXPECT_EQ(vpx_codec_encode(enc, video.img(), video.pts(), video.duration(), + /*flags=*/0, VPX_DL_GOOD_QUALITY), + VPX_CODEC_OK) + << vpx_codec_error_detail(enc); +} + +TEST(EncodeAPI, ConfigChangeThreadCount) { + constexpr int kWidth = 1920; + constexpr int kHeight = 1080; + + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); + for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) { + vpx_codec_enc_cfg_t cfg; + struct Encoder { + ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); } + vpx_codec_ctx_t ctx = {}; + } enc; + + EXPECT_NO_FATAL_FAILURE( + InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg)); + if (IsVP9(iface)) { + EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_TILE_COLUMNS, 6), + VPX_CODEC_OK); + EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_ROW_MT, i), + VPX_CODEC_OK); + } + + for (const auto threads : { 1, 4, 8, 6, 2, 1 }) { + cfg.g_threads = threads; + EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx)) + << "iteration: " << i << " threads: " << threads; + } + } + } +} + } // namespace diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 7e80835f6c..8fdd86916f 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2676,7 +2676,6 @@ static void free_tpl_buffer(VP9_COMP *cpi); void vp9_remove_compressor(VP9_COMP *cpi) { VP9_COMMON *cm; unsigned int i; - int t; if (!cpi) return; @@ -2789,28 +2788,10 @@ void vp9_remove_compressor(VP9_COMP *cpi) { free_tpl_buffer(cpi); - for (t = 0; t < cpi->num_workers; ++t) { - VPxWorker *const worker = &cpi->workers[t]; - EncWorkerData *const thread_data = &cpi->tile_thr_data[t]; - - // Deallocate allocated threads. - vpx_get_worker_interface()->end(worker); - - // Deallocate allocated thread data. - if (t < cpi->num_workers - 1) { - vpx_free(thread_data->td->counts); - vp9_free_pc_tree(thread_data->td); - vpx_free(thread_data->td); - } - } - vpx_free(cpi->tile_thr_data); - vpx_free(cpi->workers); + vp9_loop_filter_dealloc(&cpi->lf_row_sync); + vp9_bitstream_encode_tiles_buffer_dealloc(cpi); vp9_row_mt_mem_dealloc(cpi); - - if (cpi->num_workers > 1) { - vp9_loop_filter_dealloc(&cpi->lf_row_sync); - vp9_bitstream_encode_tiles_buffer_dealloc(cpi); - } + vp9_encode_free_mt_data(cpi); #if !CONFIG_REALTIME_ONLY vp9_alt_ref_aq_destroy(cpi->alt_ref_aq); diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c index e7f8a537d4..453fe2e0df 100644 --- a/vp9/encoder/vp9_ethread.c +++ b/vp9/encoder/vp9_ethread.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "vp9/common/vp9_thread_common.h" +#include "vp9/encoder/vp9_bitstream.h" #include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_ethread.h" @@ -79,60 +81,59 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) { VP9_COMMON *const cm = &cpi->common; const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); int i; + // While using SVC, we need to allocate threads according to the highest + // resolution. When row based multithreading is enabled, it is OK to + // allocate more threads than the number of max tile columns. + if (cpi->use_svc && !cpi->row_mt) { + int max_tile_cols = get_max_tile_cols(cpi); + num_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols); + } + assert(num_workers > 0); + if (num_workers == cpi->num_workers) return; + vp9_loop_filter_dealloc(&cpi->lf_row_sync); + vp9_bitstream_encode_tiles_buffer_dealloc(cpi); + vp9_encode_free_mt_data(cpi); - // Only run once to create threads and allocate thread data. - if (cpi->num_workers == 0) { - int allocated_workers = num_workers; - - // While using SVC, we need to allocate threads according to the highest - // resolution. When row based multithreading is enabled, it is OK to - // allocate more threads than the number of max tile columns. - if (cpi->use_svc && !cpi->row_mt) { - int max_tile_cols = get_max_tile_cols(cpi); - allocated_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols); - } - - CHECK_MEM_ERROR(cm, cpi->workers, - vpx_malloc(allocated_workers * sizeof(*cpi->workers))); + CHECK_MEM_ERROR(cm, cpi->workers, + vpx_malloc(num_workers * sizeof(*cpi->workers))); - CHECK_MEM_ERROR(cm, cpi->tile_thr_data, - vpx_calloc(allocated_workers, sizeof(*cpi->tile_thr_data))); + CHECK_MEM_ERROR(cm, cpi->tile_thr_data, + vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data))); - for (i = 0; i < allocated_workers; i++) { - VPxWorker *const worker = &cpi->workers[i]; - EncWorkerData *thread_data = &cpi->tile_thr_data[i]; + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + EncWorkerData *thread_data = &cpi->tile_thr_data[i]; - ++cpi->num_workers; - winterface->init(worker); + ++cpi->num_workers; + winterface->init(worker); - if (i < allocated_workers - 1) { - thread_data->cpi = cpi; + if (i < num_workers - 1) { + thread_data->cpi = cpi; - // Allocate thread data. - CHECK_MEM_ERROR(cm, thread_data->td, - vpx_memalign(32, sizeof(*thread_data->td))); - vp9_zero(*thread_data->td); + // Allocate thread data. + CHECK_MEM_ERROR(cm, thread_data->td, + vpx_memalign(32, sizeof(*thread_data->td))); + vp9_zero(*thread_data->td); - // Set up pc_tree. - thread_data->td->leaf_tree = NULL; - thread_data->td->pc_tree = NULL; - vp9_setup_pc_tree(cm, thread_data->td); + // Set up pc_tree. + thread_data->td->leaf_tree = NULL; + thread_data->td->pc_tree = NULL; + vp9_setup_pc_tree(cm, thread_data->td); - // Allocate frame counters in thread data. - CHECK_MEM_ERROR(cm, thread_data->td->counts, - vpx_calloc(1, sizeof(*thread_data->td->counts))); + // Allocate frame counters in thread data. + CHECK_MEM_ERROR(cm, thread_data->td->counts, + vpx_calloc(1, sizeof(*thread_data->td->counts))); - // Create threads - if (!winterface->reset(worker)) - vpx_internal_error(&cm->error, VPX_CODEC_ERROR, - "Tile encoder thread creation failed"); - } else { - // Main thread acts as a worker and uses the thread data in cpi. - thread_data->cpi = cpi; - thread_data->td = &cpi->td; - } - winterface->sync(worker); + // Create threads + if (!winterface->reset(worker)) + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Tile encoder thread creation failed"); + } else { + // Main thread acts as a worker and uses the thread data in cpi. + thread_data->cpi = cpi; + thread_data->td = &cpi->td; } + winterface->sync(worker); } } @@ -169,6 +170,27 @@ static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2, } } +void vp9_encode_free_mt_data(struct VP9_COMP *cpi) { + int t; + for (t = 0; t < cpi->num_workers; ++t) { + VPxWorker *const worker = &cpi->workers[t]; + EncWorkerData *const thread_data = &cpi->tile_thr_data[t]; + + // Deallocate allocated threads. + vpx_get_worker_interface()->end(worker); + + // Deallocate allocated thread data. + if (t < cpi->num_workers - 1) { + vpx_free(thread_data->td->counts); + vp9_free_pc_tree(thread_data->td); + vpx_free(thread_data->td); + } + } + vpx_free(cpi->tile_thr_data); + vpx_free(cpi->workers); + cpi->num_workers = 0; +} + void vp9_encode_tiles_mt(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; diff --git a/vp9/encoder/vp9_ethread.h b/vp9/encoder/vp9_ethread.h index cda0293bcf..4c192da515 100644 --- a/vp9/encoder/vp9_ethread.h +++ b/vp9/encoder/vp9_ethread.h @@ -42,6 +42,11 @@ typedef struct VP9RowMTSyncData { int rows; } VP9RowMTSync; +// Frees EncWorkerData related allocations made by vp9_encode_*_mt(). +// row_mt specific data is freed with vp9_row_mt_mem_dealloc() and is not +// called by this function. +void vp9_encode_free_mt_data(struct VP9_COMP *cpi); + void vp9_encode_tiles_mt(struct VP9_COMP *cpi); void vp9_encode_tiles_row_mt(struct VP9_COMP *cpi); From 31b954debe62026f957e9a13354c8c75b12e537a Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 2 Dec 2021 13:11:56 -0800 Subject: [PATCH 0254/1000] clear -Wextra-semi/-Wextra-semi-stmt warnings Bug: chromium:1257449 Change-Id: Ia9aafccc09b611521d4a7aedfe3723393a840c62 --- configure | 2 + examples/postproc.c | 2 +- examples/vpx_temporal_svc_encoder.c | 2 +- test/pp_filter_test.cc | 4 +- test/set_roi.cc | 2 +- test/vp8_fdct4x4_test.cc | 4 +- test/yuv_temporal_filter_test.cc | 8 +- tools/tiny_ssim.c | 30 ++-- vp8/common/blockd.h | 2 +- vp8/common/common.h | 12 +- vp8/encoder/encodeframe.c | 13 +- vp8/encoder/encodemv.c | 7 +- vp8/encoder/mcomp.c | 54 +++--- vp8/encoder/onyx_if.c | 2 +- vp9/common/vp9_common.h | 4 +- vp9/encoder/vp9_mcomp.c | 194 +++++++++++---------- vpx/src/vpx_encoder.c | 2 +- vpx_dsp/x86/highbd_convolve_avx2.c | 12 +- vpx_dsp/x86/highbd_variance_sse2.c | 86 ++++----- vpx_dsp/x86/sad_avx2.c | 36 ++-- vpx_dsp/x86/variance_sse2.c | 60 +++---- vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c | 24 +-- vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 12 +- vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c | 12 +- vpx_ports/x86.h | 8 +- 25 files changed, 307 insertions(+), 287 deletions(-) diff --git a/configure b/configure index d39db6cb08..b68f9fd781 100755 --- a/configure +++ b/configure @@ -622,6 +622,8 @@ process_toolchain() { check_add_cflags -Wall check_add_cflags -Wdeclaration-after-statement check_add_cflags -Wdisabled-optimization + check_add_cflags -Wextra-semi + check_add_cflags -Wextra-semi-stmt check_add_cflags -Wfloat-conversion check_add_cflags -Wformat=2 check_add_cflags -Wparentheses-equality diff --git a/examples/postproc.c b/examples/postproc.c index be999b429e..b53c15ea15 100644 --- a/examples/postproc.c +++ b/examples/postproc.c @@ -109,7 +109,7 @@ int main(int argc, char **argv) { 0 }; if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp)) die_codec(&codec, "Failed to turn on postproc."); - }; + } // Decode the frame with 15ms deadline if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 15000)) diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c index 47f30751eb..bb761a4117 100644 --- a/examples/vpx_temporal_svc_encoder.c +++ b/examples/vpx_temporal_svc_encoder.c @@ -30,7 +30,7 @@ #define ROI_MAP 0 -#define zero(Dest) memset(&(Dest), 0, sizeof(Dest)); +#define zero(Dest) memset(&(Dest), 0, sizeof(Dest)) static const char *exec_name; diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc index a511ffbe98..775f7f36a3 100644 --- a/test/pp_filter_test.cc +++ b/test/pp_filter_test.cc @@ -115,7 +115,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { } vpx_free(flimits_); -}; +} TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { // Size of the underlying data block that will be filtered. @@ -214,7 +214,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, DISABLED_Speed) { PrintMedian("16x16"); vpx_free(flimits_); -}; +} class VpxMbPostProcAcrossIpTest : public AbstractBench, diff --git a/test/set_roi.cc b/test/set_roi.cc index f639547523..167cf908fd 100644 --- a/test/set_roi.cc +++ b/test/set_roi.cc @@ -161,6 +161,6 @@ TEST(VP8RoiMapTest, ParameterCheck) { // Free allocated memory if (cpi.segmentation_map) vpx_free(cpi.segmentation_map); if (roi_map) vpx_free(roi_map); -}; +} } // namespace diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc index d5ac253003..3e4305be73 100644 --- a/test/vp8_fdct4x4_test.cc +++ b/test/vp8_fdct4x4_test.cc @@ -148,7 +148,7 @@ TEST_P(FdctTest, SignBiasCheck) { EXPECT_EQ(true, bias_acceptable) << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]"; -}; +} TEST_P(FdctTest, RoundTripErrorCheck) { int max_error = 0; @@ -181,7 +181,7 @@ TEST_P(FdctTest, RoundTripErrorCheck) { EXPECT_GE(count_test_block, total_error) << "Error: FDCT/IDCT has average roundtrip error > 1 per block"; -}; +} INSTANTIATE_TEST_SUITE_P(C, FdctTest, ::testing::Values(vp8_short_fdct4x4_c)); diff --git a/test/yuv_temporal_filter_test.cc b/test/yuv_temporal_filter_test.cc index cfdc88d896..2bdcf4d86f 100644 --- a/test/yuv_temporal_filter_test.cc +++ b/test/yuv_temporal_filter_test.cc @@ -674,8 +674,8 @@ TEST_P(YUVTemporalFilterTest, DISABLED_Speed) { v_count); \ } -WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10); -WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12); +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10) +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12) INSTANTIATE_TEST_SUITE_P( C, YUVTemporalFilterTest, @@ -683,8 +683,8 @@ INSTANTIATE_TEST_SUITE_P( TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_10, 10), TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_12, 12))); #if HAVE_SSE4_1 -WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 10); -WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 12); +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 10) +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 12) INSTANTIATE_TEST_SUITE_P( SSE4_1, YUVTemporalFilterTest, diff --git a/tools/tiny_ssim.c b/tools/tiny_ssim.c index ff4634ade4..1577970488 100644 --- a/tools/tiny_ssim.c +++ b/tools/tiny_ssim.c @@ -425,20 +425,24 @@ int main(int argc, char *argv[]) { break; } #if CONFIG_VP9_HIGHBITDEPTH -#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \ - if (bit_depth < 9) { \ - ssim = ssim2(buf0, buf1, w, w, w, h); \ - psnr = calc_plane_error(buf0, w, buf1, w, w, h); \ - } else { \ - ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), w, \ - w, w, h, bit_depth); \ - psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w, \ - CAST_TO_SHORTPTR(buf1), w, w, h); \ - } +#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \ + do { \ + if (bit_depth < 9) { \ + ssim = ssim2(buf0, buf1, w, w, w, h); \ + psnr = calc_plane_error(buf0, w, buf1, w, w, h); \ + } else { \ + ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), \ + w, w, w, h, bit_depth); \ + psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w, \ + CAST_TO_SHORTPTR(buf1), w, w, h); \ + } \ + } while (0) #else -#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \ - ssim = ssim2(buf0, buf1, w, w, w, h); \ - psnr = calc_plane_error(buf0, w, buf1, w, w, h); +#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \ + do { \ + ssim = ssim2(buf0, buf1, w, w, w, h); \ + psnr = calc_plane_error(buf0, w, buf1, w, w, h); \ + } while (0) #endif // CONFIG_VP9_HIGHBITDEPTH if (n_frames == allocated_frames) { diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h index 02abe053cb..405443449d 100644 --- a/vp8/common/blockd.h +++ b/vp8/common/blockd.h @@ -58,7 +58,7 @@ typedef struct { extern const unsigned char vp8_block2left[25]; extern const unsigned char vp8_block2above[25]; -#define VP8_COMBINEENTROPYCONTEXTS(Dest, A, B) Dest = (A) + (B); +#define VP8_COMBINEENTROPYCONTEXTS(Dest, A, B) Dest = (A) + (B) typedef enum { KEY_FRAME = 0, INTER_FRAME = 1 } FRAME_TYPE; diff --git a/vp8/common/common.h b/vp8/common/common.h index 2c30e8d6c5..562569f9ab 100644 --- a/vp8/common/common.h +++ b/vp8/common/common.h @@ -24,22 +24,22 @@ extern "C" { /* Only need this for fixed-size arrays, for structs just assign. */ #define vp8_copy(Dest, Src) \ - { \ + do { \ assert(sizeof(Dest) == sizeof(Src)); \ memcpy(Dest, Src, sizeof(Src)); \ - } + } while (0) /* Use this for variably-sized arrays. */ #define vp8_copy_array(Dest, Src, N) \ - { \ + do { \ assert(sizeof(*(Dest)) == sizeof(*(Src))); \ memcpy(Dest, Src, (N) * sizeof(*(Src))); \ - } + } while (0) -#define vp8_zero(Dest) memset(&(Dest), 0, sizeof(Dest)); +#define vp8_zero(Dest) memset(&(Dest), 0, sizeof(Dest)) -#define vp8_zero_array(Dest, N) memset(Dest, 0, (N) * sizeof(*(Dest))); +#define vp8_zero_array(Dest, N) memset(Dest, 0, (N) * sizeof(*(Dest))) #ifdef __cplusplus } // extern "C" diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 69271f1a73..4df35f6edb 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -639,7 +639,8 @@ static void init_encode_frame_mb_context(VP8_COMP *cpi) { vp8_zero(x->coef_counts); vp8_zero(x->ymode_count); - vp8_zero(x->uv_mode_count) x->prediction_error = 0; + vp8_zero(x->uv_mode_count); + x->prediction_error = 0; x->intra_error = 0; vp8_zero(x->count_mb_ref_frame_usage); } @@ -766,12 +767,12 @@ void vp8_encode_frame(VP8_COMP *cpi) { for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1)) { - vp8_zero(cm->left_context) + vp8_zero(cm->left_context); #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING - tp = cpi->tok; + tp = cpi->tok; #else - tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24); + tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24); #endif encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate); @@ -858,10 +859,10 @@ void vp8_encode_frame(VP8_COMP *cpi) { /* for each macroblock row in image */ for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { - vp8_zero(cm->left_context) + vp8_zero(cm->left_context); #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING - tp = cpi->tok; + tp = cpi->tok; #endif encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate); diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c index ff38965393..c88ea1653e 100644 --- a/vp8/encoder/encodemv.c +++ b/vp8/encoder/encodemv.c @@ -205,8 +205,11 @@ static void write_component_probs(vp8_writer *const w, (void)rc; vp8_copy_array(Pnew, default_mvc, MVPcount); - vp8_zero(is_short_ct) vp8_zero(sign_ct) vp8_zero(bit_ct) vp8_zero(short_ct) - vp8_zero(short_bct) + vp8_zero(is_short_ct); + vp8_zero(sign_ct); + vp8_zero(bit_ct); + vp8_zero(short_ct); + vp8_zero(short_bct); /* j=0 */ { diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index 9e7f5c7ace..4ab6c7b3d0 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -204,19 +204,21 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) { /* returns distortion + motion vector cost */ #define ERR(r, c) (MVC(r, c) + DIST(r, c)) /* checks if (r,c) has better score than previous best */ -#define CHECK_BETTER(v, r, c) \ - IFMVCV(r, c, \ - { \ - thismse = DIST(r, c); \ - if ((v = (MVC(r, c) + thismse)) < besterr) { \ - besterr = v; \ - br = r; \ - bc = c; \ - *distortion = thismse; \ - *sse1 = sse; \ - } \ - }, \ - v = UINT_MAX;) +#define CHECK_BETTER(v, r, c) \ + do { \ + IFMVCV(r, c, \ + { \ + thismse = DIST(r, c); \ + if ((v = (MVC(r, c) + thismse)) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + }, \ + v = UINT_MAX;) \ + } while (0) int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, @@ -800,13 +802,13 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, } #define CHECK_BOUNDS(range) \ - { \ + do { \ all_in = 1; \ all_in &= ((br - range) >= x->mv_row_min); \ all_in &= ((br + range) <= x->mv_row_max); \ all_in &= ((bc - range) >= x->mv_col_min); \ all_in &= ((bc + range) <= x->mv_col_max); \ - } + } while (0) #define CHECK_POINT \ { \ @@ -817,7 +819,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, } #define CHECK_BETTER \ - { \ + do { \ if (thissad < bestsad) { \ thissad += \ mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); \ @@ -826,7 +828,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, best_site = i; \ } \ } \ - } + } while (0) static const MV next_chkpts[6][3] = { { { -2, 0 }, { -1, -2 }, { 1, -2 } }, { { -1, -2 }, { 1, -2 }, { 2, 0 } }, @@ -901,7 +903,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, #endif /* hex search */ - CHECK_BOUNDS(2) + CHECK_BOUNDS(2); if (all_in) { for (i = 0; i < 6; ++i) { @@ -910,7 +912,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); - CHECK_BETTER + CHECK_BETTER; } } else { for (i = 0; i < 6; ++i) { @@ -920,7 +922,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); - CHECK_BETTER + CHECK_BETTER; } } @@ -934,7 +936,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, for (j = 1; j < hex_range; ++j) { best_site = -1; - CHECK_BOUNDS(2) + CHECK_BOUNDS(2); if (all_in) { for (i = 0; i < 3; ++i) { @@ -943,7 +945,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); - CHECK_BETTER + CHECK_BETTER; } } else { for (i = 0; i < 3; ++i) { @@ -953,7 +955,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); - CHECK_BETTER + CHECK_BETTER; } } @@ -975,7 +977,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, cal_neighbors: for (j = 0; j < dia_range; ++j) { best_site = -1; - CHECK_BOUNDS(1) + CHECK_BOUNDS(1); if (all_in) { for (i = 0; i < 4; ++i) { @@ -984,7 +986,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); - CHECK_BETTER + CHECK_BETTER; } } else { for (i = 0; i < 4; ++i) { @@ -994,7 +996,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); - CHECK_BETTER + CHECK_BETTER; } } diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 15c9d72f5d..59bce951e0 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1023,7 +1023,7 @@ void vp8_set_speed_features(VP8_COMP *cpi) { memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins)); - }; /* switch */ + } /* switch */ /* Slow quant, dct and trellis not worthwhile for first pass * so make sure they are always turned off. diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h index e3c5535ddb..3cec53bfd8 100644 --- a/vp9/common/vp9_common.h +++ b/vp9/common/vp9_common.h @@ -27,10 +27,10 @@ extern "C" { // Only need this for fixed-size arrays, for structs just assign. #define vp9_copy(dest, src) \ - { \ + do { \ assert(sizeof(dest) == sizeof(src)); \ memcpy(dest, src, sizeof(src)); \ - } + } while (0) // Use this for variably-sized arrays. #define vp9_copy_array(dest, src, n) \ diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index ac29f36ec1..cd67064203 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -159,59 +159,63 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { #if CONFIG_VP9_HIGHBITDEPTH /* checks if (r, c) has better score than previous best */ -#define CHECK_BETTER(v, r, c) \ - if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - int64_t tmpmse; \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ - if (second_pred == NULL) { \ - thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ - src_stride, &sse); \ - } else { \ - thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ - src_stride, &sse, second_pred); \ - } \ - tmpmse = thismse; \ - tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \ - if (tmpmse >= INT_MAX) { \ - v = INT_MAX; \ - } else if ((v = (uint32_t)tmpmse) < besterr) { \ - besterr = v; \ - br = r; \ - bc = c; \ - *distortion = thismse; \ - *sse1 = sse; \ - } \ - } else { \ - v = INT_MAX; \ - } +#define CHECK_BETTER(v, r, c) \ + do { \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + int64_t tmpmse; \ + const MV mv = { r, c }; \ + const MV ref_mv = { rr, rc }; \ + if (second_pred == NULL) { \ + thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ + src_stride, &sse); \ + } else { \ + thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ + src_stride, &sse, second_pred); \ + } \ + tmpmse = thismse; \ + tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \ + if (tmpmse >= INT_MAX) { \ + v = INT_MAX; \ + } else if ((v = (uint32_t)tmpmse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } \ + } while (0) #else /* checks if (r, c) has better score than previous best */ -#define CHECK_BETTER(v, r, c) \ - if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ - if (second_pred == NULL) \ - thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ - src_stride, &sse); \ - else \ - thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ - src_stride, &sse, second_pred); \ - if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \ - thismse) < besterr) { \ - besterr = v; \ - br = r; \ - bc = c; \ - *distortion = thismse; \ - *sse1 = sse; \ - } \ - } else { \ - v = INT_MAX; \ - } +#define CHECK_BETTER(v, r, c) \ + do { \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + const MV mv = { r, c }; \ + const MV ref_mv = { rr, rc }; \ + if (second_pred == NULL) \ + thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ + src_stride, &sse); \ + else \ + thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ + src_stride, &sse, second_pred); \ + if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \ + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } \ + } while (0) #endif #define FIRST_LEVEL_CHECKS \ - { \ + do { \ unsigned int left, right, up, down, diag; \ CHECK_BETTER(left, tr, tc - hstep); \ CHECK_BETTER(right, tr, tc + hstep); \ @@ -224,10 +228,10 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { case 2: CHECK_BETTER(diag, tr + hstep, tc - hstep); break; \ case 3: CHECK_BETTER(diag, tr + hstep, tc + hstep); break; \ } \ - } + } while (0) #define SECOND_LEVEL_CHECKS \ - { \ + do { \ int kr, kc; \ unsigned int second; \ if (tr != br && tc != bc) { \ @@ -256,7 +260,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { case 3: CHECK_BETTER(second, tr + kr, tc - hstep); break; \ } \ } \ - } + } while (0) #define SETUP_SUBPEL_SEARCH \ const uint8_t *const z = x->plane[0].src.buf; \ @@ -290,7 +294,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { maxr = subpel_mv_limits.row_max; \ \ bestmv->row *= 8; \ - bestmv->col *= 8; + bestmv->col *= 8 static unsigned int setup_center_error( const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv, @@ -678,48 +682,52 @@ static int accurate_sub_pel_search( // TODO(yunqing): this part can be further refactored. #if CONFIG_VP9_HIGHBITDEPTH /* checks if (r, c) has better score than previous best */ -#define CHECK_BETTER1(v, r, c) \ - if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - int64_t tmpmse; \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ - thismse = \ - accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \ - y, y_stride, second_pred, w, h, &sse); \ - tmpmse = thismse; \ - tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \ - if (tmpmse >= INT_MAX) { \ - v = INT_MAX; \ - } else if ((v = (uint32_t)tmpmse) < besterr) { \ - besterr = v; \ - br = r; \ - bc = c; \ - *distortion = thismse; \ - *sse1 = sse; \ - } \ - } else { \ - v = INT_MAX; \ - } +#define CHECK_BETTER1(v, r, c) \ + do { \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + int64_t tmpmse; \ + const MV mv = { r, c }; \ + const MV ref_mv = { rr, rc }; \ + thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, \ + src_stride, y, y_stride, second_pred, \ + w, h, &sse); \ + tmpmse = thismse; \ + tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \ + if (tmpmse >= INT_MAX) { \ + v = INT_MAX; \ + } else if ((v = (uint32_t)tmpmse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } \ + } while (0) #else /* checks if (r, c) has better score than previous best */ -#define CHECK_BETTER1(v, r, c) \ - if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ - thismse = \ - accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \ - y, y_stride, second_pred, w, h, &sse); \ - if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \ - thismse) < besterr) { \ - besterr = v; \ - br = r; \ - bc = c; \ - *distortion = thismse; \ - *sse1 = sse; \ - } \ - } else { \ - v = INT_MAX; \ - } +#define CHECK_BETTER1(v, r, c) \ + do { \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + const MV mv = { r, c }; \ + const MV ref_mv = { rr, rc }; \ + thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, \ + src_stride, y, y_stride, second_pred, \ + w, h, &sse); \ + if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \ + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } \ + } while (0) #endif @@ -2962,7 +2970,7 @@ int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x, (void)sse; \ (void)thismse; \ (void)cost_list; \ - (void)use_accurate_subpel_search; + (void)use_accurate_subpel_search // Return the maximum MV. uint32_t vp9_return_max_sub_pixel_mv( diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c index f636b54a33..846638fe55 100644 --- a/vpx/src/vpx_encoder.c +++ b/vpx/src/vpx_encoder.c @@ -173,7 +173,7 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, #include "vpx_ports/x86.h" #define FLOATING_POINT_INIT() \ do { \ - unsigned short x87_orig_mode = x87_set_double_precision(); + unsigned short x87_orig_mode = x87_set_double_precision() #define FLOATING_POINT_RESTORE() \ x87_set_control_word(x87_orig_mode); \ } \ diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c index 3209625617..01a52ec8bf 100644 --- a/vpx_dsp/x86/highbd_convolve_avx2.c +++ b/vpx_dsp/x86/highbd_convolve_avx2.c @@ -1465,10 +1465,10 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; #define vpx_highbd_filter_block1d4_h4_avg_avx2 \ vpx_highbd_filter_block1d4_h8_avg_avx2 -HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0); +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0) HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, - src - src_stride * (num_taps / 2 - 1), , avx2, 0); -HIGH_FUN_CONV_2D(, avx2, 0); + src - src_stride * (num_taps / 2 - 1), , avx2, 0) +HIGH_FUN_CONV_2D(, avx2, 0) // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; @@ -1487,9 +1487,9 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; #define vpx_highbd_filter_block1d4_v2_avg_avx2 \ vpx_highbd_filter_block1d4_v2_avg_sse2 -HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1); +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1) HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, - src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1); -HIGH_FUN_CONV_2D(avg_, avx2, 1); + src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1) +HIGH_FUN_CONV_2D(avg_, avx2, 1) #undef HIGHBD_FUNC diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c index dd6cfbb2c4..7c8d79b09e 100644 --- a/vpx_dsp/x86/highbd_variance_sse2.c +++ b/vpx_dsp/x86/highbd_variance_sse2.c @@ -121,8 +121,8 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, *sse = ROUND_POWER_OF_TWO(*sse, 8); \ } -HIGH_GET_VAR(16); -HIGH_GET_VAR(8); +HIGH_GET_VAR(16) +HIGH_GET_VAR(8) #undef HIGH_GET_VAR @@ -167,16 +167,16 @@ HIGH_GET_VAR(8); return (var >= 0) ? (uint32_t)var : 0; \ } -VAR_FN(64, 64, 16, 12); -VAR_FN(64, 32, 16, 11); -VAR_FN(32, 64, 16, 11); -VAR_FN(32, 32, 16, 10); -VAR_FN(32, 16, 16, 9); -VAR_FN(16, 32, 16, 9); -VAR_FN(16, 16, 16, 8); -VAR_FN(16, 8, 8, 7); -VAR_FN(8, 16, 8, 7); -VAR_FN(8, 8, 8, 6); +VAR_FN(64, 64, 16, 12) +VAR_FN(64, 32, 16, 11) +VAR_FN(32, 64, 16, 11) +VAR_FN(32, 32, 16, 10) +VAR_FN(32, 16, 16, 9) +VAR_FN(16, 32, 16, 9) +VAR_FN(16, 16, 16, 8) +VAR_FN(16, 8, 8, 7) +VAR_FN(8, 16, 8, 7) +VAR_FN(8, 8, 8, 6) #undef VAR_FN @@ -255,10 +255,10 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, const uint16_t *ref, ptrdiff_t ref_stride, int height, \ unsigned int *sse, void *unused0, void *unused); #define DECLS(opt) \ - DECL(8, opt); \ + DECL(8, opt) \ DECL(16, opt) -DECLS(sse2); +DECLS(sse2) #undef DECLS #undef DECL @@ -383,20 +383,20 @@ DECLS(sse2); return (var >= 0) ? (uint32_t)var : 0; \ } -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int64_t)); \ - FN(8, 16, 8, 3, 4, opt, (int64_t)); \ - FN(8, 8, 8, 3, 3, opt, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt, (int64_t)); - -FNS(sse2); +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t)) \ + FN(64, 32, 16, 6, 5, opt, (int64_t)) \ + FN(32, 64, 16, 5, 6, opt, (int64_t)) \ + FN(32, 32, 16, 5, 5, opt, (int64_t)) \ + FN(32, 16, 16, 5, 4, opt, (int64_t)) \ + FN(16, 32, 16, 4, 5, opt, (int64_t)) \ + FN(16, 16, 16, 4, 4, opt, (int64_t)) \ + FN(16, 8, 16, 4, 3, opt, (int64_t)) \ + FN(8, 16, 8, 3, 4, opt, (int64_t)) \ + FN(8, 8, 8, 3, 3, opt, (int64_t)) \ + FN(8, 4, 8, 3, 2, opt, (int64_t)) + +FNS(sse2) #undef FNS #undef FN @@ -412,7 +412,7 @@ FNS(sse2); DECL(16, opt1) \ DECL(8, opt1) -DECLS(sse2); +DECLS(sse2) #undef DECL #undef DECLS @@ -542,20 +542,20 @@ DECLS(sse2); return (var >= 0) ? (uint32_t)var : 0; \ } -#define FNS(opt1) \ - FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ - FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ - FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ - FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ - FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ - FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ - FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ - FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ - FN(8, 16, 8, 4, 3, opt1, (int64_t)); \ - FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt1, (int64_t)); - -FNS(sse2); +#define FNS(opt1) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t)) \ + FN(64, 32, 16, 6, 5, opt1, (int64_t)) \ + FN(32, 64, 16, 5, 6, opt1, (int64_t)) \ + FN(32, 32, 16, 5, 5, opt1, (int64_t)) \ + FN(32, 16, 16, 5, 4, opt1, (int64_t)) \ + FN(16, 32, 16, 4, 5, opt1, (int64_t)) \ + FN(16, 16, 16, 4, 4, opt1, (int64_t)) \ + FN(16, 8, 16, 4, 3, opt1, (int64_t)) \ + FN(8, 16, 8, 4, 3, opt1, (int64_t)) \ + FN(8, 8, 8, 3, 3, opt1, (int64_t)) \ + FN(8, 4, 8, 3, 2, opt1, (int64_t)) + +FNS(sse2) #undef FNS #undef FN diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c index d944134305..3b48acd510 100644 --- a/vpx_dsp/x86/sad_avx2.c +++ b/vpx_dsp/x86/sad_avx2.c @@ -71,17 +71,17 @@ return res; \ } -#define FSAD64 \ - FSAD64_H(64); \ - FSAD64_H(32); +#define FSAD64 \ + FSAD64_H(64) \ + FSAD64_H(32) -#define FSAD32 \ - FSAD32_H(64); \ - FSAD32_H(32); \ - FSAD32_H(16); +#define FSAD32 \ + FSAD32_H(64) \ + FSAD32_H(32) \ + FSAD32_H(16) -FSAD64; -FSAD32; +FSAD64 +FSAD32 #undef FSAD64 #undef FSAD32 @@ -160,17 +160,17 @@ FSAD32; return res; \ } -#define FSADAVG64 \ - FSADAVG64_H(64); \ - FSADAVG64_H(32); +#define FSADAVG64 \ + FSADAVG64_H(64) \ + FSADAVG64_H(32) -#define FSADAVG32 \ - FSADAVG32_H(64); \ - FSADAVG32_H(32); \ - FSADAVG32_H(16); +#define FSADAVG32 \ + FSADAVG32_H(64) \ + FSADAVG32_H(32) \ + FSADAVG32_H(16) -FSADAVG64; -FSADAVG32; +FSADAVG64 +FSADAVG32 #undef FSADAVG64 #undef FSADAVG32 diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c index 67645c57ac..a67c92aadb 100644 --- a/vpx_dsp/x86/variance_sse2.c +++ b/vpx_dsp/x86/variance_sse2.c @@ -471,23 +471,23 @@ DECLS(ssse3, ssse3); (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } -#define FNS(opt1, opt2) \ - FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \ +#define FNS(opt1, opt2) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)) \ FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t)) -FNS(sse2, sse2); -FNS(ssse3, ssse3); +FNS(sse2, sse2) +FNS(ssse3, ssse3) #undef FNS #undef FN @@ -543,23 +543,23 @@ DECLS(ssse3, ssse3); (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } -#define FNS(opt1, opt2) \ - FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \ +#define FNS(opt1, opt2) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)) \ FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t)) -FNS(sse2, sse); -FNS(ssse3, ssse3); +FNS(sse2, sse) +FNS(ssse3, ssse3) #undef FNS #undef FN diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c index 2391790284..0cbd151dc3 100644 --- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c +++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c @@ -1040,12 +1040,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0) FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, , - sse2, 0); -FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1); + sse2, 0) +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1) FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, - src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1); + src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1) // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -1057,8 +1057,8 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_2D(, sse2, 0); -FUN_CONV_2D(avg_, sse2, 1); +FUN_CONV_2D(, sse2, 0) +FUN_CONV_2D(avg_, sse2, 1) #if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. @@ -1139,12 +1139,12 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; // const int16_t *filter_y, // int y_step_q4, // int w, int h, int bd); -HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0); +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0) HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, - src - src_stride * (num_taps / 2 - 1), , sse2, 0); -HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1); + src - src_stride * (num_taps / 2 - 1), , sse2, 0) +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1) HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, - src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1); + src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1) // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -1156,6 +1156,6 @@ HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, // int y_step_q4, int w, int h, int bd); -HIGH_FUN_CONV_2D(, sse2, 0); -HIGH_FUN_CONV_2D(avg_, sse2, 1); +HIGH_FUN_CONV_2D(, sse2, 0) +HIGH_FUN_CONV_2D(avg_, sse2, 1) #endif // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 1eaa19bfc5..6f2983a4b5 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -969,12 +969,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, // int y_step_q4, int w, int h); -FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0) FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), , - avx2, 0); -FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1); + avx2, 0) +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1) FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, - src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1); + src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1) // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -986,6 +986,6 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_2D(, avx2, 0); -FUN_CONV_2D(avg_, avx2, 1); +FUN_CONV_2D(, avx2, 0) +FUN_CONV_2D(avg_, avx2, 1) #endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index 77355a2085..ed46d6245d 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -731,12 +731,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, // int y_step_q4, int w, int h); -FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0) FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), , - ssse3, 0); -FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1); + ssse3, 0) +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1) FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, - src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1); + src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1) static void filter_horiz_w8_ssse3(const uint8_t *const src, const ptrdiff_t src_stride, @@ -1083,5 +1083,5 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_2D(, ssse3, 0); -FUN_CONV_2D(avg_, ssse3, 1); +FUN_CONV_2D(, ssse3, 0) +FUN_CONV_2D(avg_, ssse3, 1) diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h index 651ff64606..795fb2923f 100644 --- a/vpx_ports/x86.h +++ b/vpx_ports/x86.h @@ -47,7 +47,7 @@ typedef enum { #define cpuid(func, func2, ax, bx, cx, dx) \ __asm__ __volatile__("cpuid \n\t" \ : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \ - : "a"(func), "c"(func2)); + : "a"(func), "c"(func2)) #else #define cpuid(func, func2, ax, bx, cx, dx) \ __asm__ __volatile__( \ @@ -55,7 +55,7 @@ typedef enum { "cpuid \n\t" \ "xchg %%edi, %%ebx \n\t" \ : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ - : "a"(func), "c"(func2)); + : "a"(func), "c"(func2)) #endif #elif defined(__SUNPRO_C) || \ defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/ @@ -67,7 +67,7 @@ typedef enum { "movl %ebx, %edi \n\t" \ "xchg %rsi, %rbx \n\t" \ : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ - : "a"(func), "c"(func2)); + : "a"(func), "c"(func2)) #else #define cpuid(func, func2, ax, bx, cx, dx) \ asm volatile( \ @@ -76,7 +76,7 @@ typedef enum { "movl %ebx, %edi \n\t" \ "popl %ebx \n\t" \ : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ - : "a"(func), "c"(func2)); + : "a"(func), "c"(func2)) #endif #else /* end __SUNPRO__ */ #if VPX_ARCH_X86_64 From ab35ee100a38347433af24df05a5e1578172a2ae Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 7 Dec 2021 13:11:46 -0800 Subject: [PATCH 0255/1000] clear -Wextra-semi/-Wextra-semi-stmt warnings x2 some additional neon file updates after: 31b954deb clear -Wextra-semi/-Wextra-semi-stmt warnings Bug: chromium:1257449 Change-Id: I3e2664f2bd8f6f7328ec91bf6595ba5fc09862bd --- vpx_dsp/arm/fdct32x32_neon.c | 2 +- vpx_dsp/arm/sad_neon.c | 30 +++++++-------- vpx_dsp/arm/subpel_variance_neon.c | 56 ++++++++++++++-------------- vpx_dsp/arm/variance_neon.c | 22 +++++------ vpx_dsp/arm/vpx_convolve8_neon_asm.c | 8 ++-- vpx_dsp/arm/vpx_convolve8_neon_asm.h | 16 ++++---- 6 files changed, 67 insertions(+), 67 deletions(-) diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c index e9cd34904b..de74e6630b 100644 --- a/vpx_dsp/arm/fdct32x32_neon.c +++ b/vpx_dsp/arm/fdct32x32_neon.c @@ -153,7 +153,7 @@ static INLINE void load(const int16_t *a, int stride, int16x8_t *b) { do { \ store_s16q_to_tran_low(dest, src[index]); \ dest += 8; \ - } while (0); + } while (0) // Store 32 16x8 values, assuming stride == 32. // Slight twist: store horizontally in blocks of 8. diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index 59567bda5b..b1509d883a 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -110,7 +110,7 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride, return abs; } -#define sad8xN(n) \ +#define SAD8XN(n) \ uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ @@ -125,9 +125,9 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride, return horizontal_add_uint16x8(abs); \ } -sad8xN(4); -sad8xN(8); -sad8xN(16); +SAD8XN(4) +SAD8XN(8) +SAD8XN(16) static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, @@ -167,7 +167,7 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride, return abs; } -#define sad16xN(n) \ +#define SAD16XN(n) \ uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ const uint16x8_t abs = \ @@ -183,9 +183,9 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride, return horizontal_add_uint16x8(abs); \ } -sad16xN(8); -sad16xN(16); -sad16xN(32); +SAD16XN(8) +SAD16XN(16) +SAD16XN(32) static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, @@ -235,7 +235,7 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride, return abs; } -#define sad32xN(n) \ +#define SAD32XN(n) \ uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ const uint16x8_t abs = \ @@ -251,9 +251,9 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride, return horizontal_add_uint16x8(abs); \ } -sad32xN(16); -sad32xN(32); -sad32xN(64); +SAD32XN(16) +SAD32XN(32) +SAD32XN(64) static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, @@ -333,7 +333,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, } } -#define sad64xN(n) \ +#define SAD64XN(n) \ uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ const uint32x4_t abs = \ @@ -349,5 +349,5 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, return horizontal_add_uint32x4(abs); \ } -sad64xN(32); -sad64xN(64); +SAD64XN(32) +SAD64XN(64) diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c index 37bfd1cd1f..a3befdc348 100644 --- a/vpx_dsp/arm/subpel_variance_neon.c +++ b/vpx_dsp/arm/subpel_variance_neon.c @@ -97,7 +97,7 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, // 4xM filter writes an extra row to fdata because it processes two rows at a // time. -#define sub_pixel_varianceNxM(n, m) \ +#define SUB_PIXEL_VARIANCENXM(n, m) \ uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ @@ -123,23 +123,23 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, return vpx_variance##n##x##m(temp1, n, ref_ptr, ref_stride, sse); \ } -sub_pixel_varianceNxM(4, 4); -sub_pixel_varianceNxM(4, 8); -sub_pixel_varianceNxM(8, 4); -sub_pixel_varianceNxM(8, 8); -sub_pixel_varianceNxM(8, 16); -sub_pixel_varianceNxM(16, 8); -sub_pixel_varianceNxM(16, 16); -sub_pixel_varianceNxM(16, 32); -sub_pixel_varianceNxM(32, 16); -sub_pixel_varianceNxM(32, 32); -sub_pixel_varianceNxM(32, 64); -sub_pixel_varianceNxM(64, 32); -sub_pixel_varianceNxM(64, 64); +SUB_PIXEL_VARIANCENXM(4, 4) +SUB_PIXEL_VARIANCENXM(4, 8) +SUB_PIXEL_VARIANCENXM(8, 4) +SUB_PIXEL_VARIANCENXM(8, 8) +SUB_PIXEL_VARIANCENXM(8, 16) +SUB_PIXEL_VARIANCENXM(16, 8) +SUB_PIXEL_VARIANCENXM(16, 16) +SUB_PIXEL_VARIANCENXM(16, 32) +SUB_PIXEL_VARIANCENXM(32, 16) +SUB_PIXEL_VARIANCENXM(32, 32) +SUB_PIXEL_VARIANCENXM(32, 64) +SUB_PIXEL_VARIANCENXM(64, 32) +SUB_PIXEL_VARIANCENXM(64, 64) // 4xM filter writes an extra row to fdata because it processes two rows at a // time. -#define sub_pixel_avg_varianceNxM(n, m) \ +#define SUB_PIXEL_AVG_VARIANCENXM(n, m) \ uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \ const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ @@ -169,16 +169,16 @@ sub_pixel_varianceNxM(64, 64); return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse); \ } -sub_pixel_avg_varianceNxM(4, 4); -sub_pixel_avg_varianceNxM(4, 8); -sub_pixel_avg_varianceNxM(8, 4); -sub_pixel_avg_varianceNxM(8, 8); -sub_pixel_avg_varianceNxM(8, 16); -sub_pixel_avg_varianceNxM(16, 8); -sub_pixel_avg_varianceNxM(16, 16); -sub_pixel_avg_varianceNxM(16, 32); -sub_pixel_avg_varianceNxM(32, 16); -sub_pixel_avg_varianceNxM(32, 32); -sub_pixel_avg_varianceNxM(32, 64); -sub_pixel_avg_varianceNxM(64, 32); -sub_pixel_avg_varianceNxM(64, 64); +SUB_PIXEL_AVG_VARIANCENXM(4, 4) +SUB_PIXEL_AVG_VARIANCENXM(4, 8) +SUB_PIXEL_AVG_VARIANCENXM(8, 4) +SUB_PIXEL_AVG_VARIANCENXM(8, 8) +SUB_PIXEL_AVG_VARIANCENXM(8, 16) +SUB_PIXEL_AVG_VARIANCENXM(16, 8) +SUB_PIXEL_AVG_VARIANCENXM(16, 16) +SUB_PIXEL_AVG_VARIANCENXM(16, 32) +SUB_PIXEL_AVG_VARIANCENXM(32, 16) +SUB_PIXEL_AVG_VARIANCENXM(32, 32) +SUB_PIXEL_AVG_VARIANCENXM(32, 64) +SUB_PIXEL_AVG_VARIANCENXM(64, 32) +SUB_PIXEL_AVG_VARIANCENXM(64, 64) diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index 410ce7d9e6..7b93f142b1 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -268,7 +268,7 @@ void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16, sse, sum); } -#define varianceNxM(n, m, shift) \ +#define VARIANCENXM(n, m, shift) \ unsigned int vpx_variance##n##x##m##_neon( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, unsigned int *sse) { \ @@ -288,16 +288,16 @@ void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ } -varianceNxM(4, 4, 4); -varianceNxM(4, 8, 5); -varianceNxM(8, 4, 5); -varianceNxM(8, 8, 6); -varianceNxM(8, 16, 7); -varianceNxM(16, 8, 7); -varianceNxM(16, 16, 8); -varianceNxM(16, 32, 9); -varianceNxM(32, 16, 9); -varianceNxM(32, 32, 10); +VARIANCENXM(4, 4, 4) +VARIANCENXM(4, 8, 5) +VARIANCENXM(8, 4, 5) +VARIANCENXM(8, 8, 6) +VARIANCENXM(8, 16, 7) +VARIANCENXM(16, 8, 7) +VARIANCENXM(16, 16, 8) +VARIANCENXM(16, 32, 9) +VARIANCENXM(32, 16, 9) +VARIANCENXM(32, 32, 10) unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, diff --git a/vpx_dsp/arm/vpx_convolve8_neon_asm.c b/vpx_dsp/arm/vpx_convolve8_neon_asm.c index 4470b28b88..c4177c5385 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon_asm.c +++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.c @@ -35,7 +35,7 @@ } \ } -DEFINE_FILTER(horiz); -DEFINE_FILTER(avg_horiz); -DEFINE_FILTER(vert); -DEFINE_FILTER(avg_vert); +DEFINE_FILTER(horiz) +DEFINE_FILTER(avg_horiz) +DEFINE_FILTER(vert) +DEFINE_FILTER(avg_vert) diff --git a/vpx_dsp/arm/vpx_convolve8_neon_asm.h b/vpx_dsp/arm/vpx_convolve8_neon_asm.h index b123d1cb08..f1c7d62ed0 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon_asm.h +++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.h @@ -17,13 +17,13 @@ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -DECLARE_FILTER(horiz, type1); -DECLARE_FILTER(avg_horiz, type1); -DECLARE_FILTER(horiz, type2); -DECLARE_FILTER(avg_horiz, type2); -DECLARE_FILTER(vert, type1); -DECLARE_FILTER(avg_vert, type1); -DECLARE_FILTER(vert, type2); -DECLARE_FILTER(avg_vert, type2); +DECLARE_FILTER(horiz, type1) +DECLARE_FILTER(avg_horiz, type1) +DECLARE_FILTER(horiz, type2) +DECLARE_FILTER(avg_horiz, type2) +DECLARE_FILTER(vert, type1) +DECLARE_FILTER(avg_vert, type1) +DECLARE_FILTER(vert, type2) +DECLARE_FILTER(avg_vert, type2) #endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_ From 69146697b51d3344e8dfe7608cd141699e1f6a59 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 8 Dec 2021 19:34:47 -0800 Subject: [PATCH 0256/1000] vp9_thread_test.cc: remove incorrect TODO the row-based loop filter is ok (and being used) in this case; since it's serialized the previous row will always be done Change-Id: I024a0c78e7488178956cc22a4c4680a00dc6eade --- test/vp9_thread_test.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc index 352ad71eca..5cac9ea0ee 100644 --- a/test/vp9_thread_test.cc +++ b/test/vp9_thread_test.cc @@ -216,10 +216,6 @@ TEST(VPxWorkerThreadTest, TestSerialInterface) { static const VPxWorkerInterface serial_interface = { impl::Init, impl::Reset, impl::Sync, impl::Launch, impl::Execute, impl::End }; - // TODO(jzern): Avoid using a file that will use the row-based thread - // loopfilter, with the simple serialized implementation it will hang. This is - // due to its expectation that rows will be run in parallel as they wait on - // progress in the row above before proceeding. static const char expected_md5[] = "b35a1b707b28e82be025d960aba039bc"; static const char filename[] = "vp90-2-03-size-226x226.webm"; VPxWorkerInterface default_interface = *vpx_get_worker_interface(); From 093a8c4824729be62e582b7d3f00e18830aee3b8 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 8 Dec 2021 21:35:26 -0800 Subject: [PATCH 0257/1000] test_intra_pred_speed: match above ext w/reconintra only 2 x block_size is needed + remove a related TODO; C & assembly rely on this extension Change-Id: Iea430267624251cccbbdaec8045eb81d01ae1db1 --- test/test_intra_pred_speed.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 08100a146e..28b3484a03 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -48,11 +48,9 @@ struct IntraPredTestMem { for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand16() & mask; for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand16() & mask; - // some code assumes the top row has been extended: - // d45/d63 C-code, for instance, but not the assembly. - // TODO(jzern): this style of extension isn't strictly necessary. + // d45/d63 require the top row to be extended. ASSERT_LE(block_size, kBPS); - for (int i = block_size; i < 2 * kBPS; ++i) { + for (int i = block_size; i < 2 * block_size; ++i) { above[i] = above[block_size - 1]; } } From 7fbcee49da63a61feee00147746efa33e31087e8 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 8 Dec 2021 21:42:28 -0800 Subject: [PATCH 0258/1000] quiet -Warray-parameter warnings w/gcc-11 this matches the definition of the function with the declaration Change-Id: I757b731b9560cb0b0ceec4ec258ec5af5a183b3d --- vpx_dsp/avg.c | 2 +- vpx_dsp/x86/sad4d_avx2.c | 8 ++++---- vpx_dsp/x86/sad4d_avx512.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c index 1c45e8a73d..c87ab20ffe 100644 --- a/vpx_dsp/avg.c +++ b/vpx_dsp/avg.c @@ -340,7 +340,7 @@ int vpx_satd_c(const tran_low_t *coeff, int length) { // Integer projection onto row vectors. // height: value range {16, 32, 64}. -void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, +void vpx_int_pro_row_c(int16_t *hbuf /*[16]*/, const uint8_t *ref, const int ref_stride, const int height) { int idx; const int norm_factor = height >> 1; diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c index a5c4f8c537..9dd0666918 100644 --- a/vpx_dsp/x86/sad4d_avx2.c +++ b/vpx_dsp/x86/sad4d_avx2.c @@ -22,8 +22,8 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, } void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { + const uint8_t *const ref_array[/*4*/], int ref_stride, + uint32_t *sad_array /*[4]*/) { int i; const uint8_t *refs[4]; __m256i sums[4]; @@ -127,8 +127,8 @@ void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride, } void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { + const uint8_t *const ref_array[/*4*/], int ref_stride, + uint32_t *sad_array /*[4]*/) { __m256i sums[4]; int i; const uint8_t *refs[4]; diff --git a/vpx_dsp/x86/sad4d_avx512.c b/vpx_dsp/x86/sad4d_avx512.c index 4c5d70464d..2fa9108718 100644 --- a/vpx_dsp/x86/sad4d_avx512.c +++ b/vpx_dsp/x86/sad4d_avx512.c @@ -12,8 +12,8 @@ #include "vpx/vpx_integer.h" void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t res[4]) { + const uint8_t *const ref_array[/*4*/], + int ref_stride, uint32_t *res /*[4]*/) { __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3; __m512i sum_mlow, sum_mhigh; From f3e2a690cd474eae47376b431f5bddf6d73e377c Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 9 Dec 2021 17:32:30 -0800 Subject: [PATCH 0259/1000] vp9_bitstream.c: quiet -Wstringop-overflow warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit w/gcc-11 as noted in the size of interp_filter_selected[][]'s first dimension varies between VP9_COMP and VP9BitstreamWorkerData as noted in the latter's definition: // The size of interp_filter_selected in VP9_COMP is actually // MAX_REFERENCE_FRAMES x SWITCHABLE. But when encoding tiles, all we ever do // is increment the very first index (index 0) for the first dimension. Hence // this is sufficient. int interp_filter_selected[1][SWITCHABLE]; normalize the function signatures of write_modes*(), etc. to take this into account. vp9/encoder/vp9_bitstream.c|948 col 3| warning: ‘write_modes’ accessing 64 bytes in a region of size 16 [-Wstringop-overflow=] || 948 | write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info, || | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ || 949 | &data->bit_writer, tile_row, data->tile_idx, || | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ || 950 | &data->max_mv_magnitude, data->interp_filter_selected); || | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ vp9/encoder/vp9_bitstream.c|948 col 3| note: referencing argument 8 of type ‘int (*)[4]’ vp9/encoder/vp9_bitstream.c|488 col 13| note: in a call to function ‘write_modes’ Change-Id: I0898cd7c3431633c382a0c3a1be2f0a0bea8d0f9 --- vp9/encoder/vp9_bitstream.c | 42 ++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index c23e150a45..d4ac4ffc79 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -236,11 +236,11 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *const xd, } } -static void pack_inter_mode_mvs( - VP9_COMP *cpi, const MACROBLOCKD *const xd, - const MB_MODE_INFO_EXT *const mbmi_ext, vpx_writer *w, - unsigned int *const max_mv_magnitude, - int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) { +static void pack_inter_mode_mvs(VP9_COMP *cpi, const MACROBLOCKD *const xd, + const MB_MODE_INFO_EXT *const mbmi_ext, + vpx_writer *w, + unsigned int *const max_mv_magnitude, + int interp_filter_selected[][SWITCHABLE]) { VP9_COMMON *const cm = &cpi->common; const nmv_context *nmvc = &cm->fc->nmvc; const struct segmentation *const seg = &cm->seg; @@ -373,11 +373,12 @@ static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd, write_intra_mode(w, mi->uv_mode, vp9_kf_uv_mode_prob[mi->mode]); } -static void write_modes_b( - VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile, - vpx_writer *w, TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, - int mi_row, int mi_col, unsigned int *const max_mv_magnitude, - int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) { +static void write_modes_b(VP9_COMP *cpi, MACROBLOCKD *const xd, + const TileInfo *const tile, vpx_writer *w, + TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, + int mi_row, int mi_col, + unsigned int *const max_mv_magnitude, + int interp_filter_selected[][SWITCHABLE]) { const VP9_COMMON *const cm = &cpi->common; const MB_MODE_INFO_EXT *const mbmi_ext = cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); @@ -422,12 +423,12 @@ static void write_partition(const VP9_COMMON *const cm, } } -static void write_modes_sb( - VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile, - vpx_writer *w, TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, - int mi_row, int mi_col, BLOCK_SIZE bsize, - unsigned int *const max_mv_magnitude, - int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) { +static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd, + const TileInfo *const tile, vpx_writer *w, + TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, + int mi_row, int mi_col, BLOCK_SIZE bsize, + unsigned int *const max_mv_magnitude, + int interp_filter_selected[][SWITCHABLE]) { const VP9_COMMON *const cm = &cpi->common; const int bsl = b_width_log2_lookup[bsize]; const int bs = (1 << bsl) / 4; @@ -485,11 +486,10 @@ static void write_modes_sb( update_partition_context(xd, mi_row, mi_col, subsize, bsize); } -static void write_modes( - VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile, - vpx_writer *w, int tile_row, int tile_col, - unsigned int *const max_mv_magnitude, - int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) { +static void write_modes(VP9_COMP *cpi, MACROBLOCKD *const xd, + const TileInfo *const tile, vpx_writer *w, int tile_row, + int tile_col, unsigned int *const max_mv_magnitude, + int interp_filter_selected[][SWITCHABLE]) { const VP9_COMMON *const cm = &cpi->common; int mi_row, mi_col, tile_sb_row; TOKENEXTRA *tok = NULL; From 3cff8be3d8b662d96d49b01a53e4fa22278709b2 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 9 Dec 2021 18:02:30 -0800 Subject: [PATCH 0260/1000] vp9_diamond_search_sad_avx: quiet -Wmaybe-uninitialized warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit w/gcc-11 v_these_mv_w is always initialized in this block with _mm_add_epi16(); converting this to a _mm_storeu_si32(tmp) call also works, but introduces more stack usage || ../vp9/encoder/x86/vp9_diamond_search_sad_avx.c: In function ‘vp9_diamond_search_sad_avx’: vp9/encoder/x86/vp9_diamond_search_sad_avx.c|285 col 19| warning: ‘v_these_mv_w’ may be used uninitialized [-Wmaybe-uninitialized] || 285 | new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; || | ~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ vp9/encoder/x86/vp9_diamond_search_sad_avx.c|149 col 21| note: ‘v_these_mv_w’ declared here || 149 | const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); || | ^~~~~~~~~~~~ Change-Id: I1cd2fcb41030db16f51c94f3a70eb8eb2a526401 --- vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c index 4be6a5ea02..fcf50eb2a7 100644 --- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c @@ -282,7 +282,14 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, // Update the global minimum if the local minimum is smaller if (LIKELY(local_best_sad < best_sad)) { +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic pop +#endif new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; best_sad = local_best_sad; From 03a81068467076b4ce4a41dafaac9a9e5cc5f01c Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 9 Dec 2021 18:34:18 -0800 Subject: [PATCH 0261/1000] vp[89]_initalize_enc(): protect against multiple invocations this removes the burden from callers; the rtcd functions are left with a mostly redundant (outside of tests) once() as top-level functions should ensure their constraints are met Change-Id: I5bdbcfa4671c6a1492cfe9c7d886c361c26caaa9 --- vp8/encoder/onyx_if.c | 14 ++++++-------- vp8/vp8_cx_iface.c | 3 +-- vp9/common/vp9_rtcd.c | 6 +----- vp9/encoder/vp9_encoder.c | 26 ++++++++++++-------------- vp9/vp9_cx_iface.c | 3 +-- vpx_ports/vpx_once.h | 4 ++-- 6 files changed, 23 insertions(+), 33 deletions(-) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 59bce951e0..f09177c7f5 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -36,6 +36,7 @@ #include "vp8/common/swapyv12buffer.h" #include "vp8/common/threading.h" #include "vpx_ports/system_state.h" +#include "vpx_ports/vpx_once.h" #include "vpx_ports/vpx_timer.h" #include "vpx_util/vpx_write_yuv_frame.h" #if VPX_ARCH_ARM @@ -394,16 +395,13 @@ static void setup_features(VP8_COMP *cpi) { static void dealloc_raw_frame_buffers(VP8_COMP *cpi); -void vp8_initialize_enc(void) { - static volatile int init_done = 0; - - if (!init_done) { - vpx_dsp_rtcd(); - vp8_init_intra_predictors(); - init_done = 1; - } +static void initialize_enc(void) { + vpx_dsp_rtcd(); + vp8_init_intra_predictors(); } +void vp8_initialize_enc(void) { once(initialize_enc); } + static void dealloc_compressor_data(VP8_COMP *cpi) { vpx_free(cpi->tplist); cpi->tplist = NULL; diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index ab954c46f2..21fed0e8ed 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -18,7 +18,6 @@ #include "vpx_mem/vpx_mem.h" #include "vpx_ports/static_assert.h" #include "vpx_ports/system_state.h" -#include "vpx_ports/vpx_once.h" #include "vpx_util/vpx_timestamp.h" #include "vp8/encoder/onyx_int.h" #include "vpx/vp8cx.h" @@ -694,7 +693,7 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, ctx->priv->enc.total_encoders = 1; } - once(vp8_initialize_enc); + vp8_initialize_enc(); res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0); diff --git a/vp9/common/vp9_rtcd.c b/vp9/common/vp9_rtcd.c index d8c870aa3f..37762ca15a 100644 --- a/vp9/common/vp9_rtcd.c +++ b/vp9/common/vp9_rtcd.c @@ -12,8 +12,4 @@ #include "./vp9_rtcd.h" #include "vpx_ports/vpx_once.h" -void vp9_rtcd() { - // TODO(JBB): Remove this once, by insuring that both the encoder and - // decoder setup functions are protected by once(); - once(setup_rtcd_internal); -} +void vp9_rtcd() { once(setup_rtcd_internal); } diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 8fdd86916f..8d5ec5a360 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -25,6 +25,7 @@ #endif #include "vpx_ports/mem.h" #include "vpx_ports/system_state.h" +#include "vpx_ports/vpx_once.h" #include "vpx_ports/vpx_timer.h" #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "vpx_util/vpx_debug_util.h" @@ -929,24 +930,21 @@ static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) { cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1; } -void vp9_initialize_enc(void) { - static volatile int init_done = 0; - - if (!init_done) { - vp9_rtcd(); - vpx_dsp_rtcd(); - vpx_scale_rtcd(); - vp9_init_intra_predictors(); - vp9_init_me_luts(); - vp9_rc_init_minq_luts(); - vp9_entropy_mv_init(); +static void initialize_enc(void) { + vp9_rtcd(); + vpx_dsp_rtcd(); + vpx_scale_rtcd(); + vp9_init_intra_predictors(); + vp9_init_me_luts(); + vp9_rc_init_minq_luts(); + vp9_entropy_mv_init(); #if !CONFIG_REALTIME_ONLY - vp9_temporal_filter_init(); + vp9_temporal_filter_init(); #endif - init_done = 1; - } } +void vp9_initialize_enc(void) { once(initialize_enc); } + static void dealloc_compressor_data(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; int i; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index cc4081c4f5..9f03ed1728 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -15,7 +15,6 @@ #include "vpx/vpx_encoder.h" #include "vpx/vpx_ext_ratectrl.h" #include "vpx_dsp/psnr.h" -#include "vpx_ports/vpx_once.h" #include "vpx_ports/static_assert.h" #include "vpx_ports/system_state.h" #include "vpx_util/vpx_timestamp.h" @@ -1096,7 +1095,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, } priv->extra_cfg = default_extra_cfg; - once(vp9_initialize_enc); + vp9_initialize_enc(); res = validate_config(priv, &priv->cfg, &priv->extra_cfg); diff --git a/vpx_ports/vpx_once.h b/vpx_ports/vpx_once.h index 4eb592b87e..d8a8ed89fe 100644 --- a/vpx_ports/vpx_once.h +++ b/vpx_ports/vpx_once.h @@ -95,7 +95,7 @@ static void once(void (*func)(void)) { #define INCL_DOS #include static void once(void (*func)(void)) { - static int done; + static volatile int done; /* If the initialization is complete, return early. */ if (done) return; @@ -128,7 +128,7 @@ static void once(void (*func)(void)) { */ static void once(void (*func)(void)) { - static int done; + static volatile int done; if (!done) { func(); From e7f33a53cf404bbb3688af9b13375b5c090daae4 Mon Sep 17 00:00:00 2001 From: Jianhui Dai Date: Thu, 9 Dec 2021 13:38:22 +0800 Subject: [PATCH 0262/1000] Set unused reference frames to first ref If a reference frame is not referenced, then set the index for that reference to the first one used/referenced instead of unused slot. Unused slot means key frame, as key frame resets all slots with itself. This CL extracts `get_first_ref_frame()` from `reset_fb_idx_unused()` with a typo fixing, and sets all unused reference frames to first ref in vp9 uncompressed header. Bug: webrtc:13442 Change-Id: I99523bc2ceedf27efe376d1113851ff342982181 --- vp9/encoder/vp9_bitstream.c | 17 ++++++--- vp9/encoder/vp9_encoder.h | 22 ++++++++---- vp9/encoder/vp9_svc_layercontext.c | 55 ++++++++++++------------------ vpx/vp8cx.h | 17 +++++---- 4 files changed, 61 insertions(+), 50 deletions(-) diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index c23e150a45..3c4bdc9914 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -1241,12 +1241,21 @@ static void write_uncompressed_header(VP9_COMP *cpi, vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES); write_frame_size(cm, wb); } else { + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + const MV_REFERENCE_FRAME first_ref = get_first_ref_frame(cpi); + const int first_ref_map_idx = get_ref_frame_map_idx(cpi, first_ref); MV_REFERENCE_FRAME ref_frame; vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES); - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX); - vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame), - REF_FRAMES_LOG2); + + // If a reference frame is not referenced, then set the index for that + // reference to the first one used/referenced. + for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) { + const int referenced = cpi->ref_frame_flags & flag_list[ref_frame]; + const int map_idx = referenced ? get_ref_frame_map_idx(cpi, ref_frame) + : first_ref_map_idx; + assert(map_idx != INVALID_IDX); + vpx_wb_write_literal(wb, map_idx, REF_FRAMES_LOG2); vpx_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]); } diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 9774a64ccf..1bca7ded75 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1196,14 +1196,24 @@ static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) { (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref); } +static INLINE MV_REFERENCE_FRAME get_first_ref_frame(VP9_COMP *const cpi) { + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + MV_REFERENCE_FRAME ref_frame = LAST_FRAME; + while (ref_frame < MAX_REF_FRAMES) { + if (cpi->ref_frame_flags & flag_list[ref_frame]) break; + ref_frame++; + } + return ref_frame; +} + static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { - if (ref_frame == LAST_FRAME) { - return cpi->lst_fb_idx; - } else if (ref_frame == GOLDEN_FRAME) { - return cpi->gld_fb_idx; - } else { - return cpi->alt_fb_idx; + switch (ref_frame) { + case LAST_FRAME: return cpi->lst_fb_idx; + case GOLDEN_FRAME: return cpi->gld_fb_idx; + case ALTREF_FRAME: return cpi->alt_fb_idx; + default: return INVALID_IDX; } } diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index ad3a8f7afa..f01cb17a2f 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -73,7 +73,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { svc->downsample_filter_type[sl] = BILINEAR; svc->downsample_filter_phase[sl] = 8; // Set to 8 for averaging filter. svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark; - svc->fb_idx_upd_tl0[sl] = -1; + svc->fb_idx_upd_tl0[sl] = INVALID_IDX; svc->drop_count[sl] = 0; svc->spatial_layer_sync[sl] = 0; svc->force_drop_constrained_from_above[sl] = 0; @@ -462,32 +462,21 @@ static void reset_fb_idx_unused(VP9_COMP *const cpi) { // fb_idx for that reference to the first one used/referenced. // This is to avoid setting fb_idx for a reference to a slot that is not // used/needed (i.e., since that reference is not referenced or refreshed). - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; - MV_REFERENCE_FRAME ref_frame; - MV_REFERENCE_FRAME first_ref = 0; - int first_fb_idx = 0; - int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx }; - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { - if (cpi->ref_frame_flags & flag_list[ref_frame]) { - first_ref = ref_frame; - first_fb_idx = fb_idx[ref_frame - 1]; - break; + const MV_REFERENCE_FRAME first_ref = get_first_ref_frame(cpi); + const int map_idx = get_ref_frame_map_idx(cpi, first_ref); + if (map_idx != INVALID_IDX) { + if (!(cpi->ref_frame_flags & VP9_LAST_FLAG || + cpi->ext_refresh_last_frame)) { + cpi->lst_fb_idx = map_idx; + } + if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG || + cpi->ext_refresh_golden_frame)) { + cpi->gld_fb_idx = map_idx; + } + if (!(cpi->ref_frame_flags & VP9_ALT_FLAG || + cpi->ext_refresh_alt_ref_frame)) { + cpi->alt_fb_idx = map_idx; } - } - if (first_ref > 0) { - if (first_ref != LAST_FRAME && - !(cpi->ref_frame_flags & flag_list[LAST_FRAME]) && - !cpi->ext_refresh_last_frame) - cpi->lst_fb_idx = first_fb_idx; - else if (first_ref != GOLDEN_FRAME && - !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && - !cpi->ext_refresh_golden_frame) - cpi->gld_fb_idx = first_fb_idx; - else if (first_ref != ALTREF_FRAME && - !(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]) && - !cpi->ext_refresh_alt_ref_frame) - cpi->alt_fb_idx = first_fb_idx; } } @@ -716,9 +705,9 @@ static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config( int sl = svc->spatial_layer_id = svc->spatial_layer_to_encode; cpi->svc.temporal_layer_id = cpi->svc.temporal_layer_id_per_spatial[sl]; cpi->ext_refresh_frame_flags_pending = 1; - cpi->lst_fb_idx = svc->lst_fb_idx[sl]; - cpi->gld_fb_idx = svc->gld_fb_idx[sl]; - cpi->alt_fb_idx = svc->alt_fb_idx[sl]; + if (svc->reference_last[sl]) cpi->lst_fb_idx = svc->lst_fb_idx[sl]; + if (svc->reference_golden[sl]) cpi->gld_fb_idx = svc->gld_fb_idx[sl]; + if (svc->reference_altref[sl]) cpi->alt_fb_idx = svc->alt_fb_idx[sl]; cpi->ext_refresh_last_frame = 0; cpi->ext_refresh_golden_frame = 0; cpi->ext_refresh_alt_ref_frame = 0; @@ -875,9 +864,9 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { // flags are passed via the encode call (bypass mode). Issue is that we're // resetting ext_refresh_frame_flags_pending to 0 on frame drops. if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { - memset(&svc->lst_fb_idx, -1, sizeof(svc->lst_fb_idx)); - memset(&svc->gld_fb_idx, -1, sizeof(svc->lst_fb_idx)); - memset(&svc->alt_fb_idx, -1, sizeof(svc->lst_fb_idx)); + memset(&svc->lst_fb_idx, INVALID_IDX, sizeof(svc->lst_fb_idx)); + memset(&svc->gld_fb_idx, INVALID_IDX, sizeof(svc->lst_fb_idx)); + memset(&svc->alt_fb_idx, INVALID_IDX, sizeof(svc->lst_fb_idx)); // These are set by API before the superframe is encoded and they are // passed to encoder layer by layer. Don't reset them on layer 0 in bypass // mode. @@ -970,7 +959,7 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && svc->last_layer_dropped[svc->spatial_layer_id] && - svc->fb_idx_upd_tl0[svc->spatial_layer_id] != -1 && + svc->fb_idx_upd_tl0[svc->spatial_layer_id] != INVALID_IDX && !svc->layer_context[svc->temporal_layer_id].is_key_frame) { // For fixed/non-flexible mode, if the previous frame (same spatial layer // from previous superframe) was dropped, make sure the lst_fb_idx diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 47c38d3b5e..b3c50a9b67 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -897,13 +897,16 @@ typedef struct vpx_svc_ref_frame_config { int alt_fb_idx[VPX_SS_MAX_LAYERS]; /**< Altref buffer index. */ int update_buffer_slot[VPX_SS_MAX_LAYERS]; /**< Update reference frames. */ // TODO(jianj): Remove update_last/golden/alt_ref, these are deprecated. - int update_last[VPX_SS_MAX_LAYERS]; /**< Update last. */ - int update_golden[VPX_SS_MAX_LAYERS]; /**< Update golden. */ - int update_alt_ref[VPX_SS_MAX_LAYERS]; /**< Update altref. */ - int reference_last[VPX_SS_MAX_LAYERS]; /**< Last as reference. */ - int reference_golden[VPX_SS_MAX_LAYERS]; /**< Golden as reference. */ - int reference_alt_ref[VPX_SS_MAX_LAYERS]; /**< Altref as reference. */ - int64_t duration[VPX_SS_MAX_LAYERS]; /**< Duration per spatial layer. */ + int update_last[VPX_SS_MAX_LAYERS]; /**< Update last. */ + int update_golden[VPX_SS_MAX_LAYERS]; /**< Update golden. */ + int update_alt_ref[VPX_SS_MAX_LAYERS]; /**< Update altref. */ + int reference_last[VPX_SS_MAX_LAYERS]; + /**< Last as reference. Use first referenced index if FALSE. */ + int reference_golden[VPX_SS_MAX_LAYERS]; + /**< Golden as reference. Use first referenced index if FALSE. */ + int reference_alt_ref[VPX_SS_MAX_LAYERS]; + /**< Altref as reference. Use first referenced index if FALSE. */ + int64_t duration[VPX_SS_MAX_LAYERS]; /**< Duration per spatial layer. */ } vpx_svc_ref_frame_config_t; /*!\brief VP9 svc frame dropping mode. From ea042a676ee09987dc5c8fccaef6ea941eaea258 Mon Sep 17 00:00:00 2001 From: Fyodor Kyslov Date: Tue, 14 Dec 2021 09:59:17 -0800 Subject: [PATCH 0263/1000] vp9 encoder: fix integer overflows fixing integer overflow with 16K content and enabling the test Bug: webm:1750 Fixed: webm:1750 Change-Id: I76eebd915bcae55bc755613251a98e1716dea4c0 --- test/realtime_test.cc | 4 +--- vp9/encoder/vp9_bitstream.c | 6 +++--- vp9/encoder/vp9_cost.h | 5 ++--- vp9/encoder/vp9_subexp.c | 35 ++++++++++++++++++----------------- vp9/encoder/vp9_subexp.h | 15 ++++++++------- 5 files changed, 32 insertions(+), 33 deletions(-) diff --git a/test/realtime_test.cc b/test/realtime_test.cc index ab2080a85d..107f2e224f 100644 --- a/test/realtime_test.cc +++ b/test/realtime_test.cc @@ -85,9 +85,7 @@ TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); } TEST_P(RealtimeTest, IntegerOverflowLarge) { if (IsVP9()) { - GTEST_SKIP() << "TODO(https://crbug.com/webm/1750): Enable this test after " - "undefined sanitizer warnings are fixed."; - // TestIntegerOverflow(16384, 16384); + TestIntegerOverflow(16384, 16384); } else { GTEST_SKIP() << "TODO(https://crbug.com/webm/1748,https://crbug.com/webm/1751):" diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index c23e150a45..e23ca97730 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -563,7 +563,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, for (t = 0; t < entropy_nodes_update; ++t) { vpx_prob newp = new_coef_probs[i][j][k][l][t]; const vpx_prob oldp = old_coef_probs[i][j][k][l][t]; - int s; + int64_t s; int u = 0; if (t == PIVOT_NODE) s = vp9_prob_diff_update_savings_search_model( @@ -600,7 +600,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, vpx_prob newp = new_coef_probs[i][j][k][l][t]; vpx_prob *oldp = old_coef_probs[i][j][k][l] + t; const vpx_prob upd = DIFF_UPDATE_PROB; - int s; + int64_t s; int u = 0; if (t == PIVOT_NODE) s = vp9_prob_diff_update_savings_search_model( @@ -636,7 +636,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, for (t = 0; t < entropy_nodes_update; ++t) { vpx_prob newp = new_coef_probs[i][j][k][l][t]; vpx_prob *oldp = old_coef_probs[i][j][k][l] + t; - int s; + int64_t s; int u = 0; if (t == PIVOT_NODE) { diff --git a/vp9/encoder/vp9_cost.h b/vp9/encoder/vp9_cost.h index 638d72a916..ee0033fa31 100644 --- a/vp9/encoder/vp9_cost.h +++ b/vp9/encoder/vp9_cost.h @@ -29,9 +29,8 @@ extern const uint16_t vp9_prob_cost[256]; #define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? 256 - (prob) : (prob)) -static INLINE unsigned int cost_branch256(const unsigned int ct[2], - vpx_prob p) { - return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p); +static INLINE uint64_t cost_branch256(const unsigned int ct[2], vpx_prob p) { + return (uint64_t)ct[0] * vp9_cost_zero(p) + (uint64_t)ct[1] * vp9_cost_one(p); } static INLINE int treed_cost(vpx_tree tree, const vpx_prob *probs, int bits, diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c index 19bbd5373f..661294ba04 100644 --- a/vp9/encoder/vp9_subexp.c +++ b/vp9/encoder/vp9_subexp.c @@ -114,19 +114,20 @@ void vp9_write_prob_diff_update(vpx_writer *w, vpx_prob newp, vpx_prob oldp) { encode_term_subexp(w, delp); } -int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp, - vpx_prob *bestp, vpx_prob upd) { - const int old_b = cost_branch256(ct, oldp); - int bestsavings = 0; +int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct, + vpx_prob oldp, vpx_prob *bestp, + vpx_prob upd) { + const int64_t old_b = cost_branch256(ct, oldp); + int64_t bestsavings = 0; vpx_prob newp, bestnewp = oldp; const int step = *bestp > oldp ? -1 : 1; const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd); if (old_b > upd_cost + (MIN_DELP_BITS << VP9_PROB_COST_SHIFT)) { for (newp = *bestp; newp != oldp; newp += step) { - const int new_b = cost_branch256(ct, newp); - const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost; - const int savings = old_b - new_b - update_b; + const int64_t new_b = cost_branch256(ct, newp); + const int64_t update_b = prob_diff_update_cost(newp, oldp) + upd_cost; + const int64_t savings = old_b - new_b - update_b; if (savings > bestsavings) { bestsavings = savings; bestnewp = newp; @@ -137,15 +138,15 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp, return bestsavings; } -int vp9_prob_diff_update_savings_search_model(const unsigned int *ct, - const vpx_prob oldp, - vpx_prob *bestp, vpx_prob upd, - int stepsize) { - int i, old_b, new_b, update_b, savings, bestsavings; - int newp; - const int step_sign = *bestp > oldp ? -1 : 1; - const int step = stepsize * step_sign; - const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd); +int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct, + const vpx_prob oldp, + vpx_prob *bestp, vpx_prob upd, + int stepsize) { + int64_t i, old_b, new_b, update_b, savings, bestsavings; + int64_t newp; + const int64_t step_sign = *bestp > oldp ? -1 : 1; + const int64_t step = stepsize * step_sign; + const int64_t upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd); const vpx_prob *newplist, *oldplist; vpx_prob bestnewp; oldplist = vp9_pareto8_full[oldp - 1]; @@ -182,7 +183,7 @@ void vp9_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp, const unsigned int ct[2]) { const vpx_prob upd = DIFF_UPDATE_PROB; vpx_prob newp = get_binary_prob(ct[0], ct[1]); - const int savings = + const int64_t savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp, upd); assert(newp >= 1); if (savings > 0) { diff --git a/vp9/encoder/vp9_subexp.h b/vp9/encoder/vp9_subexp.h index f0d544b527..2d016d24c5 100644 --- a/vp9/encoder/vp9_subexp.h +++ b/vp9/encoder/vp9_subexp.h @@ -25,13 +25,14 @@ void vp9_write_prob_diff_update(struct vpx_writer *w, vpx_prob newp, void vp9_cond_prob_diff_update(struct vpx_writer *w, vpx_prob *oldp, const unsigned int ct[2]); -int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp, - vpx_prob *bestp, vpx_prob upd); - -int vp9_prob_diff_update_savings_search_model(const unsigned int *ct, - const vpx_prob oldp, - vpx_prob *bestp, vpx_prob upd, - int stepsize); +int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct, + vpx_prob oldp, vpx_prob *bestp, + vpx_prob upd); + +int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct, + const vpx_prob oldp, + vpx_prob *bestp, vpx_prob upd, + int stepsize); #ifdef __cplusplus } // extern "C" From 6bf761a7ef8e8532d8a88c95d255a2873077de5d Mon Sep 17 00:00:00 2001 From: Fyodor Kyslov Date: Wed, 15 Dec 2021 23:11:15 -0800 Subject: [PATCH 0264/1000] vp9 encoder: fix test failure on 32 bit arch test fails with memory error. Reducing testing resolution bug: webm:1750 Change-Id: I75664088022aa660bdf6e69de2d11121db44716f --- test/realtime_test.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/realtime_test.cc b/test/realtime_test.cc index 107f2e224f..853b942824 100644 --- a/test/realtime_test.cc +++ b/test/realtime_test.cc @@ -85,7 +85,11 @@ TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); } TEST_P(RealtimeTest, IntegerOverflowLarge) { if (IsVP9()) { +#if VPX_ARCH_X86_64 TestIntegerOverflow(16384, 16384); +#else + TestIntegerOverflow(4096, 4096); +#endif } else { GTEST_SKIP() << "TODO(https://crbug.com/webm/1748,https://crbug.com/webm/1751):" From 94972ca7ea2a93806d0f902dd222ec164e81997d Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 20 Dec 2021 19:56:01 -0800 Subject: [PATCH 0265/1000] vpx_int_pro_row: normalize declaration w/aom this is a followup to: 7fbcee49d quiet -Warray-parameter warnings and conforms to aom in: 06e13e817 quiet -Warray-parameter warnings the sad functions are more varied in libvpx and will require a separate pass Change-Id: I765fd6704df615e836ba0b184ff8266ce926c394 --- vpx_dsp/avg.c | 2 +- vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/avg_intrin_sse2.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c index c87ab20ffe..1c45e8a73d 100644 --- a/vpx_dsp/avg.c +++ b/vpx_dsp/avg.c @@ -340,7 +340,7 @@ int vpx_satd_c(const tran_low_t *coeff, int length) { // Integer projection onto row vectors. // height: value range {16, 32, 64}. -void vpx_int_pro_row_c(int16_t *hbuf /*[16]*/, const uint8_t *ref, +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height) { int idx; const int norm_factor = height >> 1; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index fd7eefdad0..0144b90c26 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -824,7 +824,7 @@ () specialize qw/vpx_satd avx2 sse2 neon msa/; } - add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height"; + add_proto qw/void vpx_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height"; specialize qw/vpx_int_pro_row sse2 neon msa/; add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width"; diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c index 3cba258f61..9da2f34c9b 100644 --- a/vpx_dsp/x86/avg_intrin_sse2.c +++ b/vpx_dsp/x86/avg_intrin_sse2.c @@ -464,7 +464,7 @@ int vpx_satd_sse2(const tran_low_t *coeff, int length) { return _mm_cvtsi128_si32(accum); } -void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height) { int idx; __m128i zero = _mm_setzero_si128(); From b685d6f02f1e3ccad9b1debd8dcf4c7a06bfaab6 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 21 Dec 2021 11:53:51 -0800 Subject: [PATCH 0266/1000] vp9_prob_diff_update_savings_search_model: quiet conv warnings under Visual Studio: Warning C4244 '=': conversion from 'int64_t' to 'vpx_prob', possible loss of data after: ea042a676 vp9 encoder: fix integer overflows 'newp' has already been range checked earlier in the loop so the cast won't have any unexpected results Change-Id: Ic10877db2c0633d53fffdf8852d5095403c23a02 --- vp9/encoder/vp9_subexp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c index 661294ba04..3953253dbb 100644 --- a/vp9/encoder/vp9_subexp.c +++ b/vp9/encoder/vp9_subexp.c @@ -163,14 +163,14 @@ int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct, for (newp = *bestp; (newp - oldp) * step_sign < 0; newp += step) { if (newp < 1 || newp > 255) continue; newplist = vp9_pareto8_full[newp - 1]; - new_b = cost_branch256(ct + 2 * PIVOT_NODE, newp); + new_b = cost_branch256(ct + 2 * PIVOT_NODE, (vpx_prob)newp); for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i) new_b += cost_branch256(ct + 2 * i, newplist[i - UNCONSTRAINED_NODES]); - update_b = prob_diff_update_cost(newp, oldp) + upd_cost; + update_b = prob_diff_update_cost((vpx_prob)newp, oldp) + upd_cost; savings = old_b - new_b - update_b; if (savings > bestsavings) { bestsavings = savings; - bestnewp = newp; + bestnewp = (vpx_prob)newp; } } } From 44e611482e13fdffa0acde780a20dd68ee153498 Mon Sep 17 00:00:00 2001 From: Jianhui Dai Date: Sat, 1 Jan 2022 08:01:48 +0800 Subject: [PATCH 0267/1000] Add vp9 ref frame to flag map function Change-Id: I371c2346b9e0153c0f8053cab399ce14cd286c56 --- vp9/encoder/vp9_bitstream.c | 7 +++--- vp9/encoder/vp9_encoder.c | 4 +-- vp9/encoder/vp9_encoder.h | 15 ++++++++---- vp9/encoder/vp9_pickmode.c | 39 ++++++++++++++---------------- vp9/encoder/vp9_rdopt.c | 16 ++++++------ vp9/encoder/vp9_speed_features.c | 5 ++-- vp9/encoder/vp9_svc_layercontext.c | 18 +++++--------- 7 files changed, 47 insertions(+), 57 deletions(-) diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 7644930c1c..c5145acf08 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -1241,9 +1241,7 @@ static void write_uncompressed_header(VP9_COMP *cpi, vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES); write_frame_size(cm, wb); } else { - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; - const MV_REFERENCE_FRAME first_ref = get_first_ref_frame(cpi); + const int first_ref = get_first_ref_frame(cpi); const int first_ref_map_idx = get_ref_frame_map_idx(cpi, first_ref); MV_REFERENCE_FRAME ref_frame; vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES); @@ -1251,7 +1249,8 @@ static void write_uncompressed_header(VP9_COMP *cpi, // If a reference frame is not referenced, then set the index for that // reference to the first one used/referenced. for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) { - const int referenced = cpi->ref_frame_flags & flag_list[ref_frame]; + const int referenced = + cpi->ref_frame_flags & ref_frame_to_flag(ref_frame); const int map_idx = referenced ? get_ref_frame_map_idx(cpi, ref_frame) : first_ref_map_idx; assert(map_idx != INVALID_IDX); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 8d5ec5a360..1038bd9515 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -586,8 +586,6 @@ static void apply_roi_map(VP9_COMP *cpi) { int ref_frame[8]; int internal_delta_q[MAX_SEGMENTS]; int i; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; // TODO(jianj): Investigate why ROI not working in speed < 5 or in non // realtime mode. @@ -628,7 +626,7 @@ static void apply_roi_map(VP9_COMP *cpi) { valid_ref = 0; // If GOLDEN is selected, make sure it's set as reference. if (ref_frame[i] == GOLDEN_FRAME && - !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) { + !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame[i]))) { valid_ref = 0; } // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 1bca7ded75..0059e881be 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1196,12 +1196,17 @@ static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) { (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref); } -static INLINE MV_REFERENCE_FRAME get_first_ref_frame(VP9_COMP *const cpi) { - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; - MV_REFERENCE_FRAME ref_frame = LAST_FRAME; +static INLINE int ref_frame_to_flag(int8_t ref_frame) { + static const int kVp9RefFlagList[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME); + return kVp9RefFlagList[ref_frame]; +} + +static INLINE int get_first_ref_frame(VP9_COMP *const cpi) { + int ref_frame = LAST_FRAME; while (ref_frame < MAX_REF_FRAMES) { - if (cpi->ref_frame_flags & flag_list[ref_frame]) break; + if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) break; ref_frame++; } return ref_frame; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 695fd484fc..c8e167f25b 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1247,7 +1247,7 @@ static INLINE void find_predictors( VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask, - const int flag_list[4], TileDataEnc *tile_data, int mi_row, int mi_col, + TileDataEnc *tile_data, int mi_row, int mi_col, struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize, int force_skip_low_temp_var, int comp_pred_allowed) { VP9_COMMON *const cm = &cpi->common; @@ -1259,7 +1259,7 @@ static INLINE void find_predictors( frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; // this needs various further optimizations. to be continued.. - if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) { + if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && (yv12 != NULL)) { int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); @@ -1690,8 +1690,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; RD_COST this_rdc, best_rdc; // var_y and sse_y are saved to be used in skipping checking unsigned int var_y = UINT_MAX; @@ -1925,14 +1923,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // constrain the inter mode to only test zero motion. if (cpi->use_svc && svc->force_zero_mode_spatial_ref && svc->spatial_layer_id > 0 && !gf_temporal_ref) { - if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) { + if (cpi->ref_frame_flags & VP9_LAST_FLAG) { struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf; if (vp9_is_scaled(sf)) { svc_force_zero_mode[LAST_FRAME - 1] = 1; inter_layer_ref = LAST_FRAME; } } - if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) { + if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf; if (vp9_is_scaled(sf)) { svc_force_zero_mode[GOLDEN_FRAME - 1] = 1; @@ -1957,7 +1955,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, cpi->rc.avg_frame_low_motion < 60)) usable_ref_frame = LAST_FRAME; - if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && + if (!((cpi->ref_frame_flags & VP9_GOLD_FLAG) && !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var)) use_golden_nonzeromv = 0; @@ -1985,12 +1983,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // Skip find_predictor if the reference frame is not in the // ref_frame_flags (i.e., not used as a reference for this frame). skip_ref_find_pred[ref_frame] = - !(cpi->ref_frame_flags & flag_list[ref_frame]); + !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)); if (!skip_ref_find_pred[ref_frame]) { find_predictors(cpi, x, ref_frame, frame_mv, const_motion, - &ref_frame_skip_mask, flag_list, tile_data, mi_row, - mi_col, yv12_mb, bsize, force_skip_low_temp_var, - comp_modes > 0); + &ref_frame_skip_mask, tile_data, mi_row, mi_col, yv12_mb, + bsize, force_skip_low_temp_var, comp_modes > 0); } } @@ -2014,7 +2011,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // than current layer: force check of GF-ZEROMV before early exit // due to skip flag. if (svc->spatial_layer_id > 0 && no_scaling && - (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && + (cpi->ref_frame_flags & VP9_GOLD_FLAG) && cm->base_qindex > svc->lower_layer_qindex + 10) force_test_gf_zeromv = 1; @@ -2094,7 +2091,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (comp_pred) { if (!cpi->allow_comp_inter_inter) continue; // Skip compound inter modes if ARF is not available. - if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; + if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame))) + continue; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue; @@ -2107,7 +2105,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, (!cpi->use_svc && sse_zeromv_normalized < thresh_skip_golden))) continue; - if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; + if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) continue; // For screen content. If zero_temp_sad source is computed: skip // non-zero motion check for stationary blocks. If the superblock is @@ -2190,7 +2188,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (usable_ref_frame < ALTREF_FRAME) { if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) { i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; - if ((cpi->ref_frame_flags & flag_list[i])) + if ((cpi->ref_frame_flags & ref_frame_to_flag(i))) if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) ref_frame_skip_mask |= (1 << ref_frame); } @@ -2199,9 +2197,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, ref_frame == ALTREF_FRAME)) { int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME; int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME; - if (((cpi->ref_frame_flags & flag_list[ref1]) && + if (((cpi->ref_frame_flags & ref_frame_to_flag(ref1)) && (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) || - ((cpi->ref_frame_flags & flag_list[ref2]) && + ((cpi->ref_frame_flags & ref_frame_to_flag(ref2)) && (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1)))) ref_frame_skip_mask |= (1 << ref_frame); } @@ -2488,7 +2486,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, perform_intra_pred = svc->temporal_layer_id == 0 || svc->layer_context[svc->temporal_layer_id].is_key_frame || - !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) || + !(cpi->ref_frame_flags & VP9_GOLD_FLAG) || (!svc->layer_context[svc->temporal_layer_id].is_key_frame && svc_force_zero_mode[best_pickmode.best_ref_frame - 1]); inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh; @@ -2747,8 +2745,6 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, MV_REFERENCE_FRAME best_ref_frame = NONE; unsigned char segment_id = mi->segment_id; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; int64_t best_rd = INT64_MAX; b_mode_info bsi[MAX_REF_FRAMES][4]; int ref_frame_skip_mask = 0; @@ -2764,7 +2760,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int_mv dummy_mv[2]; x->pred_mv_sad[ref_frame] = INT_MAX; - if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) { + if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && + (yv12 != NULL)) { int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame]; const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index a1687dcf46..0171a05720 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -3315,8 +3315,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } }; INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES]; int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES]; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; int64_t best_rd = best_rd_so_far; int64_t best_pred_diff[REFERENCE_MODES]; int64_t best_pred_rd[REFERENCE_MODES]; @@ -3392,7 +3390,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; - if ((cpi->ref_frame_flags & flag_list[ref_frame]) && + if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && !(is_rect_partition && (ctx->skip_ref_frame_mask & (1 << ref_frame)))) { assert(get_ref_frame_buffer(cpi, ref_frame) != NULL); setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, @@ -3403,7 +3401,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - if (!(cpi->ref_frame_flags & flag_list[ref_frame])) { + if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) { // Skip checking missing references in both single and compound reference // modes. Note that a mode will be skipped if both reference frames // are masked out. @@ -3609,7 +3607,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, continue; // Skip compound inter modes if ARF is not available. - if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; + if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame))) + continue; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. @@ -4140,8 +4139,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, int comp_pred, i; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; int64_t best_rd = best_rd_so_far; int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise int64_t best_pred_diff[REFERENCE_MODES]; @@ -4191,7 +4188,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, rd_cost->rate = INT_MAX; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { - if (cpi->ref_frame_flags & flag_list[ref_frame]) { + if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) { setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); } else { @@ -4276,7 +4273,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, cm->ref_frame_sign_bias[second_ref_frame]) continue; - if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; + if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame))) + continue; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 81695e9156..7d7b2c3fb4 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -495,11 +495,10 @@ static void set_rt_speed_feature_framesize_independent( (cpi->external_resize == 1 || cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) { MV_REFERENCE_FRAME ref_frame; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); - if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) { + if (yv12 != NULL && + (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) { const struct scale_factors *const scale_fac = &cm->frame_refs[ref_frame - 1].sf; if (vp9_is_scaled(scale_fac)) sf->reference_masking = 0; diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index f01cb17a2f..30c17fd8e6 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -719,8 +719,6 @@ static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config( void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) { SVC *const svc = &cpi->svc; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; int sl = svc->spatial_layer_id; svc->lst_fb_idx[sl] = cpi->lst_fb_idx; svc->gld_fb_idx[sl] = cpi->gld_fb_idx; @@ -743,12 +741,9 @@ void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) { svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame; svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame; - svc->reference_last[sl] = - (uint8_t)(cpi->ref_frame_flags & flag_list[LAST_FRAME]); - svc->reference_golden[sl] = - (uint8_t)(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]); - svc->reference_altref[sl] = - (uint8_t)(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]); + svc->reference_last[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_LAST_FLAG); + svc->reference_golden[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_GOLD_FLAG); + svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG); } int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { @@ -1069,15 +1064,14 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) { svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF || svc->drop_spatial_layer[sl - 1]) { MV_REFERENCE_FRAME ref_frame; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); - if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) { + if (yv12 != NULL && + (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) { const struct scale_factors *const scale_fac = &cm->frame_refs[ref_frame - 1].sf; if (vp9_is_scaled(scale_fac)) { - cpi->ref_frame_flags &= (~flag_list[ref_frame]); + cpi->ref_frame_flags &= (~ref_frame_to_flag(ref_frame)); // Point golden/altref frame buffer index to last. if (!svc->simulcast_mode) { if (ref_frame == GOLDEN_FRAME) From 6982214de5cc62f1f4dc733f1bcc3ffbd74780b0 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 11 Jan 2022 08:46:59 -0800 Subject: [PATCH 0268/1000] Revert "Add vp9 ref frame to flag map function" This reverts commit 44e611482e13fdffa0acde780a20dd68ee153498. Change-Id: Ic900cc01be4de7983fab42178a488277efab77b3 --- vp9/encoder/vp9_bitstream.c | 7 +++--- vp9/encoder/vp9_encoder.c | 4 ++- vp9/encoder/vp9_encoder.h | 15 ++++-------- vp9/encoder/vp9_pickmode.c | 39 ++++++++++++++++-------------- vp9/encoder/vp9_rdopt.c | 16 ++++++------ vp9/encoder/vp9_speed_features.c | 5 ++-- vp9/encoder/vp9_svc_layercontext.c | 18 +++++++++----- 7 files changed, 57 insertions(+), 47 deletions(-) diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index c5145acf08..7644930c1c 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -1241,7 +1241,9 @@ static void write_uncompressed_header(VP9_COMP *cpi, vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES); write_frame_size(cm, wb); } else { - const int first_ref = get_first_ref_frame(cpi); + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + const MV_REFERENCE_FRAME first_ref = get_first_ref_frame(cpi); const int first_ref_map_idx = get_ref_frame_map_idx(cpi, first_ref); MV_REFERENCE_FRAME ref_frame; vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES); @@ -1249,8 +1251,7 @@ static void write_uncompressed_header(VP9_COMP *cpi, // If a reference frame is not referenced, then set the index for that // reference to the first one used/referenced. for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) { - const int referenced = - cpi->ref_frame_flags & ref_frame_to_flag(ref_frame); + const int referenced = cpi->ref_frame_flags & flag_list[ref_frame]; const int map_idx = referenced ? get_ref_frame_map_idx(cpi, ref_frame) : first_ref_map_idx; assert(map_idx != INVALID_IDX); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 1038bd9515..8d5ec5a360 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -586,6 +586,8 @@ static void apply_roi_map(VP9_COMP *cpi) { int ref_frame[8]; int internal_delta_q[MAX_SEGMENTS]; int i; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; // TODO(jianj): Investigate why ROI not working in speed < 5 or in non // realtime mode. @@ -626,7 +628,7 @@ static void apply_roi_map(VP9_COMP *cpi) { valid_ref = 0; // If GOLDEN is selected, make sure it's set as reference. if (ref_frame[i] == GOLDEN_FRAME && - !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame[i]))) { + !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) { valid_ref = 0; } // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 0059e881be..1bca7ded75 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1196,17 +1196,12 @@ static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) { (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref); } -static INLINE int ref_frame_to_flag(int8_t ref_frame) { - static const int kVp9RefFlagList[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; - assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME); - return kVp9RefFlagList[ref_frame]; -} - -static INLINE int get_first_ref_frame(VP9_COMP *const cpi) { - int ref_frame = LAST_FRAME; +static INLINE MV_REFERENCE_FRAME get_first_ref_frame(VP9_COMP *const cpi) { + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + MV_REFERENCE_FRAME ref_frame = LAST_FRAME; while (ref_frame < MAX_REF_FRAMES) { - if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) break; + if (cpi->ref_frame_flags & flag_list[ref_frame]) break; ref_frame++; } return ref_frame; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index c8e167f25b..695fd484fc 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1247,7 +1247,7 @@ static INLINE void find_predictors( VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask, - TileDataEnc *tile_data, int mi_row, int mi_col, + const int flag_list[4], TileDataEnc *tile_data, int mi_row, int mi_col, struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize, int force_skip_low_temp_var, int comp_pred_allowed) { VP9_COMMON *const cm = &cpi->common; @@ -1259,7 +1259,7 @@ static INLINE void find_predictors( frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; // this needs various further optimizations. to be continued.. - if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && (yv12 != NULL)) { + if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) { int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); @@ -1690,6 +1690,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; RD_COST this_rdc, best_rdc; // var_y and sse_y are saved to be used in skipping checking unsigned int var_y = UINT_MAX; @@ -1923,14 +1925,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // constrain the inter mode to only test zero motion. if (cpi->use_svc && svc->force_zero_mode_spatial_ref && svc->spatial_layer_id > 0 && !gf_temporal_ref) { - if (cpi->ref_frame_flags & VP9_LAST_FLAG) { + if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) { struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf; if (vp9_is_scaled(sf)) { svc_force_zero_mode[LAST_FRAME - 1] = 1; inter_layer_ref = LAST_FRAME; } } - if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { + if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) { struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf; if (vp9_is_scaled(sf)) { svc_force_zero_mode[GOLDEN_FRAME - 1] = 1; @@ -1955,7 +1957,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, cpi->rc.avg_frame_low_motion < 60)) usable_ref_frame = LAST_FRAME; - if (!((cpi->ref_frame_flags & VP9_GOLD_FLAG) && + if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var)) use_golden_nonzeromv = 0; @@ -1983,11 +1985,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // Skip find_predictor if the reference frame is not in the // ref_frame_flags (i.e., not used as a reference for this frame). skip_ref_find_pred[ref_frame] = - !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)); + !(cpi->ref_frame_flags & flag_list[ref_frame]); if (!skip_ref_find_pred[ref_frame]) { find_predictors(cpi, x, ref_frame, frame_mv, const_motion, - &ref_frame_skip_mask, tile_data, mi_row, mi_col, yv12_mb, - bsize, force_skip_low_temp_var, comp_modes > 0); + &ref_frame_skip_mask, flag_list, tile_data, mi_row, + mi_col, yv12_mb, bsize, force_skip_low_temp_var, + comp_modes > 0); } } @@ -2011,7 +2014,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // than current layer: force check of GF-ZEROMV before early exit // due to skip flag. if (svc->spatial_layer_id > 0 && no_scaling && - (cpi->ref_frame_flags & VP9_GOLD_FLAG) && + (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && cm->base_qindex > svc->lower_layer_qindex + 10) force_test_gf_zeromv = 1; @@ -2091,8 +2094,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (comp_pred) { if (!cpi->allow_comp_inter_inter) continue; // Skip compound inter modes if ARF is not available. - if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame))) - continue; + if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue; @@ -2105,7 +2107,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, (!cpi->use_svc && sse_zeromv_normalized < thresh_skip_golden))) continue; - if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) continue; + if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; // For screen content. If zero_temp_sad source is computed: skip // non-zero motion check for stationary blocks. If the superblock is @@ -2188,7 +2190,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (usable_ref_frame < ALTREF_FRAME) { if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) { i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; - if ((cpi->ref_frame_flags & ref_frame_to_flag(i))) + if ((cpi->ref_frame_flags & flag_list[i])) if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) ref_frame_skip_mask |= (1 << ref_frame); } @@ -2197,9 +2199,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, ref_frame == ALTREF_FRAME)) { int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME; int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME; - if (((cpi->ref_frame_flags & ref_frame_to_flag(ref1)) && + if (((cpi->ref_frame_flags & flag_list[ref1]) && (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) || - ((cpi->ref_frame_flags & ref_frame_to_flag(ref2)) && + ((cpi->ref_frame_flags & flag_list[ref2]) && (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1)))) ref_frame_skip_mask |= (1 << ref_frame); } @@ -2486,7 +2488,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, perform_intra_pred = svc->temporal_layer_id == 0 || svc->layer_context[svc->temporal_layer_id].is_key_frame || - !(cpi->ref_frame_flags & VP9_GOLD_FLAG) || + !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) || (!svc->layer_context[svc->temporal_layer_id].is_key_frame && svc_force_zero_mode[best_pickmode.best_ref_frame - 1]); inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh; @@ -2745,6 +2747,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, MV_REFERENCE_FRAME best_ref_frame = NONE; unsigned char segment_id = mi->segment_id; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; int64_t best_rd = INT64_MAX; b_mode_info bsi[MAX_REF_FRAMES][4]; int ref_frame_skip_mask = 0; @@ -2760,8 +2764,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int_mv dummy_mv[2]; x->pred_mv_sad[ref_frame] = INT_MAX; - if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && - (yv12 != NULL)) { + if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) { int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame]; const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 0171a05720..a1687dcf46 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -3315,6 +3315,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } }; INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES]; int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES]; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; int64_t best_rd = best_rd_so_far; int64_t best_pred_diff[REFERENCE_MODES]; int64_t best_pred_rd[REFERENCE_MODES]; @@ -3390,7 +3392,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; - if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && + if ((cpi->ref_frame_flags & flag_list[ref_frame]) && !(is_rect_partition && (ctx->skip_ref_frame_mask & (1 << ref_frame)))) { assert(get_ref_frame_buffer(cpi, ref_frame) != NULL); setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, @@ -3401,7 +3403,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) { + if (!(cpi->ref_frame_flags & flag_list[ref_frame])) { // Skip checking missing references in both single and compound reference // modes. Note that a mode will be skipped if both reference frames // are masked out. @@ -3607,8 +3609,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, continue; // Skip compound inter modes if ARF is not available. - if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame))) - continue; + if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. @@ -4139,6 +4140,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, int comp_pred, i; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; int64_t best_rd = best_rd_so_far; int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise int64_t best_pred_diff[REFERENCE_MODES]; @@ -4188,7 +4191,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, rd_cost->rate = INT_MAX; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { - if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) { + if (cpi->ref_frame_flags & flag_list[ref_frame]) { setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); } else { @@ -4273,8 +4276,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, cm->ref_frame_sign_bias[second_ref_frame]) continue; - if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame))) - continue; + if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 7d7b2c3fb4..81695e9156 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -495,10 +495,11 @@ static void set_rt_speed_feature_framesize_independent( (cpi->external_resize == 1 || cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) { MV_REFERENCE_FRAME ref_frame; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); - if (yv12 != NULL && - (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) { + if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) { const struct scale_factors *const scale_fac = &cm->frame_refs[ref_frame - 1].sf; if (vp9_is_scaled(scale_fac)) sf->reference_masking = 0; diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 30c17fd8e6..f01cb17a2f 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -719,6 +719,8 @@ static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config( void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) { SVC *const svc = &cpi->svc; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; int sl = svc->spatial_layer_id; svc->lst_fb_idx[sl] = cpi->lst_fb_idx; svc->gld_fb_idx[sl] = cpi->gld_fb_idx; @@ -741,9 +743,12 @@ void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) { svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame; svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame; - svc->reference_last[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_LAST_FLAG); - svc->reference_golden[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_GOLD_FLAG); - svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG); + svc->reference_last[sl] = + (uint8_t)(cpi->ref_frame_flags & flag_list[LAST_FRAME]); + svc->reference_golden[sl] = + (uint8_t)(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]); + svc->reference_altref[sl] = + (uint8_t)(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]); } int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { @@ -1064,14 +1069,15 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) { svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF || svc->drop_spatial_layer[sl - 1]) { MV_REFERENCE_FRAME ref_frame; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); - if (yv12 != NULL && - (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) { + if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) { const struct scale_factors *const scale_fac = &cm->frame_refs[ref_frame - 1].sf; if (vp9_is_scaled(scale_fac)) { - cpi->ref_frame_flags &= (~ref_frame_to_flag(ref_frame)); + cpi->ref_frame_flags &= (~flag_list[ref_frame]); // Point golden/altref frame buffer index to last. if (!svc->simulcast_mode) { if (ref_frame == GOLDEN_FRAME) From 51415c4076578d3cbc32fcd0d683161c3e887814 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 11 Jan 2022 08:47:52 -0800 Subject: [PATCH 0269/1000] Revert "Set unused reference frames to first ref" This reverts commit e7f33a53cf404bbb3688af9b13375b5c090daae4. Change-Id: I54e807220885cb78af6f3c6e48b3eb2c9f1e70b4 --- vp9/encoder/vp9_bitstream.c | 17 +++------ vp9/encoder/vp9_encoder.h | 22 ++++-------- vp9/encoder/vp9_svc_layercontext.c | 55 ++++++++++++++++++------------ vpx/vp8cx.h | 17 ++++----- 4 files changed, 50 insertions(+), 61 deletions(-) diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 7644930c1c..99cc2ee831 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -1241,21 +1241,12 @@ static void write_uncompressed_header(VP9_COMP *cpi, vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES); write_frame_size(cm, wb); } else { - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; - const MV_REFERENCE_FRAME first_ref = get_first_ref_frame(cpi); - const int first_ref_map_idx = get_ref_frame_map_idx(cpi, first_ref); MV_REFERENCE_FRAME ref_frame; vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES); - - // If a reference frame is not referenced, then set the index for that - // reference to the first one used/referenced. - for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) { - const int referenced = cpi->ref_frame_flags & flag_list[ref_frame]; - const int map_idx = referenced ? get_ref_frame_map_idx(cpi, ref_frame) - : first_ref_map_idx; - assert(map_idx != INVALID_IDX); - vpx_wb_write_literal(wb, map_idx, REF_FRAMES_LOG2); + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX); + vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame), + REF_FRAMES_LOG2); vpx_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]); } diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 1bca7ded75..9774a64ccf 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1196,24 +1196,14 @@ static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) { (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref); } -static INLINE MV_REFERENCE_FRAME get_first_ref_frame(VP9_COMP *const cpi) { - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; - MV_REFERENCE_FRAME ref_frame = LAST_FRAME; - while (ref_frame < MAX_REF_FRAMES) { - if (cpi->ref_frame_flags & flag_list[ref_frame]) break; - ref_frame++; - } - return ref_frame; -} - static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { - switch (ref_frame) { - case LAST_FRAME: return cpi->lst_fb_idx; - case GOLDEN_FRAME: return cpi->gld_fb_idx; - case ALTREF_FRAME: return cpi->alt_fb_idx; - default: return INVALID_IDX; + if (ref_frame == LAST_FRAME) { + return cpi->lst_fb_idx; + } else if (ref_frame == GOLDEN_FRAME) { + return cpi->gld_fb_idx; + } else { + return cpi->alt_fb_idx; } } diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index f01cb17a2f..ad3a8f7afa 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -73,7 +73,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { svc->downsample_filter_type[sl] = BILINEAR; svc->downsample_filter_phase[sl] = 8; // Set to 8 for averaging filter. svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark; - svc->fb_idx_upd_tl0[sl] = INVALID_IDX; + svc->fb_idx_upd_tl0[sl] = -1; svc->drop_count[sl] = 0; svc->spatial_layer_sync[sl] = 0; svc->force_drop_constrained_from_above[sl] = 0; @@ -462,22 +462,33 @@ static void reset_fb_idx_unused(VP9_COMP *const cpi) { // fb_idx for that reference to the first one used/referenced. // This is to avoid setting fb_idx for a reference to a slot that is not // used/needed (i.e., since that reference is not referenced or refreshed). - const MV_REFERENCE_FRAME first_ref = get_first_ref_frame(cpi); - const int map_idx = get_ref_frame_map_idx(cpi, first_ref); - if (map_idx != INVALID_IDX) { - if (!(cpi->ref_frame_flags & VP9_LAST_FLAG || - cpi->ext_refresh_last_frame)) { - cpi->lst_fb_idx = map_idx; - } - if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG || - cpi->ext_refresh_golden_frame)) { - cpi->gld_fb_idx = map_idx; - } - if (!(cpi->ref_frame_flags & VP9_ALT_FLAG || - cpi->ext_refresh_alt_ref_frame)) { - cpi->alt_fb_idx = map_idx; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + MV_REFERENCE_FRAME ref_frame; + MV_REFERENCE_FRAME first_ref = 0; + int first_fb_idx = 0; + int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx }; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (cpi->ref_frame_flags & flag_list[ref_frame]) { + first_ref = ref_frame; + first_fb_idx = fb_idx[ref_frame - 1]; + break; } } + if (first_ref > 0) { + if (first_ref != LAST_FRAME && + !(cpi->ref_frame_flags & flag_list[LAST_FRAME]) && + !cpi->ext_refresh_last_frame) + cpi->lst_fb_idx = first_fb_idx; + else if (first_ref != GOLDEN_FRAME && + !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && + !cpi->ext_refresh_golden_frame) + cpi->gld_fb_idx = first_fb_idx; + else if (first_ref != ALTREF_FRAME && + !(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]) && + !cpi->ext_refresh_alt_ref_frame) + cpi->alt_fb_idx = first_fb_idx; + } } // Never refresh any reference frame buffers on top temporal layers in @@ -705,9 +716,9 @@ static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config( int sl = svc->spatial_layer_id = svc->spatial_layer_to_encode; cpi->svc.temporal_layer_id = cpi->svc.temporal_layer_id_per_spatial[sl]; cpi->ext_refresh_frame_flags_pending = 1; - if (svc->reference_last[sl]) cpi->lst_fb_idx = svc->lst_fb_idx[sl]; - if (svc->reference_golden[sl]) cpi->gld_fb_idx = svc->gld_fb_idx[sl]; - if (svc->reference_altref[sl]) cpi->alt_fb_idx = svc->alt_fb_idx[sl]; + cpi->lst_fb_idx = svc->lst_fb_idx[sl]; + cpi->gld_fb_idx = svc->gld_fb_idx[sl]; + cpi->alt_fb_idx = svc->alt_fb_idx[sl]; cpi->ext_refresh_last_frame = 0; cpi->ext_refresh_golden_frame = 0; cpi->ext_refresh_alt_ref_frame = 0; @@ -864,9 +875,9 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { // flags are passed via the encode call (bypass mode). Issue is that we're // resetting ext_refresh_frame_flags_pending to 0 on frame drops. if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { - memset(&svc->lst_fb_idx, INVALID_IDX, sizeof(svc->lst_fb_idx)); - memset(&svc->gld_fb_idx, INVALID_IDX, sizeof(svc->lst_fb_idx)); - memset(&svc->alt_fb_idx, INVALID_IDX, sizeof(svc->lst_fb_idx)); + memset(&svc->lst_fb_idx, -1, sizeof(svc->lst_fb_idx)); + memset(&svc->gld_fb_idx, -1, sizeof(svc->lst_fb_idx)); + memset(&svc->alt_fb_idx, -1, sizeof(svc->lst_fb_idx)); // These are set by API before the superframe is encoded and they are // passed to encoder layer by layer. Don't reset them on layer 0 in bypass // mode. @@ -959,7 +970,7 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && svc->last_layer_dropped[svc->spatial_layer_id] && - svc->fb_idx_upd_tl0[svc->spatial_layer_id] != INVALID_IDX && + svc->fb_idx_upd_tl0[svc->spatial_layer_id] != -1 && !svc->layer_context[svc->temporal_layer_id].is_key_frame) { // For fixed/non-flexible mode, if the previous frame (same spatial layer // from previous superframe) was dropped, make sure the lst_fb_idx diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index b3c50a9b67..47c38d3b5e 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -897,16 +897,13 @@ typedef struct vpx_svc_ref_frame_config { int alt_fb_idx[VPX_SS_MAX_LAYERS]; /**< Altref buffer index. */ int update_buffer_slot[VPX_SS_MAX_LAYERS]; /**< Update reference frames. */ // TODO(jianj): Remove update_last/golden/alt_ref, these are deprecated. - int update_last[VPX_SS_MAX_LAYERS]; /**< Update last. */ - int update_golden[VPX_SS_MAX_LAYERS]; /**< Update golden. */ - int update_alt_ref[VPX_SS_MAX_LAYERS]; /**< Update altref. */ - int reference_last[VPX_SS_MAX_LAYERS]; - /**< Last as reference. Use first referenced index if FALSE. */ - int reference_golden[VPX_SS_MAX_LAYERS]; - /**< Golden as reference. Use first referenced index if FALSE. */ - int reference_alt_ref[VPX_SS_MAX_LAYERS]; - /**< Altref as reference. Use first referenced index if FALSE. */ - int64_t duration[VPX_SS_MAX_LAYERS]; /**< Duration per spatial layer. */ + int update_last[VPX_SS_MAX_LAYERS]; /**< Update last. */ + int update_golden[VPX_SS_MAX_LAYERS]; /**< Update golden. */ + int update_alt_ref[VPX_SS_MAX_LAYERS]; /**< Update altref. */ + int reference_last[VPX_SS_MAX_LAYERS]; /**< Last as reference. */ + int reference_golden[VPX_SS_MAX_LAYERS]; /**< Golden as reference. */ + int reference_alt_ref[VPX_SS_MAX_LAYERS]; /**< Altref as reference. */ + int64_t duration[VPX_SS_MAX_LAYERS]; /**< Duration per spatial layer. */ } vpx_svc_ref_frame_config_t; /*!\brief VP9 svc frame dropping mode. From 82014b6675ef9acf20cb2bb42c83f95d9e33906b Mon Sep 17 00:00:00 2001 From: Jianhui Dai Date: Sat, 1 Jan 2022 08:01:48 +0800 Subject: [PATCH 0270/1000] Reland "Add vp9 ref frame to flag map function" Original change's description: > Add vp9 ref frame to flag map function > > Change-Id: I371c2346b9e0153c0f8053cab399ce14cd286c56 Change-Id: I04a407ee0ef66c01a0d224b4468e043213f8791f --- vp9/encoder/vp9_encoder.c | 4 +-- vp9/encoder/vp9_encoder.h | 7 ++++++ vp9/encoder/vp9_pickmode.c | 39 ++++++++++++++---------------- vp9/encoder/vp9_rdopt.c | 16 ++++++------ vp9/encoder/vp9_speed_features.c | 5 ++-- vp9/encoder/vp9_svc_layercontext.c | 29 ++++++++-------------- 6 files changed, 45 insertions(+), 55 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 8d5ec5a360..1038bd9515 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -586,8 +586,6 @@ static void apply_roi_map(VP9_COMP *cpi) { int ref_frame[8]; int internal_delta_q[MAX_SEGMENTS]; int i; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; // TODO(jianj): Investigate why ROI not working in speed < 5 or in non // realtime mode. @@ -628,7 +626,7 @@ static void apply_roi_map(VP9_COMP *cpi) { valid_ref = 0; // If GOLDEN is selected, make sure it's set as reference. if (ref_frame[i] == GOLDEN_FRAME && - !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) { + !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame[i]))) { valid_ref = 0; } // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 9774a64ccf..1d58945250 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1196,6 +1196,13 @@ static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) { (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref); } +static INLINE int ref_frame_to_flag(int8_t ref_frame) { + static const int kVp9RefFlagList[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME); + return kVp9RefFlagList[ref_frame]; +} + static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { if (ref_frame == LAST_FRAME) { diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 695fd484fc..c8e167f25b 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1247,7 +1247,7 @@ static INLINE void find_predictors( VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask, - const int flag_list[4], TileDataEnc *tile_data, int mi_row, int mi_col, + TileDataEnc *tile_data, int mi_row, int mi_col, struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize, int force_skip_low_temp_var, int comp_pred_allowed) { VP9_COMMON *const cm = &cpi->common; @@ -1259,7 +1259,7 @@ static INLINE void find_predictors( frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; // this needs various further optimizations. to be continued.. - if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) { + if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && (yv12 != NULL)) { int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); @@ -1690,8 +1690,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; RD_COST this_rdc, best_rdc; // var_y and sse_y are saved to be used in skipping checking unsigned int var_y = UINT_MAX; @@ -1925,14 +1923,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // constrain the inter mode to only test zero motion. if (cpi->use_svc && svc->force_zero_mode_spatial_ref && svc->spatial_layer_id > 0 && !gf_temporal_ref) { - if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) { + if (cpi->ref_frame_flags & VP9_LAST_FLAG) { struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf; if (vp9_is_scaled(sf)) { svc_force_zero_mode[LAST_FRAME - 1] = 1; inter_layer_ref = LAST_FRAME; } } - if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) { + if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf; if (vp9_is_scaled(sf)) { svc_force_zero_mode[GOLDEN_FRAME - 1] = 1; @@ -1957,7 +1955,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, cpi->rc.avg_frame_low_motion < 60)) usable_ref_frame = LAST_FRAME; - if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && + if (!((cpi->ref_frame_flags & VP9_GOLD_FLAG) && !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var)) use_golden_nonzeromv = 0; @@ -1985,12 +1983,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // Skip find_predictor if the reference frame is not in the // ref_frame_flags (i.e., not used as a reference for this frame). skip_ref_find_pred[ref_frame] = - !(cpi->ref_frame_flags & flag_list[ref_frame]); + !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)); if (!skip_ref_find_pred[ref_frame]) { find_predictors(cpi, x, ref_frame, frame_mv, const_motion, - &ref_frame_skip_mask, flag_list, tile_data, mi_row, - mi_col, yv12_mb, bsize, force_skip_low_temp_var, - comp_modes > 0); + &ref_frame_skip_mask, tile_data, mi_row, mi_col, yv12_mb, + bsize, force_skip_low_temp_var, comp_modes > 0); } } @@ -2014,7 +2011,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // than current layer: force check of GF-ZEROMV before early exit // due to skip flag. if (svc->spatial_layer_id > 0 && no_scaling && - (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && + (cpi->ref_frame_flags & VP9_GOLD_FLAG) && cm->base_qindex > svc->lower_layer_qindex + 10) force_test_gf_zeromv = 1; @@ -2094,7 +2091,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (comp_pred) { if (!cpi->allow_comp_inter_inter) continue; // Skip compound inter modes if ARF is not available. - if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; + if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame))) + continue; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue; @@ -2107,7 +2105,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, (!cpi->use_svc && sse_zeromv_normalized < thresh_skip_golden))) continue; - if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; + if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) continue; // For screen content. If zero_temp_sad source is computed: skip // non-zero motion check for stationary blocks. If the superblock is @@ -2190,7 +2188,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (usable_ref_frame < ALTREF_FRAME) { if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) { i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; - if ((cpi->ref_frame_flags & flag_list[i])) + if ((cpi->ref_frame_flags & ref_frame_to_flag(i))) if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) ref_frame_skip_mask |= (1 << ref_frame); } @@ -2199,9 +2197,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, ref_frame == ALTREF_FRAME)) { int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME; int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME; - if (((cpi->ref_frame_flags & flag_list[ref1]) && + if (((cpi->ref_frame_flags & ref_frame_to_flag(ref1)) && (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) || - ((cpi->ref_frame_flags & flag_list[ref2]) && + ((cpi->ref_frame_flags & ref_frame_to_flag(ref2)) && (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1)))) ref_frame_skip_mask |= (1 << ref_frame); } @@ -2488,7 +2486,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, perform_intra_pred = svc->temporal_layer_id == 0 || svc->layer_context[svc->temporal_layer_id].is_key_frame || - !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) || + !(cpi->ref_frame_flags & VP9_GOLD_FLAG) || (!svc->layer_context[svc->temporal_layer_id].is_key_frame && svc_force_zero_mode[best_pickmode.best_ref_frame - 1]); inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh; @@ -2747,8 +2745,6 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, MV_REFERENCE_FRAME best_ref_frame = NONE; unsigned char segment_id = mi->segment_id; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; int64_t best_rd = INT64_MAX; b_mode_info bsi[MAX_REF_FRAMES][4]; int ref_frame_skip_mask = 0; @@ -2764,7 +2760,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int_mv dummy_mv[2]; x->pred_mv_sad[ref_frame] = INT_MAX; - if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) { + if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && + (yv12 != NULL)) { int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame]; const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index a1687dcf46..0171a05720 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -3315,8 +3315,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } }; INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES]; int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES]; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; int64_t best_rd = best_rd_so_far; int64_t best_pred_diff[REFERENCE_MODES]; int64_t best_pred_rd[REFERENCE_MODES]; @@ -3392,7 +3390,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; - if ((cpi->ref_frame_flags & flag_list[ref_frame]) && + if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && !(is_rect_partition && (ctx->skip_ref_frame_mask & (1 << ref_frame)))) { assert(get_ref_frame_buffer(cpi, ref_frame) != NULL); setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, @@ -3403,7 +3401,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - if (!(cpi->ref_frame_flags & flag_list[ref_frame])) { + if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) { // Skip checking missing references in both single and compound reference // modes. Note that a mode will be skipped if both reference frames // are masked out. @@ -3609,7 +3607,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, continue; // Skip compound inter modes if ARF is not available. - if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; + if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame))) + continue; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. @@ -4140,8 +4139,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, int comp_pred, i; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; int64_t best_rd = best_rd_so_far; int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise int64_t best_pred_diff[REFERENCE_MODES]; @@ -4191,7 +4188,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, rd_cost->rate = INT_MAX; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { - if (cpi->ref_frame_flags & flag_list[ref_frame]) { + if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) { setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); } else { @@ -4276,7 +4273,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, cm->ref_frame_sign_bias[second_ref_frame]) continue; - if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; + if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame))) + continue; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 81695e9156..7d7b2c3fb4 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -495,11 +495,10 @@ static void set_rt_speed_feature_framesize_independent( (cpi->external_resize == 1 || cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) { MV_REFERENCE_FRAME ref_frame; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); - if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) { + if (yv12 != NULL && + (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) { const struct scale_factors *const scale_fac = &cm->frame_refs[ref_frame - 1].sf; if (vp9_is_scaled(scale_fac)) sf->reference_masking = 0; diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index ad3a8f7afa..6655643654 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -462,30 +462,27 @@ static void reset_fb_idx_unused(VP9_COMP *const cpi) { // fb_idx for that reference to the first one used/referenced. // This is to avoid setting fb_idx for a reference to a slot that is not // used/needed (i.e., since that reference is not referenced or refreshed). - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; MV_REFERENCE_FRAME ref_frame; MV_REFERENCE_FRAME first_ref = 0; int first_fb_idx = 0; int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx }; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { - if (cpi->ref_frame_flags & flag_list[ref_frame]) { + if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) { first_ref = ref_frame; first_fb_idx = fb_idx[ref_frame - 1]; break; } } if (first_ref > 0) { - if (first_ref != LAST_FRAME && - !(cpi->ref_frame_flags & flag_list[LAST_FRAME]) && + if (first_ref != LAST_FRAME && !(cpi->ref_frame_flags & VP9_LAST_FLAG) && !cpi->ext_refresh_last_frame) cpi->lst_fb_idx = first_fb_idx; else if (first_ref != GOLDEN_FRAME && - !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && + !(cpi->ref_frame_flags & VP9_GOLD_FLAG) && !cpi->ext_refresh_golden_frame) cpi->gld_fb_idx = first_fb_idx; else if (first_ref != ALTREF_FRAME && - !(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]) && + !(cpi->ref_frame_flags & VP9_ALT_FLAG) && !cpi->ext_refresh_alt_ref_frame) cpi->alt_fb_idx = first_fb_idx; } @@ -730,8 +727,6 @@ static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config( void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) { SVC *const svc = &cpi->svc; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; int sl = svc->spatial_layer_id; svc->lst_fb_idx[sl] = cpi->lst_fb_idx; svc->gld_fb_idx[sl] = cpi->gld_fb_idx; @@ -754,12 +749,9 @@ void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) { svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame; svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame; - svc->reference_last[sl] = - (uint8_t)(cpi->ref_frame_flags & flag_list[LAST_FRAME]); - svc->reference_golden[sl] = - (uint8_t)(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]); - svc->reference_altref[sl] = - (uint8_t)(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]); + svc->reference_last[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_LAST_FLAG); + svc->reference_golden[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_GOLD_FLAG); + svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG); } int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { @@ -1080,15 +1072,14 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) { svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF || svc->drop_spatial_layer[sl - 1]) { MV_REFERENCE_FRAME ref_frame; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); - if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) { + if (yv12 != NULL && + (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) { const struct scale_factors *const scale_fac = &cm->frame_refs[ref_frame - 1].sf; if (vp9_is_scaled(scale_fac)) { - cpi->ref_frame_flags &= (~flag_list[ref_frame]); + cpi->ref_frame_flags &= (~ref_frame_to_flag(ref_frame)); // Point golden/altref frame buffer index to last. if (!svc->simulcast_mode) { if (ref_frame == GOLDEN_FRAME) From 395732f679e3f7842f5b2094e3a91de036f85708 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 25 Jan 2022 20:06:59 -0800 Subject: [PATCH 0271/1000] libwebm: update to libwebm-1.0.0.28-28-gee0bab5 https://chromium.googlesource.com/webm/libwebm/+log/206d268d4d8066e5a37c49025325b80c95c771dd..ee0bab576c338c9807249b99588e352b7268cb62 only one commit affects this snapshot: ee0bab5 Revert "mkvmuxer,Cluster::Size: make uint64 conversion explicit" Change-Id: Ib1f21fc5589098af346d110ff88c94bb1ba0a027 --- third_party/libwebm/README.libvpx | 2 +- third_party/libwebm/mkvmuxer/mkvmuxer.cc | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx index 5cc0a83701..325604cc66 100644 --- a/third_party/libwebm/README.libvpx +++ b/third_party/libwebm/README.libvpx @@ -1,5 +1,5 @@ URL: https://chromium.googlesource.com/webm/libwebm -Version: 206d268d4d8066e5a37c49025325b80c95c771dd +Version: ee0bab576c338c9807249b99588e352b7268cb62 License: BSD License File: LICENSE.txt diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/libwebm/mkvmuxer/mkvmuxer.cc index 24c288863f..ae36531439 100644 --- a/third_party/libwebm/mkvmuxer/mkvmuxer.cc +++ b/third_party/libwebm/mkvmuxer/mkvmuxer.cc @@ -2622,8 +2622,7 @@ bool Cluster::Finalize(bool set_last_frame_duration, uint64_t duration) { uint64_t Cluster::Size() const { const uint64_t element_size = - EbmlMasterElementSize(static_cast(libwebm::kMkvCluster), - uint64_t{0xFFFFFFFFFFFFFFFFU}) + + EbmlMasterElementSize(libwebm::kMkvCluster, 0xFFFFFFFFFFFFFFFFULL) + payload_size_; return element_size; } From ae5d16173d2af45dcb80d43635f53129a045b946 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 26 Jan 2022 15:05:22 -0800 Subject: [PATCH 0272/1000] fix some include guards Change-Id: I0233d352c134bdda3ca160d41b4671d1c45ab01c --- vpx/internal/vpx_ratectrl_rtc.h | 6 +++--- vpx_ports/mips.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vpx/internal/vpx_ratectrl_rtc.h b/vpx/internal/vpx_ratectrl_rtc.h index 0474e0a85b..65398c654d 100644 --- a/vpx/internal/vpx_ratectrl_rtc.h +++ b/vpx/internal/vpx_ratectrl_rtc.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_VPX_RATECTRL_RTC_H_ -#define VPX_VPX_RATECTRL_RTC_H_ +#ifndef VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_ +#define VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_ #include "vpx/vpx_encoder.h" @@ -59,4 +59,4 @@ struct VpxRateControlRtcConfig { int aq_mode; }; } // namespace libvpx -#endif +#endif // VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_ diff --git a/vpx_ports/mips.h b/vpx_ports/mips.h index bdc7525f7b..439de754fd 100644 --- a/vpx_ports/mips.h +++ b/vpx_ports/mips.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_MIPS_H_ -#define VPX_PORTS_MIPS_H_ +#ifndef VPX_VPX_PORTS_MIPS_H_ +#define VPX_VPX_PORTS_MIPS_H_ #ifdef __cplusplus extern "C" { @@ -24,4 +24,4 @@ int mips_cpu_caps(void); } // extern "C" #endif -#endif // VPX_PORTS_MIPS_H_ +#endif // VPX_VPX_PORTS_MIPS_H_ From 935350958690c086c16ec645e4da53d241cd36bf Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 26 Jan 2022 19:41:41 -0800 Subject: [PATCH 0273/1000] vp8dx.h: add missing define for VP9_SET_BYTE_ALIGNMENT Change-Id: I4e643c837bb010bd58f4fc8179045f8df18f8ae1 --- vpx/vp8dx.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h index af92f21ae3..08c3451445 100644 --- a/vpx/vp8dx.h +++ b/vpx/vp8dx.h @@ -189,6 +189,8 @@ VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *) #define VPX_CTRL_VP9D_GET_BIT_DEPTH VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *) #define VPX_CTRL_VP9D_GET_FRAME_SIZE +VPX_CTRL_USE_TYPE(VP9_SET_BYTE_ALIGNMENT, int) +#define VPX_CTRL_VP9_SET_BYTE_ALIGNMENT VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER #define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER From 531c60e2a2c0a98a0754502f3e6c28f3d5002c4d Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 26 Jan 2022 19:44:33 -0800 Subject: [PATCH 0274/1000] vp8dx.h,cosmetics: normalize #define/type order Change-Id: I2db20130cc366bead5e576b375479917f9aee024 --- vpx/vp8dx.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h index 08c3451445..dcf7a62860 100644 --- a/vpx/vp8dx.h +++ b/vpx/vp8dx.h @@ -193,14 +193,14 @@ VPX_CTRL_USE_TYPE(VP9_SET_BYTE_ALIGNMENT, int) #define VPX_CTRL_VP9_SET_BYTE_ALIGNMENT VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER -#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int) -#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER +#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int) -#define VPX_CTRL_VP9_DECODE_SET_ROW_MT +#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER VPX_CTRL_USE_TYPE(VP9D_SET_ROW_MT, int) -#define VPX_CTRL_VP9_SET_LOOP_FILTER_OPT +#define VPX_CTRL_VP9_DECODE_SET_ROW_MT VPX_CTRL_USE_TYPE(VP9D_SET_LOOP_FILTER_OPT, int) +#define VPX_CTRL_VP9_SET_LOOP_FILTER_OPT /*!\endcond */ /*! @} - end defgroup vp8_decoder */ From 8a0af65f34bdf43fc63b4ce4ac9393aceab0abbf Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 28 Sep 2021 16:59:21 -0700 Subject: [PATCH 0275/1000] Use background segmentation mask with ROI RTC sample encoder vpx_temporal_svc_encoder can take mask files as input when ROI_MAP is set to 1. Uses ROI and segmentation of vp9 to skip background encoding when source_sad is low and the correspond block in previous frame is also skipped. Change-Id: I8590e6f9a88cecfa1d7f375d4cc480f0f2af87b6 --- examples/vpx_temporal_svc_encoder.c | 99 +++++++++++++++++-- test/test-data.mk | 1 + test/test-data.sha1 | 1 + test/test.mk | 1 + test/vp9_roi_test.cc | 143 ++++++++++++++++++++++++++++ vp9/common/vp9_seg_common.h | 5 + vp9/encoder/vp9_aq_cyclicrefresh.c | 4 +- vp9/encoder/vp9_encodeframe.c | 42 ++++++-- vp9/encoder/vp9_encoder.c | 19 +++- 9 files changed, 292 insertions(+), 23 deletions(-) create mode 100644 test/vp9_roi_test.cc diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c index ad3e79c713..e528179f3f 100644 --- a/examples/vpx_temporal_svc_encoder.c +++ b/examples/vpx_temporal_svc_encoder.c @@ -240,6 +240,38 @@ static void set_roi_map(const char *enc_name, vpx_codec_enc_cfg_t *cfg, } } } + +static void set_roi_skip_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi, + int *skip_map, int *prev_mask_map, int frame_num) { + const int block_size = 8; + unsigned int i, j; + roi->rows = (cfg->g_h + block_size - 1) / block_size; + roi->cols = (cfg->g_w + block_size - 1) / block_size; + zero(roi->skip); + zero(roi->delta_q); + zero(roi->delta_lf); + memset(roi->ref_frame, -1, sizeof(roi->ref_frame)); + roi->ref_frame[1] = 1; + // Use segment 3 for skip. + roi->skip[3] = 1; + roi->roi_map = + (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map)); + for (i = 0; i < roi->rows; ++i) { + for (j = 0; j < roi->cols; ++j) { + const int idx = i * roi->cols + j; + // Use segment 3 for skip. + // prev_mask_map keeps track of blocks that have been stably on segment 3 + // for the past 10 frames. Only skip when the block is on segment 3 in + // both current map and prev_mask_map. + if (skip_map[idx] == 1 && prev_mask_map[idx] == 1) roi->roi_map[idx] = 3; + // Reset it every 10 frames so it doesn't propagate for too many frames. + if (frame_num % 10 == 0) + prev_mask_map[idx] = skip_map[idx]; + else if (prev_mask_map[idx] == 1 && skip_map[idx] == 0) + prev_mask_map[idx] = 0; + } + } +} #endif // Temporal scaling parameters: @@ -574,6 +606,23 @@ static void set_temporal_layer_pattern(int layering_mode, } } +#if ROI_MAP +static void read_mask(FILE *mask_file, int *seg_map) { + int mask_rows, mask_cols, i, j; + int *map_start = seg_map; + fscanf(mask_file, "%d %d\n", &mask_cols, &mask_rows); + for (i = 0; i < mask_rows; i++) { + for (j = 0; j < mask_cols; j++) { + fscanf(mask_file, "%d ", &seg_map[j]); + // reverse the bit + seg_map[j] = 1 - seg_map[j]; + } + seg_map += mask_cols; + } + seg_map = map_start; +} +#endif + int main(int argc, char **argv) { VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = { NULL }; vpx_codec_ctx_t codec; @@ -613,7 +662,14 @@ int main(int argc, char **argv) { double sum_bitrate = 0.0; double sum_bitrate2 = 0.0; double framerate = 30.0; - +#if ROI_MAP + FILE *mask_file = NULL; + int block_size = 8; + int mask_rows = 0; + int mask_cols = 0; + int *mask_map; + int *prev_mask_map; +#endif zero(rc.layer_target_bitrate); memset(&layer_id, 0, sizeof(vpx_svc_layer_id_t)); memset(&input_ctx, 0, sizeof(input_ctx)); @@ -657,9 +713,15 @@ int main(int argc, char **argv) { die("Invalid layering mode (0..12) %s", argv[12]); } +#if ROI_MAP + if (argc != min_args + mode_to_num_layers[layering_mode] + 1) { + die("Invalid number of arguments"); + } +#else if (argc != min_args + mode_to_num_layers[layering_mode]) { die("Invalid number of arguments"); } +#endif input_ctx.filename = argv[1]; open_input_file(&input_ctx); @@ -817,6 +879,13 @@ int main(int argc, char **argv) { #endif // CONFIG_VP9_HIGHBITDEPTH die("Failed to initialize encoder"); +#if ROI_MAP + mask_rows = (cfg.g_h + block_size - 1) / block_size; + mask_cols = (cfg.g_w + block_size - 1) / block_size; + mask_map = (int *)calloc(mask_rows * mask_cols, sizeof(*mask_map)); + prev_mask_map = (int *)calloc(mask_rows * mask_cols, sizeof(*mask_map)); +#endif + if (strncmp(encoder->name, "vp8", 3) == 0) { vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed); vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff); @@ -827,7 +896,6 @@ int main(int argc, char **argv) { if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi)) die_codec(&codec, "Failed to set ROI map"); #endif - } else if (strncmp(encoder->name, "vp9", 3) == 0) { vpx_svc_extra_cfg_t svc_params; memset(&svc_params, 0, sizeof(svc_params)); @@ -843,12 +911,7 @@ int main(int argc, char **argv) { vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0); vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, get_msb(cfg.g_threads)); vpx_codec_control(&codec, VP9E_SET_DISABLE_LOOPFILTER, 0); -#if ROI_MAP - set_roi_map(encoder->name, &cfg, &roi); - if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi)) - die_codec(&codec, "Failed to set ROI map"); - vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 0); -#endif + if (cfg.g_threads > 1) vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1); else @@ -881,6 +944,9 @@ int main(int argc, char **argv) { struct vpx_usec_timer timer; vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt; +#if ROI_MAP + char mask_file_name[255]; +#endif // Update the temporal layer_id. No spatial layers in this test. layer_id.spatial_layer_id = 0; layer_id.temporal_layer_id = @@ -894,6 +960,19 @@ int main(int argc, char **argv) { } flags = layer_flags[frame_cnt % flag_periodicity]; if (layering_mode == 0) flags = 0; +#if ROI_MAP + snprintf(mask_file_name, sizeof(mask_file_name), "%s%05d.txt", + argv[argc - 1], frame_cnt); + mask_file = fopen(mask_file_name, "r"); + if (mask_file != NULL) { + read_mask(mask_file, mask_map); + fclose(mask_file); + // set_roi_map(encoder->name, &cfg, &roi); + set_roi_skip_map(&cfg, &roi, mask_map, prev_mask_map, frame_cnt); + if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi)) + die_codec(&codec, "Failed to set ROI map"); + } +#endif frame_avail = read_frame(&input_ctx, &raw); if (frame_avail) ++rc.layer_input_frames[layer_id.temporal_layer_id]; vpx_usec_timer_start(&timer); @@ -963,6 +1042,10 @@ int main(int argc, char **argv) { ++frame_cnt; pts += frame_duration; } +#if ROI_MAP + free(mask_map); + free(prev_mask_map); +#endif close_input_file(&input_ctx); printout_rate_control_summary(&rc, &cfg, frame_cnt); printf("\n"); diff --git a/test/test-data.mk b/test/test-data.mk index 46fe359898..62a9d6ef14 100644 --- a/test/test-data.mk +++ b/test/test-data.mk @@ -6,6 +6,7 @@ LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288_nv12.yuv LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktop_office1.1280_720-020.yuv LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += slides_code_term_web_plot.1920_1080.yuv +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktopqvga.320_240.yuv LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420_20f.y4m LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422_20f.y4m diff --git a/test/test-data.sha1 b/test/test-data.sha1 index 668992fba2..55f92a25df 100644 --- a/test/test-data.sha1 +++ b/test/test-data.sha1 @@ -869,3 +869,4 @@ bac455906360b45338a16dd626ac5f19bc36a307 *desktop_office1.1280_720-020.yuv 518a0be998afece76d3df76047d51e256c591ff2 *invalid-bug-148271109.ivf d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv +8a0b2c350539859463d3546a67876c83ff6ff0ac *desktopqvga.320_240.yuv diff --git a/test/test.mk b/test/test.mk index 41dfd5d835..6df4572904 100644 --- a/test/test.mk +++ b/test/test.mk @@ -156,6 +156,7 @@ LIBVPX_TEST_SRCS-yes += superframe_test.cc LIBVPX_TEST_SRCS-yes += tile_independence_test.cc LIBVPX_TEST_SRCS-yes += vp9_boolcoder_test.cc LIBVPX_TEST_SRCS-yes += vp9_encoder_parms_get_to_decoder.cc +LIBVPX_TEST_SRCS-yes += vp9_roi_test.cc endif LIBVPX_TEST_SRCS-yes += convolve_test.cc diff --git a/test/vp9_roi_test.cc b/test/vp9_roi_test.cc new file mode 100644 index 0000000000..52dfd9e029 --- /dev/null +++ b/test/vp9_roi_test.cc @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "memory" + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "test/yuv_video_source.h" + +#define MASK_WIDTH 40 +#define MASK_HEIGHT 30 +#define MASK_SIZE MASK_WIDTH *MASK_HEIGHT + +namespace { + +const int mask[MASK_SIZE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 +}; + +class RoiMaskBackgroundSkip : public ::libvpx_test::EncoderTest, + public ::testing::Test { + protected: + RoiMaskBackgroundSkip() : EncoderTest(&::libvpx_test::kVP9) {} + virtual ~RoiMaskBackgroundSkip() { free(roi_.roi_map); } + + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + SetRoi(); + } + + void SetRoi() { + const int block_size = 8; + unsigned int i, j; + roi_.rows = (cfg_.g_h + block_size - 1) / block_size; + roi_.cols = (cfg_.g_w + block_size - 1) / block_size; + memset(&roi_.skip, 0, sizeof(roi_.skip)); + memset(&roi_.delta_q, 0, sizeof(roi_.delta_q)); + memset(&roi_.delta_lf, 0, sizeof(roi_.delta_lf)); + memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame)); + roi_.ref_frame[1] = 1; + // Use segment 3 for skip. + roi_.skip[3] = 1; + roi_.roi_map = + (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)); + for (i = 0; i < roi_.rows; ++i) { + for (j = 0; j < roi_.cols; ++j) { + const int idx = i * roi_.cols + j; + if (mask[idx] == 1) roi_.roi_map[idx] = 3; + } + } + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, 7); + encoder->Control(VP9E_SET_AQ_MODE, 3); + } + encoder->Control(VP9E_SET_ROI_MAP, &roi_); + } + + private: + vpx_roi_map_t roi_; +}; + +TEST_F(RoiMaskBackgroundSkip, RoiMaskNoMismatch) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 50; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_target_bitrate = 200; + cfg_.g_lag_in_frames = 0; + cfg_.kf_max_dist = 9999; + + ::libvpx_test::I420VideoSource video("desktopqvga.320_240.yuv", 320, 240, 30, + 1, 0, 150); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} +} // namespace diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h index b63e4f4999..5e71c2fca5 100644 --- a/vp9/common/vp9_seg_common.h +++ b/vp9/common/vp9_seg_common.h @@ -25,6 +25,11 @@ extern "C" { #define PREDICTION_PROBS 3 +// Segment ID used to skip background encoding +#define BACKGROUND_SEG_SKIP_ID 3 +// Number of frames that don't skip after a key frame +#define FRAMES_NO_SKIPPING_AFTER_KEY 20 + // Segment level features. typedef enum { SEG_LVL_ALT_Q = 0, // Use alternate Quantizer .... diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index f06fe47268..e336179e90 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -497,7 +497,9 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { rc->avg_frame_low_motion < thresh_low_motion && rc->frames_since_key > 40) || (!cpi->use_svc && rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh && - rc->frames_since_key > 20)) { + rc->frames_since_key > 20) || + (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] && + rc->frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY)) { cr->apply_cyclic_refresh = 0; return; } diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 131c4887f2..fc4089865d 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -5513,16 +5513,6 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, x->arf_frame_usage = 0; x->lastgolden_frame_usage = 0; - if (seg->enabled) { - const uint8_t *const map = - seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); - seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); - if (seg_skip) { - partition_search_type = FIXED_PARTITION; - } - } - if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) { int shift = cpi->Source->y_stride * (mi_row << 3) + (mi_col << 3); int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); @@ -5534,6 +5524,38 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, partition_search_type = REFERENCE_PARTITION; } + if (seg->enabled) { + const uint8_t *const map = + seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); + seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); + + if (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] && + cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY && + x->content_state_sb > kLowSadLowSumdiff) { + // For ROI with skip, force segment = 0 (no skip) over whole + // superblock to avoid artifacts if temporal change in source_sad is + // not 0. + int xi, yi; + const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64]; + const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64]; + const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); + const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); + const int block_index = mi_row * cm->mi_cols + mi_col; + set_mode_info_offsets(cm, x, xd, mi_row, mi_col); + for (yi = 0; yi < ymis; yi++) + for (xi = 0; xi < xmis; xi++) { + int map_offset = block_index + yi * cm->mi_cols + xi; + cpi->segmentation_map[map_offset] = 0; + } + set_segment_index(cpi, x, mi_row, mi_col, BLOCK_64X64, 0); + seg_skip = 0; + } + if (seg_skip) { + partition_search_type = FIXED_PARTITION; + } + } + // Set the partition type of the 64X64 block switch (partition_search_type) { case VAR_BASED_PARTITION: diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 7e80835f6c..ac0efced7f 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -618,7 +618,7 @@ static void apply_roi_map(VP9_COMP *cpi) { } if (skip[i] != 0) { vp9_enable_segfeature(seg, i, SEG_LVL_SKIP); - vp9_set_segdata(seg, i, SEG_LVL_SKIP, skip[i]); + vp9_set_segdata(seg, i, SEG_LVL_SKIP, 0); } if (ref_frame[i] >= 0) { int valid_ref = 1; @@ -4137,11 +4137,22 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi); } else { #endif - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + // If ROI is enabled and skip feature is used for segmentation, apply cyclic + // refresh but not apply ROI for skip for the first 20 frames (defined by + // FRAMES_NO_SKIPPING_AFTER_KEY) after key frame to improve quality. + if (cpi->roi.enabled && !frame_is_intra_only(cm)) { + if (cpi->roi.skip[BACKGROUND_SEG_SKIP_ID]) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_setup(cpi); + if (cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY) + apply_roi_map(cpi); + } else { + apply_roi_map(cpi); + } + } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { vp9_cyclic_refresh_setup(cpi); - } else if (cpi->roi.enabled && !frame_is_intra_only(cm)) { - apply_roi_map(cpi); } + #if !CONFIG_REALTIME_ONLY } #endif From 479758aeb15c82c8faf6d7a905999c7512284c64 Mon Sep 17 00:00:00 2001 From: Jin Bo Date: Tue, 6 Jul 2021 17:18:48 +0800 Subject: [PATCH 0276/1000] libvpx[loongarch]: Add loongarch support. LSX and LASX are enabled by default if compiler supports them. Bug: webm:1754 Change-Id: Ic36b113bc4313c50e9d2bbab91199b3aa46d00dc --- build/make/Makefile | 6 ++++ build/make/configure.sh | 47 ++++++++++++++++++++++++++++ build/make/rtcd.pl | 47 ++++++++++++++++++++++++++++ configure | 5 +++ vp8/common/generic/systemdependent.c | 4 +++ vpx_ports/loongarch.h | 29 +++++++++++++++++ vpx_ports/loongarch_cpudetect.c | 40 +++++++++++++++++++++++ vpx_ports/vpx_ports.mk | 3 ++ 8 files changed, 181 insertions(+) create mode 100644 vpx_ports/loongarch.h create mode 100644 vpx_ports/loongarch_cpudetect.c diff --git a/build/make/Makefile b/build/make/Makefile index 9ca97c8c64..b7a873cc81 100644 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -151,6 +151,12 @@ $(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx $(BUILD_PFX)%_msa.c.d: CFLAGS += -mmsa $(BUILD_PFX)%_msa.c.o: CFLAGS += -mmsa +# LOONGARCH +$(BUILD_PFX)%_lsx.c.d: CFLAGS += -mlsx +$(BUILD_PFX)%_lsx.c.o: CFLAGS += -mlsx +$(BUILD_PFX)%_lasx.c.d: CFLAGS += -mlasx +$(BUILD_PFX)%_lasx.c.o: CFLAGS += -mlasx + $(BUILD_PFX)%.c.d: %.c $(if $(quiet),@echo " [DEP] $@") $(qexec)mkdir -p $(dir $@) diff --git a/build/make/configure.sh b/build/make/configure.sh index b24e79a0d2..581042e38e 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -449,6 +449,17 @@ EOF fi } +check_inline_asm() { + log check_inline_asm "$@" + name="$1" + code="$2" + shift 2 + disable_feature $name + check_cc "$@" <> config.mk @@ -766,6 +777,12 @@ process_common_toolchain() { *mips32el*) tgt_isa=mips32 ;; + loongarch32*) + tgt_isa=loongarch32 + ;; + loongarch64*) + tgt_isa=loongarch64 + ;; esac # detect tgt_os @@ -834,6 +851,11 @@ process_common_toolchain() { ppc*) enable_feature ppc ;; + loongarch*) + soft_enable lsx + soft_enable lasx + enable_feature loongarch + ;; esac # PIC is probably what we want when building shared libs @@ -1419,6 +1441,15 @@ EOF ;; esac ;; + loongarch*) + link_with_cc=gcc + setup_gnu_toolchain + + enabled lsx && check_inline_asm lsx '"vadd.b $vr0, $vr1, $vr1"' + enabled lsx && soft_enable runtime_cpu_detect + enabled lasx && check_inline_asm lasx '"xvadd.b $xr0, $xr1, $xr1"' + enabled lasx && soft_enable runtime_cpu_detect + ;; *-gcc|generic-gnu) link_with_cc=gcc enable_feature gcc @@ -1521,6 +1552,22 @@ EOF ;; esac + # only for LOONGARCH platforms + case ${toolchain} in + loongarch*) + if enabled big_endian; then + if enabled lsx; then + echo "lsx optimizations are available only for little endian platforms" + disable_feature lsx + fi + if enabled lasx; then + echo "lasx optimizations are available only for little endian platforms" + disable_feature lasx + fi + fi + ;; + esac + # glibc needs these if enabled linux; then add_cflags -D_LARGEFILE_SOURCE diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl index acb9f6e466..8ed776add8 100755 --- a/build/make/rtcd.pl +++ b/build/make/rtcd.pl @@ -387,6 +387,37 @@ () common_bottom; } +sub loongarch() { + determine_indirection("c", @ALL_ARCHS); + + # Assign the helper variable for each enabled extension + foreach my $opt (@ALL_ARCHS) { + my $opt_uc = uc $opt; + eval "\$have_${opt}=\"flags & HAS_${opt_uc}\""; + } + + common_top; + print <) { + if (/HAVE_LSX=yes/) { + @ALL_ARCHS = filter("$opts{arch}", qw/lsx/); + last; + } + if (/HAVE_LASX=yes/) { + @ALL_ARCHS = filter("$opts{arch}", qw/lasx/); + last; + } + } + close CONFIG_FILE; + loongarch; } else { unoptimized; } diff --git a/configure b/configure index b68f9fd781..434ebbe366 100755 --- a/configure +++ b/configure @@ -114,6 +114,8 @@ all_platforms="${all_platforms} armv7-win32-vs14" all_platforms="${all_platforms} armv7-win32-vs15" all_platforms="${all_platforms} armv7s-darwin-gcc" all_platforms="${all_platforms} armv8-linux-gcc" +all_platforms="${all_platforms} loongarch32-linux-gcc" +all_platforms="${all_platforms} loongarch64-linux-gcc" all_platforms="${all_platforms} mips32-linux-gcc" all_platforms="${all_platforms} mips64-linux-gcc" all_platforms="${all_platforms} ppc64le-linux-gcc" @@ -237,6 +239,7 @@ ARCH_LIST=" x86 x86_64 ppc + loongarch " ARCH_EXT_LIST_X86=" mmx @@ -252,6 +255,8 @@ ARCH_EXT_LIST_X86=" ARCH_EXT_LIST_LOONGSON=" mmi + lsx + lasx " ARCH_EXT_LIST=" diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c index cd1b02c9cc..71529bdfd8 100644 --- a/vp8/common/generic/systemdependent.c +++ b/vp8/common/generic/systemdependent.c @@ -18,6 +18,8 @@ #include "vpx_ports/ppc.h" #elif VPX_ARCH_MIPS #include "vpx_ports/mips.h" +#elif VPX_ARCH_LOONGARCH +#include "vpx_ports/loongarch.h" #endif #include "vp8/common/onyxc_int.h" #include "vp8/common/systemdependent.h" @@ -100,6 +102,8 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) { ctx->cpu_caps = ppc_simd_caps(); #elif VPX_ARCH_MIPS ctx->cpu_caps = mips_cpu_caps(); +#elif VPX_ARCH_LOONGARCH + ctx->cpu_caps = loongarch_cpu_caps(); #else // generic-gnu targets. ctx->cpu_caps = 0; diff --git a/vpx_ports/loongarch.h b/vpx_ports/loongarch.h new file mode 100644 index 0000000000..d93ff9f5f0 --- /dev/null +++ b/vpx_ports/loongarch.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Jin Bo + * Contributed by Lu Wang + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_LOONGARCH_H_ +#define VPX_VPX_PORTS_LOONGARCH_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define HAS_LSX 0x01 +#define HAS_LASX 0x02 + +int loongarch_cpu_caps(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_PORTS_LOONGARCH_H_ diff --git a/vpx_ports/loongarch_cpudetect.c b/vpx_ports/loongarch_cpudetect.c new file mode 100644 index 0000000000..7b4322d35e --- /dev/null +++ b/vpx_ports/loongarch_cpudetect.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Jin Bo + * Contributed by Lu Wang + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "vpx_ports/loongarch.h" + +#define LOONGARCH_CFG2 0x02 +#define LOONGARCH_CFG2_LSX (1 << 6) +#define LOONGARCH_CFG2_LASX (1 << 7) + +#if CONFIG_RUNTIME_CPU_DETECT +#if defined(__loongarch__) && defined(__linux__) +int loongarch_cpu_caps(void) { + int reg = 0; + int flag = 0; + + __asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(reg) : "r"(LOONGARCH_CFG2)); + if (reg & LOONGARCH_CFG2_LSX) flag |= HAS_LSX; + + if (reg & LOONGARCH_CFG2_LASX) flag |= HAS_LASX; + + return flag; +} +#else /* end __loongarch__ && __linux__ */ +#error \ + "--enable-runtime-cpu-detect selected, but no CPU detection method " \ +"available for your platform. Reconfigure with --disable-runtime-cpu-detect." +#endif +#else /* end CONFIG_RUNTIME_CPU_DETECT */ +int loongarch_cpu_caps(void) { return 0; } +#endif diff --git a/vpx_ports/vpx_ports.mk b/vpx_ports/vpx_ports.mk index e5001be496..e30e87cefb 100644 --- a/vpx_ports/vpx_ports.mk +++ b/vpx_ports/vpx_ports.mk @@ -45,6 +45,9 @@ PORTS_SRCS-$(VPX_ARCH_PPC) += ppc.h PORTS_SRCS-$(VPX_ARCH_MIPS) += mips_cpudetect.c PORTS_SRCS-$(VPX_ARCH_MIPS) += mips.h +PORTS_SRCS-$(VPX_ARCH_LOONGARCH) += loongarch_cpudetect.c +PORTS_SRCS-$(VPX_ARCH_LOONGARCH) += loongarch.h + ifeq ($(VPX_ARCH_MIPS), yes) PORTS_SRCS-yes += asmdefs_mmi.h endif From 0494625b7b386d6634c19b47d39e1608c3a5bcec Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 28 Jan 2022 11:47:08 -0800 Subject: [PATCH 0277/1000] vpx/vp8[cd]x.h,cosmetics: normalize ctrls to enum order Change-Id: I49bbd956b3a64008d1abe54de87d7831bc3eede6 --- vpx/vp8cx.h | 121 ++++++++++++++++++---------------------------------- vpx/vp8dx.h | 12 +++--- 2 files changed, 47 insertions(+), 86 deletions(-) diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 47c38d3b5e..6b02aa8657 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -952,28 +952,12 @@ typedef struct vpx_svc_spatial_layer_sync { * */ -VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int) -#define VPX_CTRL_VP8E_SET_FRAME_FLAGS -VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int) -#define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *) #define VPX_CTRL_VP8E_SET_ROI_MAP -VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *) -#define VPX_CTRL_VP9E_SET_ROI_MAP VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *) #define VPX_CTRL_VP8E_SET_ACTIVEMAP VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *) #define VPX_CTRL_VP8E_SET_SCALEMODE - -VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int) -#define VPX_CTRL_VP9E_SET_SVC -VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, void *) -#define VPX_CTRL_VP9E_SET_SVC_PARAMETERS -VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *) -#define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK -VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *) -#define VPX_CTRL_VP9E_SET_SVC_LAYER_ID - VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED, int) #define VPX_CTRL_VP8E_SET_CPUUSED VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF, unsigned int) @@ -986,7 +970,10 @@ VPX_CTRL_USE_TYPE(VP8E_SET_STATIC_THRESHOLD, unsigned int) #define VPX_CTRL_VP8E_SET_STATIC_THRESHOLD VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS, int) /* vp8e_token_partitions */ #define VPX_CTRL_VP8E_SET_TOKEN_PARTITIONS - +VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *) +#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER +VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *) +#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES, unsigned int) #define VPX_CTRL_VP8E_SET_ARNR_MAXFRAMES VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH, unsigned int) @@ -997,133 +984,107 @@ VPX_CTRL_USE_TYPE(VP8E_SET_TUNING, int) /* vp8e_tuning */ #define VPX_CTRL_VP8E_SET_TUNING VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL, unsigned int) #define VPX_CTRL_VP8E_SET_CQ_LEVEL - -VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int) -#define VPX_CTRL_VP9E_SET_TILE_COLUMNS -VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int) -#define VPX_CTRL_VP9E_SET_TILE_ROWS - -VPX_CTRL_USE_TYPE(VP9E_SET_TPL, int) -#define VPX_CTRL_VP9E_SET_TPL - -VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *) -#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER -VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *) -#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64 -VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *) -#define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS - -VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *) -#define VPX_CTRL_VP9E_GET_SVC_LAYER_ID - VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int) #define VPX_CTRL_VP8E_SET_MAX_INTRA_BITRATE_PCT +VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int) +#define VPX_CTRL_VP8E_SET_FRAME_FLAGS VPX_CTRL_USE_TYPE(VP9E_SET_MAX_INTER_BITRATE_PCT, unsigned int) #define VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT - -VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int) -#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT - -VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int) -#define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE - VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int) #define VPX_CTRL_VP9E_SET_GF_CBR_BOOST_PCT - +VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int) +#define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID +VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int) +#define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int) #define VPX_CTRL_VP9E_SET_LOSSLESS - +VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int) +#define VPX_CTRL_VP9E_SET_TILE_COLUMNS +VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int) +#define VPX_CTRL_VP9E_SET_TILE_ROWS VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int) #define VPX_CTRL_VP9E_SET_FRAME_PARALLEL_DECODING - VPX_CTRL_USE_TYPE(VP9E_SET_AQ_MODE, unsigned int) #define VPX_CTRL_VP9E_SET_AQ_MODE - -VPX_CTRL_USE_TYPE(VP9E_SET_ALT_REF_AQ, int) -#define VPX_CTRL_VP9E_SET_ALT_REF_AQ - VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int) #define VPX_CTRL_VP9E_SET_FRAME_PERIODIC_BOOST - VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY, unsigned int) #define VPX_CTRL_VP9E_SET_NOISE_SENSITIVITY - +VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int) +#define VPX_CTRL_VP9E_SET_SVC +VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *) +#define VPX_CTRL_VP9E_SET_ROI_MAP +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, void *) +#define VPX_CTRL_VP9E_SET_SVC_PARAMETERS +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *) +#define VPX_CTRL_VP9E_SET_SVC_LAYER_ID VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */ #define VPX_CTRL_VP9E_SET_TUNE_CONTENT - +VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *) +#define VPX_CTRL_VP9E_GET_SVC_LAYER_ID +VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *) +#define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int) #define VPX_CTRL_VP9E_SET_COLOR_SPACE - +VPX_CTRL_USE_TYPE(VP9E_SET_TEMPORAL_LAYERING_MODE, + int) /* VP9E_TEMPORAL_LAYERING_MODE */ +#define VPX_CTRL_VP9E_SET_TEMPORAL_LAYERING_MODE VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL, unsigned int) #define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL - VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL, unsigned int) #define VPX_CTRL_VP9E_SET_MAX_GF_INTERVAL - VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *) #define VPX_CTRL_VP9E_GET_ACTIVEMAP - VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_RANGE, int) #define VPX_CTRL_VP9E_SET_COLOR_RANGE - VPX_CTRL_USE_TYPE(VP9E_SET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *) #define VPX_CTRL_VP9E_SET_SVC_REF_FRAME_CONFIG - VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *) #define VPX_CTRL_VP9E_SET_RENDER_SIZE - VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int) #define VPX_CTRL_VP9E_SET_TARGET_LEVEL - VPX_CTRL_USE_TYPE(VP9E_SET_ROW_MT, unsigned int) #define VPX_CTRL_VP9E_SET_ROW_MT - VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *) #define VPX_CTRL_VP9E_GET_LEVEL - -VPX_CTRL_USE_TYPE(VP9E_GET_LOOPFILTER_LEVEL, int *) -#define VPX_CTRL_VP9E_GET_LOOPFILTER_LEVEL - +VPX_CTRL_USE_TYPE(VP9E_SET_ALT_REF_AQ, int) +#define VPX_CTRL_VP9E_SET_ALT_REF_AQ +VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int) +#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT VPX_CTRL_USE_TYPE(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int) #define VPX_CTRL_VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST - VPX_CTRL_USE_TYPE(VP9E_SET_SVC_INTER_LAYER_PRED, unsigned int) #define VPX_CTRL_VP9E_SET_SVC_INTER_LAYER_PRED - VPX_CTRL_USE_TYPE(VP9E_SET_SVC_FRAME_DROP_LAYER, vpx_svc_frame_drop_t *) #define VPX_CTRL_VP9E_SET_SVC_FRAME_DROP_LAYER - VPX_CTRL_USE_TYPE(VP9E_GET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *) #define VPX_CTRL_VP9E_GET_SVC_REF_FRAME_CONFIG - VPX_CTRL_USE_TYPE(VP9E_SET_SVC_GF_TEMPORAL_REF, unsigned int) #define VPX_CTRL_VP9E_SET_SVC_GF_TEMPORAL_REF - VPX_CTRL_USE_TYPE(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, vpx_svc_spatial_layer_sync_t *) #define VPX_CTRL_VP9E_SET_SVC_SPATIAL_LAYER_SYNC - +VPX_CTRL_USE_TYPE(VP9E_SET_TPL, int) +#define VPX_CTRL_VP9E_SET_TPL VPX_CTRL_USE_TYPE(VP9E_SET_POSTENCODE_DROP, unsigned int) #define VPX_CTRL_VP9E_SET_POSTENCODE_DROP - VPX_CTRL_USE_TYPE(VP9E_SET_DELTA_Q_UV, int) #define VPX_CTRL_VP9E_SET_DELTA_Q_UV - VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, int) #define VPX_CTRL_VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR - VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int) #define VPX_CTRL_VP9E_SET_DISABLE_LOOPFILTER - +VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *) +#define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL VPX_CTRL_USE_TYPE(VP9E_SET_RTC_EXTERNAL_RATECTRL, int) #define VPX_CTRL_VP9E_SET_RTC_EXTERNAL_RATECTRL - +VPX_CTRL_USE_TYPE(VP9E_GET_LOOPFILTER_LEVEL, int *) +#define VPX_CTRL_VP9E_GET_LOOPFILTER_LEVEL +VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *) +#define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int) #define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL -VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *) -#define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL - /*!\endcond */ /*! @} - end defgroup vp8_encoder */ #ifdef __cplusplus diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h index dcf7a62860..506a8936be 100644 --- a/vpx/vp8dx.h +++ b/vpx/vp8dx.h @@ -177,26 +177,26 @@ VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED, int *) #define VPX_CTRL_VP8D_GET_FRAME_CORRUPTED VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *) #define VPX_CTRL_VP8D_GET_LAST_REF_USED -VPX_CTRL_USE_TYPE(VPXD_GET_LAST_QUANTIZER, int *) -#define VPX_CTRL_VPXD_GET_LAST_QUANTIZER VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *) #define VPX_CTRL_VPXD_SET_DECRYPTOR VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *) #define VPX_CTRL_VP8D_SET_DECRYPTOR +VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *) +#define VPX_CTRL_VP9D_GET_FRAME_SIZE VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *) #define VPX_CTRL_VP9D_GET_DISPLAY_SIZE VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *) #define VPX_CTRL_VP9D_GET_BIT_DEPTH -VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *) -#define VPX_CTRL_VP9D_GET_FRAME_SIZE VPX_CTRL_USE_TYPE(VP9_SET_BYTE_ALIGNMENT, int) #define VPX_CTRL_VP9_SET_BYTE_ALIGNMENT VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER -VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int) -#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int) #define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER +VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int) +#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER +VPX_CTRL_USE_TYPE(VPXD_GET_LAST_QUANTIZER, int *) +#define VPX_CTRL_VPXD_GET_LAST_QUANTIZER VPX_CTRL_USE_TYPE(VP9D_SET_ROW_MT, int) #define VPX_CTRL_VP9_DECODE_SET_ROW_MT VPX_CTRL_USE_TYPE(VP9D_SET_LOOP_FILTER_OPT, int) From fc2a31cfb901e8988dc382f094b843bf9fcd4433 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 1 Feb 2022 11:57:05 -0800 Subject: [PATCH 0278/1000] vp9_thread_test: parameterize VP9DecodeMultiThreadedTest on a per-file basis; this will make sharding more effective Change-Id: Ib797681a7cc3bd7ec835bb0c1c7a8d9f23512a0d --- test/vp9_thread_test.cc | 158 +++++++++++++++++----------------------- 1 file changed, 68 insertions(+), 90 deletions(-) diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc index 5cac9ea0ee..1ceef8185c 100644 --- a/test/vp9_thread_test.cc +++ b/test/vp9_thread_test.cc @@ -148,11 +148,6 @@ TEST(VPxWorkerThreadTest, TestInterfaceAPI) { // ----------------------------------------------------------------------------- // Multi-threaded decode tests #if CONFIG_WEBM_IO -struct FileList { - const char *name; - const char *expected_md5; -}; - // Decodes |filename| with |num_threads|. Returns the md5 of the decoded frames. string DecodeFile(const string &filename, int num_threads) { libvpx_test::WebMVideoSource video(filename); @@ -182,16 +177,6 @@ string DecodeFile(const string &filename, int num_threads) { return string(md5.Get()); } -void DecodeFiles(const FileList files[]) { - for (const FileList *iter = files; iter->name != nullptr; ++iter) { - SCOPED_TRACE(iter->name); - for (int t = 1; t <= 8; ++t) { - EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t)) - << "threads = " << t; - } - } -} - // Trivial serialized thread worker interface implementation. // Note any worker that requires synchronization between other workers will // hang. @@ -228,88 +213,81 @@ TEST(VPxWorkerThreadTest, TestSerialInterface) { EXPECT_EQ(expected_md5, DecodeFile(filename, 2)); } -TEST(VP9DecodeMultiThreadedTest, NoTilesNonFrameParallel) { - // no tiles or frame parallel; this exercises loop filter threading. - EXPECT_EQ("b35a1b707b28e82be025d960aba039bc", - DecodeFile("vp90-2-03-size-226x226.webm", 2)); -} +struct FileParam { + const char *name; + const char *expected_md5; + friend std::ostream &operator<<(std::ostream &os, const FileParam ¶m) { + return os << "file name: " << param.name + << " digest: " << param.expected_md5; + } +}; -TEST(VP9DecodeMultiThreadedTest, FrameParallel) { - static const FileList files[] = { { "vp90-2-08-tile_1x2_frame_parallel.webm", - "68ede6abd66bae0a2edf2eb9232241b6" }, - { "vp90-2-08-tile_1x4_frame_parallel.webm", - "368ebc6ebf3a5e478d85b2c3149b2848" }, - { "vp90-2-08-tile_1x8_frame_parallel.webm", - "17e439da2388aff3a0f69cb22579c6c1" }, - { nullptr, nullptr } }; +class VP9DecodeMultiThreadedTest : public ::testing::TestWithParam { +}; - DecodeFiles(files); +TEST_P(VP9DecodeMultiThreadedTest, Decode) { + for (int t = 1; t <= 8; ++t) { + EXPECT_EQ(GetParam().expected_md5, DecodeFile(GetParam().name, t)) + << "threads = " << t; + } } -TEST(VP9DecodeMultiThreadedTest, FrameParallelResize) { - static const FileList files[] = { - { "vp90-2-14-resize-fp-tiles-1-16.webm", - "0cd5e632c326297e975f38949c31ea94" }, - { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm", - "5c78a96a42e7f4a4f6b2edcdb791e44c" }, - { "vp90-2-14-resize-fp-tiles-1-2.webm", - "e030450ae85c3277be2a418769df98e2" }, - { "vp90-2-14-resize-fp-tiles-1-4.webm", - "312eed4e2b64eb7a4e7f18916606a430" }, - { "vp90-2-14-resize-fp-tiles-16-1.webm", - "1755c16d8af16a9cb3fe7338d90abe52" }, - { "vp90-2-14-resize-fp-tiles-16-2.webm", - "500300592d3fcb6f12fab25e48aaf4df" }, - { "vp90-2-14-resize-fp-tiles-16-4.webm", - "47c48379fa6331215d91c67648e1af6e" }, - { "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm", - "eecf17290739bc708506fa4827665989" }, - { "vp90-2-14-resize-fp-tiles-16-8.webm", - "29b6bb54e4c26b5ca85d5de5fed94e76" }, - { "vp90-2-14-resize-fp-tiles-1-8.webm", - "1b6f175e08cd82cf84bb800ac6d1caa3" }, - { "vp90-2-14-resize-fp-tiles-2-16.webm", - "ca3b03e4197995d8d5444ede7a6c0804" }, - { "vp90-2-14-resize-fp-tiles-2-1.webm", - "99aec065369d70bbb78ccdff65afed3f" }, - { "vp90-2-14-resize-fp-tiles-2-4.webm", - "22d0ebdb49b87d2920a85aea32e1afd5" }, - { "vp90-2-14-resize-fp-tiles-2-8.webm", - "c2115cf051c62e0f7db1d4a783831541" }, - { "vp90-2-14-resize-fp-tiles-4-16.webm", - "c690d7e1719b31367564cac0af0939cb" }, - { "vp90-2-14-resize-fp-tiles-4-1.webm", - "a926020b2cc3e15ad4cc271853a0ff26" }, - { "vp90-2-14-resize-fp-tiles-4-2.webm", - "42699063d9e581f1993d0cf890c2be78" }, - { "vp90-2-14-resize-fp-tiles-4-8.webm", - "7f76d96036382f45121e3d5aa6f8ec52" }, - { "vp90-2-14-resize-fp-tiles-8-16.webm", - "76a43fcdd7e658542913ea43216ec55d" }, - { "vp90-2-14-resize-fp-tiles-8-1.webm", - "8e3fbe89486ca60a59299dea9da91378" }, - { "vp90-2-14-resize-fp-tiles-8-2.webm", - "ae96f21f21b6370cc0125621b441fc52" }, - { "vp90-2-14-resize-fp-tiles-8-4.webm", - "3eb4f24f10640d42218f7fd7b9fd30d4" }, - { nullptr, nullptr } - }; +const FileParam kNoTilesNonFrameParallelFiles[] = { + { "vp90-2-03-size-226x226.webm", "b35a1b707b28e82be025d960aba039bc" } +}; - DecodeFiles(files); -} +const FileParam kFrameParallelFiles[] = { + { "vp90-2-08-tile_1x2_frame_parallel.webm", + "68ede6abd66bae0a2edf2eb9232241b6" }, + { "vp90-2-08-tile_1x4_frame_parallel.webm", + "368ebc6ebf3a5e478d85b2c3149b2848" }, + { "vp90-2-08-tile_1x8_frame_parallel.webm", + "17e439da2388aff3a0f69cb22579c6c1" }, +}; -TEST(VP9DecodeMultiThreadedTest, NonFrameParallel) { - static const FileList files[] = { - { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" }, - { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" }, - { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" }, - { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" }, - { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" }, - { nullptr, nullptr } - }; +const FileParam kFrameParallelResizeFiles[] = { + { "vp90-2-14-resize-fp-tiles-1-16.webm", "0cd5e632c326297e975f38949c31ea94" }, + { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm", + "5c78a96a42e7f4a4f6b2edcdb791e44c" }, + { "vp90-2-14-resize-fp-tiles-1-2.webm", "e030450ae85c3277be2a418769df98e2" }, + { "vp90-2-14-resize-fp-tiles-1-4.webm", "312eed4e2b64eb7a4e7f18916606a430" }, + { "vp90-2-14-resize-fp-tiles-16-1.webm", "1755c16d8af16a9cb3fe7338d90abe52" }, + { "vp90-2-14-resize-fp-tiles-16-2.webm", "500300592d3fcb6f12fab25e48aaf4df" }, + { "vp90-2-14-resize-fp-tiles-16-4.webm", "47c48379fa6331215d91c67648e1af6e" }, + { "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm", + "eecf17290739bc708506fa4827665989" }, + { "vp90-2-14-resize-fp-tiles-16-8.webm", "29b6bb54e4c26b5ca85d5de5fed94e76" }, + { "vp90-2-14-resize-fp-tiles-1-8.webm", "1b6f175e08cd82cf84bb800ac6d1caa3" }, + { "vp90-2-14-resize-fp-tiles-2-16.webm", "ca3b03e4197995d8d5444ede7a6c0804" }, + { "vp90-2-14-resize-fp-tiles-2-1.webm", "99aec065369d70bbb78ccdff65afed3f" }, + { "vp90-2-14-resize-fp-tiles-2-4.webm", "22d0ebdb49b87d2920a85aea32e1afd5" }, + { "vp90-2-14-resize-fp-tiles-2-8.webm", "c2115cf051c62e0f7db1d4a783831541" }, + { "vp90-2-14-resize-fp-tiles-4-16.webm", "c690d7e1719b31367564cac0af0939cb" }, + { "vp90-2-14-resize-fp-tiles-4-1.webm", "a926020b2cc3e15ad4cc271853a0ff26" }, + { "vp90-2-14-resize-fp-tiles-4-2.webm", "42699063d9e581f1993d0cf890c2be78" }, + { "vp90-2-14-resize-fp-tiles-4-8.webm", "7f76d96036382f45121e3d5aa6f8ec52" }, + { "vp90-2-14-resize-fp-tiles-8-16.webm", "76a43fcdd7e658542913ea43216ec55d" }, + { "vp90-2-14-resize-fp-tiles-8-1.webm", "8e3fbe89486ca60a59299dea9da91378" }, + { "vp90-2-14-resize-fp-tiles-8-2.webm", "ae96f21f21b6370cc0125621b441fc52" }, + { "vp90-2-14-resize-fp-tiles-8-4.webm", "3eb4f24f10640d42218f7fd7b9fd30d4" }, +}; - DecodeFiles(files); -} +const FileParam kNonFrameParallelFiles[] = { + { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" }, + { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" }, + { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" }, + { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" }, + { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" }, +}; + +INSTANTIATE_TEST_SUITE_P(NoTilesNonFrameParallel, VP9DecodeMultiThreadedTest, + ::testing::ValuesIn(kNoTilesNonFrameParallelFiles)); +INSTANTIATE_TEST_SUITE_P(FrameParallel, VP9DecodeMultiThreadedTest, + ::testing::ValuesIn(kFrameParallelFiles)); +INSTANTIATE_TEST_SUITE_P(FrameParallelResize, VP9DecodeMultiThreadedTest, + ::testing::ValuesIn(kFrameParallelResizeFiles)); +INSTANTIATE_TEST_SUITE_P(NonFrameParallel, VP9DecodeMultiThreadedTest, + ::testing::ValuesIn(kNonFrameParallelFiles)); #endif // CONFIG_WEBM_IO INSTANTIATE_TEST_SUITE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool()); From 847a0ef84f4d5bc3c28124742fe47bb277a1f5fe Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 1 Feb 2022 16:24:07 -0800 Subject: [PATCH 0279/1000] vp9_roi_test: apply iwyu Change-Id: I715c27e329495940d989f95df65ac10e021261d2 --- test/vp9_roi_test.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/vp9_roi_test.cc b/test/vp9_roi_test.cc index 52dfd9e029..e8373c4c0b 100644 --- a/test/vp9_roi_test.cc +++ b/test/vp9_roi_test.cc @@ -8,7 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "memory" +#include +#include +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -16,8 +18,11 @@ #include "test/encode_test_driver.h" #include "test/i420_video_source.h" #include "test/util.h" +#include "test/video_source.h" #include "test/y4m_video_source.h" #include "test/yuv_video_source.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" #define MASK_WIDTH 40 #define MASK_HEIGHT 30 From 74c0f504c4187fd0f923209767c530776588728d Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Wed, 2 Feb 2022 16:17:58 -0800 Subject: [PATCH 0280/1000] rtc-vp9: Fix to tests for intra-only frame. Fix some issues with the test, and add new test that verifies that we can decode base stream startinig at middle of sequence where intra-only frame is inserted. Change-Id: I398d23927113eb58ef64694feca25e60ce60a5f7 --- test/svc_end_to_end_test.cc | 54 +++++++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc index 518824d03f..e59e337f1b 100644 --- a/test/svc_end_to_end_test.cc +++ b/test/svc_end_to_end_test.cc @@ -171,9 +171,14 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, decode_to_layer_before_sync_); } else { - if (decode_to_layer_after_sync_ >= 0) - decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, - decode_to_layer_after_sync_); + if (decode_to_layer_after_sync_ >= 0) { + int decode_to_layer = decode_to_layer_after_sync_; + // Overlay frame is additional layer for intra-only. + if (video->frame() == frame_to_sync_ && intra_only_test_ && + decode_to_layer_after_sync_ == 0 && number_spatial_layers_ > 1) + decode_to_layer += 1; + decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, decode_to_layer); + } } } #endif @@ -246,7 +251,7 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, cfg_.temporal_layering_mode = 2; } else if (num_temporal_layer == 1) { cfg_.ts_rate_decimator[0] = 1; - cfg_.temporal_layering_mode = 1; + cfg_.temporal_layering_mode = 0; } } }; @@ -390,6 +395,37 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncFrameVGADenoise) { } #endif +// Encode 3 spatial, 3 temporal layer but don't start decoding. +// During the sequence insert intra-only on base/qvga layer at frame 20 +// and start decoding only QVGA layer from there. +TEST_P(SyncFrameOnePassCbrSvc, + OnePassCbrSvc3SL3TLSyncFrameStartDecodeOnIntraOnlyQVGA) { + SetSvcConfig(3, 3); + frame_to_start_decode_ = 20; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 2; + decode_to_layer_after_sync_ = 0; + intra_only_test_ = true; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 1; + svc_layer_sync_.spatial_layer_sync[0] = 1; + svc_layer_sync_.spatial_layer_sync[1] = 0; + svc_layer_sync_.spatial_layer_sync[2] = 0; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + if (0 && decode_to_layer_before_sync_ == decode_to_layer_after_sync_) { + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); + } +#endif +} + // Start decoding from beginning of sequence, during sequence insert intra-only // on base/qvga layer. Decode all layers. TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyQVGA) { @@ -397,8 +433,9 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyQVGA) { frame_to_start_decode_ = 0; frame_to_sync_ = 20; decode_to_layer_before_sync_ = 2; - // The superframe containing intra-only layer will have 4 frames. Thus set the - // layer to decode after sync frame to 3. + // The superframe containing intra-only layer will have +1 frames. Thus set + // the layer to decode after sync frame to +1 from + // decode_to_layer_before_sync. decode_to_layer_after_sync_ = 3; intra_only_test_ = true; @@ -426,8 +463,9 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyVGA) { frame_to_start_decode_ = 0; frame_to_sync_ = 20; decode_to_layer_before_sync_ = 2; - // The superframe containing intra-only layer will have 4 frames. Thus set the - // layer to decode after sync frame to 3. + // The superframe containing intra-only layer will have +1 frames. Thus set + // the layer to decode after sync frame to +1 from + // decode_to_layer_before_sync. decode_to_layer_after_sync_ = 3; intra_only_test_ = true; From e2cc35cb673a65ffa14bd36f62390aa221c54393 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Sat, 5 Feb 2022 12:14:37 -0800 Subject: [PATCH 0281/1000] Update error messages in validate_img() Change-Id: I4aa6d2e16e077d29e4e9eabfc7056fcfed6786d6 --- vp9/vp9_cx_iface.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 9f03ed1728..76274437c6 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -380,8 +380,8 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, case VPX_IMG_FMT_I440: if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) { ERROR( - "Invalid image format. I422, I444, I440, NV12 images are " - "not supported in profile."); + "Invalid image format. I422, I444, I440 images are not supported " + "in profile."); } break; case VPX_IMG_FMT_I42216: @@ -396,8 +396,8 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, break; default: ERROR( - "Invalid image format. Only YV12, I420, I422, I444 images are " - "supported."); + "Invalid image format. Only YV12, I420, I422, I444, I440, NV12 " + "images are supported."); break; } From b22edeb26b8b47155ee94f2d9093cf7cf108bf07 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Sat, 5 Feb 2022 14:32:49 -0800 Subject: [PATCH 0282/1000] Handle NV12 in vpx_img_chroma_subsampling() Change-Id: Ibac9f6f8dcdcae0d0c10ae1a118d13baf2407270 --- vp9/encoder/vp9_encoder.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 8711fa2c07..4609a6bb26 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2219,6 +2219,7 @@ static INLINE void vpx_img_chroma_subsampling(vpx_img_fmt_t fmt, switch (fmt) { case VPX_IMG_FMT_I420: case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_NV12: case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I42016: case VPX_IMG_FMT_I42216: *subsampling_x = 1; break; @@ -2229,6 +2230,7 @@ static INLINE void vpx_img_chroma_subsampling(vpx_img_fmt_t fmt, case VPX_IMG_FMT_I420: case VPX_IMG_FMT_I440: case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_NV12: case VPX_IMG_FMT_I42016: case VPX_IMG_FMT_I44016: *subsampling_y = 1; break; default: *subsampling_y = 0; break; From 85a9bdc6cc0ab6be4a2fb2c93f9e1551688489f6 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Wed, 10 Nov 2021 15:05:42 +0800 Subject: [PATCH 0283/1000] vpx_util[loongarch]: Add loongson_intrinsics.h v1.0.5. Bug: webm:1755 Change-Id: Id2fa999bdb8788bd4285114c748c547fa262a95e --- vpx_util/loongson_intrinsics.h | 1869 ++++++++++++++++++++++++++++++++ 1 file changed, 1869 insertions(+) create mode 100644 vpx_util/loongson_intrinsics.h diff --git a/vpx_util/loongson_intrinsics.h b/vpx_util/loongson_intrinsics.h new file mode 100644 index 0000000000..a34b6e8b44 --- /dev/null +++ b/vpx_util/loongson_intrinsics.h @@ -0,0 +1,1869 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#ifndef VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ +#define VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ + +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + * Contributed by Shiyou Yin + * Xiwei Gu + * Lu Wang + * + * This file is a header file for loongarch builtin extension. + * + */ + +#ifndef LOONGSON_INTRINSICS_H +#define LOONGSON_INTRINSICS_H + +/** + * MAJOR version: Macro usage changes. + * MINOR version: Add new functions, or bug fixes. + * MICRO version: Comment changes or implementation changes. + */ +#define LSOM_VERSION_MAJOR 1 +#define LSOM_VERSION_MINOR 0 +#define LSOM_VERSION_MICRO 5 + +#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \ + { \ + _OUT0 = _INS(_IN0); \ + _OUT1 = _INS(_IN1); \ + } + +#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \ + { \ + _OUT0 = _INS(_IN0, _IN1); \ + _OUT1 = _INS(_IN2, _IN3); \ + } + +#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \ + { \ + _OUT0 = _INS(_IN0, _IN1, _IN2); \ + _OUT1 = _INS(_IN3, _IN4, _IN5); \ + } + +#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \ + { \ + DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \ + DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \ + } + +#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \ + _OUT1, _OUT2, _OUT3) \ + { \ + DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \ + DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \ + } + +#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \ + _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \ + { \ + DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \ + DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \ + } + +#ifdef __loongarch_sx +#include +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Signed byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get results with the twice size of input. + * Then the results plus to signed half-word elements from in_c. + * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) + * in_c : 1,2,3,4, 1,2,3,4 + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 + * out : 23,40,41,26, 23,40,41,26 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, + __m128i in_l) { + __m128i out; + + out = __lsx_vmaddwev_h_b(in_c, in_h, in_l); + out = __lsx_vmaddwod_h_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied by + * unsigned byte elements from in_l, and then added adjacent to + * each other to get results with the twice size of input. + * The results plus to signed half-word elements from in_c. + * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) + * in_c : 1,2,3,4, 1,2,3,4 + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 + * out : 23,40,41,26, 23,40,41,26 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, + __m128i in_l) { + __m128i out; + + out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l); + out = __lsx_vmaddwod_h_bu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of half-word vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Outputs - out + * Return Type - __m128i + * Details : Signed half-word elements from in_h are multiplied by + * signed half-word elements from in_l, and then added adjacent to + * each other to get results with the twice size of input. + * Then the results plus to signed word elements from in_c. + * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) + * in_c : 1,2,3,4 + * in_h : 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1 + * out : 23,40,41,26 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, + __m128i in_l) { + __m128i out; + + out = __lsx_vmaddwev_w_h(in_c, in_h, in_l); + out = __lsx_vmaddwod_w_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Signed byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get results with the twice size of input. + * Example : out = __lsx_vdp2_h_b(in_h, in_l) + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 + * out : 22,38,38,22, 22,38,38,22 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) { + __m128i out; + + out = __lsx_vmulwev_h_b(in_h, in_l); + out = __lsx_vmaddwod_h_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied by + * unsigned byte elements from in_l, and then added adjacent to + * each other to get results with the twice size of input. + * Example : out = __lsx_vdp2_h_bu(in_h, in_l) + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 + * out : 22,38,38,22, 22,38,38,22 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) { + __m128i out; + + out = __lsx_vmulwev_h_bu(in_h, in_l); + out = __lsx_vmaddwod_h_bu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get results with the twice size of input. + * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l) + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1 + * out : 22,38,38,22, 22,38,38,6 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) { + __m128i out; + + out = __lsx_vmulwev_h_bu_b(in_h, in_l); + out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Signed byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get results with the twice size of input. + * Example : out = __lsx_vdp2_w_h(in_h, in_l) + * in_h : 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1 + * out : 22,38,38,22 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) { + __m128i out; + + out = __lsx_vmulwev_w_h(in_h, in_l); + out = __lsx_vmaddwod_w_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Clip all halfword elements of input vector between min & max + * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : + * (_in)) + * Arguments : Inputs - _in (input vector) + * - min (min threshold) + * - max (max threshold) + * Outputs - out (output vector with clipped elements) + * Return Type - signed halfword + * Example : out = __lsx_vclip_h(_in) + * _in : -8,2,280,249, -8,255,280,249 + * min : 1,1,1,1, 1,1,1,1 + * max : 9,9,9,9, 9,9,9,9 + * out : 1,2,9,9, 1,9,9,9 + * ============================================================================= + */ +static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) { + __m128i out; + + out = __lsx_vmax_h(min, _in); + out = __lsx_vmin_h(max, out); + return out; +} + +/* + * ============================================================================= + * Description : Set each element of vector between 0 and 255 + * Arguments : Inputs - _in + * Outputs - out + * Return Type - halfword + * Details : Signed byte elements from _in are clamped between 0 and 255. + * Example : out = __lsx_vclip255_h(_in) + * _in : -8,255,280,249, -8,255,280,249 + * out : 0,255,255,249, 0,255,255,249 + * ============================================================================= + */ +static inline __m128i __lsx_vclip255_h(__m128i _in) { + __m128i out; + + out = __lsx_vmaxi_h(_in, 0); + out = __lsx_vsat_hu(out, 7); + return out; +} + +/* + * ============================================================================= + * Description : Set each element of vector between 0 and 255 + * Arguments : Inputs - _in + * Outputs - out + * Return Type - word + * Details : Signed byte elements from _in are clamped between 0 and 255. + * Example : out = __lsx_vclip255_w(_in) + * _in : -8,255,280,249 + * out : 0,255,255,249 + * ============================================================================= + */ +static inline __m128i __lsx_vclip255_w(__m128i _in) { + __m128i out; + + out = __lsx_vmaxi_w(_in, 0); + out = __lsx_vsat_wu(out, 7); + return out; +} + +/* + * ============================================================================= + * Description : Swap two variables + * Arguments : Inputs - _in0, _in1 + * Outputs - _in0, _in1 (in-place) + * Details : Swapping of two input variables using xor + * Example : LSX_SWAP(_in0, _in1) + * _in0 : 1,2,3,4 + * _in1 : 5,6,7,8 + * _in0(out) : 5,6,7,8 + * _in1(out) : 1,2,3,4 + * ============================================================================= + */ +#define LSX_SWAP(_in0, _in1) \ + { \ + _in0 = __lsx_vxor_v(_in0, _in1); \ + _in1 = __lsx_vxor_v(_in0, _in1); \ + _in0 = __lsx_vxor_v(_in0, _in1); \ + } + +/* + * ============================================================================= + * Description : Transpose 4x4 block with word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Details : + * Example : + * 1, 2, 3, 4 1, 5, 9,13 + * 5, 6, 7, 8 to 2, 6,10,14 + * 9,10,11,12 =====> 3, 7,11,15 + * 13,14,15,16 4, 8,12,16 + * ============================================================================= + */ +#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + __m128i _t0, _t1, _t2, _t3; \ + \ + _t0 = __lsx_vilvl_w(_in1, _in0); \ + _t1 = __lsx_vilvh_w(_in1, _in0); \ + _t2 = __lsx_vilvl_w(_in3, _in2); \ + _t3 = __lsx_vilvh_w(_in3, _in2); \ + _out0 = __lsx_vilvl_d(_t2, _t0); \ + _out1 = __lsx_vilvh_d(_t2, _t0); \ + _out2 = __lsx_vilvl_d(_t3, _t1); \ + _out3 = __lsx_vilvh_d(_t3, _t1); \ + } + +/* + * ============================================================================= + * Description : Transpose 8x8 block with byte elements in vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, + * _out7 + * Details : The rows of the matrix become columns, and the columns + * become rows. + * Example : LSX_TRANSPOSE8x8_B + * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00 + * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00 + * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00 + * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00 + * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00 + * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00 + * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00 + * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00 + * + * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00 + * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00 + * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00 + * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00 + * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00 + * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00 + * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00 + * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00 + * ============================================================================= + */ +#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + __m128i zero = { 0 }; \ + __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \ + __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ + \ + _t0 = __lsx_vilvl_b(_in2, _in0); \ + _t1 = __lsx_vilvl_b(_in3, _in1); \ + _t2 = __lsx_vilvl_b(_in6, _in4); \ + _t3 = __lsx_vilvl_b(_in7, _in5); \ + _t4 = __lsx_vilvl_b(_t1, _t0); \ + _t5 = __lsx_vilvh_b(_t1, _t0); \ + _t6 = __lsx_vilvl_b(_t3, _t2); \ + _t7 = __lsx_vilvh_b(_t3, _t2); \ + _out0 = __lsx_vilvl_w(_t6, _t4); \ + _out2 = __lsx_vilvh_w(_t6, _t4); \ + _out4 = __lsx_vilvl_w(_t7, _t5); \ + _out6 = __lsx_vilvh_w(_t7, _t5); \ + _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \ + _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \ + _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \ + _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \ + } + +/* + * ============================================================================= + * Description : Transpose 8x8 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + * Details : + * Example : + * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70 + * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71 + * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72 + * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73 + * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74 + * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75 + * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76 + * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77 + * ============================================================================= + */ +#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ + \ + _s0 = __lsx_vilvl_h(_in6, _in4); \ + _s1 = __lsx_vilvl_h(_in7, _in5); \ + _t0 = __lsx_vilvl_h(_s1, _s0); \ + _t1 = __lsx_vilvh_h(_s1, _s0); \ + _s0 = __lsx_vilvh_h(_in6, _in4); \ + _s1 = __lsx_vilvh_h(_in7, _in5); \ + _t2 = __lsx_vilvl_h(_s1, _s0); \ + _t3 = __lsx_vilvh_h(_s1, _s0); \ + _s0 = __lsx_vilvl_h(_in2, _in0); \ + _s1 = __lsx_vilvl_h(_in3, _in1); \ + _t4 = __lsx_vilvl_h(_s1, _s0); \ + _t5 = __lsx_vilvh_h(_s1, _s0); \ + _s0 = __lsx_vilvh_h(_in2, _in0); \ + _s1 = __lsx_vilvh_h(_in3, _in1); \ + _t6 = __lsx_vilvl_h(_s1, _s0); \ + _t7 = __lsx_vilvh_h(_s1, _s0); \ + \ + _out0 = __lsx_vpickev_d(_t0, _t4); \ + _out2 = __lsx_vpickev_d(_t1, _t5); \ + _out4 = __lsx_vpickev_d(_t2, _t6); \ + _out6 = __lsx_vpickev_d(_t3, _t7); \ + _out1 = __lsx_vpickod_d(_t0, _t4); \ + _out3 = __lsx_vpickod_d(_t1, _t5); \ + _out5 = __lsx_vpickod_d(_t2, _t6); \ + _out7 = __lsx_vpickod_d(_t3, _t7); \ + } + +/* + * ============================================================================= + * Description : Transpose input 8x4 byte block into 4x8 + * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block) + * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block) + * Return Type - as per RTYPE + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : LSX_TRANSPOSE8x4_B + * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00 + * + * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00 + * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00 + * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00 + * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00 + * ============================================================================= + */ +#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3) \ + { \ + __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + \ + _tmp0_m = __lsx_vpackev_w(_in4, _in0); \ + _tmp1_m = __lsx_vpackev_w(_in5, _in1); \ + _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \ + _tmp0_m = __lsx_vpackev_w(_in6, _in2); \ + _tmp1_m = __lsx_vpackev_w(_in7, _in3); \ + \ + _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \ + _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \ + _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \ + \ + _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \ + _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \ + _out1 = __lsx_vilvh_d(_out2, _out0); \ + _out3 = __lsx_vilvh_d(_out0, _out2); \ + } + +/* + * ============================================================================= + * Description : Transpose 16x8 block with byte elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8 + * in9, in10, in11, in12, in13, in14, in15 + * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + * Details : + * Example : + * 000,001,002,003,004,005,006,007 + * 008,009,010,011,012,013,014,015 + * 016,017,018,019,020,021,022,023 + * 024,025,026,027,028,029,030,031 + * 032,033,034,035,036,037,038,039 + * 040,041,042,043,044,045,046,047 000,008,...,112,120 + * 048,049,050,051,052,053,054,055 001,009,...,113,121 + * 056,057,058,059,060,061,062,063 to 002,010,...,114,122 + * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123 + * 072,073,074,075,076,077,078,079 004,012,...,116,124 + * 080,081,082,083,084,085,086,087 005,013,...,117,125 + * 088,089,090,091,092,093,094,095 006,014,...,118,126 + * 096,097,098,099,100,101,102,103 007,015,...,119,127 + * 104,105,106,107,108,109,110,111 + * 112,113,114,115,116,117,118,119 + * 120,121,122,123,124,125,126,127 + * ============================================================================= + */ +#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7) \ + { \ + __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ + __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ + DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \ + _tmp0, _tmp1, _tmp2, _tmp3); \ + DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \ + _in13, _tmp4, _tmp5, _tmp6, _tmp7); \ + DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \ + DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \ + DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \ + DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \ + DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \ + DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \ + DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \ + DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \ + DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \ + DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \ + DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \ + DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \ + } + +/* + * ============================================================================= + * Description : Butterfly of 4 input vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Details : Butterfly operation + * Example : + * out0 = in0 + in3; + * out1 = in1 + in2; + * out2 = in1 - in2; + * out3 = in0 - in3; + * ============================================================================= + */ +#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lsx_vadd_b(_in0, _in3); \ + _out1 = __lsx_vadd_b(_in1, _in2); \ + _out2 = __lsx_vsub_b(_in1, _in2); \ + _out3 = __lsx_vsub_b(_in0, _in3); \ + } +#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lsx_vadd_h(_in0, _in3); \ + _out1 = __lsx_vadd_h(_in1, _in2); \ + _out2 = __lsx_vsub_h(_in1, _in2); \ + _out3 = __lsx_vsub_h(_in0, _in3); \ + } +#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lsx_vadd_w(_in0, _in3); \ + _out1 = __lsx_vadd_w(_in1, _in2); \ + _out2 = __lsx_vsub_w(_in1, _in2); \ + _out3 = __lsx_vsub_w(_in0, _in3); \ + } +#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lsx_vadd_d(_in0, _in3); \ + _out1 = __lsx_vadd_d(_in1, _in2); \ + _out2 = __lsx_vsub_d(_in1, _in2); \ + _out3 = __lsx_vsub_d(_in0, _in3); \ + } + +/* + * ============================================================================= + * Description : Butterfly of 8 input vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ + * Outputs - _out0, _out1, _out2, _out3, ~ + * Details : Butterfly operation + * Example : + * _out0 = _in0 + _in7; + * _out1 = _in1 + _in6; + * _out2 = _in2 + _in5; + * _out3 = _in3 + _in4; + * _out4 = _in3 - _in4; + * _out5 = _in2 - _in5; + * _out6 = _in1 - _in6; + * _out7 = _in0 - _in7; + * ============================================================================= + */ +#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lsx_vadd_b(_in0, _in7); \ + _out1 = __lsx_vadd_b(_in1, _in6); \ + _out2 = __lsx_vadd_b(_in2, _in5); \ + _out3 = __lsx_vadd_b(_in3, _in4); \ + _out4 = __lsx_vsub_b(_in3, _in4); \ + _out5 = __lsx_vsub_b(_in2, _in5); \ + _out6 = __lsx_vsub_b(_in1, _in6); \ + _out7 = __lsx_vsub_b(_in0, _in7); \ + } + +#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lsx_vadd_h(_in0, _in7); \ + _out1 = __lsx_vadd_h(_in1, _in6); \ + _out2 = __lsx_vadd_h(_in2, _in5); \ + _out3 = __lsx_vadd_h(_in3, _in4); \ + _out4 = __lsx_vsub_h(_in3, _in4); \ + _out5 = __lsx_vsub_h(_in2, _in5); \ + _out6 = __lsx_vsub_h(_in1, _in6); \ + _out7 = __lsx_vsub_h(_in0, _in7); \ + } + +#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lsx_vadd_w(_in0, _in7); \ + _out1 = __lsx_vadd_w(_in1, _in6); \ + _out2 = __lsx_vadd_w(_in2, _in5); \ + _out3 = __lsx_vadd_w(_in3, _in4); \ + _out4 = __lsx_vsub_w(_in3, _in4); \ + _out5 = __lsx_vsub_w(_in2, _in5); \ + _out6 = __lsx_vsub_w(_in1, _in6); \ + _out7 = __lsx_vsub_w(_in0, _in7); \ + } + +#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lsx_vadd_d(_in0, _in7); \ + _out1 = __lsx_vadd_d(_in1, _in6); \ + _out2 = __lsx_vadd_d(_in2, _in5); \ + _out3 = __lsx_vadd_d(_in3, _in4); \ + _out4 = __lsx_vsub_d(_in3, _in4); \ + _out5 = __lsx_vsub_d(_in2, _in5); \ + _out6 = __lsx_vsub_d(_in1, _in6); \ + _out7 = __lsx_vsub_d(_in0, _in7); \ + } + +#endif // LSX + +#ifdef __loongarch_asx +#include +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed halfword + * Details : Unsigned byte elements from in_h are multiplied with + * unsigned byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then this multiplied results of adjacent odd-even elements + * are added to the out vector + * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_h_bu(in_h, in_l); + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed halfword + * Details : Signed byte elements from in_h are multiplied with + * signed byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then this multiplication results of adjacent odd-even elements + * are added to the out vector + * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_h_b(in_h, in_l); + out = __lasx_xvmaddwod_h_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed word + * Details : Signed halfword elements from in_h are multiplied with + * signed halfword elements from in_l producing a result + * twice the size of input i.e. signed word. + * Then this multiplied results of adjacent odd-even elements + * are added to the out vector. + * Example : out = __lasx_xvdp2_w_h(in_h, in_l) + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 + * out : 22,38,38,22, 22,38,38,22 + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_w_h(in_h, in_l); + out = __lasx_xvmaddwod_w_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of word vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed double + * Details : Signed word elements from in_h are multiplied with + * signed word elements from in_l producing a result + * twice the size of input i.e. signed double-word. + * Then this multiplied results of adjacent odd-even elements + * are added to the out vector. + * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_d_w(in_h, in_l); + out = __lasx_xvmaddwod_d_w(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed word + * Details : Unsigned halfword elements from in_h are multiplied with + * signed halfword elements from in_l producing a result + * twice the size of input i.e. unsigned word. + * Multiplication result of adjacent odd-even elements + * are added to the out vector + * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_w_hu_h(in_h, in_l); + out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - halfword + * Details : Signed byte elements from in_h are multiplied with + * signed byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then this multiplied results of adjacent odd-even elements + * are added to the in_c vector. + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l); + out = __lasx_xvmaddwod_h_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Return Type - per RTYPE + * Details : Signed halfword elements from in_h are multiplied with + * signed halfword elements from in_l producing a result + * twice the size of input i.e. signed word. + * Multiplication result of adjacent odd-even elements + * are added to the in_c vector. + * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * in_c : 1,2,3,4, 1,2,3,4 + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8, + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1, + * out : 23,40,41,26, 23,40,41,26 + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l); + out = __lasx_xvmaddwod_w_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Return Type - signed word + * Details : Unsigned halfword elements from in_h are multiplied with + * unsigned halfword elements from in_l producing a result + * twice the size of input i.e. signed word. + * Multiplication result of adjacent odd-even elements + * are added to the in_c vector. + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l); + out = __lasx_xvmaddwod_w_hu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Return Type - signed word + * Details : Unsigned halfword elements from in_h are multiplied with + * signed halfword elements from in_l producing a result + * twice the size of input i.e. signed word. + * Multiplication result of adjacent odd-even elements + * are added to the in_c vector + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l); + out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Vector Unsigned Dot Product and Subtract + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Return Type - signed halfword + * Details : Unsigned byte elements from in_h are multiplied with + * unsigned byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Multiplication result of adjacent odd-even elements + * are added together and subtracted from double width elements + * in_c vector. + * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_h_bu(in_h, in_l); + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); + out = __lasx_xvsub_h(in_c, out); + return out; +} + +/* + * ============================================================================= + * Description : Vector Signed Dot Product and Subtract + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Return Type - signed word + * Details : Signed halfword elements from in_h are multiplied with + * Signed halfword elements from in_l producing a result + * twice the size of input i.e. signed word. + * Multiplication result of adjacent odd-even elements + * are added together and subtracted from double width elements + * in_c vector. + * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l) + * in_c : 0,0,0,0, 0,0,0,0 + * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1 + * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1 + * out : -7,-3,0,0, 0,-1,0,-1 + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_w_h(in_h, in_l); + out = __lasx_xvmaddwod_w_h(out, in_h, in_l); + out = __lasx_xvsub_w(in_c, out); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed word + * Details : Signed halfword elements from in_h are multiplied with + * signed halfword elements from in_l producing a result + * four times the size of input i.e. signed doubleword. + * Then this multiplication results of four adjacent elements + * are added together and stored to the out vector. + * Example : out = __lasx_xvdp4_d_h(in_h, in_l) + * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1 + * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1 + * out : -2,0,1,1 + * ============================================================================= + */ +static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_w_h(in_h, in_l); + out = __lasx_xvmaddwod_w_h(out, in_h, in_l); + out = __lasx_xvhaddw_d_w(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The high half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are added after the + * higher half of the two-fold sign extension (signed byte + * to signed halfword) and stored to the out vector. + * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvilvh_b(in_h, in_l); + out = __lasx_xvhaddw_h_b(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The high half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are added after the + * higher half of the two-fold sign extension (signed halfword + * to signed word) and stored to the out vector. + * Example : out = __lasx_xvaddwh_w_h(in_h, in_l) + * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 + * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1 + * out : 1,0,0,-1, 1,0,0, 2 + * ============================================================================= + */ +static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvilvh_h(in_h, in_l); + out = __lasx_xvhaddw_w_h(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are added after the + * lower half of the two-fold sign extension (signed byte + * to signed halfword) and stored to the out vector. + * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvilvl_b(in_h, in_l); + out = __lasx_xvhaddw_h_b(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are added after the + * lower half of the two-fold sign extension (signed halfword + * to signed word) and stored to the out vector. + * Example : out = __lasx_xvaddwl_w_h(in_h, in_l) + * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 + * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1 + * out : 5,-1,4,2, 1,0,2,-1 + * ============================================================================= + */ +static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvilvl_h(in_h, in_l); + out = __lasx_xvhaddw_w_h(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The out vector and the out vector are added after the + * lower half of the two-fold zero extension (unsigned byte + * to unsigned halfword) and stored to the out vector. + * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvilvl_b(in_h, in_l); + out = __lasx_xvhaddw_hu_bu(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_l vector after double zero extension (unsigned byte to + * signed halfword),added to the in_h vector. + * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvsllwil_hu_bu(in_l, 0); + out = __lasx_xvadd_h(in_h, out); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_l vector after double sign extension (signed halfword to + * signed word), added to the in_h vector. + * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l) + * in_h : 0, 1,0,0, -1,0,0,1, + * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1, + * out : 2, 0,1,2, -1,0,1,1, + * ============================================================================= + */ +static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvsllwil_w_h(in_l, 0); + out = __lasx_xvadd_w(in_h, out); + return out; +} + +/* + * ============================================================================= + * Description : Multiplication and addition calculation after expansion + * of the lower half of the vector. + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are multiplied after + * the lower half of the two-fold sign extension (signed halfword + * to signed word), and the result is added to the vector in_c, + * then stored to the out vector. + * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l) + * in_c : 1,2,3,4, 5,6,7,8 + * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8 + * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000, + * -200,-300,-400,-500, -2000,-3000,-4000,-5000 + * out : 201, 602,1203,2004, -995, -1794,-2793,-3992 + * ============================================================================= + */ +static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i tmp0, tmp1, out; + + tmp0 = __lasx_xvsllwil_w_h(in_h, 0); + tmp1 = __lasx_xvsllwil_w_h(in_l, 0); + tmp0 = __lasx_xvmul_w(tmp0, tmp1); + out = __lasx_xvadd_w(tmp0, in_c); + return out; +} + +/* + * ============================================================================= + * Description : Multiplication and addition calculation after expansion + * of the higher half of the vector. + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are multiplied after + * the higher half of the two-fold sign extension (signed + * halfword to signed word), and the result is added to + * the vector in_c, then stored to the out vector. + * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i tmp0, tmp1, out; + + tmp0 = __lasx_xvilvh_h(in_h, in_h); + tmp1 = __lasx_xvilvh_h(in_l, in_l); + tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1); + out = __lasx_xvadd_w(tmp0, in_c); + return out; +} + +/* + * ============================================================================= + * Description : Multiplication calculation after expansion of the lower + * half of the vector. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are multiplied after + * the lower half of the two-fold sign extension (signed + * halfword to signed word), then stored to the out vector. + * Example : out = __lasx_xvmulwl_w_h(in_h, in_l) + * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 + * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1 + * out : 6,1,3,0, 0,0,1,0 + * ============================================================================= + */ +static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) { + __m256i tmp0, tmp1, out; + + tmp0 = __lasx_xvsllwil_w_h(in_h, 0); + tmp1 = __lasx_xvsllwil_w_h(in_l, 0); + out = __lasx_xvmul_w(tmp0, tmp1); + return out; +} + +/* + * ============================================================================= + * Description : Multiplication calculation after expansion of the lower + * half of the vector. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are multiplied after + * the lower half of the two-fold sign extension (signed + * halfword to signed word), then stored to the out vector. + * Example : out = __lasx_xvmulwh_w_h(in_h, in_l) + * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 + * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1 + * out : 0,0,0,0, 0,0,0,1 + * ============================================================================= + */ +static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) { + __m256i tmp0, tmp1, out; + + tmp0 = __lasx_xvilvh_h(in_h, in_h); + tmp1 = __lasx_xvilvh_h(in_l, in_l); + out = __lasx_xvmulwev_w_h(tmp0, tmp1); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are added to the high half + * after being doubled, then saturated. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector adds the in_l vector after the lower half of + * the two-fold zero extension (unsigned byte to unsigned + * halfword) and then saturated. The results are stored to the out + * vector. + * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l) + * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1 + * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, + * 0,0,0,1 + * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2, + * ============================================================================= + */ +static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) { + __m256i tmp1, out; + __m256i zero = { 0 }; + + tmp1 = __lasx_xvilvl_b(zero, in_l); + out = __lasx_xvsadd_hu(in_h, tmp1); + return out; +} + +/* + * ============================================================================= + * Description : Clip all halfword elements of input vector between min & max + * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in)) + * Arguments : Inputs - in (input vector) + * - min (min threshold) + * - max (max threshold) + * Outputs - in (output vector with clipped elements) + * Return Type - signed halfword + * Example : out = __lasx_xvclip_h(in, min, max) + * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5 + * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1 + * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9 + * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5 + * ============================================================================= + */ +static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) { + __m256i out; + + out = __lasx_xvmax_h(min, in); + out = __lasx_xvmin_h(max, out); + return out; +} + +/* + * ============================================================================= + * Description : Clip all signed halfword elements of input vector + * between 0 & 255 + * Arguments : Inputs - in (input vector) + * Outputs - out (output vector with clipped elements) + * Return Type - signed halfword + * Example : See out = __lasx_xvclip255_w(in) + * ============================================================================= + */ +static inline __m256i __lasx_xvclip255_h(__m256i in) { + __m256i out; + + out = __lasx_xvmaxi_h(in, 0); + out = __lasx_xvsat_hu(out, 7); + return out; +} + +/* + * ============================================================================= + * Description : Clip all signed word elements of input vector + * between 0 & 255 + * Arguments : Inputs - in (input vector) + * Output - out (output vector with clipped elements) + * Return Type - signed word + * Example : out = __lasx_xvclip255_w(in) + * in : -8,255,280,249, -8,255,280,249 + * out : 0,255,255,249, 0,255,255,249 + * ============================================================================= + */ +static inline __m256i __lasx_xvclip255_w(__m256i in) { + __m256i out; + + out = __lasx_xvmaxi_w(in, 0); + out = __lasx_xvsat_wu(out, 7); + return out; +} + +/* + * ============================================================================= + * Description : Indexed halfword element values are replicated to all + * elements in output vector. If 'idx < 8' use xvsplati_l_*, + * if 'idx >= 8' use xvsplati_h_*. + * Arguments : Inputs - in, idx + * Output - out + * Details : Idx element value from in vector is replicated to all + * elements in out vector. + * Valid index range for halfword operation is 0-7 + * Example : out = __lasx_xvsplati_l_h(in, idx) + * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0 + * idx : 0x02 + * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11 + * ============================================================================= + */ +static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) { + __m256i out; + + out = __lasx_xvpermi_q(in, in, 0x02); + out = __lasx_xvreplve_h(out, idx); + return out; +} + +/* + * ============================================================================= + * Description : Indexed halfword element values are replicated to all + * elements in output vector. If 'idx < 8' use xvsplati_l_*, + * if 'idx >= 8' use xvsplati_h_*. + * Arguments : Inputs - in, idx + * Output - out + * Details : Idx element value from in vector is replicated to all + * elements in out vector. + * Valid index range for halfword operation is 0-7 + * Example : out = __lasx_xvsplati_h_h(in, idx) + * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0 + * idx : 0x09 + * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2 + * ============================================================================= + */ +static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { + __m256i out; + + out = __lasx_xvpermi_q(in, in, 0x13); + out = __lasx_xvreplve_h(out, idx); + return out; +} + +/* + * ============================================================================= + * Description : Transpose 4x4 block with double-word elements in vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3 + * Outputs - _out0, _out1, _out2, _out3 + * Example : LASX_TRANSPOSE4x4_D + * _in0 : 1,2,3,4 + * _in1 : 1,2,3,4 + * _in2 : 1,2,3,4 + * _in3 : 1,2,3,4 + * + * _out0 : 1,1,1,1 + * _out1 : 2,2,2,2 + * _out2 : 3,3,3,3 + * _out3 : 4,4,4,4 + * ============================================================================= + */ +#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \ + _out3) \ + { \ + __m256i _tmp0, _tmp1, _tmp2, _tmp3; \ + _tmp0 = __lasx_xvilvl_d(_in1, _in0); \ + _tmp1 = __lasx_xvilvh_d(_in1, _in0); \ + _tmp2 = __lasx_xvilvl_d(_in3, _in2); \ + _tmp3 = __lasx_xvilvh_d(_in3, _in2); \ + _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \ + _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \ + _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \ + _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \ + } + +/* + * ============================================================================= + * Description : Transpose 8x8 block with word elements in vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, + * _out7 + * Example : LASX_TRANSPOSE8x8_W + * _in0 : 1,2,3,4,5,6,7,8 + * _in1 : 2,2,3,4,5,6,7,8 + * _in2 : 3,2,3,4,5,6,7,8 + * _in3 : 4,2,3,4,5,6,7,8 + * _in4 : 5,2,3,4,5,6,7,8 + * _in5 : 6,2,3,4,5,6,7,8 + * _in6 : 7,2,3,4,5,6,7,8 + * _in7 : 8,2,3,4,5,6,7,8 + * + * _out0 : 1,2,3,4,5,6,7,8 + * _out1 : 2,2,2,2,2,2,2,2 + * _out2 : 3,3,3,3,3,3,3,3 + * _out3 : 4,4,4,4,4,4,4,4 + * _out4 : 5,5,5,5,5,5,5,5 + * _out5 : 6,6,6,6,6,6,6,6 + * _out6 : 7,7,7,7,7,7,7,7 + * _out7 : 8,8,8,8,8,8,8,8 + * ============================================================================= + */ +#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + __m256i _s0_m, _s1_m; \ + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ + \ + _s0_m = __lasx_xvilvl_w(_in2, _in0); \ + _s1_m = __lasx_xvilvl_w(_in3, _in1); \ + _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ + _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ + _s0_m = __lasx_xvilvh_w(_in2, _in0); \ + _s1_m = __lasx_xvilvh_w(_in3, _in1); \ + _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ + _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ + _s0_m = __lasx_xvilvl_w(_in6, _in4); \ + _s1_m = __lasx_xvilvl_w(_in7, _in5); \ + _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ + _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ + _s0_m = __lasx_xvilvh_w(_in6, _in4); \ + _s1_m = __lasx_xvilvh_w(_in7, _in5); \ + _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ + _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ + _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \ + _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \ + _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \ + _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \ + _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \ + _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \ + _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \ + _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \ + } + +/* + * ============================================================================= + * Description : Transpose input 16x8 byte block + * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, + * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15 + * (input 16x8 byte block) + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, + * _out7 (output 8x16 byte block) + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : See LASX_TRANSPOSE16x8_H + * ============================================================================= + */ +#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7) \ + { \ + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ + \ + _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \ + _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \ + _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \ + _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \ + _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \ + _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \ + _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \ + _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \ + _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \ + _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \ + _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \ + _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \ + _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \ + _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \ + _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \ + _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \ + _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \ + _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \ + _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \ + _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \ + _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \ + _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \ + _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \ + _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \ + _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \ + _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \ + _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \ + _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \ + _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \ + _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \ + _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \ + _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \ + } + +/* + * ============================================================================= + * Description : Transpose input 16x8 byte block + * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, + * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15 + * (input 16x8 byte block) + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, + * _out7 (output 8x16 byte block) + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : LASX_TRANSPOSE16x8_H + * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * + * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6 + * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 + * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3 + * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4 + * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 + * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6 + * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 + * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 + * ============================================================================= + */ +#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7) \ + { \ + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ + __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ + \ + _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \ + _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \ + _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \ + _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \ + _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \ + _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \ + _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \ + _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \ + _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \ + _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \ + _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \ + _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \ + _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \ + _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \ + _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \ + _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \ + _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \ + _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \ + _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \ + _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \ + _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \ + _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \ + _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \ + _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \ + _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \ + _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \ + _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \ + _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \ + \ + _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \ + _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \ + _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \ + _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \ + _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \ + _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \ + _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \ + _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \ + _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \ + _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \ + _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \ + _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \ + _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \ + _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \ + _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \ + _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \ + _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \ + _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \ + _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \ + _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \ + _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \ + _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \ + _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \ + _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \ + _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \ + _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \ + _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \ + _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \ + } + +/* + * ============================================================================= + * Description : Transpose 4x4 block with halfword elements in vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3 + * Outputs - _out0, _out1, _out2, _out3 + * Return Type - signed halfword + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : See LASX_TRANSPOSE8x8_H + * ============================================================================= + */ +#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \ + _out3) \ + { \ + __m256i _s0_m, _s1_m; \ + \ + _s0_m = __lasx_xvilvl_h(_in1, _in0); \ + _s1_m = __lasx_xvilvl_h(_in3, _in2); \ + _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \ + _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \ + _out1 = __lasx_xvilvh_d(_out0, _out0); \ + _out3 = __lasx_xvilvh_d(_out2, _out2); \ + } + +/* + * ============================================================================= + * Description : Transpose input 8x8 byte block + * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 + * (input 8x8 byte block) + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, + * _out7 (output 8x8 byte block) + * Example : See LASX_TRANSPOSE8x8_H + * ============================================================================= + */ +#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ + _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \ + _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \ + _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \ + _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \ + _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \ + _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \ + _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \ + _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \ + _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \ + _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \ + _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \ + _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \ + _out1 = __lasx_xvbsrl_v(_out0, 8); \ + _out3 = __lasx_xvbsrl_v(_out2, 8); \ + _out5 = __lasx_xvbsrl_v(_out4, 8); \ + _out7 = __lasx_xvbsrl_v(_out6, 8); \ + } + +/* + * ============================================================================= + * Description : Transpose 8x8 block with halfword elements in vectors. + * Arguments : Inputs - _in0, _in1, ~ + * Outputs - _out0, _out1, ~ + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : LASX_TRANSPOSE8x8_H + * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8 + * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8 + * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8 + * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8 + * + * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9 + * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2 + * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3 + * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4 + * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5 + * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6 + * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7 + * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8 + * ============================================================================= + */ +#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + __m256i _s0_m, _s1_m; \ + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ + \ + _s0_m = __lasx_xvilvl_h(_in6, _in4); \ + _s1_m = __lasx_xvilvl_h(_in7, _in5); \ + _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ + _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ + _s0_m = __lasx_xvilvh_h(_in6, _in4); \ + _s1_m = __lasx_xvilvh_h(_in7, _in5); \ + _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ + _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ + \ + _s0_m = __lasx_xvilvl_h(_in2, _in0); \ + _s1_m = __lasx_xvilvl_h(_in3, _in1); \ + _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ + _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ + _s0_m = __lasx_xvilvh_h(_in2, _in0); \ + _s1_m = __lasx_xvilvh_h(_in3, _in1); \ + _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ + _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ + \ + _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \ + _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \ + _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \ + _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \ + _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \ + _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \ + _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \ + _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \ + } + +/* + * ============================================================================= + * Description : Butterfly of 4 input vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3 + * Outputs - _out0, _out1, _out2, _out3 + * Details : Butterfly operation + * Example : LASX_BUTTERFLY_4 + * _out0 = _in0 + _in3; + * _out1 = _in1 + _in2; + * _out2 = _in1 - _in2; + * _out3 = _in0 - _in3; + * ============================================================================= + */ +#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lasx_xvadd_b(_in0, _in3); \ + _out1 = __lasx_xvadd_b(_in1, _in2); \ + _out2 = __lasx_xvsub_b(_in1, _in2); \ + _out3 = __lasx_xvsub_b(_in0, _in3); \ + } +#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lasx_xvadd_h(_in0, _in3); \ + _out1 = __lasx_xvadd_h(_in1, _in2); \ + _out2 = __lasx_xvsub_h(_in1, _in2); \ + _out3 = __lasx_xvsub_h(_in0, _in3); \ + } +#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lasx_xvadd_w(_in0, _in3); \ + _out1 = __lasx_xvadd_w(_in1, _in2); \ + _out2 = __lasx_xvsub_w(_in1, _in2); \ + _out3 = __lasx_xvsub_w(_in0, _in3); \ + } +#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lasx_xvadd_d(_in0, _in3); \ + _out1 = __lasx_xvadd_d(_in1, _in2); \ + _out2 = __lasx_xvsub_d(_in1, _in2); \ + _out3 = __lasx_xvsub_d(_in0, _in3); \ + } + +/* + * ============================================================================= + * Description : Butterfly of 8 input vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ + * Outputs - _out0, _out1, _out2, _out3, ~ + * Details : Butterfly operation + * Example : LASX_BUTTERFLY_8 + * _out0 = _in0 + _in7; + * _out1 = _in1 + _in6; + * _out2 = _in2 + _in5; + * _out3 = _in3 + _in4; + * _out4 = _in3 - _in4; + * _out5 = _in2 - _in5; + * _out6 = _in1 - _in6; + * _out7 = _in0 - _in7; + * ============================================================================= + */ +#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lasx_xvadd_b(_in0, _in7); \ + _out1 = __lasx_xvadd_b(_in1, _in6); \ + _out2 = __lasx_xvadd_b(_in2, _in5); \ + _out3 = __lasx_xvadd_b(_in3, _in4); \ + _out4 = __lasx_xvsub_b(_in3, _in4); \ + _out5 = __lasx_xvsub_b(_in2, _in5); \ + _out6 = __lasx_xvsub_b(_in1, _in6); \ + _out7 = __lasx_xvsub_b(_in0, _in7); \ + } + +#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lasx_xvadd_h(_in0, _in7); \ + _out1 = __lasx_xvadd_h(_in1, _in6); \ + _out2 = __lasx_xvadd_h(_in2, _in5); \ + _out3 = __lasx_xvadd_h(_in3, _in4); \ + _out4 = __lasx_xvsub_h(_in3, _in4); \ + _out5 = __lasx_xvsub_h(_in2, _in5); \ + _out6 = __lasx_xvsub_h(_in1, _in6); \ + _out7 = __lasx_xvsub_h(_in0, _in7); \ + } + +#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lasx_xvadd_w(_in0, _in7); \ + _out1 = __lasx_xvadd_w(_in1, _in6); \ + _out2 = __lasx_xvadd_w(_in2, _in5); \ + _out3 = __lasx_xvadd_w(_in3, _in4); \ + _out4 = __lasx_xvsub_w(_in3, _in4); \ + _out5 = __lasx_xvsub_w(_in2, _in5); \ + _out6 = __lasx_xvsub_w(_in1, _in6); \ + _out7 = __lasx_xvsub_w(_in0, _in7); \ + } + +#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lasx_xvadd_d(_in0, _in7); \ + _out1 = __lasx_xvadd_d(_in1, _in6); \ + _out2 = __lasx_xvadd_d(_in2, _in5); \ + _out3 = __lasx_xvadd_d(_in3, _in4); \ + _out4 = __lasx_xvsub_d(_in3, _in4); \ + _out5 = __lasx_xvsub_d(_in2, _in5); \ + _out6 = __lasx_xvsub_d(_in1, _in6); \ + _out7 = __lasx_xvsub_d(_in0, _in7); \ + } + +#endif // LASX + +/* + * ============================================================================= + * Description : Print out elements in vector. + * Arguments : Inputs - RTYPE, _element_num, _in0, _enter + * Outputs - + * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if + * '_enter' is TRUE, prefix "\nVP:" will be added first. + * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4 + * VP:1,2,3,4, + * ============================================================================= + */ +#define VECT_PRINT(RTYPE, element_num, in0, enter) \ + { \ + RTYPE _tmp0 = (RTYPE)in0; \ + int _i = 0; \ + if (enter) printf("\nVP:"); \ + for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \ + } + +#endif /* LOONGSON_INTRINSICS_H */ +#endif /* VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ */ From b3cc4b625d1d2c9a0913dcfbda97dd3bf845f998 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Wed, 10 Nov 2021 15:21:17 +0800 Subject: [PATCH 0284/1000] vp8[loongarch]: Optimize vp8_loop/sixtap, vpx_dc with LSX. 1. vp8_loop_filter_mbh, vp8_loop_filter_mbv 2. vp8_sixtap_predict16x16, vp8_sixtap_predict8x8 3. vpx_dc_predictor_16x16, vpx_dc_predictor_8x8 ./vpxdec --progress -o YUV_1920X1080.yuv original_1200f/VP8_1920X1080.webm before: 37.77fps after : 220.90fps Bug: webm:1755 Change-Id: I1a3ce16f0c872261d813b6531cfdf25bd59bb774 --- vp8/common/loongarch/loopfilter_filters_lsx.c | 393 ++++++ vp8/common/loongarch/sixtap_filter_lsx.c | 1164 +++++++++++++++++ vp8/common/rtcd_defs.pl | 8 +- vp8/vp8_common.mk | 4 + vpx_dsp/loongarch/intrapred_lsx.c | 98 ++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 7 files changed, 1666 insertions(+), 6 deletions(-) create mode 100644 vp8/common/loongarch/loopfilter_filters_lsx.c create mode 100644 vp8/common/loongarch/sixtap_filter_lsx.c create mode 100644 vpx_dsp/loongarch/intrapred_lsx.c diff --git a/vp8/common/loongarch/loopfilter_filters_lsx.c b/vp8/common/loongarch/loopfilter_filters_lsx.c new file mode 100644 index 0000000000..484b3d6ad0 --- /dev/null +++ b/vp8/common/loongarch/loopfilter_filters_lsx.c @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Lu Wang + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/loopfilter.h" +#include "vpx_util/loongson_intrinsics.h" + +#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ + { \ + __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ + __m128i u, filt, t1, t2, filt_sign, q0_sub_p0; \ + __m128i filt_r, filt_l; \ + __m128i temp0, temp1, temp2, temp3; \ + const __m128i cnst4b = __lsx_vldi(4); \ + const __m128i cnst3b = __lsx_vldi(3); \ + const __m128i cnst9h = __lsx_vldi(1033); \ + const __m128i cnst63h = __lsx_vldi(1087); \ + \ + p2_m = __lsx_vxori_b(p2, 0x80); \ + p1_m = __lsx_vxori_b(p1, 0x80); \ + p0_m = __lsx_vxori_b(p0, 0x80); \ + q0_m = __lsx_vxori_b(q0, 0x80); \ + q1_m = __lsx_vxori_b(q1, 0x80); \ + q2_m = __lsx_vxori_b(q2, 0x80); \ + \ + filt = __lsx_vssub_b(p1_m, q1_m); \ + q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vand_v(filt, mask); \ + \ + t2 = __lsx_vand_v(filt, hev); \ + hev = __lsx_vxori_b(hev, 0xff); \ + filt = __lsx_vand_v(hev, filt); \ + t1 = __lsx_vsadd_b(t2, cnst4b); \ + t1 = __lsx_vsra_b(t1, cnst3b); \ + t2 = __lsx_vsadd_b(t2, cnst3b); \ + t2 = __lsx_vsra_b(t2, cnst3b); \ + q0_m = __lsx_vssub_b(q0_m, t1); \ + p0_m = __lsx_vsadd_b(p0_m, t2); \ + filt_sign = __lsx_vslti_b(filt, 0); \ + filt_r = __lsx_vilvl_b(filt_sign, filt); \ + filt_l = __lsx_vilvh_b(filt_sign, filt); \ + temp0 = __lsx_vmul_h(filt_r, cnst9h); \ + temp1 = __lsx_vadd_h(temp0, cnst63h); \ + temp2 = __lsx_vmul_h(filt_l, cnst9h); \ + temp3 = __lsx_vadd_h(temp2, cnst63h); \ + \ + u = __lsx_vssrani_b_h(temp3, temp1, 7); \ + q2_m = __lsx_vssub_b(q2_m, u); \ + p2_m = __lsx_vsadd_b(p2_m, u); \ + q2 = __lsx_vxori_b(q2_m, 0x80); \ + p2 = __lsx_vxori_b(p2_m, 0x80); \ + \ + temp1 = __lsx_vadd_h(temp1, temp0); \ + temp3 = __lsx_vadd_h(temp3, temp2); \ + \ + u = __lsx_vssrani_b_h(temp3, temp1, 7); \ + q1_m = __lsx_vssub_b(q1_m, u); \ + p1_m = __lsx_vsadd_b(p1_m, u); \ + q1 = __lsx_vxori_b(q1_m, 0x80); \ + p1 = __lsx_vxori_b(p1_m, 0x80); \ + \ + temp1 = __lsx_vadd_h(temp1, temp0); \ + temp3 = __lsx_vadd_h(temp3, temp2); \ + \ + u = __lsx_vssrani_b_h(temp3, temp1, 7); \ + q0_m = __lsx_vssub_b(q0_m, u); \ + p0_m = __lsx_vsadd_b(p0_m, u); \ + q0 = __lsx_vxori_b(q0_m, 0x80); \ + p0 = __lsx_vxori_b(p0_m, 0x80); \ + } + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ + flat_out) \ + { \ + __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in); \ + p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in); \ + p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in); \ + q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in); \ + q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in); \ + q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in); \ + p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in); \ + p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in); \ + flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = __lsx_vslt_bu(thresh_in, flat_out); \ + p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1); \ + p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m); \ + mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m); \ + mask_out = __lsx_vmax_bu(flat_out, mask_out); \ + p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out); \ + mask_out = __lsx_vslt_bu(limit_in, mask_out); \ + mask_out = __lsx_vxori_b(mask_out, 0xff); \ + } + +#define VP8_ST6x1_B(in0, in0_idx, in1, in1_idx, pdst, stride) \ + { \ + __lsx_vstelm_w(in0, pdst, 0, in0_idx); \ + __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx); \ + } + +static inline void mbloop_filter_horizontal_edge_y_lsx( + uint8_t *src, int32_t pitch, const uint8_t b_limit_in, + const uint8_t limit_in, const uint8_t thresh_in) { + uint8_t *temp_src; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + + DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); + thresh = __lsx_vldrepl_b(&thresh_in, 0); + + temp_src = src - pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, p3, p2, p1, p0); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, q0, q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + + temp_src = src - pitch_x3; + __lsx_vstx(p2, temp_src, 0); + __lsx_vstx(p1, temp_src, pitch); + __lsx_vstx(p0, temp_src, pitch_x2); + __lsx_vstx(q0, temp_src, pitch_x3); + temp_src += pitch_x4; + __lsx_vstx(q1, temp_src, 0); + __lsx_vstx(q2, temp_src, pitch); +} + +static inline void mbloop_filter_horizontal_edge_uv_lsx( + uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, + const uint8_t limit_in, const uint8_t thresh_in) { + uint8_t *temp_src; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; + __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; + + DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); + thresh = __lsx_vldrepl_b(&thresh_in, 0); + + temp_src = src_u - pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, p3_u, p2_u, p1_u, p0_u); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, q0_u, q1_u, q2_u, q3_u); + temp_src = src_v - pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, p3_v, p2_v, p1_v, p0_v); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, q0_v, q1_v, q2_v, q3_v); + + DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, + p2, p1, p0); + DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, + q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + + src_u -= pitch_x3; + __lsx_vstelm_d(p2, src_u, 0, 0); + __lsx_vstelm_d(p1, src_u + pitch, 0, 0); + __lsx_vstelm_d(p0, src_u + pitch_x2, 0, 0); + __lsx_vstelm_d(q0, src_u + pitch_x3, 0, 0); + src_u += pitch_x4; + __lsx_vstelm_d(q1, src_u, 0, 0); + src_u += pitch; + __lsx_vstelm_d(q2, src_u, 0, 0); + + src_v -= pitch_x3; + __lsx_vstelm_d(p2, src_v, 0, 1); + __lsx_vstelm_d(p1, src_v + pitch, 0, 1); + __lsx_vstelm_d(p0, src_v + pitch_x2, 0, 1); + __lsx_vstelm_d(q0, src_v + pitch_x3, 0, 1); + src_v += pitch_x4; + __lsx_vstelm_d(q1, src_v, 0, 1); + src_v += pitch; + __lsx_vstelm_d(q2, src_v, 0, 1); +} + +static inline void mbloop_filter_vertical_edge_y_lsx(uint8_t *src, + int32_t pitch, + const uint8_t b_limit_in, + const uint8_t limit_in, + const uint8_t thresh_in) { + uint8_t *temp_src; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8; + __m128i row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); + thresh = __lsx_vldrepl_b(&thresh_in, 0); + temp_src = src - 4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, row0, row1, row2, row3); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, row4, row5, row6, row7); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, row8, row9, row10, row11); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, row12, row13, row14, row15); + temp_src -= pitch_x4; + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1); + tmp3 = __lsx_vilvl_h(tmp1, tmp0); + tmp4 = __lsx_vilvh_h(tmp1, tmp0); + DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1); + tmp6 = __lsx_vilvl_h(tmp1, tmp0); + tmp7 = __lsx_vilvh_h(tmp1, tmp0); + tmp2 = __lsx_vilvl_b(q2, q1); + tmp5 = __lsx_vilvh_b(q2, q1); + + temp_src = src - 3; + VP8_ST6x1_B(tmp3, 0, tmp2, 0, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp3, 1, tmp2, 1, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp3, 2, tmp2, 2, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp3, 3, tmp2, 3, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp4, 0, tmp2, 4, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp4, 1, tmp2, 5, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp4, 2, tmp2, 6, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp4, 3, tmp2, 7, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp6, 0, tmp5, 0, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp6, 1, tmp5, 1, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp6, 2, tmp5, 2, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp6, 3, tmp5, 3, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp7, 0, tmp5, 4, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp7, 1, tmp5, 5, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp7, 2, tmp5, 6, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp7, 3, tmp5, 7, temp_src, 4); +} + +static inline void mbloop_filter_vertical_edge_uv_lsx( + uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, + const uint8_t limit_in, const uint8_t thresh_in) { + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8; + __m128i row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); + thresh = __lsx_vldrepl_b(&thresh_in, 0); + + src_u -= 4; + DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u, + pitch_x3, row0, row1, row2, row3); + src_u += pitch_x4; + DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u, + pitch_x3, row4, row5, row6, row7); + src_v -= 4; + DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v, + pitch_x3, row8, row9, row10, row11); + src_v += pitch_x4; + DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v, + pitch_x3, row12, row13, row14, row15); + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1); + tmp3 = __lsx_vilvl_h(tmp1, tmp0); + tmp4 = __lsx_vilvh_h(tmp1, tmp0); + DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1); + tmp6 = __lsx_vilvl_h(tmp1, tmp0); + tmp7 = __lsx_vilvh_h(tmp1, tmp0); + tmp2 = __lsx_vilvl_b(q2, q1); + tmp5 = __lsx_vilvh_b(q2, q1); + + src_u += 1 - pitch_x4; + VP8_ST6x1_B(tmp3, 0, tmp2, 0, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp3, 1, tmp2, 1, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp3, 2, tmp2, 2, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp3, 3, tmp2, 3, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp4, 0, tmp2, 4, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp4, 1, tmp2, 5, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp4, 2, tmp2, 6, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp4, 3, tmp2, 7, src_u, 4); + + src_v += 1 - pitch_x4; + VP8_ST6x1_B(tmp6, 0, tmp5, 0, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp6, 1, tmp5, 1, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp6, 2, tmp5, 2, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp6, 3, tmp5, 3, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp7, 0, tmp5, 4, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp7, 1, tmp5, 5, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp7, 2, tmp5, 6, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp7, 3, tmp5, 7, src_v, 4); +} + +void vp8_loop_filter_mbh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + mbloop_filter_horizontal_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim, + *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + if (src_u) { + mbloop_filter_horizontal_edge_uv_lsx( + src_u, src_v, pitch_u_v, *lpf_info_ptr->mblim, *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + } +} + +void vp8_loop_filter_mbv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + mbloop_filter_vertical_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim, + *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); + if (src_u) { + mbloop_filter_vertical_edge_uv_lsx(src_u, src_v, pitch_u_v, + *lpf_info_ptr->mblim, *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + } +} diff --git a/vp8/common/loongarch/sixtap_filter_lsx.c b/vp8/common/loongarch/sixtap_filter_lsx.c new file mode 100644 index 0000000000..75fe533d98 --- /dev/null +++ b/vp8/common/loongarch/sixtap_filter_lsx.c @@ -0,0 +1,1164 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Lu Wang + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/filter.h" +#include "vpx_ports/mem.h" +#include "vpx_util/loongson_intrinsics.h" + +DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_lsx[7][8]) = { + { 0, -6, 123, 12, -1, 0, 0, 0 }, + { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */ + { 0, -9, 93, 50, -6, 0, 0, 0 }, + { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */ + { 0, -6, 50, 93, -9, 0, 0, 0 }, + { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */ + { 0, -1, 12, 123, -6, 0, 0, 0 }, +}; + +static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +#define DPADD_H3(in0, in1, in2, coeff0, coeff1, coeff2) \ + ({ \ + __m128i out0_m; \ + \ + out0_m = __lsx_vdp2_h_b(in0, coeff0); \ + out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1); \ + out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2); \ + \ + out0_m; \ + }) + +#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \ + filt_h2) \ + ({ \ + __m128i vec0_m, vec1_m, vec2_m; \ + __m128i hz_out_m; \ + \ + DUP2_ARG3(__lsx_vshuf_b, src0, src1, mask0, src0, src1, mask1, vec0_m, \ + vec1_m); \ + vec2_m = __lsx_vshuf_b(src0, src1, mask2); \ + hz_out_m = DPADD_H3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); \ + \ + hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); \ + hz_out_m = __lsx_vsat_h(hz_out_m, 7); \ + \ + hz_out_m; \ + }) + +#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, filt0, filt1, filt2, out0, out1, \ + out2, out3) \ + ({ \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + \ + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \ + vec1_m); \ + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m, \ + vec3_m); \ + DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \ + vec3_m, filt0, out0, out1, out2, out3); \ + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m, \ + vec1_m); \ + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m, \ + vec3_m); \ + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, vec4_m, \ + vec5_m); \ + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, vec6_m, \ + vec7_m); \ + DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \ + out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, \ + out3); \ + DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \ + out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2, \ + out3); \ + }) + +#define FILT_4TAP_DPADD_H(vec0, vec1, filt0, filt1) \ + ({ \ + __m128i tmp0; \ + \ + tmp0 = __lsx_vdp2_h_b(vec0, filt0); \ + tmp0 = __lsx_vdp2add_h_b(tmp0, vec1, filt1); \ + \ + tmp0; \ + }) + +#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ + ({ \ + __m128i vec0_m, vec1_m; \ + __m128i hz_out_m; \ + \ + DUP2_ARG3(__lsx_vshuf_b, src0, src1, mask0, src0, src1, mask1, vec0_m, \ + vec1_m); \ + hz_out_m = FILT_4TAP_DPADD_H(vec0_m, vec1_m, filt_h0, filt_h1); \ + hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); \ + hz_out_m = __lsx_vsat_h(hz_out_m, 7); \ + \ + hz_out_m; \ + }) + +#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + filt0, filt1, out0, out1, out2, out3) \ + ({ \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ + \ + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \ + vec1_m); \ + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m, \ + vec3_m); \ + DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \ + vec3_m, filt0, out0, out1, out2, out3); \ + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m, \ + vec1_m); \ + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m, \ + vec3_m); \ + DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \ + out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, \ + out3); \ + }) + +static void common_hz_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, filt0, filt1, filt2; + __m128i mask0, mask1, mask2, tmp0, tmp1; + __m128i filt, out0, out1, out2, out3; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 2; + + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + filt2 = __lsx_vreplvei_h(filt, 2); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src += src_stride_x4; + HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out0, out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + + for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src += src_stride_x4; + HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + filt0, filt1, filt2, out0, out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + } +} + +static void common_hz_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2; + __m128i mask0, mask1, mask2, out; + __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 2; + + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + filt2 = __lsx_vreplvei_h(filt, 2); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src2, src4, src6); + src += 8; + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src1, src3, src5, src7); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4, + src5, src6, src7); + src += src_stride_x4 - 8; + + HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + filt0, filt1, filt2, out0, out1, out2, out3); + HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2, + filt0, filt1, filt2, out4, out5, out6, out7); + DUP4_ARG2(__lsx_vsrari_h, out0, VP8_FILTER_SHIFT, out1, VP8_FILTER_SHIFT, + out2, VP8_FILTER_SHIFT, out3, VP8_FILTER_SHIFT, out0, out1, out2, + out3); + DUP4_ARG2(__lsx_vsrari_h, out4, VP8_FILTER_SHIFT, out5, VP8_FILTER_SHIFT, + out6, VP8_FILTER_SHIFT, out7, VP8_FILTER_SHIFT, out4, out5, out6, + out7); + DUP4_ARG2(__lsx_vsat_h, out0, 7, out1, 7, out2, 7, out3, 7, out0, out1, + out2, out3); + DUP4_ARG2(__lsx_vsat_h, out4, 7, out5, 7, out6, 7, out7, 7, out4, out5, + out6, out7); + out = __lsx_vpickev_b(out1, out0); + out = __lsx_vxori_b(out, 128); + __lsx_vst(out, dst, 0); + out = __lsx_vpickev_b(out3, out2); + out = __lsx_vxori_b(out, 128); + __lsx_vstx(out, dst, dst_stride); + out = __lsx_vpickev_b(out5, out4); + out = __lsx_vxori_b(out, 128); + __lsx_vstx(out, dst, dst_stride_x2); + out = __lsx_vpickev_b(out7, out6); + out = __lsx_vxori_b(out, 128); + __lsx_vstx(out, dst, dst_stride_x3); + dst += dst_stride_x4; + } +} + +static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src7, src8, src9, src10; + __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; + __m128i src109_r, filt0, filt1, filt2; + __m128i tmp0, tmp1; + __m128i filt, out0_r, out1_r, out2_r, out3_r; + + src -= src_stride_x2; + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + filt2 = __lsx_vreplvei_h(filt, 2); + + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + src += src_stride_x4; + src4 = __lsx_vld(src, 0); + src += src_stride; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src2, src1, src4, src3, + src10_r, src32_r, src21_r, src43_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src7, src8, src9, src10); + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + out0_r = DPADD_H3(src10_r, src32_r, src76_r, filt0, filt1, filt2); + out1_r = DPADD_H3(src21_r, src43_r, src87_r, filt0, filt1, filt2); + out2_r = DPADD_H3(src32_r, src76_r, src98_r, filt0, filt1, filt2); + out3_r = DPADD_H3(src43_r, src87_r, src109_r, filt0, filt1, filt2); + DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r, + out2_r, VP8_FILTER_SHIFT, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + + src10_r = src76_r; + src32_r = src98_r; + src21_r = src87_r; + src43_r = src109_r; + src4 = src10; + } +} + +static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + __m128i src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; + __m128i src65_l, src87_l, filt0, filt1, filt2; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= src_stride_x2; + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + filt2 = __lsx_vreplvei_h(filt, 2); + + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + src += src_stride_x4; + src4 = __lsx_vldx(src, 0); + src += src_stride; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src4, src3, src2, src1, + src10_r, src32_r, src43_r, src21_r); + DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src4, src3, src2, src1, + src10_l, src32_l, src43_l, src21_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src5, src6, src7, src8); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, + src6, src7, src8); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7, + src54_l, src65_l, src76_l, src87_l); + out0_r = DPADD_H3(src10_r, src32_r, src54_r, filt0, filt1, filt2); + out1_r = DPADD_H3(src21_r, src43_r, src65_r, filt0, filt1, filt2); + out2_r = DPADD_H3(src32_r, src54_r, src76_r, filt0, filt1, filt2); + out3_r = DPADD_H3(src43_r, src65_r, src87_r, filt0, filt1, filt2); + out0_l = DPADD_H3(src10_l, src32_l, src54_l, filt0, filt1, filt2); + out1_l = DPADD_H3(src21_l, src43_l, src65_l, filt0, filt1, filt2); + out2_l = DPADD_H3(src32_l, src54_l, src76_l, filt0, filt1, filt2); + out3_l = DPADD_H3(src43_l, src65_l, src87_l, filt0, filt1, filt2); + DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l, + out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT, + out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0, + tmp1, tmp2, tmp3); + __lsx_vstx(tmp0, dst, 0); + __lsx_vstx(tmp1, dst, dst_stride); + __lsx_vstx(tmp2, dst, dst_stride_x2); + __lsx_vstx(tmp3, dst, dst_stride_x3); + dst += dst_stride_x4; + + src10_r = src54_r; + src32_r = src76_r; + src21_r = src65_r; + src43_r = src87_r; + src10_l = src54_l; + src32_l = src76_l; + src21_l = src65_l; + src43_l = src87_l; + src4 = src8; + } +} + +static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i filt_hz0, filt_hz1, filt_hz2; + __m128i mask0, mask1, mask2, vec0, vec1; + __m128i filt, filt_vt0, filt_vt1, filt_vt2; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; + __m128i tmp0, tmp1, tmp2, tmp3; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= (2 + src_stride_x2); + + filt = __lsx_vld(filter_horiz, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1); + filt_hz2 = __lsx_vreplvei_h(filt, 2); + + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + src += src_stride_x4; + src4 = __lsx_vldx(src, 0); + src += src_stride; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + + hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + filt = __lsx_vld(filter_vert, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1); + filt_vt2 = __lsx_vreplvei_h(filt, 2); + + DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out2, + hz_out1, hz_out4, hz_out3, out0, out1, out3, out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src5, src6, src7, src8); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, + src6, src7, src8); + hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out5 = __lsx_vpackev_b(hz_out6, hz_out5); + tmp1 = DPADD_H3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); + + hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out7 = __lsx_vpackev_b(hz_out7, hz_out6); + tmp2 = DPADD_H3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2); + + hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out6 = __lsx_vpackev_b(hz_out8, hz_out7); + tmp3 = DPADD_H3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2); + + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, VP8_FILTER_SHIFT, tmp3, tmp2, + VP8_FILTER_SHIFT, vec0, vec1); + DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1); + + __lsx_vstelm_d(vec0, dst, 0, 0); + __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + + hz_out4 = hz_out8; + out0 = out2; + out1 = out7; + out3 = out5; + out4 = out6; + } +} + +static void common_hv_6ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + common_hv_6ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + common_hv_6ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride, + filter_horiz, filter_vert, height); +} + +static void common_hz_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1; + __m128i tmp0, tmp1; + __m128i filt, out0, out1, out2, out3; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 1; + + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + } +} + +static void common_hz_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i filt0, filt1, mask0, mask1; + __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 1; + + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src2, src4, src6); + src += 8; + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src1, src3, src5, src7); + src += src_stride_x4 - 8; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4, + src5, src6, src7); + HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); + HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0, + filt1, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, out5, out4, VP8_FILTER_SHIFT, out7, out6, + VP8_FILTER_SHIFT, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out2, 128, out3, 128, out0, + out1, out2, out3); + __lsx_vstx(out0, dst, 0); + __lsx_vstx(out1, dst, dst_stride); + __lsx_vstx(out2, dst, dst_stride_x2); + __lsx_vstx(out3, dst, dst_stride_x3); + dst += dst_stride_x4; + } +} + +static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src7, src8, src9, src10; + __m128i src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1; + __m128i tmp0, tmp1; + __m128i filt, out0_r, out1_r, out2_r, out3_r; + + src -= src_stride; + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + + DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1); + src2 = __lsx_vldx(src, src_stride_x2); + src += src_stride_x3; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src7, src8, src9, src10); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9, + src72_r, src87_r, src98_r, src109_r); + out0_r = FILT_4TAP_DPADD_H(src10_r, src72_r, filt0, filt1); + out1_r = FILT_4TAP_DPADD_H(src21_r, src87_r, filt0, filt1); + out2_r = FILT_4TAP_DPADD_H(src72_r, src98_r, filt0, filt1); + out3_r = FILT_4TAP_DPADD_H(src87_r, src109_r, filt0, filt1); + DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r, + out2_r, VP8_FILTER_SHIFT, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + + src10_r = src98_r; + src21_r = src109_r; + src2 = src10; + } +} + +static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6; + __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l; + __m128i src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= src_stride; + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + + DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1); + src2 = __lsx_vldx(src, src_stride_x2); + src += src_stride_x3; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src3, src4, src5, src6); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, + src4, src5, src6); + DUP4_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src5, src4, src6, src5, + src32_r, src43_r, src54_r, src65_r); + DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6, src5, + src32_l, src43_l, src54_l, src65_l); + out0_r = FILT_4TAP_DPADD_H(src10_r, src32_r, filt0, filt1); + out1_r = FILT_4TAP_DPADD_H(src21_r, src43_r, filt0, filt1); + out2_r = FILT_4TAP_DPADD_H(src32_r, src54_r, filt0, filt1); + out3_r = FILT_4TAP_DPADD_H(src43_r, src65_r, filt0, filt1); + out0_l = FILT_4TAP_DPADD_H(src10_l, src32_l, filt0, filt1); + out1_l = FILT_4TAP_DPADD_H(src21_l, src43_l, filt0, filt1); + out2_l = FILT_4TAP_DPADD_H(src32_l, src54_l, filt0, filt1); + out3_l = FILT_4TAP_DPADD_H(src43_l, src65_l, filt0, filt1); + DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l, + out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT, + out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0, + tmp1, tmp2, tmp3); + __lsx_vstx(tmp0, dst, 0); + __lsx_vstx(tmp1, dst, dst_stride); + __lsx_vstx(tmp2, dst, dst_stride_x2); + __lsx_vstx(tmp3, dst, dst_stride_x3); + dst += dst_stride_x4; + + src10_r = src54_r; + src21_r = src65_r; + src10_l = src54_l; + src21_l = src65_l; + src2 = src6; + } +} + +static inline void common_hv_4ht_4vt_8w_lsx( + uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; + __m128i mask0, mask1, out0, out1; + __m128i filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3; + __m128i vec0, vec1, vec2, vec3, vec4; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 1 + src_stride; + + filt = __lsx_vld(filter_horiz, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1); + src2 = __lsx_vldx(src, src_stride_x2); + src += src_stride_x3; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2); + + filt = __lsx_vld(filter_vert, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src3, src4, src5, src6); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, + src4, src5, src6); + hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); + vec1 = __lsx_vpackev_b(hz_out3, hz_out2); + tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1); + + hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); + vec3 = __lsx_vpackev_b(hz_out0, hz_out3); + tmp1 = FILT_4TAP_DPADD_H(vec2, vec3, filt_vt0, filt_vt1); + + hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); + vec4 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = FILT_4TAP_DPADD_H(vec1, vec4, filt_vt0, filt_vt1); + + hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec0, vec1); + tmp3 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1); + + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + + vec0 = vec4; + vec2 = vec1; + } +} + +static void common_hv_4ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + common_hv_4ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + common_hv_4ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride, + filter_horiz, filter_vert, height); +} + +static inline void common_hv_6ht_4vt_8w_lsx( + uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + + __m128i src0, src1, src2, src3, src4, src5, src6; + __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2; + __m128i filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3; + __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3; + __m128i out0, out1; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= (2 + src_stride); + + filt = __lsx_vld(filter_horiz, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1); + filt_hz2 = __lsx_vreplvei_h(filt, 2); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1); + src2 = __lsx_vldx(src, src_stride_x2); + src += src_stride_x3; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2); + + filt = __lsx_vld(filter_vert, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src3, src4, src5, src6); + src += src_stride_x4; + DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, + src4, src5, src6); + + hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec1 = __lsx_vpackev_b(hz_out3, hz_out2); + tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1); + + hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec3 = __lsx_vpackev_b(hz_out0, hz_out3); + tmp1 = FILT_4TAP_DPADD_H(vec2, vec3, filt_vt0, filt_vt1); + + hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = FILT_4TAP_DPADD_H(vec1, vec0, filt_vt0, filt_vt1); + + hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2); + tmp3 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1); + + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + } +} + +static void common_hv_6ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + common_hv_6ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + common_hv_6ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride, + filter_horiz, filter_vert, height); +} + +static inline void common_hv_4ht_6vt_8w_lsx( + uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i filt_hz0, filt_hz1, mask0, mask1; + __m128i filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; + __m128i vec0, vec1; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 1 + src_stride_x2; + + filt = __lsx_vld(filter_horiz, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + src += src_stride_x4; + src4 = __lsx_vld(src, 0); + src += src_stride; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); + DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4); + + filt = __lsx_vld(filter_vert, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1); + filt_vt2 = __lsx_vreplvei_h(filt, 2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src5, src6, src7, src8); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, + src6, src7, src8); + hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); + out2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); + out5 = __lsx_vpackev_b(hz_out6, hz_out5); + tmp1 = DPADD_H3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); + + hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1); + out6 = __lsx_vpackev_b(hz_out7, hz_out6); + tmp2 = DPADD_H3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2); + + hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1); + out7 = __lsx_vpackev_b(hz_out8, hz_out7); + tmp3 = DPADD_H3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, vec0, vec1); + DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1); + __lsx_vstelm_d(vec0, dst, 0, 0); + __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + hz_out4 = hz_out8; + out0 = out2; + out1 = out6; + out3 = out5; + out4 = out7; + } +} + +static void common_hv_4ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + common_hv_4ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + common_hv_4ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride, + filter_horiz, filter_vert, height); +} + +typedef void (*PVp8SixtapPredictFunc1)( + uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, + int32_t height); + +typedef void (*PVp8SixtapPredictFunc2)(uint8_t *RESTRICT src, + int32_t src_stride, + uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter, + int32_t height); + +void vp8_sixtap_predict8x8_lsx(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1]; + const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1]; + + static PVp8SixtapPredictFunc1 Predict8x8Funcs1[4] = { + common_hv_6ht_6vt_8w_lsx, + common_hv_6ht_4vt_8w_lsx, + common_hv_4ht_6vt_8w_lsx, + common_hv_4ht_4vt_8w_lsx, + }; + + static PVp8SixtapPredictFunc2 Predict8x8Funcs2[4] = { common_vt_6t_8w_lsx, + common_vt_4t_8w_lsx, + common_hz_6t_8w_lsx, + common_hz_4t_8w_lsx }; + + if (yoffset < 8 && xoffset < 8) { + if (yoffset) { + if (xoffset) { + switch (xoffset & 1) { + case 0: + switch (yoffset & 1) { + case 0: + Predict8x8Funcs1[0](src, src_stride, dst, dst_stride, h_filter, + v_filter, 8); + break; + + case 1: + Predict8x8Funcs1[1](src, src_stride, dst, dst_stride, h_filter, + v_filter + 1, 8); + break; + } + break; + + case 1: + switch (yoffset & 1) { + case 0: + Predict8x8Funcs1[2](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter, 8); + break; + + case 1: + Predict8x8Funcs1[3](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter + 1, 8); + break; + } + break; + } + } else { + switch (yoffset & 1) { + case 0: + Predict8x8Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 8); + break; + + case 1: + Predict8x8Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1, + 8); + break; + } + } + } else { + switch (xoffset & 1) { + case 1: + Predict8x8Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1, + 8); + break; + } + switch (xoffset) { + case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break; + case 2: + case 4: + case 6: + Predict8x8Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 8); + break; + } + } + } +} + +void vp8_sixtap_predict16x16_lsx(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1]; + const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1]; + + static PVp8SixtapPredictFunc1 Predict16x16Funcs1[4] = { + common_hv_6ht_6vt_16w_lsx, + common_hv_6ht_4vt_16w_lsx, + common_hv_4ht_6vt_16w_lsx, + common_hv_4ht_4vt_16w_lsx, + }; + + static PVp8SixtapPredictFunc2 Predict16x16Funcs2[4] = { + common_vt_6t_16w_lsx, common_vt_4t_16w_lsx, common_hz_6t_16w_lsx, + common_hz_4t_16w_lsx + }; + + if (yoffset < 8 && xoffset < 8) { + if (yoffset) { + if (xoffset) { + switch (xoffset & 1) { + case 0: + switch (yoffset & 1) { + case 0: + Predict16x16Funcs1[0](src, src_stride, dst, dst_stride, + h_filter, v_filter, 16); + break; + + case 1: + Predict16x16Funcs1[1](src, src_stride, dst, dst_stride, + h_filter, v_filter + 1, 16); + break; + } + break; + + case 1: + switch (yoffset & 1) { + case 0: + Predict16x16Funcs1[2](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter, 16); + break; + + case 1: + Predict16x16Funcs1[3](src, src_stride, dst, dst_stride, + h_filter, v_filter + 1, 16); + break; + } + break; + } + } else { + switch (yoffset & 1) { + case 0: + Predict16x16Funcs2[0](src, src_stride, dst, dst_stride, v_filter, + 16); + break; + + case 1: + Predict16x16Funcs2[1](src, src_stride, dst, dst_stride, + v_filter + 1, 16); + break; + } + } + } else { + switch (xoffset & 1) { + case 1: + Predict16x16Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1, + 16); + break; + } + switch (xoffset) { + case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break; + case 2: + case 4: + case 6: + Predict16x16Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 16); + break; + } + } + } +} diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 8452b5e854..40117e3677 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -47,13 +47,13 @@ () # Loopfilter # add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"; -specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi/; +specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi lsx/; add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"; specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi/; add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"; -specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi/; +specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi lsx/; add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"; specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi/; @@ -146,10 +146,10 @@ () # Subpixel # add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; -specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi/; +specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi lsx/; add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; -specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi/; +specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi lsx/; add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/; diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 286a93a056..909924ce8d 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -124,6 +124,10 @@ ifeq ($(CONFIG_POSTPROC),yes) VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c endif +# common (loongarch LSX intrinsics) +VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/loopfilter_filters_lsx.c +VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/sixtap_filter_lsx.c + # common (neon intrinsics) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/loopfilter_arm.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/loopfilter_arm.h diff --git a/vpx_dsp/loongarch/intrapred_lsx.c b/vpx_dsp/loongarch/intrapred_lsx.c new file mode 100644 index 0000000000..f990211791 --- /dev/null +++ b/vpx_dsp/loongarch/intrapred_lsx.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Lu Wang + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static inline void intra_predict_dc_8x8_lsx(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint64_t val0, val1; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i store, sum_h, sum_w, sum_d; + __m128i src = { 0 }; + + val0 = *(const uint64_t *)src_top; + val1 = *(const uint64_t *)src_left; + DUP2_ARG3(__lsx_vinsgr2vr_d, src, val0, 0, src, val1, 1, src, src); + sum_h = __lsx_vhaddw_hu_bu(src, src); + sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vpickev_w(sum_d, sum_d); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vsrari_w(sum_d, 4); + store = __lsx_vreplvei_b(sum_w, 0); + + __lsx_vstelm_d(store, dst, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0); + dst += dst_stride_x4; + __lsx_vstelm_d(store, dst, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0); +} + +static inline void intra_predict_dc_16x16_lsx(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, + int32_t dst_stride) { + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i top, left, out; + __m128i sum_h, sum_top, sum_left; + __m128i sum_w; + __m128i sum_d; + + DUP2_ARG2(__lsx_vld, src_top, 0, src_left, 0, top, left); + DUP2_ARG2(__lsx_vhaddw_hu_bu, top, top, left, left, sum_top, sum_left); + sum_h = __lsx_vadd_h(sum_top, sum_left); + sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vpickev_w(sum_d, sum_d); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vsrari_w(sum_d, 5); + out = __lsx_vreplvei_b(sum_w, 0); + + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); + dst += dst_stride_x4; + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); + dst += dst_stride_x4; + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); + dst += dst_stride_x4; + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); +} + +void vpx_dc_predictor_8x8_lsx(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_8x8_lsx(above, left, dst, y_stride); +} + +void vpx_dc_predictor_16x16_lsx(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_16x16_lsx(above, left, dst, y_stride); +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 01653102ef..c948e12a39 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -74,6 +74,7 @@ endif # CONFIG_POSTPROC DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM) DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c +DSP_SRCS-$(HAVE_LSX) += loongarch/intrapred_lsx.c DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred4_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred8_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 0144b90c26..c721e190b0 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -117,7 +117,7 @@ () add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; # TODO(crbug.com/webm/1522): Re-enable vsx implementation. -specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2/; +specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 lsx/; add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/; @@ -155,7 +155,7 @@ () specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/; add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx/; +specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx lsx/; add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/; From 232ad814de14e5941857c62a023b6fd66e967bb6 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Tue, 8 Feb 2022 12:36:39 -0800 Subject: [PATCH 0285/1000] rtc-vp9: Fix intra-only for bypass mode Allow intra-only frame in svc to also work in bypass (flexible-svc) mode. Added unittest for the flexible svc case. And fix the gld_fb_idx for (SL0, TL1) in bypass/flexible mode pattern in the sample encoder: force it to be 0 (same as lst_fb_idx), since the slot is unused on SL0. Change-Id: Iada9d1b052e470a0d5d25220809ad0c87cd46268 --- examples/vp9_spatial_svc_encoder.c | 3 +- test/svc_end_to_end_test.cc | 148 +++++++++++++++++++++++++++-- vp9/encoder/vp9_ratectrl.c | 16 +++- vp9/encoder/vp9_svc_layercontext.c | 6 ++ 4 files changed, 163 insertions(+), 10 deletions(-) diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c index c37e608d17..455f6c9036 100644 --- a/examples/vp9_spatial_svc_encoder.c +++ b/examples/vp9_spatial_svc_encoder.c @@ -579,7 +579,8 @@ static void set_frame_flags_bypass_mode_ex0( ref_frame_config->alt_fb_idx[sl] = 0; } else if (tl == 1) { ref_frame_config->lst_fb_idx[sl] = sl; - ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1; + ref_frame_config->gld_fb_idx[sl] = + (sl == 0) ? 0 : num_spatial_layers + sl - 1; ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl; } // Set the reference and update flags. diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc index e59e337f1b..c0556d8b7d 100644 --- a/test/svc_end_to_end_test.cc +++ b/test/svc_end_to_end_test.cc @@ -15,6 +15,7 @@ #include "test/svc_test.h" #include "test/util.h" #include "test/y4m_video_source.h" +#include "vp9/common/vp9_onyxc_int.h" #include "vpx/vpx_codec.h" #include "vpx_ports/bitops.h" @@ -139,6 +140,91 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, return current_video_frame_ >= frame_to_start_decode_; } + // Example pattern for spatial layers and 2 temporal layers used in the + // bypass/flexible mode. The pattern corresponds to the pattern + // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in + // non-flexible mode. + void set_frame_flags_bypass_mode( + int tl, int num_spatial_layers, int is_key_frame, + vpx_svc_ref_frame_config_t *ref_frame_config) { + int sl; + for (sl = 0; sl < num_spatial_layers; ++sl) + ref_frame_config->update_buffer_slot[sl] = 0; + + for (sl = 0; sl < num_spatial_layers; ++sl) { + // Set the buffer idx. + if (tl == 0) { + ref_frame_config->lst_fb_idx[sl] = sl; + if (sl) { + if (is_key_frame) { + ref_frame_config->lst_fb_idx[sl] = sl - 1; + ref_frame_config->gld_fb_idx[sl] = sl; + } else { + ref_frame_config->gld_fb_idx[sl] = sl - 1; + } + } else { + ref_frame_config->gld_fb_idx[sl] = 0; + } + ref_frame_config->alt_fb_idx[sl] = 0; + } else if (tl == 1) { + ref_frame_config->lst_fb_idx[sl] = sl; + ref_frame_config->gld_fb_idx[sl] = + (sl == 0) ? 0 : num_spatial_layers + sl - 1; + ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl; + } + // Set the reference and update flags. + if (!tl) { + if (!sl) { + // Base spatial and base temporal (sl = 0, tl = 0) + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } else { + if (is_key_frame) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->gld_fb_idx[sl]; + } else { + // Non-zero spatiall layer. + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 1; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } + } + } else if (tl == 1) { + if (!sl) { + // Base spatial and top temporal (tl = 1) + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; + } else { + // Non-zero spatial. + if (sl < num_spatial_layers - 1) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; + } else if (sl == num_spatial_layers - 1) { + // Top spatial and top temporal (non-reference -- doesn't + // update any reference buffers). + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 0; + } + } + } + } + } + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { current_video_frame_ = video->frame(); @@ -158,6 +244,20 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_); } + if (flexible_mode_) { + vpx_svc_layer_id_t layer_id; + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = (video->frame() % 2 != 0); + temporal_layer_id_ = layer_id.temporal_layer_id; + for (int i = 0; i < number_spatial_layers_; i++) { + layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; + ref_frame_config.duration[i] = 1; + } + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + set_frame_flags_bypass_mode(layer_id.temporal_layer_id, + number_spatial_layers_, 0, &ref_frame_config); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); + } if (video->frame() == frame_to_sync_) { encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync_); } @@ -226,6 +326,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, vpx_svc_spatial_layer_sync_t svc_layer_sync_; unsigned int mismatch_nframes_; unsigned int num_nonref_frames_; + bool flexible_mode_; + vpx_svc_ref_frame_config_t ref_frame_config; private: virtual void SetConfig(const int num_temporal_layer) { @@ -275,6 +377,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLFullSync) { ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; AssignLayerBitrates(); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); #if CONFIG_VP9_DECODER @@ -302,6 +405,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncToVGA) { ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, 0, 400); cfg_.rc_target_bitrate = 400; + flexible_mode_ = false; AssignLayerBitrates(); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); #if CONFIG_VP9_DECODER @@ -329,6 +433,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToHD) { ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; AssignLayerBitrates(); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); #if CONFIG_VP9_DECODER @@ -356,6 +461,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToVGAHD) { ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; AssignLayerBitrates(); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); #if CONFIG_VP9_DECODER @@ -385,6 +491,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncFrameVGADenoise) { ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, 0, 400); cfg_.rc_target_bitrate = 400; + flexible_mode_ = false; AssignLayerBitrates(); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); #if CONFIG_VP9_DECODER @@ -395,6 +502,34 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncFrameVGADenoise) { } #endif +// Encode 3 spatial, 2 temporal layer in flexible mode but don't +// start decoding. During the sequence insert intra-only on base/qvga +// layer at frame 20 and start decoding only QVGA layer from there. +TEST_P(SyncFrameOnePassCbrSvc, + OnePassCbrSvc3SL3TLSyncFrameStartDecodeOnIntraOnlyQVGAFlex) { + SetSvcConfig(3, 2); + frame_to_start_decode_ = 20; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 2; + decode_to_layer_after_sync_ = 0; + intra_only_test_ = true; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 1; + svc_layer_sync_.spatial_layer_sync[0] = 1; + svc_layer_sync_.spatial_layer_sync[1] = 0; + svc_layer_sync_.spatial_layer_sync[2] = 0; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + flexible_mode_ = true; + AssignLayerBitrates(); + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Can't check mismatch here because only base is decoded at + // frame sync, whereas encoder continues encoding all layers. +} + // Encode 3 spatial, 3 temporal layer but don't start decoding. // During the sequence insert intra-only on base/qvga layer at frame 20 // and start decoding only QVGA layer from there. @@ -415,15 +550,11 @@ TEST_P(SyncFrameOnePassCbrSvc, ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; AssignLayerBitrates(); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); -#if CONFIG_VP9_DECODER - // The non-reference frames are expected to be mismatched frames as the - // encoder will avoid loopfilter on these frames. - if (0 && decode_to_layer_before_sync_ == decode_to_layer_after_sync_) { - EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); - } -#endif + // Can't check mismatch here because only base is decoded at + // frame sync, whereas encoder continues encoding all layers. } // Start decoding from beginning of sequence, during sequence insert intra-only @@ -447,6 +578,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyQVGA) { ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; AssignLayerBitrates(); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); #if CONFIG_VP9_DECODER @@ -477,6 +609,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyVGA) { ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; AssignLayerBitrates(); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); #if CONFIG_VP9_DECODER @@ -502,6 +635,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc1SL3TLSyncFrameIntraOnlyQVGA) { ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; AssignLayerBitrates(); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); #if CONFIG_VP9_DECODER diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index ac346115fb..0852973914 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -2214,7 +2214,6 @@ static void set_intra_only_frame(VP9_COMP *cpi) { // only 3 reference buffers can be updated, but for temporal layers > 1 // we generally need to use buffer slots 4 and 5. if ((cm->current_video_frame == 0 && svc->number_temporal_layers > 1) || - svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS || svc->number_spatial_layers > 3 || svc->number_temporal_layers > 3 || svc->number_spatial_layers == 1) return; @@ -2235,11 +2234,15 @@ static void set_intra_only_frame(VP9_COMP *cpi) { cpi->lst_fb_idx = -1; cpi->gld_fb_idx = -1; cpi->alt_fb_idx = -1; + svc->update_buffer_slot[0] = 0; // For intra-only frame we need to refresh all slots that were // being used for the base layer (fb_idx_base[i] == 1). // Start with assigning last first, then golden and then alt. for (i = 0; i < REF_FRAMES; ++i) { - if (svc->fb_idx_base[i] == 1) count++; + if (svc->fb_idx_base[i] == 1) { + svc->update_buffer_slot[0] |= 1 << i; + count++; + } if (count == 1 && cpi->lst_fb_idx == -1) cpi->lst_fb_idx = i; if (count == 2 && cpi->gld_fb_idx == -1) cpi->gld_fb_idx = i; if (count == 3 && cpi->alt_fb_idx == -1) cpi->alt_fb_idx = i; @@ -2248,6 +2251,12 @@ static void set_intra_only_frame(VP9_COMP *cpi) { // to the lst_fb_idx. if (cpi->gld_fb_idx == -1) cpi->gld_fb_idx = cpi->lst_fb_idx; if (cpi->alt_fb_idx == -1) cpi->alt_fb_idx = cpi->lst_fb_idx; + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + cpi->ext_refresh_last_frame = 0; + cpi->ext_refresh_golden_frame = 0; + cpi->ext_refresh_alt_ref_frame = 0; + cpi->ref_frame_flags = 0; + } } } @@ -2390,6 +2399,9 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { set_intra_only_frame(cpi); target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); } + // Overlay frame predicts from LAST (intra-only) + if (svc->previous_frame_is_intra_only) cpi->ref_frame_flags |= VP9_LAST_FLAG; + // Any update/change of global cyclic refresh parameters (amount/delta-qp) // should be done here, before the frame qp is selected. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 6655643654..a57a70ab16 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -1234,6 +1234,7 @@ void vp9_svc_check_spatial_layer_sync(VP9_COMP *const cpi) { void vp9_svc_update_ref_frame_buffer_idx(VP9_COMP *const cpi) { SVC *const svc = &cpi->svc; + int i = 0; // Update the usage of frame buffer index for base spatial layers. if (svc->spatial_layer_id == 0) { if ((cpi->ref_frame_flags & VP9_LAST_FLAG) || cpi->refresh_last_frame) @@ -1242,6 +1243,11 @@ void vp9_svc_update_ref_frame_buffer_idx(VP9_COMP *const cpi) { svc->fb_idx_base[cpi->gld_fb_idx] = 1; if ((cpi->ref_frame_flags & VP9_ALT_FLAG) || cpi->refresh_alt_ref_frame) svc->fb_idx_base[cpi->alt_fb_idx] = 1; + // For bypass/flexible mode: check for refresh slots. + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + for (i = 0; i < REF_FRAMES; ++i) + if (svc->update_buffer_slot[0] & (1 << i)) svc->fb_idx_base[i] = 1; + } } } From cafe7cc1f10cfea74edb2ded7c3df2d69fcf1eee Mon Sep 17 00:00:00 2001 From: Gregor Jasny Date: Thu, 10 Feb 2022 09:01:49 +0100 Subject: [PATCH 0286/1000] support visual studio 2022 (vs17) Change-Id: I8380283d09b0c90183f224399f953dcc527181c5 --- README | 2 ++ build/make/gen_msvs_sln.sh | 5 +++-- build/make/gen_msvs_vcxproj.sh | 5 ++++- configure | 2 ++ 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/README b/README index ddbcb9f695..a083ebf90e 100644 --- a/README +++ b/README @@ -103,6 +103,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86-win32-vs14 x86-win32-vs15 x86-win32-vs16 + x86-win32-vs17 x86_64-android-gcc x86_64-darwin9-gcc x86_64-darwin10-gcc @@ -124,6 +125,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86_64-win64-vs14 x86_64-win64-vs15 x86_64-win64-vs16 + x86_64-win64-vs17 generic-gnu The generic-gnu target, in conjunction with the CROSS environment variable, diff --git a/build/make/gen_msvs_sln.sh b/build/make/gen_msvs_sln.sh index d1adfd749c..0b312850fe 100755 --- a/build/make/gen_msvs_sln.sh +++ b/build/make/gen_msvs_sln.sh @@ -25,7 +25,7 @@ files. Options: --help Print this message --out=outfile Redirect output to a file - --ver=version Version (14-16) of visual studio to generate for + --ver=version Version (14-17) of visual studio to generate for --target=isa-os-cc Target specifier EOF exit 1 @@ -219,6 +219,7 @@ for opt in "$@"; do 14) vs_year=2015 ;; 15) vs_year=2017 ;; 16) vs_year=2019 ;; + 17) vs_year=2022 ;; *) die Unrecognized Visual Studio Version in $opt ;; esac ;; @@ -232,7 +233,7 @@ done outfile=${outfile:-/dev/stdout} mkoutfile=${mkoutfile:-/dev/stdout} case "${vs_ver}" in - 1[4-6]) + 1[4-7]) # VS has used Format Version 12.00 continuously since vs11. sln_vers="12.00" sln_vers_str="Visual Studio ${vs_year}" diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh index 6f91ad4781..58bb66b9e3 100755 --- a/build/make/gen_msvs_vcxproj.sh +++ b/build/make/gen_msvs_vcxproj.sh @@ -170,7 +170,7 @@ for opt in "$@"; do --ver=*) vs_ver="$optval" case "$optval" in - 1[4-6]) + 1[4-7]) ;; *) die Unrecognized Visual Studio Version in $opt ;; @@ -344,6 +344,9 @@ generate_vcxproj() { if [ "$vs_ver" = "16" ]; then tag_content PlatformToolset v142 fi + if [ "$vs_ver" = "17" ]; then + tag_content PlatformToolset v143 + fi tag_content CharacterSet Unicode if [ "$config" = "Release" ]; then tag_content WholeProgramOptimization true diff --git a/configure b/configure index 434ebbe366..beea650329 100755 --- a/configure +++ b/configure @@ -142,6 +142,7 @@ all_platforms="${all_platforms} x86-win32-gcc" all_platforms="${all_platforms} x86-win32-vs14" all_platforms="${all_platforms} x86-win32-vs15" all_platforms="${all_platforms} x86-win32-vs16" +all_platforms="${all_platforms} x86-win32-vs17" all_platforms="${all_platforms} x86_64-android-gcc" all_platforms="${all_platforms} x86_64-darwin9-gcc" all_platforms="${all_platforms} x86_64-darwin10-gcc" @@ -164,6 +165,7 @@ all_platforms="${all_platforms} x86_64-win64-gcc" all_platforms="${all_platforms} x86_64-win64-vs14" all_platforms="${all_platforms} x86_64-win64-vs15" all_platforms="${all_platforms} x86_64-win64-vs16" +all_platforms="${all_platforms} x86_64-win64-vs17" all_platforms="${all_platforms} generic-gnu" # all_targets is a list of all targets that can be configured From 2da19ac0332f0b60cd2e4e2c7fa1748eb3ac85a7 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 11 Feb 2022 12:43:29 -0800 Subject: [PATCH 0287/1000] svc_datarate_test.cc: remove stale TODO Bug: webm:1554 Change-Id: I547223763b86c6a24fa32851f7b30ebab4b7472a --- test/svc_datarate_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc index 95d82ce54e..291cb01280 100644 --- a/test/svc_datarate_test.cc +++ b/test/svc_datarate_test.cc @@ -1354,7 +1354,6 @@ TEST_P(DatarateOnePassCbrSvcSmallKF, OnePassCbrSvc3SL3TLSmallKf) { ResetModel(); AssignLayerBitrates(); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - // TODO(jianj): webm:1554 CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70, 1.15); #if CONFIG_VP9_DECODER From 3b21aeac8b7d5a52b6360d878cb4df593e87113e Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Wed, 23 Feb 2022 11:23:52 +0800 Subject: [PATCH 0288/1000] vp9[loongarch]: Optimize lpf_horizontal/vertical_16_dual with LSX Change-Id: I82c6bc16ea57c3f7ac5f4d212a12a5f70cb55ffc --- vpx_dsp/loongarch/loopfilter_16_lsx.c | 1330 +++++++++++++++++++++++++ vpx_dsp/loongarch/loopfilter_lsx.h | 167 ++++ vpx_dsp/vpx_dsp.mk | 2 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 4 files changed, 1501 insertions(+), 2 deletions(-) create mode 100644 vpx_dsp/loongarch/loopfilter_16_lsx.c create mode 100644 vpx_dsp/loongarch/loopfilter_lsx.h diff --git a/vpx_dsp/loongarch/loopfilter_16_lsx.c b/vpx_dsp/loongarch/loopfilter_16_lsx.c new file mode 100644 index 0000000000..cbaefcd6e0 --- /dev/null +++ b/vpx_dsp/loongarch/loopfilter_16_lsx.c @@ -0,0 +1,1330 @@ +/* + * Copyright (c) 2022 Loongson Technology Corporation Limited + * Contributed by Hecai Yuan + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/loopfilter_lsx.h" +#include "vpx_ports/mem.h" + +#define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, \ + _in2, _in3, _in4, _in5, _in6, _in7) \ + { \ + _in0 = __lsx_vld(_src, 0); \ + _in1 = __lsx_vldx(_src, _stride); \ + _in2 = __lsx_vldx(_src, _stride2); \ + _in3 = __lsx_vldx(_src, _stride3); \ + _src += _stride4; \ + _in4 = __lsx_vld(_src, 0); \ + _in5 = __lsx_vldx(_src, _stride); \ + _in6 = __lsx_vldx(_src, _stride2); \ + _in7 = __lsx_vldx(_src, _stride3); \ + } + +#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, _dst, \ + _stride, _stride2, _stride3, _stride4) \ + { \ + __lsx_vst(_dst0, _dst, 0); \ + __lsx_vstx(_dst1, _dst, _stride); \ + __lsx_vstx(_dst2, _dst, _stride2); \ + __lsx_vstx(_dst3, _dst, _stride3); \ + _dst += _stride4; \ + __lsx_vst(_dst4, _dst, 0); \ + __lsx_vstx(_dst5, _dst, _stride); \ + __lsx_vstx(_dst6, _dst, _stride2); \ + __lsx_vstx(_dst7, _dst, _stride3); \ + } + +static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride, + uint8_t *filter48, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + __m128i zero = __lsx_vldi(0); + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + /* load vector elements */ + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vreplgr2vr_b(*thresh_ptr); + b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr); + limit = __lsx_vreplgr2vr_b(*limit_ptr); + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__lsx_bz_v(flat)) { + __lsx_vstx(p1_out, dst, -stride2); + __lsx_vstx(p0_out, dst, -stride); + __lsx_vst(q0_out, dst, 0); + __lsx_vstx(q1_out, dst, stride); + + return 1; + } + + DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, + p1_l, p0_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, + q2_l, q3_l); + + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h, + p1_h, p0_h); + DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h, + q2_h, q3_h); + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + DUP4_ARG3(__lsx_vbitsel_v, p2, p2_filt8_l, flat, p1_out, p1_filt8_l, flat, + p0_out, p0_filt8_l, flat, q0_out, q0_filt8_l, flat, p2_out, p1_out, + p0_out, q0_out); + DUP2_ARG3(__lsx_vbitsel_v, q1_out, q1_filt8_l, flat, q2, q2_filt8_l, flat, + q1_out, q2_out); + + __lsx_vst(p2_out, filter48, 0); + __lsx_vst(p1_out, filter48, 16); + __lsx_vst(p0_out, filter48, 32); + __lsx_vst(q0_out, filter48, 48); + __lsx_vst(q1_out, filter48, 64); + __lsx_vst(q2_out, filter48, 80); + __lsx_vst(flat, filter48, 96); + + return 0; +} + +static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + uint8_t *dst_tmp0 = dst - stride4; + uint8_t *dst_tmp1 = dst + stride4; + + __m128i flat, flat2, filter8; + __m128i zero = __lsx_vldi(0); + __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + __m128i out_h, out_l; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; + v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; + v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in; + v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in; + v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in; + v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in; + v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h; + + flat = __lsx_vld(filter48, 96); + + DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0, + -stride2, dst_tmp0, -stride, p7, p6, p5, p4); + + p3 = __lsx_vld(dst_tmp0, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp0, stride, dst_tmp0, stride2, p2, p1); + p0 = __lsx_vldx(dst_tmp0, stride3); + + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + q4 = __lsx_vld(dst_tmp1, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6); + q7 = __lsx_vldx(dst_tmp1, stride3); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__lsx_bz_v(flat2)) { + DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48, + p2, p1, p0, q0); + DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2); + __lsx_vstx(p2, dst, -stride3); + __lsx_vstx(p1, dst, -stride2); + __lsx_vstx(p0, dst, -stride); + __lsx_vst(q0, dst, 0); + __lsx_vstx(q1, dst, stride); + __lsx_vstx(q2, dst, stride2); + } else { + dst = dst_tmp0 - stride3; + + p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7); + p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6); + p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5); + p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4); + p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3); + p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2); + p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1); + p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0); + + q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7); + p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6); + p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5); + p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4); + + p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3); + p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2); + p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1); + p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0); + q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0); + + tmp0_h = p7_h_in << 3; + tmp0_h -= p7_h_in; + tmp0_h += p6_h_in; + tmp0_h += q0_h_in; + tmp1_h = p6_h_in + p5_h_in; + tmp1_h += p4_h_in; + tmp1_h += p3_h_in; + tmp1_h += p2_h_in; + tmp1_h += p1_h_in; + tmp1_h += p0_h_in; + tmp1_h += tmp0_h; + + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p6 = __lsx_vbitsel_v(p6, out_l, flat2); + __lsx_vst(p6, dst, 0); + dst += stride; + + /* p5 */ + q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1); + tmp0_h = p5_h_in - p6_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p5 = __lsx_vbitsel_v(p5, out_l, flat2); + __lsx_vst(p5, dst, 0); + dst += stride; + + /* p4 */ + q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2); + tmp0_h = p4_h_in - p5_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p4 = __lsx_vbitsel_v(p4, out_l, flat2); + __lsx_vst(p4, dst, 0); + dst += stride; + + /* p3 */ + q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3); + tmp0_h = p3_h_in - p4_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p3 = __lsx_vbitsel_v(p3, out_l, flat2); + __lsx_vst(p3, dst, 0); + dst += stride; + + /* p2 */ + q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4); + filter8 = __lsx_vld(filter48, 0); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4); + tmp0_h = p2_h_in - p3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* p1 */ + q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5); + filter8 = __lsx_vld(filter48, 16); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5); + tmp0_h = p1_h_in - p2_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* p0 */ + q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6); + filter8 = __lsx_vld(filter48, 32); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6); + tmp0_h = p0_h_in - p1_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q0 */ + q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7); + filter8 = __lsx_vld(filter48, 48); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7); + tmp0_h = q7_h_in - p0_h_in; + tmp0_h += q0_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q1 */ + filter8 = __lsx_vld(filter48, 64); + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q0_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p6_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q2 */ + filter8 = __lsx_vld(filter48, 80); + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q1_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p5_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q3 */ + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q2_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p4_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q3 = __lsx_vbitsel_v(q3, out_l, flat2); + __lsx_vst(q3, dst, 0); + dst += stride; + + /* q4 */ + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p3_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q4 = __lsx_vbitsel_v(q4, out_l, flat2); + __lsx_vst(q4, dst, 0); + dst += stride; + + /* q5 */ + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q4_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p2_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q5 = __lsx_vbitsel_v(q5, out_l, flat2); + __lsx_vst(q5, dst, 0); + dst += stride; + + /* q6 */ + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q5_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p1_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q6 = __lsx_vbitsel_v(q6, out_l, flat2); + __lsx_vst(q6, dst, 0); + } +} + +static void mb_lpf_horizontal_edge_dual(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + DECLARE_ALIGNED(16, uint8_t, filter48[16 * 8]); + uint8_t early_exit = 0; + + early_exit = hz_lpf_t4_and_t8_16w(dst, stride, &filter48[0], b_limit_ptr, + limit_ptr, thresh_ptr); + + if (early_exit == 0) { + hz_lpf_t16_16w(dst, stride, filter48); + } +} + +static void mb_lpf_horizontal_edge(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, int32_t count) { + if (count == 1) { + __m128i flat2, mask, hev, flat, thresh, b_limit, limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i p0_filter16, p1_filter16; + __m128i p2_filter8, p1_filter8, p0_filter8; + __m128i q0_filter8, q1_filter8, q2_filter8; + __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l; + __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l; + __m128i zero = __lsx_vldi(0); + __m128i tmp0, tmp1, tmp2; + + int32_t stride2 = stride << 1; + int32_t stride3 = 2 + stride; + int32_t stride4 = stride << 2; + uint8_t *dst_tmp0 = dst - stride4; + uint8_t *dst_tmp1 = dst + stride4; + + /* load vector elements */ + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vreplgr2vr_b(*thresh_ptr); + b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr); + limit = __lsx_vreplgr2vr_b(*limit_ptr); + + /* filter_mask* */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + flat = __lsx_vilvl_d(zero, flat); + if (__lsx_bz_v(flat)) { + __lsx_vstelm_d(p1_out, dst - stride2, 0, 0); + __lsx_vstelm_d(p0_out, dst - stride, 0, 0); + __lsx_vstelm_d(q0_out, dst, 0, 0); + __lsx_vstelm_d(q1_out, dst + stride, 0, 0); + } else { + /* convert 8 bit input data into 16 bit */ + DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, + p2_l, p1_l, p0_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, + q1_l, q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, zero, + p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8, + p0_filter8, q0_filter8); + DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, q1_filter8, + q2_filter8); + + /* store pixel values */ + p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat); + p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat); + p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat); + q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat); + q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat); + q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat); + + /* load 16 vector elements */ + DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0, + -stride2, dst_tmp0, -stride, p7, p6, p5, p4); + q4 = __lsx_vld(dst_tmp1, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6); + q7 = __lsx_vldx(dst_tmp1, stride3); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__lsx_bz_v(flat2)) { + dst -= stride3; + __lsx_vstelm_d(p2_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p0_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(q0_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(q1_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(q2_out, dst, 0, 0); + } else { + /* LSB(right) 8 pixel operation */ + DUP4_ARG2(__lsx_vilvl_b, zero, p7, zero, p6, zero, p5, zero, p4, p7_l, + p6_l, p5_l, p4_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q4, zero, q5, zero, q6, zero, q7, q4_l, + q5_l, q6_l, q7_l); + + tmp0 = __lsx_vslli_h(p7_l, 3); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp0 = __lsx_vadd_h(tmp0, p6_l); + tmp0 = __lsx_vadd_h(tmp0, q0_l); + + dst = dst_tmp0 - stride3; + + /* calculation of p6 and p5 */ + tmp1 = __lsx_vadd_h(p6_l, p5_l); + tmp1 = __lsx_vadd_h(tmp1, p4_l); + tmp1 = __lsx_vadd_h(tmp1, p3_l); + tmp1 = __lsx_vadd_h(tmp1, p2_l); + tmp1 = __lsx_vadd_h(tmp1, p1_l); + tmp1 = __lsx_vadd_h(tmp1, p0_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp0 = __lsx_vsub_h(p5_l, p6_l); + tmp0 = __lsx_vadd_h(tmp0, q1_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p6, p0_filter16, flat2, p5, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of p4 and p3 */ + tmp0 = __lsx_vsub_h(p4_l, p5_l); + tmp0 = __lsx_vadd_h(tmp0, q2_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp2 = __lsx_vsub_h(p3_l, p4_l); + tmp2 = __lsx_vadd_h(tmp2, q3_l); + tmp2 = __lsx_vsub_h(tmp2, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p4, p0_filter16, flat2, p3, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of p2 and p1 */ + tmp0 = __lsx_vsub_h(p2_l, p3_l); + tmp0 = __lsx_vadd_h(tmp0, q4_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp2 = __lsx_vsub_h(p1_l, p2_l); + tmp2 = __lsx_vadd_h(tmp2, q5_l); + tmp2 = __lsx_vsub_h(tmp2, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p2_out, p0_filter16, flat2, p1_out, + p1_filter16, flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of p0 and q0 */ + tmp0 = __lsx_vsub_h(p0_l, p1_l); + tmp0 = __lsx_vadd_h(tmp0, q6_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp2 = __lsx_vsub_h(q7_l, p0_l); + tmp2 = __lsx_vadd_h(tmp2, q0_l); + tmp2 = __lsx_vsub_h(tmp2, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p0_out, p0_filter16, flat2, q0_out, + p1_filter16, flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of q1 and q2 */ + tmp0 = __lsx_vsub_h(q7_l, q0_l); + tmp0 = __lsx_vadd_h(tmp0, q1_l); + tmp0 = __lsx_vsub_h(tmp0, p6_l); + tmp2 = __lsx_vsub_h(q7_l, q1_l); + tmp2 = __lsx_vadd_h(tmp2, q2_l); + tmp2 = __lsx_vsub_h(tmp2, p5_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, q1_out, p0_filter16, flat2, q2_out, + p1_filter16, flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of q3 and q4 */ + tmp0 = __lsx_vsub_h(q7_l, q2_l); + tmp0 = __lsx_vadd_h(tmp0, q3_l); + tmp0 = __lsx_vsub_h(tmp0, p4_l); + tmp2 = __lsx_vsub_h(q7_l, q3_l); + tmp2 = __lsx_vadd_h(tmp2, q4_l); + tmp2 = __lsx_vsub_h(tmp2, p3_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, q3, p0_filter16, flat2, q4, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of q5 and q6 */ + tmp0 = __lsx_vsub_h(q7_l, q4_l); + tmp0 = __lsx_vadd_h(tmp0, q5_l); + tmp0 = __lsx_vsub_h(tmp0, p2_l); + tmp2 = __lsx_vsub_h(q7_l, q5_l); + tmp2 = __lsx_vadd_h(tmp2, q6_l); + tmp2 = __lsx_vsub_h(tmp2, p1_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, q5, p0_filter16, flat2, q6, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + } + } + } else { + mb_lpf_horizontal_edge_dual(dst, stride, b_limit_ptr, limit_ptr, + thresh_ptr); + } +} + +void vpx_lpf_horizontal_16_dual_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(dst, stride, b_limit_ptr, limit_ptr, thresh_ptr, 2); +} + +static void transpose_16x16(uint8_t *input, int32_t in_stride, uint8_t *output, + int32_t out_stride) { + __m128i row0, row1, row2, row3, row4, row5, row6, row7; + __m128i row8, row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; + __m128i tmp2, tmp3; + __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + int32_t in_stride2 = in_stride << 1; + int32_t in_stride3 = in_stride2 + in_stride; + int32_t in_stride4 = in_stride2 << 1; + int32_t out_stride2 = out_stride << 1; + int32_t out_stride3 = out_stride2 + out_stride; + int32_t out_stride4 = out_stride2 << 1; + + LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row0, row1, + row2, row3, row4, row5, row6, row7); + input += in_stride4; + LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row8, row9, + row10, row11, row12, row13, row14, row15); + + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p7, p6, + p5, p4, p3, p2, p1, p0); + + /* transpose 16x8 matrix into 8x16 */ + /* total 8 intermediate register and 32 instructions */ + q7 = __lsx_vpackod_d(row8, row0); + q6 = __lsx_vpackod_d(row9, row1); + q5 = __lsx_vpackod_d(row10, row2); + q4 = __lsx_vpackod_d(row11, row3); + q3 = __lsx_vpackod_d(row12, row4); + q2 = __lsx_vpackod_d(row13, row5); + q1 = __lsx_vpackod_d(row14, row6); + q0 = __lsx_vpackod_d(row15, row7); + + DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1); + DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5); + + DUP2_ARG2(__lsx_vpackev_b, q2, q3, q0, q1, q5, q7); + DUP2_ARG2(__lsx_vpackod_b, q2, q3, q0, q1, tmp6, tmp7); + + DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3); + q0 = __lsx_vpackev_w(tmp3, tmp2); + q4 = __lsx_vpackod_w(tmp3, tmp2); + + tmp2 = __lsx_vpackod_h(tmp1, tmp0); + tmp3 = __lsx_vpackod_h(q7, q5); + q2 = __lsx_vpackev_w(tmp3, tmp2); + q6 = __lsx_vpackod_w(tmp3, tmp2); + + DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3); + q1 = __lsx_vpackev_w(tmp3, tmp2); + q5 = __lsx_vpackod_w(tmp3, tmp2); + + tmp2 = __lsx_vpackod_h(tmp5, tmp4); + tmp3 = __lsx_vpackod_h(tmp7, tmp6); + q3 = __lsx_vpackev_w(tmp3, tmp2); + q7 = __lsx_vpackod_w(tmp3, tmp2); + + LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_stride, out_stride2, + out_stride3, out_stride4); + output += out_stride4; + LSX_ST_8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_stride, out_stride2, + out_stride3, out_stride4); +} + +static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48, + uint8_t *dst_org, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + __m128i vec0, vec1, vec2, vec3, vec4, vec5; + __m128i zero = __lsx_vldi(0); + + /* load vector elements */ + DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16, p3, p2, p1, p0); + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3); + + thresh = __lsx_vreplgr2vr_b(*thresh_ptr); + b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr); + limit = __lsx_vreplgr2vr_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat)) { + DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); + vec2 = __lsx_vilvl_h(vec1, vec0); + vec3 = __lsx_vilvh_h(vec1, vec0); + DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); + vec4 = __lsx_vilvl_h(vec1, vec0); + vec5 = __lsx_vilvh_h(vec1, vec0); + + dst_org -= 2; + __lsx_vstelm_w(vec2, dst_org, 0, 0); + __lsx_vstelm_w(vec2, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3); + dst_org += stride4; + __lsx_vstelm_w(vec3, dst_org, 0, 0); + __lsx_vstelm_w(vec3, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3); + dst_org += stride4; + __lsx_vstelm_w(vec4, dst_org, 0, 0); + __lsx_vstelm_w(vec4, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3); + dst_org += stride4; + __lsx_vstelm_w(vec5, dst_org, 0, 0); + __lsx_vstelm_w(vec5, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3); + + return 1; + } + + DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, + p1_l, p0_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, + q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h, + p1_h, p0_h); + DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h, + q2_h, q3_h); + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + __lsx_vst(p2_out, filter48, 0); + __lsx_vst(p1_out, filter48, 16); + __lsx_vst(p0_out, filter48, 32); + __lsx_vst(q0_out, filter48, 48); + __lsx_vst(q1_out, filter48, 64); + __lsx_vst(q2_out, filter48, 80); + __lsx_vst(flat, filter48, 96); + + return 0; +} + +static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride, + uint8_t *filter48) { + __m128i zero = __lsx_vldi(0); + __m128i flat, flat2, filter8; + __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + __m128i out_l, out_h; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; + v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; + v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in; + v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in; + v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in; + v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in; + v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h; + uint8_t *dst_tmp = dst - 128; + + flat = __lsx_vld(filter48, 96); + + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, p7, + p6, p5, p4); + DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96, dst_tmp, 112, p3, + p2, p1, p0); + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3); + DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + /* if flat2 is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat2)) { + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48, + p2, p1, p0, q0); + DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2); + + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1); + vec3 = __lsx_vilvl_h(vec1, vec0); + vec4 = __lsx_vilvh_h(vec1, vec0); + DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1); + vec6 = __lsx_vilvl_h(vec1, vec0); + vec7 = __lsx_vilvh_h(vec1, vec0); + vec2 = __lsx_vilvl_b(q2, q1); + vec5 = __lsx_vilvh_b(q2, q1); + + dst_org -= 3; + __lsx_vstelm_w(vec3, dst_org, 0, 0); + __lsx_vstelm_h(vec2, dst_org, 4, 0); + dst_org += stride; + __lsx_vstelm_w(vec3, dst_org, 0, 1); + __lsx_vstelm_h(vec2, dst_org, 4, 1); + dst_org += stride; + __lsx_vstelm_w(vec3, dst_org, 0, 2); + __lsx_vstelm_h(vec2, dst_org, 4, 2); + dst_org += stride; + __lsx_vstelm_w(vec3, dst_org, 0, 3); + __lsx_vstelm_h(vec2, dst_org, 4, 3); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 0); + __lsx_vstelm_h(vec2, dst_org, 4, 4); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 1); + __lsx_vstelm_h(vec2, dst_org, 4, 5); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 2); + __lsx_vstelm_h(vec2, dst_org, 4, 6); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 3); + __lsx_vstelm_h(vec2, dst_org, 4, 7); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 0); + __lsx_vstelm_h(vec5, dst_org, 4, 0); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 1); + __lsx_vstelm_h(vec5, dst_org, 4, 1); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 2); + __lsx_vstelm_h(vec5, dst_org, 4, 2); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 3); + __lsx_vstelm_h(vec5, dst_org, 4, 3); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 0); + __lsx_vstelm_h(vec5, dst_org, 4, 4); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 1); + __lsx_vstelm_h(vec5, dst_org, 4, 5); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 2); + __lsx_vstelm_h(vec5, dst_org, 4, 6); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 3); + __lsx_vstelm_h(vec5, dst_org, 4, 7); + + return 1; + } + + dst -= 7 * 16; + + p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7); + p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6); + p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5); + p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4); + p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3); + p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2); + p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1); + p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0); + q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7); + p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6); + p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5); + p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4); + p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3); + p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2); + p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1); + p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0); + q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0); + + tmp0_h = p7_h_in << 3; + tmp0_h -= p7_h_in; + tmp0_h += p6_h_in; + tmp0_h += q0_h_in; + tmp1_h = p6_h_in + p5_h_in; + tmp1_h += p4_h_in; + tmp1_h += p3_h_in; + tmp1_h += p2_h_in; + tmp1_h += p1_h_in; + tmp1_h += p0_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p6 = __lsx_vbitsel_v(p6, out_l, flat2); + __lsx_vst(p6, dst, 0); + + /* p5 */ + q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1); + tmp0_h = p5_h_in - p6_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + p5 = __lsx_vbitsel_v(p5, out_l, flat2); + __lsx_vst(p5, dst, 16); + + /* p4 */ + q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2); + tmp0_h = p4_h_in - p5_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + p4 = __lsx_vbitsel_v(p4, out_l, flat2); + __lsx_vst(p4, dst, 16 * 2); + + /* p3 */ + q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3); + tmp0_h = p3_h_in - p4_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + p3 = __lsx_vbitsel_v(p3, out_l, flat2); + __lsx_vst(p3, dst, 16 * 3); + + /* p2 */ + q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4); + filter8 = __lsx_vld(filter48, 0); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4); + tmp0_h = p2_h_in - p3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 4); + + /* p1 */ + q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5); + filter8 = __lsx_vld(filter48, 16); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5); + tmp0_h = p1_h_in - p2_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 5); + + /* p0 */ + q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6); + filter8 = __lsx_vld(filter48, 32); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6); + tmp0_h = p0_h_in - p1_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 6); + + /* q0 */ + q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7); + filter8 = __lsx_vld(filter48, 48); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7); + tmp0_h = q7_h_in - p0_h_in; + tmp0_h += q0_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 7); + + /* q1 */ + filter8 = __lsx_vld(filter48, 64); + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q0_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p6_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 8); + + /* q2 */ + filter8 = __lsx_vld(filter48, 80); + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q1_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p5_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 9); + + /* q3 */ + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q2_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p4_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q3 = __lsx_vbitsel_v(q3, out_l, flat2); + __lsx_vst(q3, dst, 16 * 10); + + /* q4 */ + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p3_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q4 = __lsx_vbitsel_v(q4, out_l, flat2); + __lsx_vst(q4, dst, 16 * 11); + + /* q5 */ + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q4_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p2_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q5 = __lsx_vbitsel_v(q5, out_l, flat2); + __lsx_vst(q5, dst, 16 * 12); + + /* q6 */ + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q5_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p1_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q6 = __lsx_vbitsel_v(q6, out_l, flat2); + __lsx_vst(q6, dst, 16 * 13); + + return 0; +} + +void vpx_lpf_vertical_16_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(16, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + transpose_16x16((src - 8), pitch, &transposed_input[0], 16); + + early_exit = + vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); + + if (early_exit == 0) { + early_exit = + vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]); + + if (early_exit == 0) { + transpose_16x16(transposed_input, 16, (src - 8), pitch); + } + } +} diff --git a/vpx_dsp/loongarch/loopfilter_lsx.h b/vpx_dsp/loongarch/loopfilter_lsx.h new file mode 100644 index 0000000000..53e15fe6d5 --- /dev/null +++ b/vpx_dsp/loongarch/loopfilter_lsx.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_ + +#include "vpx_util/loongson_intrinsics.h" + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ + flat_out) \ + { \ + __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + /* absolute subtraction of pixel values */ \ + p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in); \ + p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in); \ + p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in); \ + q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in); \ + q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in); \ + q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in); \ + p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in); \ + p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in); \ + \ + /* calculation of hev */ \ + flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = __lsx_vslt_bu(thresh_in, flat_out); \ + \ + /* calculation of mask */ \ + p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1); \ + p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m); \ + mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m); \ + mask_out = __lsx_vmax_bu(flat_out, mask_out); \ + p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out); \ + \ + mask_out = __lsx_vslt_bu(limit_in, mask_out); \ + mask_out = __lsx_vxori_b(mask_out, 0xff); \ + } + +#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ + { \ + __m128i p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0; \ + __m128i flat4_tmp = __lsx_vldi(1); \ + \ + DUP4_ARG2(__lsx_vabsd_bu, p2_in, p0_in, q2_in, q0_in, p3_in, p0_in, q3_in, \ + q0_in, p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0); \ + p2_asub_p0 = __lsx_vmax_bu(p2_asub_p0, q2_asub_q0); \ + flat_out = __lsx_vmax_bu(p2_asub_p0, flat_out); \ + p3_asub_p0 = __lsx_vmax_bu(p3_asub_p0, q3_asub_q0); \ + flat_out = __lsx_vmax_bu(p3_asub_p0, flat_out); \ + \ + flat_out = __lsx_vslt_bu(flat4_tmp, flat_out); \ + flat_out = __lsx_vxori_b(flat_out, 0xff); \ + flat_out = flat_out & (mask); \ + } + +#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ + q6_in, q7_in, flat_in, flat2_out) \ + { \ + __m128i flat5_tmp = __lsx_vldi(1); \ + __m128i p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0; \ + __m128i p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0; \ + DUP4_ARG2(__lsx_vabsd_bu, p4_in, p0_in, q4_in, q0_in, p5_in, p0_in, q5_in, \ + q0_in, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0); \ + DUP4_ARG2(__lsx_vabsd_bu, p6_in, p0_in, q6_in, q0_in, p7_in, p0_in, q7_in, \ + q0_in, p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0); \ + \ + DUP2_ARG2(__lsx_vmax_bu, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0, \ + p4_asub_p0, flat2_out); \ + flat2_out = __lsx_vmax_bu(p4_asub_p0, flat2_out); \ + p6_asub_p0 = __lsx_vmax_bu(p6_asub_p0, q6_asub_q0); \ + flat2_out = __lsx_vmax_bu(p6_asub_p0, flat2_out); \ + p7_asub_p0 = __lsx_vmax_bu(p7_asub_p0, q7_asub_q0); \ + flat2_out = __lsx_vmax_bu(p7_asub_p0, flat2_out); \ + flat2_out = __lsx_vslt_bu(flat5_tmp, flat2_out); \ + flat2_out = __lsx_vxori_b(flat2_out, 0xff); \ + flat2_out = flat2_out & flat_in; \ + } + +#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \ + p0_out, q0_out, q1_out) \ + { \ + __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ + const __m128i cnst4b = __lsx_vldi(4); \ + const __m128i cnst3b = __lsx_vldi(3); \ + DUP4_ARG2(__lsx_vxori_b, p1_in, 0x80, p0_in, 0x80, q0_in, 0x80, q1_in, \ + 0x80, p1_m, p0_m, q0_m, q1_m); \ + filt = __lsx_vssub_b(p1_m, q1_m); \ + filt &= hev; \ + \ + q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt &= mask; \ + DUP2_ARG2(__lsx_vsadd_b, filt, cnst4b, filt, cnst3b, t1, t2); \ + DUP2_ARG2(__lsx_vsrai_b, t1, 3, t2, 3, t1, t2); \ + \ + q0_m = __lsx_vssub_b(q0_m, t1); \ + p0_m = __lsx_vsadd_b(p0_m, t2); \ + DUP2_ARG2(__lsx_vxori_b, q0_m, 0x80, p0_m, 0x80, q0_out, p0_out); \ + \ + filt = __lsx_vsrari_b(t1, 1); \ + hev = __lsx_vxori_b(hev, 0xff); \ + filt &= hev; \ + q1_m = __lsx_vssub_b(q1_m, filt); \ + p1_m = __lsx_vsadd_b(p1_m, filt); \ + DUP2_ARG2(__lsx_vxori_b, q1_m, 0x80, p1_m, 0x80, q1_out, p1_out); \ + } + +#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ + q1_filt8_out, q2_filt8_out) \ + { \ + __m128i tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ + \ + tmp_filt8_2 = __lsx_vadd_h(p2_in, p1_in); \ + tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, p0_in); \ + tmp_filt8_0 = __lsx_vslli_h(p3_in, 1); \ + \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_2); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, q0_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, p2_in); \ + p2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p1_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q1_in); \ + p1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = __lsx_vadd_h(q2_in, q1_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q0_in); \ + tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, tmp_filt8_1); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, p0_in); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, p3_in); \ + p0_filt8_out = __lsx_vsrari_h(tmp_filt8_0, 3); \ + \ + tmp_filt8_0 = __lsx_vadd_h(q2_in, q3_in); \ + tmp_filt8_0 = __lsx_vadd_h(p0_in, tmp_filt8_0); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1); \ + tmp_filt8_1 = __lsx_vadd_h(q3_in, q3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, tmp_filt8_0); \ + q2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, q3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, q0_in); \ + q0_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = __lsx_vsub_h(tmp_filt8_0, p2_in); \ + tmp_filt8_0 = __lsx_vadd_h(q1_in, q3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1); \ + q1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + } + +#endif // VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index c948e12a39..eb530db5a1 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -189,6 +189,8 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_horiz_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c +DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_16_lsx.c + ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_NEON) += arm/highbd_loopfilter_neon.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index c721e190b0..9cd58d3b80 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -442,7 +442,7 @@ () specialize qw/vpx_lpf_vertical_16 sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa/; +specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa lsx/; add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/; @@ -460,7 +460,7 @@ () specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon dspr2 msa/; add_proto qw/void vpx_lpf_horizontal_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vpx_lpf_horizontal_16_dual sse2 avx2 neon dspr2 msa/; +specialize qw/vpx_lpf_horizontal_16_dual sse2 avx2 neon dspr2 msa lsx/; add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/; From 1365e7e1a56f0a9af5fbd247a973206484bc8e2b Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Tue, 1 Mar 2022 23:03:27 -0800 Subject: [PATCH 0289/1000] vp9-svc: Remove VP9E_SET_TEMPORAL_LAYERING_MODE The control was never implemented, no need to keep this. temporal_layering_mode is set in the config. Bug: webm:1753 Change-Id: I9a6eb50e82344605ab62775911783af82ac2d401 --- vpx/vp8cx.h | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 6b02aa8657..5665a5f036 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -494,18 +494,6 @@ enum vp8e_enc_control_id { */ VP9E_SET_COLOR_SPACE, - /*!\brief Codec control function to set temporal layering mode. - * \note Valid ranges: 0..3, default is "0" - * (VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING). - * 0 = VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING - * 1 = VP9E_TEMPORAL_LAYERING_MODE_BYPASS - * 2 = VP9E_TEMPORAL_LAYERING_MODE_0101 - * 3 = VP9E_TEMPORAL_LAYERING_MODE_0212 - * - * Supported in codecs: VP9 - */ - VP9E_SET_TEMPORAL_LAYERING_MODE, - /*!\brief Codec control function to set minimum interval between GF/ARF frames * * By default the value is set as 4. @@ -1026,9 +1014,6 @@ VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *) #define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int) #define VPX_CTRL_VP9E_SET_COLOR_SPACE -VPX_CTRL_USE_TYPE(VP9E_SET_TEMPORAL_LAYERING_MODE, - int) /* VP9E_TEMPORAL_LAYERING_MODE */ -#define VPX_CTRL_VP9E_SET_TEMPORAL_LAYERING_MODE VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL, unsigned int) #define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL, unsigned int) From 624b1367004801639ed35759d5f1759a092c8410 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Tue, 1 Mar 2022 09:48:13 +0800 Subject: [PATCH 0290/1000] vp9[loongarch]: Optimize horizontal/vertical_8_c 1. vpx_lpf_vertical_8_lsx 2. vpx_lpf_horizontal_8_lsx Bug: webm:1755 Change-Id: I6b05d6b1b2ac4d2a75beb9c9ca9700976fc3af55 --- test/lpf_test.cc | 12 ++ vpx_dsp/loongarch/loopfilter_8_lsx.c | 199 +++++++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 2 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 4 files changed, 215 insertions(+), 2 deletions(-) create mode 100644 vpx_dsp/loongarch/loopfilter_8_lsx.c diff --git a/test/lpf_test.cc b/test/lpf_test.cc index 62c6f30a07..833dfb9a89 100644 --- a/test/lpf_test.cc +++ b/test/lpf_test.cc @@ -692,4 +692,16 @@ INSTANTIATE_TEST_SUITE_P( &vpx_lpf_vertical_8_dual_c, 8))); #endif // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH) +#if HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH) +INSTANTIATE_TEST_SUITE_P( + LSX, Loop8Test6Param, + ::testing::Values( + make_tuple(&vpx_lpf_horizontal_8_lsx, &vpx_lpf_horizontal_8_c, 8), + make_tuple(&vpx_lpf_horizontal_16_dual_lsx, + &vpx_lpf_horizontal_16_dual_c, 8), + make_tuple(&vpx_lpf_vertical_8_lsx, &vpx_lpf_vertical_8_c, 8), + make_tuple(&vpx_lpf_vertical_16_dual_lsx, &vpx_lpf_vertical_16_dual_c, + 8))); +#endif // HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH) + } // namespace diff --git a/vpx_dsp/loongarch/loopfilter_8_lsx.c b/vpx_dsp/loongarch/loopfilter_8_lsx.c new file mode 100644 index 0000000000..facf6f30ec --- /dev/null +++ b/vpx_dsp/loongarch/loopfilter_8_lsx.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/loopfilter_lsx.h" + +void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i mask, hev, flat, thresh, b_limit, limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i p2_filter8, p1_filter8, p0_filter8; + __m128i q0_filter8, q1_filter8, q2_filter8; + __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l; + __m128i zero = __lsx_vldi(0); + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + /* load vector elements */ + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vreplgr2vr_b(*thresh_ptr); + b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr); + limit = __lsx_vreplgr2vr_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = __lsx_vilvl_d(zero, flat); + + if (__lsx_bz_v(flat)) { + __lsx_vstelm_d(p1_out, dst - stride2, 0, 0); + __lsx_vstelm_d(p0_out, dst - stride, 0, 0); + __lsx_vstelm_d(q0_out, dst, 0, 0); + __lsx_vstelm_d(q1_out, dst + stride, 0, 0); + } else { + DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, + p1_l, p0_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, + q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, zero, + p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, + q0_filter8); + DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, q1_filter8, + q2_filter8); + DUP4_ARG3(__lsx_vbitsel_v, p2, p2_filter8, flat, p1_out, p1_filter8, flat, + p0_out, p0_filter8, flat, q0_out, q0_filter8, flat, p2_out, + p1_out, p0_out, q0_out); + DUP2_ARG3(__lsx_vbitsel_v, q1_out, q1_filter8, flat, q2, q2_filter8, flat, + q1_out, q2_out); + dst -= stride3; + + __lsx_vstelm_d(p2_out, dst, 0, 0); + __lsx_vstelm_d(p1_out, dst + stride, 0, 0); + __lsx_vstelm_d(p0_out, dst + stride2, 0, 0); + __lsx_vstelm_d(q0_out, dst + stride3, 0, 0); + + dst += stride4; + __lsx_vstelm_d(q1_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(q2_out, dst, 0, 0); + } +} + +void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p1_out, p0_out, q0_out, q1_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i vec0, vec1, vec2, vec3, vec4; + __m128i zero = __lsx_vldi(0); + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + uint8_t *dst_tmp = dst - 4; + + /* load vector elements */ + p3 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1); + p0 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + q0 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2); + q3 = __lsx_vldx(dst_tmp, stride3); + + LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = __lsx_vreplgr2vr_b(*thresh_ptr); + b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr); + limit = __lsx_vreplgr2vr_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = __lsx_vilvl_d(zero, flat); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat)) { + /* Store 4 pixels p1-_q1 */ + DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); + vec2 = __lsx_vilvl_h(vec1, vec0); + vec3 = __lsx_vilvh_h(vec1, vec0); + + dst -= 2; + __lsx_vstelm_w(vec2, dst, 0, 0); + __lsx_vstelm_w(vec2, dst + stride, 0, 1); + __lsx_vstelm_w(vec2, dst + stride2, 0, 2); + __lsx_vstelm_w(vec2, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(vec3, dst, 0, 0); + __lsx_vstelm_w(vec3, dst + stride, 0, 1); + __lsx_vstelm_w(vec3, dst + stride2, 0, 2); + __lsx_vstelm_w(vec3, dst + stride3, 0, 3); + } else { + DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, + p1_l, p0_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, + q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l, + p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + /* store pixel values */ + p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + /* Store 6 pixels p2-_q2 */ + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1); + vec2 = __lsx_vilvl_h(vec1, vec0); + vec3 = __lsx_vilvh_h(vec1, vec0); + vec4 = __lsx_vilvl_b(q2, q1); + + dst -= 3; + __lsx_vstelm_w(vec2, dst, 0, 0); + __lsx_vstelm_h(vec4, dst, 4, 0); + dst += stride; + __lsx_vstelm_w(vec2, dst, 0, 1); + __lsx_vstelm_h(vec4, dst, 4, 1); + dst += stride; + __lsx_vstelm_w(vec2, dst, 0, 2); + __lsx_vstelm_h(vec4, dst, 4, 2); + dst += stride; + __lsx_vstelm_w(vec2, dst, 0, 3); + __lsx_vstelm_h(vec4, dst, 4, 3); + dst += stride; + __lsx_vstelm_w(vec3, dst, 0, 0); + __lsx_vstelm_h(vec4, dst, 4, 4); + dst += stride; + __lsx_vstelm_w(vec3, dst, 0, 1); + __lsx_vstelm_h(vec4, dst, 4, 5); + dst += stride; + __lsx_vstelm_w(vec3, dst, 0, 2); + __lsx_vstelm_h(vec4, dst, 4, 6); + dst += stride; + __lsx_vstelm_w(vec3, dst, 0, 3); + __lsx_vstelm_h(vec4, dst, 4, 7); + } +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index eb530db5a1..976c652729 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -189,7 +189,9 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_horiz_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c +DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_lsx.h DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_16_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_8_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_NEON) += arm/highbd_loopfilter_neon.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 9cd58d3b80..ce0780fdab 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -445,7 +445,7 @@ () specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa lsx/; add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/; +specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa lsx/; add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa/; @@ -463,7 +463,7 @@ () specialize qw/vpx_lpf_horizontal_16_dual sse2 avx2 neon dspr2 msa lsx/; add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/; +specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa lsx/; add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa/; From 642529248f873d9da8b86e368d9e3af85a2a77a3 Mon Sep 17 00:00:00 2001 From: Johann Date: Sun, 13 Mar 2022 06:28:16 +0900 Subject: [PATCH 0291/1000] ads2gas[_apple].pl: remove unused stanzas Many of the features in ads2gas are no longer used. Remove all patterns which are no longer used in libvpx. Simplify between the two to minimize differences. Change-Id: Ia1151eb8b694cbe51845a1374a876cc7b798899c --- build/make/ads2gas.pl | 122 +++++-------------------------- build/make/ads2gas_apple.pl | 113 ++++++---------------------- build/make/thumb.pm | 5 +- vpx_dsp/arm/idct4x4_add_neon.asm | 2 +- 4 files changed, 42 insertions(+), 200 deletions(-) diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl index b6a8f53eae..4b7a906d26 100755 --- a/build/make/ads2gas.pl +++ b/build/make/ads2gas.pl @@ -42,39 +42,11 @@ while () { - undef $comment; - undef $line; - $comment_char = ";"; - $comment_sub = "@"; - - # Handle comments. - if (/$comment_char/) - { - $comment = ""; - ($line, $comment) = /(.*?)$comment_char(.*)/; - $_ = $line; - } - # Load and store alignment s/@/,:/g; - # Hexadecimal constants prefaced by 0x - s/#&/#0x/g; - - # Convert :OR: to | - s/:OR:/ | /g; - - # Convert :AND: to & - s/:AND:/ & /g; - - # Convert :NOT: to ~ - s/:NOT:/ ~ /g; - - # Convert :SHL: to << - s/:SHL:/ << /g; - - # Convert :SHR: to >> - s/:SHR:/ >> /g; + # Comment character + s/;/@/; # Convert ELSE to .else s/\bELSE\b/.else/g; @@ -82,82 +54,31 @@ # Convert ENDIF to .endif s/\bENDIF\b/.endif/g; - # Convert ELSEIF to .elseif - s/\bELSEIF\b/.elseif/g; - - # Convert LTORG to .ltorg - s/\bLTORG\b/.ltorg/g; - - # Convert endfunc to nothing. - s/\bendfunc\b//ig; - - # Convert FUNCTION to nothing. - s/\bFUNCTION\b//g; - s/\bfunction\b//g; - - s/\bENTRY\b//g; - s/\bMSARMASM\b/0/g; - s/^\s+end\s+$//g; - - # Convert IF :DEF:to .if - # gcc doesn't have the ability to do a conditional - # if defined variable that is set by IF :DEF: on - # armasm, so convert it to a normal .if and then - # make sure to define a value elesewhere - if (s/\bIF :DEF:\b/.if /g) - { - s/=/==/g; - } - # Convert IF to .if - if (s/\bIF\b/.if/g) - { + if (s/\bIF\b/.if/g) { s/=+/==/g; } # Convert INCLUDE to .INCLUDE "file" s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/; - # Code directive (ARM vs Thumb) - s/CODE([0-9][0-9])/.code $1/; - # No AREA required # But ALIGNs in AREA must be obeyed s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/; # If no ALIGN, strip the AREA and align to 4 bytes s/^\s*AREA.*$/.text\n.p2align 2/; - # DCD to .word - # This one is for incoming symbols - s/DCD\s+\|(\w*)\|/.long $1/; - - # DCW to .short - s/DCW\s+\|(\w*)\|/.short $1/; - s/DCW(.*)/.short $1/; - - # Constants defined in scope - s/DCD(.*)/.long $1/; - s/DCB(.*)/.byte $1/; - - # Make function visible to linker, and make additional symbol with - # prepended underscore + # Make function visible to linker. if ($elf) { s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/; } else { s/EXPORT\s+\|([\$\w]*)\|/.global $1/; } - s/IMPORT\s+\|([\$\w]*)\|/.global $1/; - - s/EXPORT\s+([\$\w]*)/.global $1/; - s/export\s+([\$\w]*)/.global $1/; - # No vertical bars required; make additional symbol with prepended - # underscore - s/^\|(\$?\w+)\|/_$1\n\t$1:/g; + # No vertical bars on function names + s/^\|(\$?\w+)\|/$1/g; # Labels need trailing colon -# s/^(\w+)/$1:/ if !/EQU/; - # put the colon at the end of the line in the macro s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/; # ALIGN directive @@ -165,7 +86,7 @@ if ($thumb) { # ARM code - we force everything to thumb with the declaration in the header - s/\sARM//g; + s/\s+ARM//g; } else { # ARM code s/\sARM/.arm/g; @@ -175,12 +96,8 @@ s/(push\s+)(r\d+)/stmdb sp\!, \{$2\}/g; s/(pop\s+)(r\d+)/ldmia sp\!, \{$2\}/g; - # NEON code - s/(vld1.\d+\s+)(q\d+)/$1\{$2\}/g; - s/(vtbl.\d+\s+[^,]+),([^,]+)/$1,\{$2\}/g; - if ($thumb) { - thumb::FixThumbInstructions($_, 0); + thumb::FixThumbInstructions($_); } # eabi_attributes numerical equivalents can be found in the @@ -193,22 +110,21 @@ # PRESERVE8 Stack 8-byte align is preserved s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g; } else { - s/\sREQUIRE8//; - s/\sPRESERVE8//; + s/\s+REQUIRE8//; + s/\s+PRESERVE8//; } # Use PROC and ENDP to give the symbols a .size directive. # This makes them show up properly in debugging tools like gdb and valgrind. - if (/\bPROC\b/) - { + if (/\bPROC\b/) { my $proc; /^_([\.0-9A-Z_a-z]\w+)\b/; $proc = $1; push(@proc_stack, $proc) if ($proc); s/\bPROC\b/@ $&/; } - if (/\bENDP\b/) - { + + if (/\bENDP\b/) { my $proc; s/\bENDP\b/@ $&/; $proc = pop(@proc_stack); @@ -220,18 +136,18 @@ # Begin macro definition if (/\bMACRO\b/) { + # Process next line down, which will be the macro definition $_ = ; s/^/.macro/; - s/\$//g; # remove formal param reference - s/;/@/g; # change comment characters + s/\$//g; # Remove $ from the variables in the declaration } - # For macros, use \ to reference formal params - s/\$/\\/g; # End macro definition - s/\bMEND\b/.endm/; # No need to tell it where to stop assembling + s/\$/\\/g; # Use \ to reference formal parameters + # End macro definition + + s/\bMEND\b/.endm/; # No need to tell it where to stop assembling next if /^\s*END\s*$/; print; - print "$comment_sub$comment\n" if defined $comment; } # Mark that this object doesn't need an executable stack. diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl index 848872fa7d..0a3fccc4b0 100755 --- a/build/make/ads2gas_apple.pl +++ b/build/make/ads2gas_apple.pl @@ -22,15 +22,12 @@ print "@ using the ads2gas_apple.pl script.\n\n"; print "\t.syntax unified\n"; -my %register_aliases; my %macro_aliases; my @mapping_list = ("\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", "\$8", "\$9"); my @incoming_array; -my @imported_functions; - # Perl trim function to remove whitespace from the start and end of the string sub trim($) { @@ -46,25 +43,7 @@ ($) s/@/,:/g; # Comment character - s/;/ @/g; - - # Hexadecimal constants prefaced by 0x - s/#&/#0x/g; - - # Convert :OR: to | - s/:OR:/ | /g; - - # Convert :AND: to & - s/:AND:/ & /g; - - # Convert :NOT: to ~ - s/:NOT:/ ~ /g; - - # Convert :SHL: to << - s/:SHL:/ << /g; - - # Convert :SHR: to >> - s/:SHR:/ >> /g; + s/;/@/; # Convert ELSE to .else s/\bELSE\b/.else/g; @@ -72,100 +51,53 @@ ($) # Convert ENDIF to .endif s/\bENDIF\b/.endif/g; - # Convert ELSEIF to .elseif - s/\bELSEIF\b/.elseif/g; - - # Convert LTORG to .ltorg - s/\bLTORG\b/.ltorg/g; - - # Convert IF :DEF:to .if - # gcc doesn't have the ability to do a conditional - # if defined variable that is set by IF :DEF: on - # armasm, so convert it to a normal .if and then - # make sure to define a value elesewhere - if (s/\bIF :DEF:\b/.if /g) - { - s/=/==/g; - } - # Convert IF to .if - if (s/\bIF\b/.if/g) - { - s/=/==/g; + if (s/\bIF\b/.if/g) { + s/=+/==/g; } # Convert INCLUDE to .INCLUDE "file" s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/; - # Code directive (ARM vs Thumb) - s/CODE([0-9][0-9])/.code $1/; - # No AREA required # But ALIGNs in AREA must be obeyed s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/; # If no ALIGN, strip the AREA and align to 4 bytes s/^\s*AREA.*$/.text\n.p2align 2/; - # DCD to .word - # This one is for incoming symbols - s/DCD\s+\|(\w*)\|/.long $1/; + # Make function visible to linker. + s/EXPORT\s+\|([\$\w]*)\|/.globl _$1/; - # DCW to .short - s/DCW\s+\|(\w*)\|/.short $1/; - s/DCW(.*)/.short $1/; + # No vertical bars on function names + s/^\|(\$?\w+)\|/$1/g; - # Constants defined in scope - s/DCD(.*)/.long $1/; - s/DCB(.*)/.byte $1/; + # Labels and functions need a leading underscore and trailing colon + s/^([a-zA-Z_0-9\$]+)/_$1:/ if !/EQU/; - # Make function visible to linker, and make additional symbol with - # prepended underscore - s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/; - - # Prepend imported functions with _ - if (s/IMPORT\s+\|([\$\w]*)\|/.globl $1/) - { - $function = trim($1); - push(@imported_functions, $function); - } - - foreach $function (@imported_functions) - { - s/$function/_$function/; - } - - # No vertical bars required; make additional symbol with prepended - # underscore - s/^\|(\$?\w+)\|/_$1\n\t$1:/g; - - # Labels need trailing colon -# s/^(\w+)/$1:/ if !/EQU/; - # put the colon at the end of the line in the macro - s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/; + # Branches need to call the correct, underscored, function + s/^(\s+b[egln]?[teq]?\s+)([a-zA-Z_0-9\$]+)/$1 _$2/ if !/EQU/; # ALIGN directive s/\bALIGN\b/.balign/g; # Strip ARM - s/\sARM/@ ARM/g; + s/\s+ARM//; # Strip REQUIRE8 - #s/\sREQUIRE8/@ REQUIRE8/g; - s/\sREQUIRE8/@ /g; + s/\s+REQUIRE8//; # Strip PRESERVE8 - s/\sPRESERVE8/@ PRESERVE8/g; + s/\s+PRESERVE8//; # Strip PROC and ENDPROC - s/\bPROC\b/@/g; - s/\bENDP\b/@/g; + s/\bPROC\b//g; + s/\bENDP\b//g; # EQU directive - s/(.*)EQU(.*)/.set $1, $2/; + s/(\S+\s+)EQU(\s+\S+)/.set $1, $2/; # Begin macro definition - if (/\bMACRO\b/) - { + if (/\bMACRO\b/) { # Process next line down, which will be the macro definition $_ = ; @@ -187,16 +119,13 @@ ($) next; } - while (($key, $value) = each(%macro_aliases)) - { + while (($key, $value) = each(%macro_aliases)) { $key =~ s/\$/\\\$/; s/$key\b/$value/g; } + # End macro definition - # For macros, use \ to reference formal params -# s/\$/\\/g; # End macro definition - s/\bMEND\b/.endm/; # No need to tell it where to stop assembling + s/\bMEND\b/.endm/; # No need to tell it where to stop assembling next if /^\s*END\s*$/; - print; } diff --git a/build/make/thumb.pm b/build/make/thumb.pm index 9c49e2d8b7..ef4b316771 100644 --- a/build/make/thumb.pm +++ b/build/make/thumb.pm @@ -11,11 +11,8 @@ package thumb; -sub FixThumbInstructions($$) +sub FixThumbInstructions($) { - my $short_branches = $_[1]; - my $branch_shift_offset = $short_branches ? 1 : 0; - # Write additions with shifts, such as "add r10, r11, lsl #8", # in three operand form, "add r10, r10, r11, lsl #8". s/(add\s+)(r\d+),\s*(r\d+),\s*(lsl #\d+)/$1$2, $2, $3, $4/g; diff --git a/vpx_dsp/arm/idct4x4_add_neon.asm b/vpx_dsp/arm/idct4x4_add_neon.asm index 184d218941..175ba7fbc2 100644 --- a/vpx_dsp/arm/idct4x4_add_neon.asm +++ b/vpx_dsp/arm/idct4x4_add_neon.asm @@ -17,7 +17,7 @@ INCLUDE vpx_dsp/arm/idct_neon.asm.S - AREA Block, CODE, READONLY ; name this block of code + AREA Block, CODE, READONLY ;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride) ; ; r0 int16_t input From 4ee32be84be7dfa2b0c00ba04f4d85503d46e3f3 Mon Sep 17 00:00:00 2001 From: Johann Date: Sun, 13 Mar 2022 07:02:03 +0900 Subject: [PATCH 0292/1000] ads2gas_apple.pl: remove gcc-isms The gcc assembler was incompatible for a long time. It is now based on clang and accepts more modern syntax, although not enough to remove the script entirely. Change-Id: I667d29dca005ea02a995c1025c45eb844081f64b --- build/make/ads2gas_apple.pl | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl index 0a3fccc4b0..af10b436a9 100755 --- a/build/make/ads2gas_apple.pl +++ b/build/make/ads2gas_apple.pl @@ -94,35 +94,17 @@ ($) s/\bENDP\b//g; # EQU directive - s/(\S+\s+)EQU(\s+\S+)/.set $1, $2/; + s/(\S+\s+)EQU(\s+\S+)/.equ $1, $2/; # Begin macro definition if (/\bMACRO\b/) { # Process next line down, which will be the macro definition $_ = ; - - $trimmed = trim($_); - - # remove commas that are separating list - $trimmed =~ s/,//g; - - # string to array - @incoming_array = split(/\s+/, $trimmed); - - print ".macro @incoming_array[0]\n"; - - # remove the first element, as that is the name of the macro - shift (@incoming_array); - - @macro_aliases{@incoming_array} = @mapping_list; - - next; + s/^/.macro/; + s/\$//g; # Remove $ from the variables in the declaration } - while (($key, $value) = each(%macro_aliases)) { - $key =~ s/\$/\\\$/; - s/$key\b/$value/g; - } + s/\$/\\/g; # Use \ to reference formal parameters # End macro definition s/\bMEND\b/.endm/; # No need to tell it where to stop assembling From 220643c8627d158f75acbf7e1b7dcd9ae642261c Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Tue, 1 Mar 2022 16:33:47 +0800 Subject: [PATCH 0293/1000] vp9[loongarch]: Optimize convolve8_horiz/vert/c 1. vpx_convolve8_lsx 2. vpx_convolve8_vert_lsx 3. vpx_convolve8_horiz_lsx Bug: webm:1755 Change-Id: I9897e1ed6a904ac74d1078bd22b275af44db142d --- test/convolve_test.cc | 13 + vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c | 852 +++++++++++++++++++ vpx_dsp/loongarch/vpx_convolve8_lsx.c | 718 ++++++++++++++++ vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c | 870 ++++++++++++++++++++ vpx_dsp/loongarch/vpx_convolve_lsx.h | 117 +++ vpx_dsp/vpx_dsp.mk | 6 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +- 7 files changed, 2579 insertions(+), 3 deletions(-) create mode 100644 vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c create mode 100644 vpx_dsp/loongarch/vpx_convolve8_lsx.c create mode 100644 vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c create mode 100644 vpx_dsp/loongarch/vpx_convolve_lsx.h diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 4b2dadefac..94b2814842 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -1449,6 +1449,19 @@ INSTANTIATE_TEST_SUITE_P(MSA, ConvolveTest, ::testing::ValuesIn(kArrayConvolve8_msa)); #endif // HAVE_MSA +#if HAVE_LSX +const ConvolveFunctions convolve8_lsx( + vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_lsx, + vpx_convolve8_avg_horiz_c, vpx_convolve8_vert_lsx, vpx_convolve8_avg_vert_c, + vpx_convolve8_lsx, vpx_convolve8_avg_c, vpx_scaled_horiz_c, + vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c, + vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve8_lsx[] = { ALL_SIZES(convolve8_lsx) }; +INSTANTIATE_TEST_SUITE_P(LSX, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_lsx)); +#endif // HAVE_LSX + #if HAVE_VSX const ConvolveFunctions convolve8_vsx( vpx_convolve_copy_vsx, vpx_convolve_avg_vsx, vpx_convolve8_horiz_vsx, diff --git a/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c new file mode 100644 index 0000000000..3608fe326c --- /dev/null +++ b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c @@ -0,0 +1,852 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \ + { \ + _src0 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src1 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src2 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src3 = __lsx_vld(_src, 0); \ + } + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out, out0, out1; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= 3; + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out0, out1); + out = __lsx_vssrarni_b_h(out1, out0, 7); + out = __lsx_vxori_b(out, 128); + __lsx_vstelm_w(out, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 3); +} + +static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) { + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + uint8_t *_src = (uint8_t *)src - 3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out0, out1); + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 3); +} + +static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_8t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_8t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out0, out1, + out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); +} + +static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + uint8_t *_src = (uint8_t *)src - 3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_8t_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_8t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 1; + int32_t stride = src_stride << 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + const uint8_t *_src = src + src_stride; + DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src0, src2); + DUP2_ARG2(__lsx_vld, src, 8, _src, 8, src1, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + dst += dst_stride; + __lsx_vst(out1, dst, 0); + dst += dst_stride; + src += stride; + } +} + +static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + + dst += dst_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + int32_t loop_cnt = height; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + + DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2); + src3 = __lsx_vld(src, 56); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 32); + __lsx_vst(out1, dst, 48); + src += src_stride; + dst += dst_stride; + } +} + +static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, mask; + __m128i filt0, vec0, vec1, res0, res1; + __m128i vec2, vec3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, vec3); + DUP2_ARG2(__lsx_vsrari_h, vec2, FILTER_BITS, vec3, FILTER_BITS, vec2, vec3); + DUP2_ARG2(__lsx_vpickev_b, vec2, vec2, vec3, vec3, res0, res1); + + __lsx_vstelm_w(res0, dst, 0, 0); + __lsx_vstelm_w(res0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1); +} + +static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i vec0, vec1, vec2, vec3, filt0; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i res0, res1, res2, res3; + __m128i vec4, vec5, vec6, vec7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + + uint8_t *src_tmp1 = src + src_stride4; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp1, src_stride3); + + DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask, + src7, src6, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vsrari_h, vec4, FILTER_BITS, vec5, FILTER_BITS, vec6, + FILTER_BITS, vec7, FILTER_BITS, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vpickev_b, vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, + res0, res1, res2, res3); + __lsx_vstelm_w(res0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 1); + dst += dst_stride; + + __lsx_vstelm_w(res2, dst, 0, 0); + __lsx_vstelm_w(res2, dst + dst_stride, 0, 1); + __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1); +} + +static void common_hz_2t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i filt0, mask; + __m128i src0, src1, src2, src3; + __m128i vec0, vec1, vec2, vec3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2, + FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3); + DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, src0, src1); + + __lsx_vstelm_d(src0, dst, 0, 0); + __lsx_vstelm_d(src0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(src1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(src1, dst + dst_stride3, 0, 1); +} + +static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + __m128i filt0, mask; + __m128i src0, src1, src2, src3, out0, out1; + __m128i vec0, vec1, vec2, vec3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2, + FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2, + FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3); + DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + if (height == 16) { + uint8_t *dst_tmp1 = dst + dst_stride4; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2, + FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2, + FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3); + DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1); + + __lsx_vstelm_d(out0, dst_tmp1, 0, 0); + __lsx_vstelm_d(out0, dst_tmp1 + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst_tmp1 + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst_tmp1 + dst_stride3, 0, 1); + } +} + +static void common_hz_2t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2) - 1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + uint8_t *src_tmp1 = src + 8; + mask = __lsx_vld(mc_filt_mask_arr, 0); + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, mask, + src7, src7, mask, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, + out4, out5, out6, out7); + DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2, + FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6, + FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7); + + tmp = __lsx_vpickev_b(out1, out0); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + tmp = __lsx_vpickev_b(out3, out2); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + tmp = __lsx_vpickev_b(out5, out4); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + tmp = __lsx_vpickev_b(out7, out6); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + for (; loop_cnt--;) { + src_tmp1 += src_stride4; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, + mask, src7, src7, mask, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2, + FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6, + FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7); + + tmp = __lsx_vpickev_b(out1, out0); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + tmp = __lsx_vpickev_b(out3, out2); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + tmp = __lsx_vpickev_b(out5, out4); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + tmp = __lsx_vpickev_b(out7, out6); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + } +} + +static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src4, src6); + src7 = __lsx_vld(src, 24); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, + mask, src7, src7, mask, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2, + FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6, + FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7); + + tmp = __lsx_vpickev_b(out1, out0); + __lsx_vst(tmp, dst, 0); + tmp = __lsx_vpickev_b(out3, out2); + __lsx_vst(tmp, dst, 16); + dst += dst_stride; + + tmp = __lsx_vpickev_b(out5, out4); + __lsx_vst(tmp, dst, 0); + tmp = __lsx_vpickev_b(out7, out6); + __lsx_vst(tmp, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4, + src6); + src7 = __lsx_vld(src, 56); + DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, + mask, src7, src7, mask, vec4, vec5, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2, + FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6, + FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7); + + tmp = __lsx_vpickev_b(out1, out0); + __lsx_vst(tmp, dst, 0); + tmp = __lsx_vpickev_b(out3, out2); + __lsx_vst(tmp, dst, 16); + tmp = __lsx_vpickev_b(out5, out4); + __lsx_vst(tmp, dst, 32); + tmp = __lsx_vpickev_b(out7, out6); + __lsx_vst(tmp, dst, 48); + dst += dst_stride; + } +} + +void vpx_convolve8_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_x = filter[x0_q4]; + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + if (vpx_get_filter_taps(filter_x) == 2) { + switch (w) { + case 4: + common_hz_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 8: + common_hz_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 16: + common_hz_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 32: + common_hz_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 64: + common_hz_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + default: + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 8: + common_hz_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + + case 16: + common_hz_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + + case 32: + common_hz_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + + case 64: + common_hz_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + default: + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_lsx.c new file mode 100644 index 0000000000..51a162bf3e --- /dev/null +++ b/vpx_dsp/loongarch/vpx_convolve8_lsx.c @@ -0,0 +1,718 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \ + { \ + _src0 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src1 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src2 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src3 = __lsx_vld(_src, 0); \ + } + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i out0, out1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= (3 + 3 * src_stride); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + src4 = __lsx_vld(src, 0); + src += src_stride; + src5 = __lsx_vld(src, 0); + src += src_stride; + src6 = __lsx_vld(src, 0); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3); + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + tmp2 = __lsx_vpackev_b(tmp5, tmp4); + + for (; loop_cnt--;) { + LSX_LD_4(src, src_stride, src7, src8, src9, src10); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff); + tmp4 = __lsx_vpackev_b(tmp3, tmp4); + out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vshuf_b(src1, tmp3, shuff); + src0 = __lsx_vpackev_b(src1, src0); + out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + out0 = __lsx_vssrarni_b_h(out1, out0, 7); + out0 = __lsx_vxori_b(out0, 128); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + tmp5 = src1; + tmp0 = tmp2; + tmp1 = tmp4; + tmp2 = src0; + } +} + +static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + __m128i out0, out1; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= (3 + 3 * src_stride); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + src4 = __lsx_vld(src, 0); + src += src_stride; + src5 = __lsx_vld(src, 0); + src += src_stride; + src6 = __lsx_vld(src, 0); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1, + tmp0, tmp1, tmp2, tmp4); + DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6); + + for (; loop_cnt--;) { + LSX_LD_4(src, src_stride, src7, src8, src9, src10); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp3 = __lsx_vpackev_b(src7, src6); + out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vpackev_b(src8, src7); + out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = __lsx_vpackev_b(src9, src8); + src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = __lsx_vpackev_b(src10, src9); + src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + src6 = src10; + tmp0 = tmp2; + tmp1 = tmp3; + tmp2 = src1; + tmp4 = tmp6; + tmp5 = src0; + tmp6 = src2; + } +} + +static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; +} + +static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_vt, filt_hz, vec0, vec1, res0, res1; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + + hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); + hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2); + + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + DUP2_ARG2(__lsx_vpickev_b, tmp0, tmp0, tmp1, tmp1, res0, res1); + + __lsx_vstelm_w(res0, dst, 0, 0); + __lsx_vstelm_w(res0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1); +} + +static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + __m128i res0, res1, res2, res3; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src5, src6, src7, src8); + src += src_stride4; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + + DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff, + hz_out1, hz_out3); + hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff); + hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6); + DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5, + hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3, + filt_vt, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vsrari_h, vec4, FILTER_BITS, vec5, FILTER_BITS, vec6, + FILTER_BITS, vec7, FILTER_BITS, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vpickev_b, vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, + res0, res1, res2, res3); + + __lsx_vstelm_w(res0, dst, 0, 0); + __lsx_vstelm_w(res0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1); + dst += dst_stride4; + __lsx_vstelm_w(res2, dst, 0, 0); + __lsx_vstelm_w(res2, dst + dst_stride, 0, 1); + __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1); +} + +static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_4x4_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else if (height == 8) { + common_hv_2ht_2vt_4x8_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } +} + +static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask, out0, out1; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec1 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec2 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec3 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); + + DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, + FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); +} + +static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = (height >> 3); + __m128i src0, src1, src2, src3, src4, mask, out0, out1; + __m128i filt_hz, filt_vt, vec0; + __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); + + DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt); + + DUP2_ARG2(__lsx_vsrari_h, tmp3, FILTER_BITS, tmp4, FILTER_BITS, tmp3, tmp4); + DUP2_ARG2(__lsx_vpickev_b, tmp2, tmp1, tmp4, tmp3, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp5 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp6 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp7 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp8 = __lsx_vdp2_h_bu(vec0, filt_vt); + + DUP4_ARG2(__lsx_vsrari_h, tmp5, FILTER_BITS, tmp6, FILTER_BITS, tmp7, + FILTER_BITS, tmp8, FILTER_BITS, tmp5, tmp6, tmp7, tmp8); + DUP2_ARG2(__lsx_vpickev_b, tmp6, tmp5, tmp8, tmp7, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_8x4_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else { + common_hv_2ht_2vt_8x8mult_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt_hz, filt_vt, vec0, vec1; + __m128i tmp, tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (; loop_cnt--;) { + uint8_t *src_tmp0 = src + 8; + + DUP2_ARG2(__lsx_vld, src, 0, src_tmp0, 0, src0, src1); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp0, src_stride, src, + src_stride2, src_tmp0, src_stride2, src2, src3, src4, src5); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7); + src += src_stride4; + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2); + tmp = __lsx_vpickev_b(tmp2, tmp1); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2); + tmp = __lsx_vpickev_b(tmp2, tmp1); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2); + tmp = __lsx_vpickev_b(tmp2, tmp1); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2); + tmp = __lsx_vpickev_b(tmp2, tmp1); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + + common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; +} + +static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + } +} + +void vpx_convolve8_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, + int32_t y_step_q4, int32_t w, int32_t h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_x) == 2 && + vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_hv_2ht_2vt_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 8: + common_hv_2ht_2vt_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 16: + common_hv_2ht_2vt_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 32: + common_hv_2ht_2vt_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 64: + common_hv_2ht_2vt_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + default: + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else if (vpx_get_filter_taps(filter_x) == 2 || + vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 8: + common_hv_8ht_8vt_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 16: + common_hv_8ht_8vt_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 32: + common_hv_8ht_8vt_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 64: + common_hv_8ht_8vt_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + default: + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c new file mode 100644 index 0000000000..c0bb10f3b7 --- /dev/null +++ b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c @@ -0,0 +1,870 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i reg0, reg1, reg2, reg3, reg4; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1; + uint8_t *_src = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src4 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); + _src += src_stride3; + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0, + tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5); + DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1); + reg2 = __lsx_vilvl_d(tmp5, tmp2); + DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1); + reg2 = __lsx_vxori_b(reg2, 128); + + for (; loop_cnt--;) { + src7 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); + src10 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4); + DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4); + out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1, + filter2, filter3); + out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1, + filter2, filter3); + out0 = __lsx_vssrarni_b_h(out1, out0, 7); + out0 = __lsx_vxori_b(out0, 128); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + reg0 = reg2; + reg1 = reg3; + reg2 = reg4; + src6 = src10; + } +} + +static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1, out2, out3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + src = src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0, + reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9); + src10 = __lsx_vldx(src, src_stride3); + src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1, + filter2, filter3); + out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1, + filter2, filter3); + out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1, + filter2, filter3); + out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + reg0 = reg2; + reg1 = tmp0; + reg2 = tmp2; + reg3 = reg5; + reg4 = tmp1; + reg5 = tmp3; + src6 = src10; + } +} + +static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filter0, filter1, filter2, filter3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i reg6, reg7, reg8, reg9, reg10, reg11; + __m128i tmp0, tmp1, tmp2, tmp3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + // uint8_t *_src = (uint8_t *)src - src_stride3; + src -= src_stride3; + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0, + reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, reg6, + reg7, reg8, reg9); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9); + src10 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, + src4, src5, src7, src8); + tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1, + filter2, filter3); + tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1, + filter2, filter3); + tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1, + filter2, filter3); + tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + __lsx_vst(tmp1, dst, 0); + dst += dst_stride; + tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1, + filter2, filter3); + tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1, + filter2, filter3); + tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1, + filter2, filter3); + tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + __lsx_vst(tmp1, dst, 0); + dst += dst_stride; + + reg0 = reg2; + reg1 = src0; + reg2 = src2; + reg3 = reg5; + reg4 = src1; + reg5 = src3; + reg6 = reg8; + reg7 = src4; + reg8 = src7; + reg9 = reg11; + reg10 = src5; + reg11 = src8; + src6 = src10; + } +} + +static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height, + int32_t width) { + uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t cnt = width >> 4; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filter0, filter1, filter2, filter3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i reg6, reg7, reg8, reg9, reg10, reg11; + __m128i tmp0, tmp1, tmp2, tmp3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + src -= src_stride3; + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; cnt--;) { + uint32_t loop_cnt = height >> 2; + + src_tmp = src; + dst_tmp = dst; + + src0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src_tmp += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg0, reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg6, reg7, reg8, reg9); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, + src7, src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, + src4, src5, src7, src8); + tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1, + filter2, filter3); + tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1, + filter2, filter3); + tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1, + filter2, filter3); + tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vst(tmp0, dst_tmp, 0); + __lsx_vstx(tmp1, dst_tmp, dst_stride); + tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1, + filter2, filter3); + tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1, + filter2, filter3); + tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1, + filter2, filter3); + tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstx(tmp0, dst_tmp, dst_stride2); + __lsx_vstx(tmp1, dst_tmp, dst_stride3); + dst_tmp += dst_stride4; + + reg0 = reg2; + reg1 = src0; + reg2 = src2; + reg3 = reg5; + reg4 = src1; + reg5 = src3; + reg6 = reg8; + reg7 = src4; + reg8 = src7; + reg9 = reg11; + reg10 = src5; + reg11 = src8; + src6 = src10; + } + src += 16; + dst += 16; + } +} + +static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height, + 32); +} + +static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height, + 64); +} + +static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4; + __m128i src10_l, src32_l, src21_l, src43_l, src2110, src4332; + __m128i filt0, tmp0, tmp1; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += (src_stride4 + src_stride); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + src10_l, src21_l, src32_l, src43_l); + DUP2_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l, src2110, + src4332); + DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + src2110 = __lsx_vpickev_b(tmp1, tmp0); + + __lsx_vstelm_w(src2110, dst, 0, 0); + __lsx_vstelm_w(src2110, dst + dst_stride, 0, 1); + __lsx_vstelm_w(src2110, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(src2110, dst + dst_stride3, 0, 3); +} + +static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; + __m128i src65_l, src87_l, src2110, src4332, src6554, src8776; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i filt0; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + uint8_t *dst_tmp1 = dst + dst_stride4; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src5, src6, src7, src8); + src += (src_stride4 + src_stride); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + src10_l, src21_l, src32_l, src43_l); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + src54_l, src65_l, src76_l, src87_l); + DUP4_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, + src87_l, src76_l, src2110, src4332, src6554, src8776); + DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0, + src8776, filt0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, + FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, src2110, src4332); + + __lsx_vstelm_w(src2110, dst, 0, 0); + __lsx_vstelm_w(src2110, dst + dst_stride, 0, 1); + __lsx_vstelm_w(src2110, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(src2110, dst + dst_stride3, 0, 3); + + __lsx_vstelm_w(src4332, dst_tmp1, 0, 0); + __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride, 0, 1); + __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride2, 0, 2); + __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride3, 0, 3); +} + +static void common_vt_2t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_vt_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_vt_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; + __m128i out0, out1; + __m128i tmp0, tmp1, tmp2, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + /* rearranging filter_y */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0, + vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, + FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); +} + +static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 3); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i out0, out1; + __m128i tmp0, tmp1, tmp2, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src5 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7) + src8 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, + FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + dst += dst_stride4; + + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, + FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + dst += dst_stride4; + + src0 = src8; + } +} + +static void common_vt_2t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_vt_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_vt_2t_16w_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int y0_q4, + int y_step_q4, int w, int height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 8; cnt--;) { + filt_ver[cnt] = filter_y[cnt]; + } + + filt0 = __lsx_vldrepl_h(&filt_ver[3], 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + tmp4 = __lsx_vpickev_b(tmp1, tmp0); + __lsx_vst(tmp4, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3); + DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); + tmp4 = __lsx_vpickev_b(tmp3, tmp2); + __lsx_vst(tmp4, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + tmp4 = __lsx_vpickev_b(tmp1, tmp0); + __lsx_vst(tmp4, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3); + DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); + tmp4 = __lsx_vpickev_b(tmp3, tmp2); + __lsx_vst(tmp4, dst, 0); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + uint8_t *src_tmp; + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + src5 = __lsx_vld(src, 16); + src += src_stride; + src_tmp = src + 16; + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src1, src6); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src2, src7, src3, src8); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src4, src9); + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + src += src_stride4; + src_tmp += src_stride4; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + tmp4 = __lsx_vpickev_b(tmp1, tmp0); + __lsx_vst(tmp4, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3); + DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); + tmp4 = __lsx_vpickev_b(tmp3, tmp2); + __lsx_vstx(tmp4, dst, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + tmp4 = __lsx_vpickev_b(tmp1, tmp0); + __lsx_vstx(tmp4, dst, dst_stride2); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3); + DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); + tmp4 = __lsx_vpickev_b(tmp3, tmp2); + __lsx_vstx(tmp4, dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + tmp4 = __lsx_vpickev_b(tmp1, tmp0); + __lsx_vst(tmp4, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3); + DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); + dst += dst_stride; + tmp4 = __lsx_vpickev_b(tmp3, tmp2); + __lsx_vst(tmp4, dst, 16); + + DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + dst += dst_stride; + tmp4 = __lsx_vpickev_b(tmp1, tmp0); + __lsx_vst(tmp4, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3); + DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); + dst += dst_stride; + tmp4 = __lsx_vpickev_b(tmp3, tmp2); + __lsx_vst(tmp4, dst, 16); + + dst += dst_stride; + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + int32_t src_stride2 = src_stride << 1; + int32_t dst_stride2 = dst_stride << 1; + uint8_t *dst_tmp1 = dst + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6, + src9); + src += src_stride; + + for (; loop_cnt--;) { + uint8_t *src_tmp0 = src + src_stride; + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7, + src10); + DUP4_ARG2(__lsx_vld, src_tmp0, 0, src_tmp0, 16, src_tmp0, 32, src_tmp0, 48, + src2, src5, src8, src11); + src += src_stride2; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + tmp = __lsx_vpickev_b(tmp1, tmp0); + __lsx_vst(tmp, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3); + DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); + tmp = __lsx_vpickev_b(tmp3, tmp2); + __lsx_vst(tmp, dst_tmp1, 0); + + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp4, tmp5); + DUP2_ARG2(__lsx_vsrari_h, tmp4, FILTER_BITS, tmp5, FILTER_BITS, tmp4, tmp5); + tmp = __lsx_vpickev_b(tmp5, tmp4); + __lsx_vst(tmp, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp6, tmp7); + DUP2_ARG2(__lsx_vsrari_h, tmp6, FILTER_BITS, tmp7, FILTER_BITS, tmp6, tmp7); + tmp = __lsx_vpickev_b(tmp7, tmp6); + __lsx_vst(tmp, dst_tmp1, 16); + + DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + tmp = __lsx_vpickev_b(tmp1, tmp0); + __lsx_vst(tmp, dst, 32); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3); + DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); + tmp = __lsx_vpickev_b(tmp3, tmp2); + __lsx_vst(tmp, dst_tmp1, 32); + + DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp4, tmp5); + DUP2_ARG2(__lsx_vsrari_h, tmp4, FILTER_BITS, tmp5, FILTER_BITS, tmp4, tmp5); + tmp = __lsx_vpickev_b(tmp5, tmp4); + __lsx_vst(tmp, dst, 48); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp6, tmp7); + DUP2_ARG2(__lsx_vsrari_h, tmp6, FILTER_BITS, tmp7, FILTER_BITS, tmp6, tmp7); + tmp = __lsx_vpickev_b(tmp7, tmp6); + __lsx_vst(tmp, dst_tmp1, 48); + dst += dst_stride2; + dst_tmp1 += dst_stride2; + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 8; cnt--;) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_vt_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 8: + common_vt_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 16: + common_vt_2t_16w_lsx(src, src_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); + break; + case 32: + common_vt_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 64: + common_vt_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + default: + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 8: + common_vt_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 16: + common_vt_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 32: + common_vt_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 64: + common_vt_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + default: + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/vpx_dsp/loongarch/vpx_convolve_lsx.h b/vpx_dsp/loongarch/vpx_convolve_lsx.h new file mode 100644 index 0000000000..2fdb93db84 --- /dev/null +++ b/vpx_dsp/loongarch/vpx_convolve_lsx.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ + +#include "vpx_util/loongson_intrinsics.h" +#include "vpx_dsp/vpx_filter.h" + +#define FILT_8TAP_DPADD_S_H(_reg0, _reg1, _reg2, _reg3, _filter0, _filter1, \ + _filter2, _filter3) \ + ({ \ + __m128i _vec0, _vec1; \ + \ + _vec0 = __lsx_vdp2_h_b(_reg0, _filter0); \ + _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1); \ + _vec1 = __lsx_vdp2_h_b(_reg2, _filter2); \ + _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3); \ + _vec0 = __lsx_vsadd_h(_vec0, _vec1); \ + \ + _vec0; \ + }) + +#define HORIZ_8TAP_FILT(_src0, _src1, _mask0, _mask1, _mask2, _mask3, \ + _filt_h0, _filt_h1, _filt_h2, _filt_h3) \ + ({ \ + __m128i _tmp0, _tmp1, _tmp2, _tmp3; \ + __m128i _out; \ + \ + DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, \ + _src1, _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, \ + _tmp3); \ + _out = FILT_8TAP_DPADD_S_H(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, \ + _filt_h2, _filt_h3); \ + _out = __lsx_vsrari_h(_out, FILTER_BITS); \ + _out = __lsx_vsat_h(_out, 7); \ + \ + _out; \ + }) + +#define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, \ + _mask2, _mask3, _filter0, _filter1, \ + _filter2, _filter3, _out0, _out1) \ + { \ + __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ + __m128i _reg0, _reg1, _reg2, _reg3; \ + \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0, \ + _tmp0, _tmp1); \ + DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1, \ + _tmp2, _tmp3); \ + DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3, \ + _filter1, _reg0, _reg1); \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2, \ + _tmp4, _tmp5); \ + DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3, \ + _tmp6, _tmp7); \ + DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7, \ + _filter3, _reg2, _reg3); \ + DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1); \ + } + +#define HORIZ_8TAP_8WID_4VECS_FILT( \ + _src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0, \ + _filter1, _filter2, _filter3, _out0, _out1, _out2, _out3) \ + { \ + __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ + __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7; \ + \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0, \ + _src2, _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, \ + _tmp3); \ + DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2, \ + _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3); \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2, \ + _src2, _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, \ + _tmp3); \ + DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2, \ + _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7); \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1, \ + _src2, _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, \ + _tmp7); \ + DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5, \ + _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \ + _reg1, _reg2, _reg3); \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3, \ + _src2, _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, \ + _tmp7); \ + DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5, \ + _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \ + _reg5, _reg6, _reg7); \ + DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3, \ + _reg7, _out0, _out1, _out2, _out3); \ + } + +#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ + ({ \ + __m128i tmp0_m; \ + __m128i tmp1_m; \ + \ + tmp0_m = __lsx_vshuf_b(in1, in0, mask); \ + tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); \ + tmp1_m = __lsx_vsrari_h(tmp1_m, shift); \ + \ + tmp1_m; \ + }) + +#endif // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 976c652729..91a983d097 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -163,6 +163,12 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_vert_dspr2.c DSP_SRCS-$(HAVE_VSX) += ppc/vpx_convolve_vsx.c +# common (lsx) +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_horiz_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_vert_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_lsx.h + # loop filters DSP_SRCS-yes += loopfilter.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index ce0780fdab..49eea44389 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -374,13 +374,13 @@ () specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi/; add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; +specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; +specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; +specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; From 31441d45f76819bb80dfc76f0a0f59f2501239e8 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Thu, 10 Mar 2022 14:56:42 +0800 Subject: [PATCH 0294/1000] vp9[loongarch]: Optimize convolve/convolve8_avg_c 1. vpx_convolve8_avg_lsx 2. vpx_convolve_avg_lsx Bug: webm:1755 Change-Id: I4af5c362a94f11d0b5d1760e18326660bdbc0559 --- test/convolve_test.cc | 4 +- vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c | 748 ++++++++++++++++++++++ vpx_dsp/loongarch/vpx_convolve_avg_lsx.c | 321 ++++++++++ vpx_dsp/vpx_dsp.mk | 2 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 5 files changed, 1075 insertions(+), 4 deletions(-) create mode 100644 vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c create mode 100644 vpx_dsp/loongarch/vpx_convolve_avg_lsx.c diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 94b2814842..a631ec77f7 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -1451,9 +1451,9 @@ INSTANTIATE_TEST_SUITE_P(MSA, ConvolveTest, #if HAVE_LSX const ConvolveFunctions convolve8_lsx( - vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_lsx, + vpx_convolve_copy_c, vpx_convolve_avg_lsx, vpx_convolve8_horiz_lsx, vpx_convolve8_avg_horiz_c, vpx_convolve8_vert_lsx, vpx_convolve8_avg_vert_c, - vpx_convolve8_lsx, vpx_convolve8_avg_c, vpx_scaled_horiz_c, + vpx_convolve8_lsx, vpx_convolve8_avg_lsx, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c new file mode 100644 index 0000000000..27f5b5ca4f --- /dev/null +++ b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c @@ -0,0 +1,748 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static void common_hv_8ht_8vt_and_aver_dst_4w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i out0, out1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *_src = (uint8_t *)src - 3 - src_stride3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src4 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); + _src += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3); + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + tmp2 = __lsx_vpackev_b(tmp5, tmp4); + for (; loop_cnt--;) { + src7 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); + src10 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src4 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src5 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, src3, src2, src5, src4, src2, src3); + src2 = __lsx_vilvl_d(src3, src2); + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff); + tmp4 = __lsx_vpackev_b(tmp3, tmp4); + out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vshuf_b(src1, tmp3, shuff); + src0 = __lsx_vpackev_b(src1, src0); + out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + out0 = __lsx_vssrarni_b_h(out1, out0, 7); + out0 = __lsx_vxori_b(out0, 128); + out0 = __lsx_vavgr_bu(out0, src2); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + tmp5 = src1; + tmp0 = tmp2; + tmp1 = tmp4; + tmp2 = src0; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_8w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + __m128i out0, out1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *_src = (uint8_t *)src - 3 - src_stride3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src4 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); + _src += src_stride3; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1, + tmp0, tmp1, tmp2, tmp4); + DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6); + + for (; loop_cnt--;) { + src7 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); + src10 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp3 = __lsx_vpackev_b(src7, src6); + out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vpackev_b(src8, src7); + out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = __lsx_vpackev_b(src9, src8); + src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = __lsx_vpackev_b(src10, src9); + src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + src5 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src7 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src8 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src9 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, src7, src5, src9, src8, src5, src7); + DUP2_ARG2(__lsx_vavgr_bu, out0, src5, out1, src7, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + src6 = src10; + tmp0 = tmp2; + tmp1 = tmp3; + tmp2 = src1; + tmp4 = tmp6; + tmp5 = src0; + tmp6 = src2; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_16w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; +} + +static void common_hv_8ht_8vt_and_aver_dst_32w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_64w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0, vec1; + __m128i dst0, dst1, dst2, dst3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, out; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); + hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + + dst0 = __lsx_vldrepl_w(dst, 0); + dst1 = __lsx_vldrepl_w(dst + dst_stride, 0); + dst2 = __lsx_vldrepl_w(dst + dst_stride2, 0); + dst3 = __lsx_vldrepl_w(dst + dst_stride3, 0); + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + out = __lsx_vpickev_b(tmp1, tmp0); + out = __lsx_vavgr_bu(out, dst0); + __lsx_vstelm_w(out, dst, 0, 0); + __lsx_vstelm_w(out, dst + dst_stride, 0, 1); + __lsx_vstelm_w(out, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(out, dst + dst_stride3, 0, 3); +} + +static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + uint8_t *dst_tmp1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3, dst4; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src5, src6, src7, src8); + src += src_stride4; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff, + hz_out1, hz_out3); + hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff); + hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6); + + dst0 = __lsx_vldrepl_w(dst, 0); + dst += dst_stride; + dst1 = __lsx_vldrepl_w(dst, 0); + dst += dst_stride; + dst2 = __lsx_vldrepl_w(dst, 0); + dst += dst_stride; + dst3 = __lsx_vldrepl_w(dst, 0); + dst += dst_stride; + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + + dst1 = __lsx_vldrepl_w(dst, 0); + dst += dst_stride; + dst2 = __lsx_vldrepl_w(dst, 0); + dst += dst_stride; + dst3 = __lsx_vldrepl_w(dst, 0); + dst += dst_stride; + dst4 = __lsx_vldrepl_w(dst, 0); + dst += dst_stride; + dst1 = __lsx_vilvl_w(dst2, dst1); + dst2 = __lsx_vilvl_w(dst4, dst3); + dst1 = __lsx_vilvl_d(dst2, dst1); + + DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5, + hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3, + filt_vt, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, + FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, res0, res1); + DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1); + + dst_tmp1 = dst; + __lsx_vstelm_w(res0, dst_tmp1, 0, 0); + dst_tmp1 += dst_stride; + __lsx_vstelm_w(res0, dst_tmp1, 0, 1); + dst_tmp1 += dst_stride; + __lsx_vstelm_w(res0, dst_tmp1, 0, 2); + dst_tmp1 += dst_stride; + __lsx_vstelm_w(res0, dst_tmp1, 0, 3); + dst_tmp1 += dst_stride; + + __lsx_vstelm_w(res1, dst_tmp1, 0, 0); + dst_tmp1 += dst_stride; + __lsx_vstelm_w(res1, dst_tmp1, 0, 1); + dst_tmp1 += dst_stride; + __lsx_vstelm_w(res1, dst_tmp1, 0, 2); + dst_tmp1 += dst_stride; + __lsx_vstelm_w(res1, dst_tmp1, 0, 3); +} + +static void common_hv_2ht_2vt_and_aver_dst_4w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else if (height == 8) { + common_hv_2ht_2vt_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + uint8_t *dst_tmp = dst; + mask = __lsx_vld(mc_filt_mask_arr, 0); + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filtrt_ver, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += (src_stride4 + src_stride); + + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec1 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec2 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec3 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vdp2_h_bu(vec1, filt_vt); + + DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, + FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); + PCKEV_AVG_ST4_D(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); + dst -= dst_stride * 3; +} + +static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + uint8_t *dst_tmp = dst; + + /* rearranging filter */ + mask = __lsx_vld(mc_filt_mask_arr, 0); + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vlds(src, src_stride3); + src += src_stride4; + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vavgr_bu(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vavgr_bu(vec0, filt_vt); + + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vavgr_bu(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vavgr_bu(vec0, filt_vt); + + DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1); + + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + PCKEV_AVG_ST4_D(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else { + common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( + src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint8_t *src_tmp1; + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src_tmp1 = (uint8_t *)(src + 8); + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src5 = __lsx_vldx(src_tmp1, src_stride3); + src += src_stride4; + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + tmp3 = __lsx_vpickev_b(tmp1, tmp0); + tmp3 = __lsx_vavgr_bu(tmp3, dst0); + __lsx_vst(tmp3, dst, 0); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + tmp3 = __lsx_vpickev_b(tmp1, tmp0); + tmp3 = __lsx_vavgr_bu(tmp3, dst1); + __lsx_vst(tmp3, dst, 0); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + tmp3 = __lsx_vpickev_b(tmp1, tmp0); + tmp3 = __lsx_vavgr_bu(tmp3, dst2); + __lsx_vst(tmp3, dst, 0); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + tmp3 = __lsx_vpickev_b(tmp1, tmp0); + tmp3 = __lsx_vavgr_bu(tmp3, dst3); + __lsx_vst(tmp3, dst, 0); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_32w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; +} + +static void common_hv_2ht_2vt_and_aver_dst_64w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + } +} + +void vpx_convolve8_avg_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + if (vpx_get_filter_taps(filter_x) == 2 && + vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_hv_2ht_2vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 8: + common_hv_2ht_2vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 16: + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 32: + common_hv_2ht_2vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 64: + common_hv_2ht_2vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + default: + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else if (vpx_get_filter_taps(filter_x) == 2 || + vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 8: + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 16: + common_hv_8ht_8vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 32: + common_hv_8ht_8vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 64: + common_hv_8ht_8vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + default: + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c new file mode 100644 index 0000000000..1dad29eeed --- /dev/null +++ b/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static void avg_width4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt; + __m128i src0, src1; + __m128i dst0, dst1; + + int32_t src_stride2 = src_stride << 1; + + if ((height % 2) == 0) { + for (cnt = (height / 2); cnt--;) { + src0 = __lsx_vld(src, 0); + src1 = __lsx_vldx(src, src_stride); + src += src_stride2; + + dst0 = __lsx_vld(dst, 0); + dst1 = __lsx_vldx(dst, dst_stride); + DUP2_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, dst0, dst1); + + __lsx_vstelm_w(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 0); + dst += dst_stride; + } + } +} + +static void avg_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 4); + __m128i src0, src1, src2, src3; + __m128i dst0, dst1, dst2, dst3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + for (; cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + + __lsx_vstelm_d(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst3, dst, 0, 0); + dst += dst_stride; + } +} + +static void avg_width16_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 8); + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + for (; cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src7 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + dst4 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst5, dst6); + dst7 = __lsx_vldx(dst, dst_stride3); + dst -= dst_stride4; + + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + + __lsx_vst(dst0, dst, 0); + __lsx_vstx(dst1, dst, dst_stride); + __lsx_vstx(dst2, dst, dst_stride2); + __lsx_vstx(dst3, dst, dst_stride3); + dst += dst_stride4; + __lsx_vst(dst4, dst, 0); + __lsx_vstx(dst5, dst, dst_stride); + __lsx_vstx(dst6, dst, dst_stride2); + __lsx_vstx(dst7, dst, dst_stride3); + dst += dst_stride4; + } +} + +static void avg_width32_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 8); + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i src8, src9, src10, src11, src12, src13, src14, src15; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + for (; cnt--;) { + uint8_t *dst_tmp = dst; + uint8_t *dst_tmp1 = dst_tmp + 16; + uint8_t *src_tmp = src + 16; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src0, src1); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src2, src3, src4, src5); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src6, src7); + src += src_stride4; + + DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst0, dst1); + DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp, + dst_stride2, dst_tmp1, dst_stride2, dst2, dst3, dst4, dst5); + DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst6, + dst7); + dst_tmp += dst_stride4; + dst_tmp1 += dst_stride4; + + src_tmp = src + 16; + DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src8, src9); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src10, src11, src12, src13); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src14, src15); + src += src_stride4; + + DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst8, dst9); + DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp, + dst_stride2, dst_tmp1, dst_stride2, dst10, dst11, dst12, dst13); + DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst14, + dst15); + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11, + dst11, dst8, dst9, dst10, dst11); + DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15, + dst15, dst12, dst13, dst14, dst15); + + dst_tmp = dst + 16; + __lsx_vst(dst0, dst, 0); + __lsx_vstx(dst2, dst, dst_stride); + __lsx_vstx(dst4, dst, dst_stride2); + __lsx_vstx(dst6, dst, dst_stride3); + __lsx_vst(dst1, dst_tmp, 0); + __lsx_vstx(dst3, dst_tmp, dst_stride); + __lsx_vstx(dst5, dst_tmp, dst_stride2); + __lsx_vstx(dst7, dst_tmp, dst_stride3); + dst += dst_stride4; + + __lsx_vst(dst8, dst, 0); + __lsx_vstx(dst10, dst, dst_stride); + __lsx_vstx(dst12, dst, dst_stride2); + __lsx_vstx(dst14, dst, dst_stride3); + __lsx_vst(dst9, dst_tmp1, 0); + __lsx_vstx(dst11, dst_tmp1, dst_stride); + __lsx_vstx(dst13, dst_tmp1, dst_stride2); + __lsx_vstx(dst15, dst_tmp1, dst_stride3); + dst += dst_stride4; + } +} + +static void avg_width64_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 4); + uint8_t *dst_tmp = dst; + + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i src8, src9, src10, src11, src12, src13, src14, src15; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + for (; cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6, + src7); + src += src_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src8, src9, src10, + src11); + src += src_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src12, src13, src14, + src15); + src += src_stride; + + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst0, dst1, dst2, dst3); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst4, dst5, dst6, dst7); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst8, dst9, dst10, dst11); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst12, dst13, dst14, dst15); + dst_tmp += dst_stride; + + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11, + dst11, dst8, dst9, dst10, dst11); + DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15, + dst15, dst12, dst13, dst14, dst15); + + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + __lsx_vst(dst2, dst, 32); + __lsx_vst(dst3, dst, 48); + dst += dst_stride; + __lsx_vst(dst4, dst, 0); + __lsx_vst(dst5, dst, 16); + __lsx_vst(dst6, dst, 32); + __lsx_vst(dst7, dst, 48); + dst += dst_stride; + __lsx_vst(dst8, dst, 0); + __lsx_vst(dst9, dst, 16); + __lsx_vst(dst10, dst, 32); + __lsx_vst(dst11, dst, 48); + dst += dst_stride; + __lsx_vst(dst12, dst, 0); + __lsx_vst(dst13, dst, 16); + __lsx_vst(dst14, dst, 32); + __lsx_vst(dst15, dst, 48); + dst += dst_stride; + } +} + +void vpx_convolve_avg_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + switch (w) { + case 4: { + avg_width4_lsx(src, src_stride, dst, dst_stride, h); + break; + } + + case 8: { + avg_width8_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + avg_width16_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + avg_width32_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + avg_width64_lsx(src, src_stride, dst, dst_stride, h); + break; + } + default: { + int32_t lp, cnt; + for (cnt = h; cnt--;) { + for (lp = 0; lp < w; ++lp) { + dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1); + } + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 91a983d097..a880e1d285 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -167,6 +167,8 @@ DSP_SRCS-$(HAVE_VSX) += ppc/vpx_convolve_vsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_horiz_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_vert_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_avg_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_lsx.h # loop filters diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 49eea44389..8b66722481 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -371,7 +371,7 @@ () specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi/; +specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi lsx/; add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; @@ -383,7 +383,7 @@ () specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; +specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; From bf672f23a5336cb54dbcb2e4417142139f44cc3e Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 11 Mar 2022 10:56:07 +0800 Subject: [PATCH 0295/1000] vp8[loongarch]: Optimize idct_add, filter_bv/bh 1. vp8_dc_only_idct_add_lsx 2. vp8_loop_filter_bh_lsx 3. vp8_loop_filter_bv_lsx Bug: webm:1755 Change-Id: I9b629767e2a4e9db8cbb3ee2369186502dc6eb00 --- vp8/common/loongarch/idct_lsx.c | 54 +++ vp8/common/loongarch/loopfilter_filters_lsx.c | 352 ++++++++++++++++++ vp8/common/rtcd_defs.pl | 6 +- vp8/vp8_common.mk | 1 + 4 files changed, 410 insertions(+), 3 deletions(-) create mode 100644 vp8/common/loongarch/idct_lsx.c diff --git a/vp8/common/loongarch/idct_lsx.c b/vp8/common/loongarch/idct_lsx.c new file mode 100644 index 0000000000..fb0b0384c4 --- /dev/null +++ b/vp8/common/loongarch/idct_lsx.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_util/loongson_intrinsics.h" + +static void idct4x4_addconst_lsx(int16_t in_dc, uint8_t *pred, + int32_t pred_stride, uint8_t *dest, + int32_t dest_stride) { + __m128i vec, res0, res1, res2, res3, dst0, dst1; + __m128i pred0, pred1, pred2, pred3; + __m128i zero = __lsx_vldi(0); + + int32_t pred_stride2 = pred_stride << 1; + int32_t pred_stride3 = pred_stride2 + pred_stride; + + vec = __lsx_vreplgr2vr_h(in_dc); + vec = __lsx_vsrari_h(vec, 3); + pred0 = __lsx_vld(pred, 0); + DUP2_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred1, pred2); + pred3 = __lsx_vldx(pred, pred_stride3); + DUP4_ARG2(__lsx_vilvl_b, zero, pred0, zero, pred1, zero, pred2, zero, pred3, + res0, res1, res2, res3); + DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0, + res1, res2, res3); + res0 = __lsx_vclip255_h(res0); + res1 = __lsx_vclip255_h(res1); + res2 = __lsx_vclip255_h(res2); + res3 = __lsx_vclip255_h(res3); + + DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, dst0, dst1); + dst0 = __lsx_vpickev_w(dst1, dst0); + __lsx_vstelm_w(dst0, dest, 0, 0); + dest += dest_stride; + __lsx_vstelm_w(dst0, dest, 0, 1); + dest += dest_stride; + __lsx_vstelm_w(dst0, dest, 0, 2); + dest += dest_stride; + __lsx_vstelm_w(dst0, dest, 0, 3); +} + +void vp8_dc_only_idct_add_lsx(int16_t input_dc, uint8_t *pred_ptr, + int32_t pred_stride, uint8_t *dst_ptr, + int32_t dst_stride) { + idct4x4_addconst_lsx(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride); +} diff --git a/vp8/common/loongarch/loopfilter_filters_lsx.c b/vp8/common/loongarch/loopfilter_filters_lsx.c index 484b3d6ad0..c48f794840 100644 --- a/vp8/common/loongarch/loopfilter_filters_lsx.c +++ b/vp8/common/loongarch/loopfilter_filters_lsx.c @@ -13,6 +13,41 @@ #include "vp8/common/loopfilter.h" #include "vpx_util/loongson_intrinsics.h" +#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) \ + { \ + __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ + const __m128i cnst4b = __lsx_vldi(4); \ + const __m128i cnst3b = __lsx_vldi(3); \ + \ + p1_m = __lsx_vxori_b(p1, 0x80); \ + p0_m = __lsx_vxori_b(p0, 0x80); \ + q0_m = __lsx_vxori_b(q0, 0x80); \ + q1_m = __lsx_vxori_b(q1, 0x80); \ + \ + filt = __lsx_vssub_b(p1_m, q1_m); \ + filt = __lsx_vand_v(filt, hev); \ + q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vand_v(filt, mask); \ + t1 = __lsx_vsadd_b(filt, cnst4b); \ + t1 = __lsx_vsra_b(filt, cnst3b); \ + t2 = __lsx_vsadd_b(filt, cnst3b); \ + t2 = __lsx_vsra_b(filt, cnst3b); \ + q0_m = __lsx_vssub_b(q0_m, t1); \ + q0 = __lsx_vxori_b(q0_m, 0x80); \ + p0_m = __lsx_vsadd_b(p0_m, t2); \ + p0 = __lsx_vxori_b(p0_m, 0x80); \ + filt = __lsx_vsrari_b(t1, 1); \ + hev = __lsx_vxori_b(hev, 0xff); \ + filt = __lsx_vand_v(filt, hev); \ + q1_m = __lsx_vssub_b(q1_m, filt); \ + q1 = __lsx_vxori_b(q1_m, 0x80); \ + p1_m = __lsx_vsadd_b(p1_m, filt); \ + p1 = __lsx_vxori_b(p1_m, 0x80); \ + } + #define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ { \ __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ @@ -116,6 +151,279 @@ __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx); \ } +static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + uint8_t *temp_src; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + + __m128i mask, hev, flat; + __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + + temp_src = src - pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, p3, p2, p1, p0); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, q0, q1, q2, q3); + + thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr); + thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr); + thresh0 = __lsx_vilvl_d(thresh1, thresh0); + + b_limit0 = __lsx_vreplgr2vr_b(*b_limit0_ptr); + b_limit1 = __lsx_vreplgr2vr_b(*b_limit1_ptr); + b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); + + limit0 = __lsx_vreplgr2vr_b(*limit0_ptr); + limit1 = __lsx_vreplgr2vr_b(*limit1_ptr); + limit0 = __lsx_vilvl_d(limit1, limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + __lsx_vstx(p1, src, -pitch_x2); + __lsx_vstx(p0, src, -pitch); + __lsx_vst(q0, src, 0); + __lsx_vstx(q1, src, pitch); +} + +static void loop_filter_vertical_4_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + uint8_t *src_tmp0 = src - 4; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + __m128i mask, hev, flat; + __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i row0, row1, row2, row3, row4, row5, row6, row7; + __m128i row8, row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + row0 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row1, row2); + row3 = __lsx_vldx(src_tmp0, pitch_x3); + src_tmp0 += pitch_x4; + row4 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row5, row6); + row7 = __lsx_vldx(src_tmp0, pitch_x3); + src_tmp0 += pitch_x4; + + row8 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row9, row10); + row11 = __lsx_vldx(src_tmp0, pitch_x3); + src_tmp0 += pitch_x4; + row12 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row13, row14); + row15 = __lsx_vldx(src_tmp0, pitch_x3); + + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); + thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); + thresh0 = __lsx_vilvl_d(thresh1, thresh0); + + b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); + b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); + b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); + + limit0 = __lsx_vldrepl_b(limit0_ptr, 0); + limit1 = __lsx_vldrepl_b(limit1_ptr, 0); + limit0 = __lsx_vilvl_d(limit1, limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1); + tmp2 = __lsx_vilvl_h(tmp1, tmp0); + tmp3 = __lsx_vilvh_h(tmp1, tmp0); + DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1); + tmp4 = __lsx_vilvl_h(tmp1, tmp0); + tmp5 = __lsx_vilvh_h(tmp1, tmp0); + + src -= 2; + __lsx_vstelm_w(tmp2, src, 0, 0); + src += pitch; + __lsx_vstelm_w(tmp2, src, 0, 1); + src += pitch; + __lsx_vstelm_w(tmp2, src, 0, 2); + src += pitch; + __lsx_vstelm_w(tmp2, src, 0, 3); + src += pitch; + + __lsx_vstelm_w(tmp3, src, 0, 0); + src += pitch; + __lsx_vstelm_w(tmp3, src, 0, 1); + src += pitch; + __lsx_vstelm_w(tmp3, src, 0, 2); + src += pitch; + __lsx_vstelm_w(tmp3, src, 0, 3); + src += pitch; + + __lsx_vstelm_w(tmp4, src, 0, 0); + src += pitch; + __lsx_vstelm_w(tmp4, src, 0, 1); + src += pitch; + __lsx_vstelm_w(tmp4, src, 0, 2); + src += pitch; + __lsx_vstelm_w(tmp4, src, 0, 3); + src += pitch; + + __lsx_vstelm_w(tmp5, src, 0, 0); + src += pitch; + __lsx_vstelm_w(tmp5, src, 0, 1); + src += pitch; + __lsx_vstelm_w(tmp5, src, 0, 2); + src += pitch; + __lsx_vstelm_w(tmp5, src, 0, 3); +} + +static void loop_filter_horizontal_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v, + int32_t pitch, + const uint8_t b_limit_in, + const uint8_t limit_in, + const uint8_t thresh_in) { + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; + __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; + + thresh = __lsx_vreplgr2vr_b(thresh_in); + limit = __lsx_vreplgr2vr_b(limit_in); + b_limit = __lsx_vreplgr2vr_b(b_limit_in); + + DUP4_ARG2(__lsx_vldx, src_u, -pitch_x4, src_u, -pitch_x3, src_u, -pitch_x2, + src_u, -pitch, p3_u, p2_u, p1_u, p0_u); + q0_u = __lsx_vld(src_u, 0); + DUP2_ARG2(__lsx_vldx, src_u, pitch, src_u, pitch_x2, q1_u, q2_u); + q3_u = __lsx_vldx(src_u, pitch_x3); + + DUP4_ARG2(__lsx_vldx, src_v, -pitch_x4, src_v, -pitch_x3, src_v, -pitch_x2, + src_v, -pitch, p3_v, p2_v, p1_v, p0_v); + q0_v = __lsx_vld(src_v, 0); + DUP2_ARG2(__lsx_vldx, src_v, pitch, src_v, pitch_x2, q1_v, q2_v); + q3_v = __lsx_vldx(src_v, pitch_x3); + + /* right 8 element of p3 are u pixel and + left 8 element of p3 are v pixel */ + DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, + p2, p1, p0); + DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, + q1, q2, q3); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + __lsx_vstelm_d(p1, src_u, 0, 0); + __lsx_vstelm_d(p0, src_u + pitch, 0, 0); + __lsx_vstelm_d(q0, src_u + pitch_x2, 0, 0); + __lsx_vstelm_d(q1, src_u + pitch_x3, 0, 0); + + __lsx_vstelm_d(p1, src_v, 0, 1); + __lsx_vstelm_d(p0, src_v + pitch, 0, 1); + __lsx_vstelm_d(q0, src_v + pitch_x2, 0, 1); + __lsx_vstelm_d(q1, src_v + pitch_x3, 0, 1); +} + +static void loop_filter_vertical_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v, + int32_t pitch, + const uint8_t b_limit_in, + const uint8_t limit_in, + const uint8_t thresh_in) { + uint8_t *src_u_tmp, *src_v_tmp; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8; + __m128i row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + thresh = __lsx_vreplgr2vr_b(thresh_in); + limit = __lsx_vreplgr2vr_b(limit_in); + b_limit = __lsx_vreplgr2vr_b(b_limit_in); + + src_u_tmp = src_u - 4; + row0 = __lsx_vld(src_u_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row1, row2); + row3 = __lsx_vldx(src_u_tmp, pitch_x3); + src_u_tmp += pitch_x4; + row4 = __lsx_vld(src_u_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row5, row6); + row7 = __lsx_vldx(src_u_tmp, pitch_x3); + + src_v_tmp = src_v - 4; + row8 = __lsx_vld(src_v_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row9, row10); + row11 = __lsx_vldx(src_v_tmp, pitch_x3); + src_v_tmp += pitch_x4; + row12 = __lsx_vld(src_v_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row13, row14); + row15 = __lsx_vldx(src_v_tmp, pitch_x3); + + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1); + tmp2 = __lsx_vilvl_h(tmp1, tmp0); + tmp3 = __lsx_vilvh_h(tmp1, tmp0); + + tmp0 = __lsx_vilvl_b(p0, q1); + tmp1 = __lsx_vilvl_b(q1, q0); + tmp4 = __lsx_vilvl_h(tmp1, tmp0); + tmp5 = __lsx_vilvh_h(tmp1, tmp0); + + src_u_tmp += 2; + __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x4, 0, 0); + __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x3, 0, 1); + __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x2, 0, 2); + __lsx_vstelm_w(tmp2, src_u_tmp - pitch, 0, 3); + + __lsx_vstelm_w(tmp3, src_u_tmp, 0, 0); + __lsx_vstelm_w(tmp3, src_u_tmp + pitch, 0, 1); + __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x2, 0, 2); + __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x3, 0, 3); + + src_v_tmp += 2; + __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x4, 0, 0); + __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x3, 0, 1); + __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x2, 0, 2); + __lsx_vstelm_w(tmp4, src_v_tmp - pitch, 0, 3); + + __lsx_vstelm_w(tmp5, src_v_tmp, 0, 0); + __lsx_vstelm_w(tmp5, src_v_tmp + pitch, 0, 1); + __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x2, 0, 2); + __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x3, 0, 3); +} + static inline void mbloop_filter_horizontal_edge_y_lsx( uint8_t *src, int32_t pitch, const uint8_t b_limit_in, const uint8_t limit_in, const uint8_t thresh_in) { @@ -391,3 +699,47 @@ void vp8_loop_filter_mbv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, *lpf_info_ptr->hev_thr); } } + +void vp8_loop_filter_bh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + loop_filter_horizontal_4_dual_lsx(src_y + 4 * pitch_y, pitch_y, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr); + loop_filter_horizontal_4_dual_lsx(src_y + 8 * pitch_y, pitch_y, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr); + loop_filter_horizontal_4_dual_lsx(src_y + 12 * pitch_y, pitch_y, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr); + if (src_u) { + loop_filter_horizontal_edge_uv_lsx( + src_u + (4 * pitch_u_v), src_v + (4 * pitch_u_v), pitch_u_v, + *lpf_info_ptr->blim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); + } +} + +void vp8_loop_filter_bv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + loop_filter_vertical_4_dual_lsx(src_y + 4, pitch_y, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr); + loop_filter_vertical_4_dual_lsx(src_y + 8, pitch_y, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr); + loop_filter_vertical_4_dual_lsx(src_y + 12, pitch_y, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr); + if (src_u) { + loop_filter_vertical_edge_uv_lsx(src_u + 4, src_v + 4, pitch_u_v, + *lpf_info_ptr->blim, *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + } +} diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 40117e3677..32601b4eba 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -50,13 +50,13 @@ () specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi lsx/; add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"; -specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi/; +specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi lsx/; add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"; specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi lsx/; add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"; -specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi/; +specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi lsx/; add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit"; @@ -108,7 +108,7 @@ () #idct1_scalar_add add_proto qw/void vp8_dc_only_idct_add/, "short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride"; -specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi/; +specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi lsx/; # # RECON diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 909924ce8d..d485965d3d 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -127,6 +127,7 @@ endif # common (loongarch LSX intrinsics) VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/loopfilter_filters_lsx.c VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/sixtap_filter_lsx.c +VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/idct_lsx.c # common (neon intrinsics) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/loopfilter_arm.c From f79d256cb28a4228df66a7a6d1cebbd9071e0639 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 11 Mar 2022 20:19:25 +0200 Subject: [PATCH 0296/1000] Make sure only NEON FDCT functions are called. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [NEON] Added vpx_fdct4x4_pass1_neon(), Added vpx_fdct8x8_pass1_notranspose_neon(), Added vpx_fdct8x8_pass1_neon() to avoid code duplication Refactored vpx_fdct4x4_neon() and vpx_dct8x8_neon() to use the above Rename dct_body to vpx_fdct16x16_body to reuse later Add transpose_s16_16x16() I have run make test and all tests/configurations seem to pass. Profiled using this command on an Ampere Altra VM: sudo perf record -g ./vpxenc --codec=vp9 --height=1080 --width=1920 \ --fps=25/1 --limit=20 -o output.mkv \ ../original_videos_Sports_1080P_Sports_1080P-0063.mkv --debug –rt Before this optimization: 1.32% 1.32% vpxenc vpxenc [.] vpx_fdct4x4_neon 0.16% 0.16% vpxenc vpxenc [.] vpx_fdct4x4_c 0.79% 0.79% vpxenc vpxenc [.] vpx_fdct8x8_c 0.52% 0.52% vpxenc vpxenc [.] vpx_fdct8x8_neon 1.23% 1.23% vpxenc vpxenc [.] vpx_fdct16x16_c 0.54% 0.54% vpxenc vpxenc [.] vpx_fdct16x16_neon So, even though a _neon() version exists, the C version was called \ as well. After this patch: 1.42% 1.36% vpxenc vpxenc [.] vpx_fdct4x4_neon 0.87% 0.82% vpxenc vpxenc [.] vpx_fdct8x8_neon 0.74% 0.74% vpxenc vpxenc [.] vpx_fdct16x16_neon Change-Id: Id4e1dd315c67b4355fe4e5a1b59e181a349f16d0 --- vpx_dsp/arm/fdct16x16_neon.c | 319 +--------------------------------- vpx_dsp/arm/fdct16x16_neon.h | 327 +++++++++++++++++++++++++++++++++++ vpx_dsp/arm/fdct_neon.c | 61 ++----- vpx_dsp/arm/fdct_neon.h | 213 +++++++++++++++++++++++ vpx_dsp/arm/fwd_txfm_neon.c | 212 ++++------------------- vpx_dsp/arm/transpose_neon.h | 39 +++++ 6 files changed, 629 insertions(+), 542 deletions(-) create mode 100644 vpx_dsp/arm/fdct16x16_neon.h create mode 100644 vpx_dsp/arm/fdct_neon.h diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c index 6b2bebd097..67f43246aa 100644 --- a/vpx_dsp/arm/fdct16x16_neon.c +++ b/vpx_dsp/arm/fdct16x16_neon.c @@ -15,6 +15,7 @@ #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct16x16_neon.h" // Some builds of gcc 4.9.2 and .3 have trouble with some of the inline // functions. @@ -27,316 +28,6 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { #else -static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) { - b[0] = vld1q_s16(a); - a += stride; - b[1] = vld1q_s16(a); - a += stride; - b[2] = vld1q_s16(a); - a += stride; - b[3] = vld1q_s16(a); - a += stride; - b[4] = vld1q_s16(a); - a += stride; - b[5] = vld1q_s16(a); - a += stride; - b[6] = vld1q_s16(a); - a += stride; - b[7] = vld1q_s16(a); - a += stride; - b[8] = vld1q_s16(a); - a += stride; - b[9] = vld1q_s16(a); - a += stride; - b[10] = vld1q_s16(a); - a += stride; - b[11] = vld1q_s16(a); - a += stride; - b[12] = vld1q_s16(a); - a += stride; - b[13] = vld1q_s16(a); - a += stride; - b[14] = vld1q_s16(a); - a += stride; - b[15] = vld1q_s16(a); -} - -// Store 8 16x8 values, assuming stride == 16. -static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) { - store_s16q_to_tran_low(a, b[0]); - a += 16; - store_s16q_to_tran_low(a, b[1]); - a += 16; - store_s16q_to_tran_low(a, b[2]); - a += 16; - store_s16q_to_tran_low(a, b[3]); - a += 16; - store_s16q_to_tran_low(a, b[4]); - a += 16; - store_s16q_to_tran_low(a, b[5]); - a += 16; - store_s16q_to_tran_low(a, b[6]); - a += 16; - store_s16q_to_tran_low(a, b[7]); -} - -// Load step of each pass. Add and subtract clear across the input, requiring -// all 16 values to be loaded. For the first pass it also multiplies by 4. - -// To maybe reduce register usage this could be combined with the load() step to -// get the first 4 and last 4 values, cross those, then load the middle 8 values -// and cross them. -static INLINE void cross_input(const int16x8_t *a /*[16]*/, - int16x8_t *b /*[16]*/, const int pass) { - if (pass == 0) { - b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2); - b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2); - b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2); - b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2); - b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2); - b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2); - b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2); - b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2); - - b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2); - b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2); - b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2); - b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2); - b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2); - b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2); - b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2); - b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2); - } else { - b[0] = vaddq_s16(a[0], a[15]); - b[1] = vaddq_s16(a[1], a[14]); - b[2] = vaddq_s16(a[2], a[13]); - b[3] = vaddq_s16(a[3], a[12]); - b[4] = vaddq_s16(a[4], a[11]); - b[5] = vaddq_s16(a[5], a[10]); - b[6] = vaddq_s16(a[6], a[9]); - b[7] = vaddq_s16(a[7], a[8]); - - b[8] = vsubq_s16(a[7], a[8]); - b[9] = vsubq_s16(a[6], a[9]); - b[10] = vsubq_s16(a[5], a[10]); - b[11] = vsubq_s16(a[4], a[11]); - b[12] = vsubq_s16(a[3], a[12]); - b[13] = vsubq_s16(a[2], a[13]); - b[14] = vsubq_s16(a[1], a[14]); - b[15] = vsubq_s16(a[0], a[15]); - } -} - -// Quarter round at the beginning of the second pass. Can't use vrshr (rounding) -// because this only adds 1, not 1 << 2. -static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) { - const int16x8_t one = vdupq_n_s16(1); - a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2); - a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2); - a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2); - a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2); - a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2); - a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2); - a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2); - a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2); - a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2); - a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2); - a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2); - a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2); - a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2); - a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2); - a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2); - a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2); -} - -// fdct_round_shift((a +/- b) * c) -static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, - const tran_high_t c, int16x8_t *add, - int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c); - const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c); - const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - -// fdct_round_shift(a * c0 +/- b * c1) -static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, - const tran_coef_t c0, - const tran_coef_t c1, int16x8_t *add, - int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0); - const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1); - const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1); - const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0); - const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - -// Transpose 8x8 to a new location. Don't use transpose_neon.h because those -// are all in-place. -static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/, - int16x8_t *b /*[8]*/) { - // Swap 16 bit elements. - const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); - const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); - const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); - const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); - - // Swap 32 bit elements. - const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), - vreinterpretq_s32_s16(c1.val[0])); - const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), - vreinterpretq_s32_s16(c1.val[1])); - const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), - vreinterpretq_s32_s16(c3.val[0])); - const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), - vreinterpretq_s32_s16(c3.val[1])); - - // Swap 64 bit elements - const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); - const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); - const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); - const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); - - b[0] = e0.val[0]; - b[1] = e1.val[0]; - b[2] = e2.val[0]; - b[3] = e3.val[0]; - b[4] = e0.val[1]; - b[5] = e1.val[1]; - b[6] = e2.val[1]; - b[7] = e3.val[1]; -} - -// Main body of fdct16x16. -static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) { - int16x8_t s[8]; - int16x8_t x[4]; - int16x8_t step[8]; - - // stage 1 - // From fwd_txfm.c: Work on the first eight values; fdct8(input, - // even_results);" - s[0] = vaddq_s16(in[0], in[7]); - s[1] = vaddq_s16(in[1], in[6]); - s[2] = vaddq_s16(in[2], in[5]); - s[3] = vaddq_s16(in[3], in[4]); - s[4] = vsubq_s16(in[3], in[4]); - s[5] = vsubq_s16(in[2], in[5]); - s[6] = vsubq_s16(in[1], in[6]); - s[7] = vsubq_s16(in[0], in[7]); - - // fdct4(step, step); - x[0] = vaddq_s16(s[0], s[3]); - x[1] = vaddq_s16(s[1], s[2]); - x[2] = vsubq_s16(s[1], s[2]); - x[3] = vsubq_s16(s[0], s[3]); - - // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) - // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) - butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]); - // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); - // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); - butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]); - - // Stage 2 - // Re-using source s5/s6 - // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) - // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) - butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]); - - // Stage 3 - x[0] = vaddq_s16(s[4], s[5]); - x[1] = vsubq_s16(s[4], s[5]); - x[2] = vsubq_s16(s[7], s[6]); - x[3] = vaddq_s16(s[7], s[6]); - - // Stage 4 - // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) - // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) - butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]); - // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) - // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) - butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]); - - // step 2 - // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" - // That file distinguished between "in_high" and "step1" but the only - // difference is that "in_high" is the first 8 values and "step 1" is the - // second. Here, since they are all in one array, "step1" values are += 8. - - // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) - // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) - // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) - // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) - butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]); - butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]); - - // step 3 - s[0] = vaddq_s16(in[8], s[3]); - s[1] = vaddq_s16(in[9], s[2]); - x[0] = vsubq_s16(in[9], s[2]); - x[1] = vsubq_s16(in[8], s[3]); - x[2] = vsubq_s16(in[15], s[4]); - x[3] = vsubq_s16(in[14], s[5]); - s[6] = vaddq_s16(in[14], s[5]); - s[7] = vaddq_s16(in[15], s[4]); - - // step 4 - // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64) - // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64) - butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]); - - // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) - // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64) - butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]); - - // step 5 - step[0] = vaddq_s16(s[0], s[1]); - step[1] = vsubq_s16(s[0], s[1]); - step[2] = vaddq_s16(x[1], s[2]); - step[3] = vsubq_s16(x[1], s[2]); - step[4] = vsubq_s16(x[2], s[5]); - step[5] = vaddq_s16(x[2], s[5]); - step[6] = vsubq_s16(s[7], s[6]); - step[7] = vaddq_s16(s[7], s[6]); - - // step 6 - // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64) - // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64) - // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64) - // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64) - // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64) - // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] * - // cospi_22_64) - // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64) - // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64) - butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9], - &out[7]); - butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1], - &out[15]); - butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13], - &out[3]); - butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5], - &out[11]); -} - void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp0[16]; int16x8_t temp1[16]; @@ -346,12 +37,12 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Left half. load(input, stride, temp0); cross_input(temp0, temp1, 0); - dct_body(temp1, temp0); + vpx_fdct16x16_body(temp1, temp0); // Right half. load(input + 8, stride, temp1); cross_input(temp1, temp2, 0); - dct_body(temp2, temp1); + vpx_fdct16x16_body(temp2, temp1); // Transpose top left and top right quarters into one contiguous location to // process to the top half. @@ -359,7 +50,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { transpose_8x8(&temp1[0], &temp2[8]); partial_round_shift(temp2); cross_input(temp2, temp3, 1); - dct_body(temp3, temp2); + vpx_fdct16x16_body(temp3, temp2); transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], &temp2[5], &temp2[6], &temp2[7]); transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], @@ -375,7 +66,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { &temp1[13], &temp1[14], &temp1[15]); partial_round_shift(temp1); cross_input(temp1, temp0, 1); - dct_body(temp0, temp1); + vpx_fdct16x16_body(temp0, temp1); transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4], &temp1[5], &temp1[6], &temp1[7]); transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h new file mode 100644 index 0000000000..8391238991 --- /dev/null +++ b/vpx_dsp/arm/fdct16x16_neon.h @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_ + +#include + +static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) { + b[0] = vld1q_s16(a); + a += stride; + b[1] = vld1q_s16(a); + a += stride; + b[2] = vld1q_s16(a); + a += stride; + b[3] = vld1q_s16(a); + a += stride; + b[4] = vld1q_s16(a); + a += stride; + b[5] = vld1q_s16(a); + a += stride; + b[6] = vld1q_s16(a); + a += stride; + b[7] = vld1q_s16(a); + a += stride; + b[8] = vld1q_s16(a); + a += stride; + b[9] = vld1q_s16(a); + a += stride; + b[10] = vld1q_s16(a); + a += stride; + b[11] = vld1q_s16(a); + a += stride; + b[12] = vld1q_s16(a); + a += stride; + b[13] = vld1q_s16(a); + a += stride; + b[14] = vld1q_s16(a); + a += stride; + b[15] = vld1q_s16(a); +} + +// Store 8 16x8 values, assuming stride == 16. +static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) { + store_s16q_to_tran_low(a, b[0]); + a += 16; + store_s16q_to_tran_low(a, b[1]); + a += 16; + store_s16q_to_tran_low(a, b[2]); + a += 16; + store_s16q_to_tran_low(a, b[3]); + a += 16; + store_s16q_to_tran_low(a, b[4]); + a += 16; + store_s16q_to_tran_low(a, b[5]); + a += 16; + store_s16q_to_tran_low(a, b[6]); + a += 16; + store_s16q_to_tran_low(a, b[7]); +} + +// Load step of each pass. Add and subtract clear across the input, requiring +// all 16 values to be loaded. For the first pass it also multiplies by 4. + +// To maybe reduce register usage this could be combined with the load() step to +// get the first 4 and last 4 values, cross those, then load the middle 8 values +// and cross them. +static INLINE void cross_input(const int16x8_t *a /*[16]*/, + int16x8_t *b /*[16]*/, const int pass) { + if (pass == 0) { + b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2); + b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2); + b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2); + b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2); + b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2); + b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2); + b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2); + b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2); + + b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2); + b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2); + b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2); + b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2); + b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2); + b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2); + b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2); + b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2); + } else { + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); + } +} + +// Quarter round at the beginning of the second pass. Can't use vrshr (rounding) +// because this only adds 1, not 1 << 2. +static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) { + const int16x8_t one = vdupq_n_s16(1); + a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2); + a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2); + a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2); + a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2); + a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2); + a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2); + a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2); + a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2); + a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2); + a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2); + a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2); + a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2); + a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2); + a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2); + a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2); + a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2); +} + +// fdct_round_shift((a +/- b) * c) +static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, + const tran_high_t c, int16x8_t *add, + int16x8_t *sub) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c); + const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c); + const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// fdct_round_shift(a * c0 +/- b * c1) +static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, + const tran_coef_t c0, + const tran_coef_t c1, int16x8_t *add, + int16x8_t *sub) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0); + const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1); + const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1); + const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0); + const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// Transpose 8x8 to a new location. Don't use transpose_neon.h because those +// are all in-place. +static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/, + int16x8_t *b /*[8]*/) { + // Swap 16 bit elements. + const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); + + // Swap 32 bit elements. + const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), + vreinterpretq_s32_s16(c1.val[0])); + const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), + vreinterpretq_s32_s16(c1.val[1])); + const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), + vreinterpretq_s32_s16(c3.val[0])); + const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), + vreinterpretq_s32_s16(c3.val[1])); + + // Swap 64 bit elements + const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); + const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); + const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); + const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); + + b[0] = e0.val[0]; + b[1] = e1.val[0]; + b[2] = e2.val[0]; + b[3] = e3.val[0]; + b[4] = e0.val[1]; + b[5] = e1.val[1]; + b[6] = e2.val[1]; + b[7] = e3.val[1]; +} + +// Main body of fdct16x16. +static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, + int16x8_t *out /*[16]*/) { + int16x8_t s[8]; + int16x8_t x[4]; + int16x8_t step[8]; + + // stage 1 + // From fwd_txfm.c: Work on the first eight values; fdct8(input, + // even_results);" + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) + // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]); + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); + butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]); + + // Stage 2 + // Re-using source s5/s6 + // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) + // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) + butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]); + + // Stage 3 + x[0] = vaddq_s16(s[4], s[5]); + x[1] = vsubq_s16(s[4], s[5]); + x[2] = vsubq_s16(s[7], s[6]); + x[3] = vaddq_s16(s[7], s[6]); + + // Stage 4 + // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]); + // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]); + + // step 2 + // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" + // That file distinguished between "in_high" and "step1" but the only + // difference is that "in_high" is the first 8 values and "step 1" is the + // second. Here, since they are all in one array, "step1" values are += 8. + + // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) + // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) + // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) + // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) + butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]); + butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]); + + // step 3 + s[0] = vaddq_s16(in[8], s[3]); + s[1] = vaddq_s16(in[9], s[2]); + x[0] = vsubq_s16(in[9], s[2]); + x[1] = vsubq_s16(in[8], s[3]); + x[2] = vsubq_s16(in[15], s[4]); + x[3] = vsubq_s16(in[14], s[5]); + s[6] = vaddq_s16(in[14], s[5]); + s[7] = vaddq_s16(in[15], s[4]); + + // step 4 + // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64) + // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64) + butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]); + + // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64) + butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]); + + // step 5 + step[0] = vaddq_s16(s[0], s[1]); + step[1] = vsubq_s16(s[0], s[1]); + step[2] = vaddq_s16(x[1], s[2]); + step[3] = vsubq_s16(x[1], s[2]); + step[4] = vsubq_s16(x[2], s[5]); + step[5] = vaddq_s16(x[2], s[5]); + step[6] = vsubq_s16(s[7], s[6]); + step[7] = vaddq_s16(s[7], s[6]); + + // step 6 + // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64) + // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64) + // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64) + // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64) + // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64) + // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] * + // cospi_22_64) + // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64) + // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64) + butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9], + &out[7]); + butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1], + &out[15]); + butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13], + &out[3]); + butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5], + &out[11]); +} + +#endif // VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_ diff --git a/vpx_dsp/arm/fdct_neon.c b/vpx_dsp/arm/fdct_neon.c index 3708cbb11f..2827791f1e 100644 --- a/vpx_dsp/arm/fdct_neon.c +++ b/vpx_dsp/arm/fdct_neon.c @@ -15,6 +15,7 @@ #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" @@ -22,67 +23,25 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, int stride) { int i; // input[M * stride] * 16 - int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4); - int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4); - int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4); - int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4); + int16x4_t in[4]; + in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4); + in[1] = vshl_n_s16(vld1_s16(input + 1 * stride), 4); + in[2] = vshl_n_s16(vld1_s16(input + 2 * stride), 4); + in[3] = vshl_n_s16(vld1_s16(input + 3 * stride), 4); // If the very first value != 0, then add 1. if (input[0] != 0) { const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1)); - input_0 = vadd_s16(input_0, one); + in[0] = vadd_s16(in[0], one); } - for (i = 0; i < 2; ++i) { - const int16x8_t input_01 = vcombine_s16(input_0, input_1); - const int16x8_t input_32 = vcombine_s16(input_3, input_2); - - // in_0 +/- in_3, in_1 +/- in_2 - const int16x8_t s_01 = vaddq_s16(input_01, input_32); - const int16x8_t s_32 = vsubq_s16(input_01, input_32); - - // step_0 +/- step_1, step_2 +/- step_3 - const int16x4_t s_0 = vget_low_s16(s_01); - const int16x4_t s_1 = vget_high_s16(s_01); - const int16x4_t s_2 = vget_high_s16(s_32); - const int16x4_t s_3 = vget_low_s16(s_32); - - // (s_0 +/- s_1) * cospi_16_64 - // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. - const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); - const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); - const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64); - const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64); - - // fdct_round_shift - int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); - int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS); - - // s_3 * cospi_8_64 + s_2 * cospi_24_64 - // s_3 * cospi_24_64 - s_2 * cospi_8_64 - const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64); - const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64); - - const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64); - const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64); - - // fdct_round_shift - int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); - int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS); - - transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); - - input_0 = out_0; - input_1 = out_1; - input_2 = out_2; - input_3 = out_3; + vpx_fdct4x4_pass1_neon(in); } - { // Not quite a rounding shift. Only add 1 despite shifting by 2. const int16x8_t one = vdupq_n_s16(1); - int16x8_t out_01 = vcombine_s16(input_0, input_1); - int16x8_t out_23 = vcombine_s16(input_2, input_3); + int16x8_t out_01 = vcombine_s16(in[0], in[1]); + int16x8_t out_23 = vcombine_s16(in[2], in[3]); out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2); out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2); store_s16q_to_tran_low(final_output + 0 * 8, out_01); diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h new file mode 100644 index 0000000000..28d7d86bf8 --- /dev/null +++ b/vpx_dsp/arm/fdct_neon.h @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT_NEON_H_ + +#include + +static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) { + const int16x8_t input_01 = vcombine_s16(in[0], in[1]); + const int16x8_t input_32 = vcombine_s16(in[3], in[2]); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // (s_0 +/- s_1) * cospi_16_64 + // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. + const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); + const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); + const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64); + const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64); + + // fdct_round_shift + int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); + int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64); + const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64); + + const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64); + const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64); + + // fdct_round_shift + int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); + int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS); + + transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); + + in[0] = out_0; + in[1] = out_1; + in[2] = out_2; + in[3] = out_3; +} + +static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in, + int16x8_t *out) { + const int16x8_t v_s0 = vaddq_s16(in[0], in[7]); + const int16x8_t v_s1 = vaddq_s16(in[1], in[6]); + const int16x8_t v_s2 = vaddq_s16(in[2], in[5]); + const int16x8_t v_s3 = vaddq_s16(in[3], in[4]); + const int16x8_t v_s4 = vsubq_s16(in[3], in[4]); + const int16x8_t v_s5 = vsubq_s16(in[2], in[5]); + const int16x8_t v_s6 = vsubq_s16(in[1], in[6]); + const int16x8_t v_s7 = vsubq_s16(in[0], in[7]); + // fdct4(step, step); + int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); + int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); + int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); + int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); + // fdct4(step, step); + int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64); + int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64); + int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64); + int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64); + v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64); + v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64); + v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); + v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); + v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); + v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out[0] = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 + out[2] = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 + out[4] = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 + out[6] = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 + } + // Stage 2 + v_x0 = vsubq_s16(v_s6, v_s5); + v_x1 = vaddq_s16(v_s6, v_s5); + v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x8_t ab = vcombine_s16(a, b); + const int16x8_t cd = vcombine_s16(c, d); + // Stage 3 + v_x0 = vaddq_s16(v_s4, ab); + v_x1 = vsubq_s16(v_s4, ab); + v_x2 = vsubq_s16(v_s7, cd); + v_x3 = vaddq_s16(v_s7, cd); + } + // Stage 4 + v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64); + v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64); + v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64); + v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64); + v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64); + v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64); + v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64); + v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64); + v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64); + v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64); + v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out[1] = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 + out[3] = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 + out[5] = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 + out[7] = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 + } +} + +static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) { + int16x8_t out[8]; + vpx_fdct8x8_pass1_notranspose_neon(in, out); + // transpose 8x8 + // Can't use transpose_s16_8x8() because the values are arranged in two 4x8 + // columns. + { + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + // 04 05 06 07 44 45 46 47 + // 14 15 16 17 54 55 56 57 + // 24 25 26 27 64 65 66 67 + // 34 35 36 37 74 75 76 77 + const int32x4x2_t r02_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2])); + const int32x4x2_t r13_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3])); + const int32x4x2_t r46_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6])); + const int32x4x2_t r57_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7])); + const int16x8x2_t r01_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), + vreinterpretq_s16_s32(r13_s32.val[0])); + const int16x8x2_t r23_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), + vreinterpretq_s16_s32(r13_s32.val[1])); + const int16x8x2_t r45_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), + vreinterpretq_s16_s32(r57_s32.val[0])); + const int16x8x2_t r67_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), + vreinterpretq_s16_s32(r57_s32.val[1])); + in[0] = r01_s16.val[0]; + in[1] = r01_s16.val[1]; + in[2] = r23_s16.val[0]; + in[3] = r23_s16.val[1]; + in[4] = r45_s16.val[0]; + in[5] = r45_s16.val[1]; + in[6] = r67_s16.val[0]; + in[7] = r67_s16.val[1]; + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } +} +#endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_ diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c index 374a262b93..d9161c6d38 100644 --- a/vpx_dsp/arm/fwd_txfm_neon.c +++ b/vpx_dsp/arm/fwd_txfm_neon.c @@ -15,196 +15,54 @@ #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" #include "vpx_dsp/arm/mem_neon.h" void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, int stride) { int i; // stage 1 - int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); - int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); - int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); - int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); - int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); - int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); - int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); - int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); + int16x8_t in[8]; + in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); + in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); + in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); + in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); + in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); + in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); + in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); + in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); for (i = 0; i < 2; ++i) { - int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7; - const int16x8_t v_s0 = vaddq_s16(input_0, input_7); - const int16x8_t v_s1 = vaddq_s16(input_1, input_6); - const int16x8_t v_s2 = vaddq_s16(input_2, input_5); - const int16x8_t v_s3 = vaddq_s16(input_3, input_4); - const int16x8_t v_s4 = vsubq_s16(input_3, input_4); - const int16x8_t v_s5 = vsubq_s16(input_2, input_5); - const int16x8_t v_s6 = vsubq_s16(input_1, input_6); - const int16x8_t v_s7 = vsubq_s16(input_0, input_7); - // fdct4(step, step); - int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); - int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); - int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); - int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); - // fdct4(step, step); - int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64); - int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64); - int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64); - int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64); - v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64); - v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64); - v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); - v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); - v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); - v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 - out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 - out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 - out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 - } - // Stage 2 - v_x0 = vsubq_s16(v_s6, v_s5); - v_x1 = vaddq_s16(v_s6, v_s5); - v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x8_t ab = vcombine_s16(a, b); - const int16x8_t cd = vcombine_s16(c, d); - // Stage 3 - v_x0 = vaddq_s16(v_s4, ab); - v_x1 = vsubq_s16(v_s4, ab); - v_x2 = vsubq_s16(v_s7, cd); - v_x3 = vaddq_s16(v_s7, cd); - } - // Stage 4 - v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64); - v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64); - v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64); - v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64); - v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64); - v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64); - v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64); - v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64); - v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64); - v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64); - v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 - out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 - out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 - out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 - } - // transpose 8x8 - // Can't use transpose_s16_8x8() because the values are arranged in two 4x8 - // columns. - { - // 00 01 02 03 40 41 42 43 - // 10 11 12 13 50 51 52 53 - // 20 21 22 23 60 61 62 63 - // 30 31 32 33 70 71 72 73 - // 04 05 06 07 44 45 46 47 - // 14 15 16 17 54 55 56 57 - // 24 25 26 27 64 65 66 67 - // 34 35 36 37 74 75 76 77 - const int32x4x2_t r02_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2)); - const int32x4x2_t r13_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3)); - const int32x4x2_t r46_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6)); - const int32x4x2_t r57_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7)); - const int16x8x2_t r01_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), - vreinterpretq_s16_s32(r13_s32.val[0])); - const int16x8x2_t r23_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), - vreinterpretq_s16_s32(r13_s32.val[1])); - const int16x8x2_t r45_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), - vreinterpretq_s16_s32(r57_s32.val[0])); - const int16x8x2_t r67_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), - vreinterpretq_s16_s32(r57_s32.val[1])); - input_0 = r01_s16.val[0]; - input_1 = r01_s16.val[1]; - input_2 = r23_s16.val[0]; - input_3 = r23_s16.val[1]; - input_4 = r45_s16.val[0]; - input_5 = r45_s16.val[1]; - input_6 = r67_s16.val[0]; - input_7 = r67_s16.val[1]; - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } + vpx_fdct8x8_pass1_neon(in); } // for { // from vpx_dct_sse2.c // Post-condition (division by two) // division of two 16 bits signed numbers using shifts // n / 2 = (n - (n >> 15)) >> 1 - const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15); - const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15); - const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15); - const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15); - const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15); - const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15); - const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15); - const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15); - input_0 = vhsubq_s16(input_0, sign_in0); - input_1 = vhsubq_s16(input_1, sign_in1); - input_2 = vhsubq_s16(input_2, sign_in2); - input_3 = vhsubq_s16(input_3, sign_in3); - input_4 = vhsubq_s16(input_4, sign_in4); - input_5 = vhsubq_s16(input_5, sign_in5); - input_6 = vhsubq_s16(input_6, sign_in6); - input_7 = vhsubq_s16(input_7, sign_in7); + const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15); + const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15); + const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15); + const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15); + const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15); + const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15); + const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15); + const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15); + in[0] = vhsubq_s16(in[0], sign_in0); + in[1] = vhsubq_s16(in[1], sign_in1); + in[2] = vhsubq_s16(in[2], sign_in2); + in[3] = vhsubq_s16(in[3], sign_in3); + in[4] = vhsubq_s16(in[4], sign_in4); + in[5] = vhsubq_s16(in[5], sign_in5); + in[6] = vhsubq_s16(in[6], sign_in6); + in[7] = vhsubq_s16(in[7], sign_in7); // store results - store_s16q_to_tran_low(final_output + 0 * 8, input_0); - store_s16q_to_tran_low(final_output + 1 * 8, input_1); - store_s16q_to_tran_low(final_output + 2 * 8, input_2); - store_s16q_to_tran_low(final_output + 3 * 8, input_3); - store_s16q_to_tran_low(final_output + 4 * 8, input_4); - store_s16q_to_tran_low(final_output + 5 * 8, input_5); - store_s16q_to_tran_low(final_output + 6 * 8, input_6); - store_s16q_to_tran_low(final_output + 7 * 8, input_7); + store_s16q_to_tran_low(final_output + 0 * 8, in[0]); + store_s16q_to_tran_low(final_output + 1 * 8, in[1]); + store_s16q_to_tran_low(final_output + 2 * 8, in[2]); + store_s16q_to_tran_low(final_output + 3 * 8, in[3]); + store_s16q_to_tran_low(final_output + 4 * 8, in[4]); + store_s16q_to_tran_low(final_output + 5 * 8, in[5]); + store_s16q_to_tran_low(final_output + 6 * 8, in[6]); + store_s16q_to_tran_low(final_output + 7 * 8, in[7]); } } diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index 752308160d..c098ad31b6 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -1184,6 +1184,45 @@ static INLINE void transpose_u8_16x16( *o15 = e7.val[1]; } +static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) { + int16x8_t t[8]; + + // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3. + t[0] = in0[8]; + t[1] = in0[9]; + t[2] = in0[10]; + t[3] = in0[11]; + t[4] = in0[12]; + t[5] = in0[13]; + t[6] = in0[14]; + t[7] = in0[15]; + in0[8] = in1[0]; + in0[9] = in1[1]; + in0[10] = in1[2]; + in0[11] = in1[3]; + in0[12] = in1[4]; + in0[13] = in1[5]; + in0[14] = in1[6]; + in0[15] = in1[7]; + in1[0] = t[0]; + in1[1] = t[1]; + in1[2] = t[2]; + in1[3] = t[3]; + in1[4] = t[4]; + in1[5] = t[5]; + in1[6] = t[6]; + in1[7] = t[7]; + + transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5], + &in0[6], &in0[7]); + transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13], + &in0[14], &in0[15]); + transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5], + &in1[6], &in1[7]); + transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13], + &in1[14], &in1[15]); +} + static INLINE void load_and_transpose_u8_4x8(const uint8_t *a, const int a_stride, uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, From f3711cae5a4cf5d673f7f27ea3a5975ab3f2982a Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 22 Mar 2022 13:07:31 -0700 Subject: [PATCH 0297/1000] Fix ClangTidy style warning Change-Id: I6c4711e488cda6b97af96d5e1b6b249786e709de --- test/svc_end_to_end_test.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc index c0556d8b7d..7300ce6679 100644 --- a/test/svc_end_to_end_test.cc +++ b/test/svc_end_to_end_test.cc @@ -251,12 +251,13 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, temporal_layer_id_ = layer_id.temporal_layer_id; for (int i = 0; i < number_spatial_layers_; i++) { layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; - ref_frame_config.duration[i] = 1; + ref_frame_config_.duration[i] = 1; } encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); set_frame_flags_bypass_mode(layer_id.temporal_layer_id, - number_spatial_layers_, 0, &ref_frame_config); - encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); + number_spatial_layers_, 0, + &ref_frame_config_); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); } if (video->frame() == frame_to_sync_) { encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync_); @@ -327,7 +328,7 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, unsigned int mismatch_nframes_; unsigned int num_nonref_frames_; bool flexible_mode_; - vpx_svc_ref_frame_config_t ref_frame_config; + vpx_svc_ref_frame_config_t ref_frame_config_; private: virtual void SetConfig(const int num_temporal_layer) { From f6344745d9887adb38a62a878a3b794b84240851 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 22 Mar 2022 13:51:27 -0700 Subject: [PATCH 0298/1000] ads2gas*.pl: strip trailing whitespace after transforms Change-Id: I0bea977b256e464231706c72cc14a5c8b6e90775 --- build/make/ads2gas.pl | 1 + build/make/ads2gas_apple.pl | 1 + 2 files changed, 2 insertions(+) diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl index 4b7a906d26..c17c4114a2 100755 --- a/build/make/ads2gas.pl +++ b/build/make/ads2gas.pl @@ -147,6 +147,7 @@ s/\bMEND\b/.endm/; # No need to tell it where to stop assembling next if /^\s*END\s*$/; + s/[ \t]+$//; print; } diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl index af10b436a9..34254f4abe 100755 --- a/build/make/ads2gas_apple.pl +++ b/build/make/ads2gas_apple.pl @@ -109,5 +109,6 @@ ($) s/\bMEND\b/.endm/; # No need to tell it where to stop assembling next if /^\s*END\s*$/; + s/[ \t]+$//; print; } From da0cfd3d592713a66863484f1c19aba2b775306f Mon Sep 17 00:00:00 2001 From: Johann Date: Wed, 23 Mar 2022 13:58:46 +0900 Subject: [PATCH 0299/1000] ads2gas: fix .size measurement The distance between PROC and END is used to generate .size information for debugging. When the leading underscore was removed the pattern used to match the function name broke. Change-Id: I90bf67d95ecdc2d214606e663773f88d2a2d6b9c --- build/make/ads2gas.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl index c17c4114a2..fcbd59b89b 100755 --- a/build/make/ads2gas.pl +++ b/build/make/ads2gas.pl @@ -118,7 +118,8 @@ # This makes them show up properly in debugging tools like gdb and valgrind. if (/\bPROC\b/) { my $proc; - /^_([\.0-9A-Z_a-z]\w+)\b/; + # Match the function name so it can be stored in $proc + /^([\.0-9A-Z_a-z]\w+)\b/; $proc = $1; push(@proc_stack, $proc) if ($proc); s/\bPROC\b/@ $&/; From 29cde7ec1a1d466e30c33b939616a4903a8abb3f Mon Sep 17 00:00:00 2001 From: Johann Date: Wed, 23 Mar 2022 14:18:58 +0900 Subject: [PATCH 0300/1000] ads2gas: maintain whitespace Don't use tabs during conversion. Save and restore existing spacing. Change-Id: Ib8f443db542c091d36e9ab9836e3e3e292d711f7 --- build/make/ads2gas.pl | 31 ++++++++++++++++--------------- build/make/ads2gas_apple.pl | 8 ++++---- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl index fcbd59b89b..c301b7f829 100755 --- a/build/make/ads2gas.pl +++ b/build/make/ads2gas.pl @@ -32,7 +32,7 @@ print "@ This file was created from a .asm file\n"; print "@ using the ads2gas.pl script.\n"; -print "\t.syntax unified\n"; +print ".syntax unified\n"; if ($thumb) { print "\t.thumb\n"; } @@ -60,19 +60,19 @@ } # Convert INCLUDE to .INCLUDE "file" - s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/; + s/INCLUDE\s?(.*)$/.include \"$1\"/; # No AREA required # But ALIGNs in AREA must be obeyed - s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/; + s/^(\s*)\bAREA\b.*ALIGN=([0-9])$/$1.text\n$1.p2align $2/; # If no ALIGN, strip the AREA and align to 4 bytes - s/^\s*AREA.*$/.text\n.p2align 2/; + s/^(\s*)\bAREA\b.*$/$1.text\n$1.p2align 2/; # Make function visible to linker. if ($elf) { - s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/; + s/(\s*)EXPORT\s+\|([\$\w]*)\|/$1.global $2\n$1.type $2, function/; } else { - s/EXPORT\s+\|([\$\w]*)\|/.global $1/; + s/(\s*)EXPORT\s+\|([\$\w]*)\|/$1.global $2/; } # No vertical bars on function names @@ -85,11 +85,12 @@ s/\bALIGN\b/.balign/g; if ($thumb) { - # ARM code - we force everything to thumb with the declaration in the header - s/\s+ARM//g; + # ARM code - we force everything to thumb with the declaration in the + # header + s/\bARM\b//g; } else { # ARM code - s/\sARM/.arm/g; + s/\bARM\b/.arm/g; } # push/pop @@ -105,13 +106,13 @@ if ($elf) { # REQUIRE8 Stack is required to be 8-byte aligned - s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g; + s/\bREQUIRE8\b/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g; # PRESERVE8 Stack 8-byte align is preserved - s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g; + s/\bPRESERVE8\b/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g; } else { - s/\s+REQUIRE8//; - s/\s+PRESERVE8//; + s/\bREQUIRE8\b//; + s/\bPRESERVE8\b//; } # Use PROC and ENDP to give the symbols a .size directive. @@ -129,7 +130,7 @@ my $proc; s/\bENDP\b/@ $&/; $proc = pop(@proc_stack); - $_ = "\t.size $proc, .-$proc".$_ if ($proc and $elf); + $_ = ".size $proc, .-$proc".$_ if ($proc and $elf); } # EQU directive @@ -153,4 +154,4 @@ } # Mark that this object doesn't need an executable stack. -printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n") if $elf; +printf (" .section .note.GNU-stack,\"\",\%\%progbits\n") if $elf; diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl index 34254f4abe..62491c1918 100755 --- a/build/make/ads2gas_apple.pl +++ b/build/make/ads2gas_apple.pl @@ -20,7 +20,7 @@ print "@ This file was created from a .asm file\n"; print "@ using the ads2gas_apple.pl script.\n\n"; -print "\t.syntax unified\n"; +print ".syntax unified\n"; my %macro_aliases; @@ -57,13 +57,13 @@ ($) } # Convert INCLUDE to .INCLUDE "file" - s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/; + s/INCLUDE\s?(.*)$/.include \"$1\"/; # No AREA required # But ALIGNs in AREA must be obeyed - s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/; + s/^(\s*)\bAREA\b.*ALIGN=([0-9])$/$1.text\n$1.p2align $2/; # If no ALIGN, strip the AREA and align to 4 bytes - s/^\s*AREA.*$/.text\n.p2align 2/; + s/^(\s*)\bAREA\b.*$/$1.text\n$1.p2align 2/; # Make function visible to linker. s/EXPORT\s+\|([\$\w]*)\|/.globl _$1/; From 9c424b7556ee44df2335332e079c59f0a8d3559b Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 26 Mar 2022 10:25:18 -0700 Subject: [PATCH 0301/1000] ads2armasm_ms.pl: fix thumb::FixThumbInstructions call broken since: 642529248 ads2gas[_apple].pl: remove unused stanzas Change-Id: I1eac77e2fe23cc3f162251e9e0102a4909f7b997 --- build/make/ads2armasm_ms.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/make/ads2armasm_ms.pl b/build/make/ads2armasm_ms.pl index 2a2c470ff8..dd4e0318c4 100755 --- a/build/make/ads2armasm_ms.pl +++ b/build/make/ads2armasm_ms.pl @@ -28,7 +28,7 @@ s/qsubaddx/qsax/i; s/qaddsubx/qasx/i; - thumb::FixThumbInstructions($_, 1); + thumb::FixThumbInstructions($_); s/ldrneb/ldrbne/i; s/ldrneh/ldrhne/i; From d60b671a73a4c8ebc2324ccb248a713652d6506b Mon Sep 17 00:00:00 2001 From: Johann Date: Wed, 23 Mar 2022 14:28:29 +0900 Subject: [PATCH 0302/1000] gcc 11 warning: mismatched bound MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up a new build warning with gcc11: argument 3 of type ‘const uint8_t * const[]’ with mismatched bound [-Warray-parameter=] Standardize sad functions with array sizes. Change-Id: Iea4144e61368f6a8279e2f3ae96c78aff06c8b41 --- vpx_dsp/arm/sad4d_neon.c | 122 +++++++++++++++++------------------ vpx_dsp/mips/sad_msa.c | 32 ++++----- vpx_dsp/sad.c | 48 +++++++++----- vpx_dsp/vpx_dsp_rtcd_defs.pl | 74 ++++++++++----------- vpx_dsp/x86/sad4d_avx2.c | 13 ++-- vpx_dsp/x86/sad4d_avx512.c | 6 +- 6 files changed, 155 insertions(+), 140 deletions(-) diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 5c7a0fcaf0..03f716c3d5 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -31,7 +31,7 @@ static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0, static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride, const uint8_t *const ref_array[4], const int ref_stride, const int height, - uint32_t *const res) { + uint32_t sad_array[4]) { int i; uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; #if !defined(__aarch64__) @@ -61,26 +61,26 @@ static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride, a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1])); r = vpaddlq_u16(vcombine_u16(a[0], a[1])); #endif - vst1q_u32(res, r); + vst1q_u32(sad_array, r); } void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, res); + uint32_t sad_array[4]) { + sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, sad_array); } void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, res); + uint32_t sad_array[4]) { + sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, sad_array); } //////////////////////////////////////////////////////////////////////////////// // Can handle 512 pixels' sad sum (such as 16x32 or 32x16) -static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/, - uint32_t *const res) { +static INLINE void sad_512_pel_final_neon(const uint16x8_t sum[4], + uint32_t sad_array[4]) { #if defined(__aarch64__) const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); @@ -95,21 +95,21 @@ static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/, const uint16x4_t b1 = vpadd_u16(a2, a3); const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1)); #endif - vst1q_u32(res, r); + vst1q_u32(sad_array, r); } #if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD) // Can handle 1024 pixels' sad sum (such as 32x32) -static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/, - uint32_t *const res) { +static INLINE void sad_1024_pel_final_neon(const uint16x8_t sum[4], + uint32_t sad_array[4]) { #if defined(__aarch64__) const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); const uint32x4_t b0 = vpaddlq_u16(a0); const uint32x4_t b1 = vpaddlq_u16(a1); const uint32x4_t r = vpaddq_u32(b0, b1); - vst1q_u32(res, r); + vst1q_u32(sad_array, r); #else const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); @@ -119,13 +119,13 @@ static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/, const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3)); const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0)); const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1)); - vst1q_u32(res, vcombine_u32(c0, c1)); + vst1q_u32(sad_array, vcombine_u32(c0, c1)); #endif } // Can handle 2048 pixels' sad sum (such as 32x64 or 64x32) -static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/, - uint32_t *const res) { +static INLINE void sad_2048_pel_final_neon(const uint16x8_t sum[4], + uint32_t sad_array[4]) { #if defined(__aarch64__) const uint32x4_t a0 = vpaddlq_u16(sum[0]); const uint32x4_t a1 = vpaddlq_u16(sum[1]); @@ -134,7 +134,7 @@ static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/, const uint32x4_t b0 = vpaddq_u32(a0, a1); const uint32x4_t b1 = vpaddq_u32(a2, a3); const uint32x4_t r = vpaddq_u32(b0, b1); - vst1q_u32(res, r); + vst1q_u32(sad_array, r); #else const uint32x4_t a0 = vpaddlq_u16(sum[0]); const uint32x4_t a1 = vpaddlq_u16(sum[1]); @@ -146,13 +146,13 @@ static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/, const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3)); const uint32x2_t c0 = vpadd_u32(b0, b1); const uint32x2_t c1 = vpadd_u32(b2, b3); - vst1q_u32(res, vcombine_u32(c0, c1)); + vst1q_u32(sad_array, vcombine_u32(c0, c1)); #endif } // Can handle 4096 pixels' sad sum (such as 64x64) -static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, - uint32_t *const res) { +static INLINE void sad_4096_pel_final_neon(const uint16x8_t sum[8], + uint32_t sad_array[4]) { #if defined(__aarch64__) const uint32x4_t a0 = vpaddlq_u16(sum[0]); const uint32x4_t a1 = vpaddlq_u16(sum[1]); @@ -169,7 +169,7 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, const uint32x4_t c0 = vpaddq_u32(b0, b1); const uint32x4_t c1 = vpaddq_u32(b2, b3); const uint32x4_t r = vpaddq_u32(c0, c1); - vst1q_u32(res, r); + vst1q_u32(sad_array, r); #else const uint32x4_t a0 = vpaddlq_u16(sum[0]); const uint32x4_t a1 = vpaddlq_u16(sum[1]); @@ -189,7 +189,7 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); const uint32x2_t d0 = vpadd_u32(c0, c1); const uint32x2_t d1 = vpadd_u32(c2, c3); - vst1q_u32(res, vcombine_u32(d0, d1)); + vst1q_u32(sad_array, vcombine_u32(d0, d1)); #endif } @@ -197,7 +197,7 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res, const int height) { + uint32_t sad_array[4], const int height) { int i, j; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], ref_array[3] }; @@ -214,25 +214,25 @@ static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride, } } - sad_512_pel_final_neon(sum, res); + sad_512_pel_final_neon(sum, sad_array); } void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 4); + uint32_t sad_array[4]) { + sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 4); } void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8); + uint32_t sad_array[4]) { + sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8); } void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16); + uint32_t sad_array[4]) { + sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16); } //////////////////////////////////////////////////////////////////////////////// @@ -249,7 +249,7 @@ static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res, const int height) { + uint32_t sad_array[4], const int height) { int i; uint32x4_t r0, r1; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], @@ -267,7 +267,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, r0 = vpaddq_u32(sum[0], sum[1]); r1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(res, vpaddq_u32(r0, r1)); + vst1q_u32(sad_array, vpaddq_u32(r0, r1)); } #else @@ -281,7 +281,7 @@ static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res, const int height) { + uint32_t sad_array[4], const int height) { int i; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], ref_array[3] }; @@ -302,27 +302,27 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, ref_loop[3] += ref_stride; } - sad_512_pel_final_neon(sum, res); + sad_512_pel_final_neon(sum, sad_array); } #endif void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8); + uint32_t sad_array[4]) { + sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8); } void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16); + uint32_t sad_array[4]) { + sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16); } void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32); + uint32_t sad_array[4]) { + sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32); } //////////////////////////////////////////////////////////////////////////////// @@ -332,7 +332,7 @@ void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res, const int height) { + uint32_t sad_array[4], const int height) { int i; uint32x4_t r0, r1; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], @@ -365,25 +365,25 @@ static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, r0 = vpaddq_u32(sum[0], sum[1]); r1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(res, vpaddq_u32(r0, r1)); + vst1q_u32(sad_array, vpaddq_u32(r0, r1)); } void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16); + uint32_t sad_array[4]) { + sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16); } void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32); + uint32_t sad_array[4]) { + sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32); } void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 64); + uint32_t sad_array[4]) { + sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64); } #else @@ -422,26 +422,26 @@ static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { + uint32_t sad_array[4]) { uint16x8_t sum[4]; sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum); - sad_512_pel_final_neon(sum, res); + sad_512_pel_final_neon(sum, sad_array); } void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { + uint32_t sad_array[4]) { uint16x8_t sum[4]; sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum); - sad_1024_pel_final_neon(sum, res); + sad_1024_pel_final_neon(sum, sad_array); } void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { + uint32_t sad_array[4]) { uint16x8_t sum[4]; sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum); - sad_2048_pel_final_neon(sum, res); + sad_2048_pel_final_neon(sum, sad_array); } #endif @@ -453,7 +453,7 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { + uint32_t sad_array[4]) { int i; uint32x4_t r0, r1; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], @@ -497,12 +497,12 @@ void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, r0 = vpaddq_u32(sum[0], sum[1]); r1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(res, vpaddq_u32(r0, r1)); + vst1q_u32(sad_array, vpaddq_u32(r0, r1)); } void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { + uint32_t sad_array[4]) { int i; uint32x4_t r0, r1, r2, r3; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], @@ -551,14 +551,14 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, r3 = vpaddq_u32(sum[6], sum[7]); r0 = vpaddq_u32(r0, r1); r1 = vpaddq_u32(r2, r3); - vst1q_u32(res, vpaddq_u32(r0, r1)); + vst1q_u32(sad_array, vpaddq_u32(r0, r1)); } #else void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { + uint32_t sad_array[4]) { int i; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], ref_array[3] }; @@ -599,12 +599,12 @@ void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, ref_loop[3] += ref_stride; } - sad_2048_pel_final_neon(sum, res); + sad_2048_pel_final_neon(sum, sad_array); } void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { + uint32_t sad_array[4]) { int i; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], ref_array[3] }; @@ -646,7 +646,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, ref_loop[3] += ref_stride; } - sad_4096_pel_final_neon(sum, res); + sad_4096_pel_final_neon(sum, sad_array); } #endif diff --git a/vpx_dsp/mips/sad_msa.c b/vpx_dsp/mips/sad_msa.c index ab681ae9f8..e3e91c4330 100644 --- a/vpx_dsp/mips/sad_msa.c +++ b/vpx_dsp/mips/sad_msa.c @@ -1040,77 +1040,77 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, #define VPX_SAD_4xHEIGHTx3_MSA(height) \ void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ + uint32_t sads[3]) { \ sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ } #define VPX_SAD_8xHEIGHTx3_MSA(height) \ void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ + uint32_t sads[3]) { \ sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ } #define VPX_SAD_16xHEIGHTx3_MSA(height) \ void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ + uint32_t sads[3]) { \ sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ } #define VPX_SAD_4xHEIGHTx8_MSA(height) \ void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ + uint32_t sads[8]) { \ sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ } #define VPX_SAD_8xHEIGHTx8_MSA(height) \ void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ + uint32_t sads[8]) { \ sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ } #define VPX_SAD_16xHEIGHTx8_MSA(height) \ void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ + uint32_t sads[8]) { \ sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ } #define VPX_SAD_4xHEIGHTx4D_MSA(height) \ void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ } #define VPX_SAD_8xHEIGHTx4D_MSA(height) \ void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ } #define VPX_SAD_16xHEIGHTx4D_MSA(height) \ void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ } #define VPX_SAD_32xHEIGHTx4D_MSA(height) \ void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ } #define VPX_SAD_64xHEIGHTx4D_MSA(height) \ void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ } diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c index 769322019e..46d513b686 100644 --- a/vpx_dsp/sad.c +++ b/vpx_dsp/sad.c @@ -45,23 +45,39 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride, return sad(src_ptr, src_stride, comp_pred, m, m, n); \ } -// depending on call sites, pass **ref_array to avoid & in subsequent call and -// de-dup with 4D below. +// Compare |src_ptr| to |k| adjacent blocks starting at |ref_ptr|. +// |k| == {3,8}. Used in vp8 for an exhaustive search. +// src: ref: +// 0 1 2 3 0 1 2 3 x x +// 4 5 6 7 6 7 8 9 x x +// 8 9 10 11 12 13 14 15 x x +// 12 13 14 15 18 19 20 21 x x +// +// x 1 2 3 4 x +// x 7 8 9 10 x +// x 13 14 15 16 x +// x 19 20 21 22 x +// +// x x 2 3 4 5 +// x x 8 9 10 11 +// x x 14 15 16 17 +// x x 20 21 22 23 +// #define sadMxNxK(m, n, k) \ void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride, \ - uint32_t *sad_array) { \ + uint32_t sad_array[k]) { \ int i; \ for (i = 0; i < k; ++i) \ sad_array[i] = \ - vpx_sad##m##x##n##_c(src_ptr, src_stride, &ref_ptr[i], ref_stride); \ + vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_ptr + i, ref_stride); \ } -// This appears to be equivalent to the above when k == 4 and refs is const +// Compare |src_ptr| to 4 distinct references in |ref_array[]| #define sadMxNx4D(m, n) \ void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *const ref_array[], \ - int ref_stride, uint32_t *sad_array) { \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ int i; \ for (i = 0; i < 4; ++i) \ sad_array[i] = \ @@ -181,15 +197,15 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride, return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n); \ } -#define highbd_sadMxNx4D(m, n) \ - void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *const ref_array[], \ - int ref_stride, uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < 4; ++i) { \ - sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \ - ref_array[i], ref_stride); \ - } \ +#define highbd_sadMxNx4D(m, n) \ + void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \ + ref_array[i], ref_stride); \ + } \ } /* clang-format off */ diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 8b66722481..06a8febb29 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -877,80 +877,80 @@ () # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally # # Blocks of 3 -add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]"; specialize qw/vpx_sad16x16x3 sse3 ssse3 msa mmi/; -add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]"; specialize qw/vpx_sad16x8x3 sse3 ssse3 msa mmi/; -add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]"; specialize qw/vpx_sad8x16x3 sse3 msa mmi/; -add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]"; specialize qw/vpx_sad8x8x3 sse3 msa mmi/; -add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]"; specialize qw/vpx_sad4x4x3 sse3 msa mmi/; # Blocks of 8 -add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; specialize qw/vpx_sad32x32x8 avx2/; -add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/; -add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; specialize qw/vpx_sad16x8x8 sse4_1 msa mmi/; -add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; specialize qw/vpx_sad8x16x8 sse4_1 msa mmi/; -add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; specialize qw/vpx_sad8x8x8 sse4_1 msa mmi/; -add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/; # # Multi-block SAD, comparing a reference to N independent blocks # -add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/; add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size"; @@ -1064,43 +1064,43 @@ () # # Multi-block SAD, comparing a reference to N independent blocks # - add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad64x64x4d sse2/; - add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad64x32x4d sse2/; - add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad32x64x4d sse2/; - add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad32x32x4d sse2/; - add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad32x16x4d sse2/; - add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad16x32x4d sse2/; - add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad16x16x4d sse2/; - add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad16x8x4d sse2/; - add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad8x16x4d sse2/; - add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad8x8x4d sse2/; - add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad8x4x4d sse2/; - add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad4x8x4d sse2/; - add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad4x4x4d sse2/; # diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c index 9dd0666918..5f1f757e25 100644 --- a/vpx_dsp/x86/sad4d_avx2.c +++ b/vpx_dsp/x86/sad4d_avx2.c @@ -11,8 +11,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, - uint32_t *sad_array) { +static INLINE void calc_final_4(const __m256i sums[4], uint32_t sad_array[4]) { const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); const __m256i t2 = _mm256_hadd_epi32(t0, t1); @@ -22,8 +21,8 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, } void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[/*4*/], int ref_stride, - uint32_t *sad_array /*[4]*/) { + const uint8_t *const ref_array[4], int ref_stride, + uint32_t sad_array[4]) { int i; const uint8_t *refs[4]; __m256i sums[4]; @@ -71,7 +70,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint32_t *sad_array) { + uint32_t sad_array[8]) { int i; __m256i sums[8]; @@ -127,8 +126,8 @@ void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride, } void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[/*4*/], int ref_stride, - uint32_t *sad_array /*[4]*/) { + const uint8_t *const ref_array[4], int ref_stride, + uint32_t sad_array[4]) { __m256i sums[4]; int i; const uint8_t *refs[4]; diff --git a/vpx_dsp/x86/sad4d_avx512.c b/vpx_dsp/x86/sad4d_avx512.c index 2fa9108718..cfd23fedd9 100644 --- a/vpx_dsp/x86/sad4d_avx512.c +++ b/vpx_dsp/x86/sad4d_avx512.c @@ -12,8 +12,8 @@ #include "vpx/vpx_integer.h" void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[/*4*/], - int ref_stride, uint32_t *res /*[4]*/) { + const uint8_t *const ref_array[4], int ref_stride, + uint32_t sad_array[4]) { __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3; __m512i sum_mlow, sum_mhigh; @@ -78,6 +78,6 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256), _mm256_extractf128_si256(sum256, 1)); - _mm_storeu_si128((__m128i *)(res), sum128); + _mm_storeu_si128((__m128i *)(sad_array), sum128); } } From afd60bd07d41e5d20a0b11eeeb104846d9517c65 Mon Sep 17 00:00:00 2001 From: Johann Date: Mon, 28 Mar 2022 15:27:46 +0900 Subject: [PATCH 0303/1000] remove sad x3,x8 specializations These would compute the sum of absolute differences (sad) for a group of 3 or 8 references. This was used as part of an exhaustive search. vp8 only uses these functions in speed 0 and best quality. For vp9 this is only used with the --enable-non-greedy-mv experiment. This removes the 3- and 8-at-a-time optimized functions and uses the fall back code which will process 1 or 4 (vpx_sadMxNx4d) at a time. For configure --target=x86_64-linux-gcc --enable-realtime-only: libvpx.a before: 3002424 after: 2937622 delta: 64802 after 'strip libvpx.a' before: 2116998 after: 2073090 delta: 43908 Change-Id: I566d06e027c327b3bede68649dd551bba81a848e --- test/sad_test.cc | 68 ------ vp8/common/rtcd_defs.pl | 5 - vp8/encoder/mcomp.c | 276 +---------------------- vp8/encoder/mcomp.h | 8 +- vp8/encoder/onyx_if.c | 11 - vp8/encoder/onyx_int.h | 1 - vp8/encoder/rdopt.c | 4 +- vp9/encoder/vp9_encoder.c | 50 ++-- vp9/encoder/vp9_mcomp.c | 23 -- vp9/encoder/vp9_mcomp.h | 10 - vpx_dsp/mips/sad_mmi.c | 23 -- vpx_dsp/mips/sad_msa.c | 426 ----------------------------------- vpx_dsp/sad.c | 41 +--- vpx_dsp/variance.h | 3 - vpx_dsp/vpx_dsp.mk | 3 - vpx_dsp/vpx_dsp_rtcd_defs.pl | 38 ---- vpx_dsp/x86/sad4d_avx2.c | 57 ----- vpx_dsp/x86/sad_sse3.asm | 376 ------------------------------- vpx_dsp/x86/sad_sse4.asm | 361 ----------------------------- vpx_dsp/x86/sad_ssse3.asm | 372 ------------------------------ 20 files changed, 33 insertions(+), 2123 deletions(-) delete mode 100644 vpx_dsp/x86/sad_sse3.asm delete mode 100644 vpx_dsp/x86/sad_sse4.asm delete mode 100644 vpx_dsp/x86/sad_ssse3.asm diff --git a/test/sad_test.cc b/test/sad_test.cc index ee10a46389..560c5f3823 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -56,8 +56,6 @@ typedef void (*SadMxNx8Func)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -typedef TestParams SadMxNx8Param; - using libvpx_test::ACMRandom; namespace { @@ -266,30 +264,6 @@ class SADTestBase : public ::testing::TestWithParam { ParamType params_; }; -class SADx8Test : public SADTestBase { - public: - SADx8Test() : SADTestBase(GetParam()) {} - - protected: - void SADs(unsigned int *results) const { - const uint8_t *reference = GetReferenceFromOffset(0); - - ASM_REGISTER_STATE_CHECK(params_.func( - source_data_, source_stride_, reference, reference_stride_, results)); - } - - void CheckSADs() const { - uint32_t reference_sad; - DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[8]); - - SADs(exp_sad); - for (int offset = 0; offset < 8; ++offset) { - reference_sad = ReferenceSAD(offset); - EXPECT_EQ(reference_sad, exp_sad[offset]) << "offset " << offset; - } - } -}; - class SADx4Test : public SADTestBase { public: SADx4Test() : SADTestBase(GetParam()) {} @@ -564,13 +538,6 @@ TEST_P(SADx4Test, DISABLED_Speed) { reference_stride_ = tmp_stride; } -TEST_P(SADx8Test, Regular) { - FillRandomWH(source_data_, source_stride_, params_.width, params_.height); - FillRandomWH(GetReferenceFromOffset(0), reference_stride_, params_.width + 8, - params_.height); - CheckSADs(); -} - //------------------------------------------------------------------------------ // C functions const SadMxNParam c_tests[] = { @@ -747,24 +714,6 @@ const SadMxNx4Param x4d_c_tests[] = { }; INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); -// TODO(angiebird): implement the marked-down sad functions -const SadMxNx8Param x8_c_tests[] = { - // SadMxNx8Param(64, 64, &vpx_sad64x64x8_c), - // SadMxNx8Param(64, 32, &vpx_sad64x32x8_c), - // SadMxNx8Param(32, 64, &vpx_sad32x64x8_c), - SadMxNx8Param(32, 32, &vpx_sad32x32x8_c), - // SadMxNx8Param(32, 16, &vpx_sad32x16x8_c), - // SadMxNx8Param(16, 32, &vpx_sad16x32x8_c), - SadMxNx8Param(16, 16, &vpx_sad16x16x8_c), - SadMxNx8Param(16, 8, &vpx_sad16x8x8_c), - SadMxNx8Param(8, 16, &vpx_sad8x16x8_c), - SadMxNx8Param(8, 8, &vpx_sad8x8x8_c), - // SadMxNx8Param(8, 4, &vpx_sad8x4x8_c), - // SadMxNx8Param(4, 8, &vpx_sad4x8x8_c), - SadMxNx8Param(4, 4, &vpx_sad4x4x8_c), -}; -INSTANTIATE_TEST_SUITE_P(C, SADx8Test, ::testing::ValuesIn(x8_c_tests)); - //------------------------------------------------------------------------------ // ARM functions #if HAVE_NEON @@ -992,18 +941,6 @@ INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests)); // Only functions are x3, which do not have tests. #endif // HAVE_SSSE3 -#if HAVE_SSE4_1 -const SadMxNx8Param x8_sse4_1_tests[] = { - SadMxNx8Param(16, 16, &vpx_sad16x16x8_sse4_1), - SadMxNx8Param(16, 8, &vpx_sad16x8x8_sse4_1), - SadMxNx8Param(8, 16, &vpx_sad8x16x8_sse4_1), - SadMxNx8Param(8, 8, &vpx_sad8x8x8_sse4_1), - SadMxNx8Param(4, 4, &vpx_sad4x4x8_sse4_1), -}; -INSTANTIATE_TEST_SUITE_P(SSE4_1, SADx8Test, - ::testing::ValuesIn(x8_sse4_1_tests)); -#endif // HAVE_SSE4_1 - #if HAVE_AVX2 const SadMxNParam avx2_tests[] = { SadMxNParam(64, 64, &vpx_sad64x64_avx2), @@ -1029,11 +966,6 @@ const SadMxNx4Param x4d_avx2_tests[] = { }; INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); -const SadMxNx8Param x8_avx2_tests[] = { - // SadMxNx8Param(64, 64, &vpx_sad64x64x8_c), - SadMxNx8Param(32, 32, &vpx_sad32x32x8_avx2), -}; -INSTANTIATE_TEST_SUITE_P(AVX2, SADx8Test, ::testing::ValuesIn(x8_avx2_tests)); #endif // HAVE_AVX2 #if HAVE_AVX512 diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 32601b4eba..c7911032f6 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -216,11 +216,6 @@ () # # Motion search # -add_proto qw/int vp8_full_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"; -specialize qw/vp8_full_search_sad sse3 sse4_1/; -$vp8_full_search_sad_sse3=vp8_full_search_sadx3; -$vp8_full_search_sad_sse4_1=vp8_full_search_sadx8; - add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"; specialize qw/vp8_refining_search_sad sse2 msa/; $vp8_refining_search_sad_sse2=vp8_refining_search_sadx4; diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index 4ab6c7b3d0..769c2f5589 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -1280,10 +1280,10 @@ int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, } #endif // HAVE_SSE2 || HAVE_MSA -int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, - int sad_per_bit, int distance, - vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], - int_mv *center_mv) { +int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, + int sad_per_bit, int distance, + vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], + int_mv *center_mv) { unsigned char *what = (*(b->base_src) + b->src); int what_stride = b->src_stride; unsigned char *in_what; @@ -1323,217 +1323,6 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, best_mv->as_mv.row = ref_row; best_mv->as_mv.col = ref_col; - /* Baseline value at the centre */ - bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) + - mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit); - - /* Apply further limits to prevent us looking using vectors that - * stretch beyiond the UMV border - */ - if (col_min < x->mv_col_min) col_min = x->mv_col_min; - - if (col_max > x->mv_col_max) col_max = x->mv_col_max; - - if (row_min < x->mv_row_min) row_min = x->mv_row_min; - - if (row_max > x->mv_row_max) row_max = x->mv_row_max; - - for (r = row_min; r < row_max; ++r) { - this_mv.as_mv.row = r; - check_here = r * mv_stride + in_what + col_min; - - for (c = col_min; c < col_max; ++c) { - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride); - - this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; - } - - check_here++; - } - } - - this_mv.as_mv.row = best_mv->as_mv.row << 3; - this_mv.as_mv.col = best_mv->as_mv.col << 3; - - return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) + - mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); -} - -#if HAVE_SSSE3 -int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, - int sad_per_bit, int distance, - vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], - int_mv *center_mv) { - unsigned char *what = (*(b->base_src) + b->src); - int what_stride = b->src_stride; - unsigned char *in_what; - int pre_stride = x->e_mbd.pre.y_stride; - unsigned char *base_pre = x->e_mbd.pre.y_buffer; - int in_what_stride = pre_stride; - int mv_stride = pre_stride; - unsigned char *bestaddress; - int_mv *best_mv = &d->bmi.mv; - int_mv this_mv; - unsigned int bestsad; - unsigned int thissad; - int r, c; - - unsigned char *check_here; - - int ref_row = ref_mv->as_mv.row; - int ref_col = ref_mv->as_mv.col; - - int row_min = ref_row - distance; - int row_max = ref_row + distance; - int col_min = ref_col - distance; - int col_max = ref_col + distance; - - unsigned int sad_array[3]; - - int *mvsadcost[2]; - int_mv fcenter_mv; - - mvsadcost[0] = x->mvsadcost[0]; - mvsadcost[1] = x->mvsadcost[1]; - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; - fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - - /* Work out the mid point for the search */ - in_what = base_pre + d->offset; - bestaddress = in_what + (ref_row * pre_stride) + ref_col; - - best_mv->as_mv.row = ref_row; - best_mv->as_mv.col = ref_col; - - /* Baseline value at the centre */ - bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) + - mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit); - - /* Apply further limits to prevent us looking using vectors that stretch - * beyond the UMV border - */ - if (col_min < x->mv_col_min) col_min = x->mv_col_min; - - if (col_max > x->mv_col_max) col_max = x->mv_col_max; - - if (row_min < x->mv_row_min) row_min = x->mv_row_min; - - if (row_max > x->mv_row_max) row_max = x->mv_row_max; - - for (r = row_min; r < row_max; ++r) { - this_mv.as_mv.row = r; - check_here = r * mv_stride + in_what + col_min; - c = col_min; - - while ((c + 2) < col_max) { - int i; - - fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array); - - for (i = 0; i < 3; ++i) { - thissad = sad_array[i]; - - if (thissad < bestsad) { - this_mv.as_mv.col = c; - thissad += - mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; - } - } - - check_here++; - c++; - } - } - - while (c < col_max) { - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride); - - if (thissad < bestsad) { - this_mv.as_mv.col = c; - thissad += - mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; - } - } - - check_here++; - c++; - } - } - - this_mv.as_mv.row = best_mv->as_mv.row << 3; - this_mv.as_mv.col = best_mv->as_mv.col << 3; - - return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) + - mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); -} -#endif // HAVE_SSSE3 - -#if HAVE_SSE4_1 -int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, - int sad_per_bit, int distance, - vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], - int_mv *center_mv) { - unsigned char *what = (*(b->base_src) + b->src); - int what_stride = b->src_stride; - int pre_stride = x->e_mbd.pre.y_stride; - unsigned char *base_pre = x->e_mbd.pre.y_buffer; - unsigned char *in_what; - int in_what_stride = pre_stride; - int mv_stride = pre_stride; - unsigned char *bestaddress; - int_mv *best_mv = &d->bmi.mv; - int_mv this_mv; - unsigned int bestsad; - unsigned int thissad; - int r, c; - - unsigned char *check_here; - - int ref_row = ref_mv->as_mv.row; - int ref_col = ref_mv->as_mv.col; - - int row_min = ref_row - distance; - int row_max = ref_row + distance; - int col_min = ref_col - distance; - int col_max = ref_col + distance; - - DECLARE_ALIGNED(16, unsigned int, sad_array8[8]); - unsigned int sad_array[3]; - - int *mvsadcost[2]; - int_mv fcenter_mv; - - mvsadcost[0] = x->mvsadcost[0]; - mvsadcost[1] = x->mvsadcost[1]; - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; - fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - - /* Work out the mid point for the search */ - in_what = base_pre + d->offset; - bestaddress = in_what + (ref_row * pre_stride) + ref_col; - - best_mv->as_mv.row = ref_row; - best_mv->as_mv.col = ref_col; - /* Baseline value at the centre */ bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit); @@ -1552,61 +1341,8 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, for (r = row_min; r < row_max; ++r) { this_mv.as_mv.row = r; check_here = r * mv_stride + in_what + col_min; - c = col_min; - - while ((c + 7) < col_max) { - int i; - - fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8); - - for (i = 0; i < 8; ++i) { - thissad = sad_array8[i]; - - if (thissad < bestsad) { - this_mv.as_mv.col = c; - thissad += - mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; - } - } - - check_here++; - c++; - } - } - - while ((c + 2) < col_max) { - int i; - - fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array); - - for (i = 0; i < 3; ++i) { - thissad = sad_array[i]; - if (thissad < bestsad) { - this_mv.as_mv.col = c; - thissad += - mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; - } - } - - check_here++; - c++; - } - } - - while (c < col_max) { + for (c = col_min; c < col_max; ++c) { thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride); if (thissad < bestsad) { @@ -1623,7 +1359,6 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, } check_here++; - c++; } } @@ -1633,7 +1368,6 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } -#endif // HAVE_SSE4_1 int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int error_per_bit, diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h index 57c18f523f..1ee6fe5dd6 100644 --- a/vp8/encoder/mcomp.h +++ b/vp8/encoder/mcomp.h @@ -50,10 +50,10 @@ fractional_mv_step_fp vp8_find_best_sub_pixel_step; fractional_mv_step_fp vp8_find_best_half_pixel_step; fractional_mv_step_fp vp8_skip_fractional_mv_step; -typedef int (*vp8_full_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, - int_mv *ref_mv, int sad_per_bit, - int distance, vp8_variance_fn_ptr_t *fn_ptr, - int *mvcost[2], int_mv *center_mv); +int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, + int sad_per_bit, int distance, + vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], + int_mv *center_mv); typedef int (*vp8_refining_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int sad_per_bit, diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index f09177c7f5..ffb3867dd1 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -2012,36 +2012,26 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16; cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16; cpi->fn_ptr[BLOCK_16X16].svf = vpx_sub_pixel_variance16x16; - cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3; - cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8; cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d; cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8; cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8; cpi->fn_ptr[BLOCK_16X8].svf = vpx_sub_pixel_variance16x8; - cpi->fn_ptr[BLOCK_16X8].sdx3f = vpx_sad16x8x3; - cpi->fn_ptr[BLOCK_16X8].sdx8f = vpx_sad16x8x8; cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d; cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16; cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16; cpi->fn_ptr[BLOCK_8X16].svf = vpx_sub_pixel_variance8x16; - cpi->fn_ptr[BLOCK_8X16].sdx3f = vpx_sad8x16x3; - cpi->fn_ptr[BLOCK_8X16].sdx8f = vpx_sad8x16x8; cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d; cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8; cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8; cpi->fn_ptr[BLOCK_8X8].svf = vpx_sub_pixel_variance8x8; - cpi->fn_ptr[BLOCK_8X8].sdx3f = vpx_sad8x8x3; - cpi->fn_ptr[BLOCK_8X8].sdx8f = vpx_sad8x8x8; cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d; cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4; cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4; cpi->fn_ptr[BLOCK_4X4].svf = vpx_sub_pixel_variance4x4; - cpi->fn_ptr[BLOCK_4X4].sdx3f = vpx_sad4x4x3; - cpi->fn_ptr[BLOCK_4X4].sdx8f = vpx_sad4x4x8; cpi->fn_ptr[BLOCK_4X4].sdx4df = vpx_sad4x4x4d; #if VPX_ARCH_X86 || VPX_ARCH_X86_64 @@ -2052,7 +2042,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->fn_ptr[BLOCK_4X4].copymem = vp8_copy32xn; #endif - cpi->full_search_sad = vp8_full_search_sad; cpi->diamond_search_sad = vp8_diamond_search_sad; cpi->refining_search_sad = vp8_refining_search_sad; diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 7f8298e44a..424f51b180 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -549,7 +549,6 @@ typedef struct VP8_COMP { unsigned char *partition_d_end[MAX_PARTITIONS]; fractional_mv_step_fp *find_fractional_mv_step; - vp8_full_search_fn_t full_search_sad; vp8_refining_search_fn_t refining_search_sad; vp8_diamond_search_fn_t diamond_search_sad; vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS]; diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 79a858e437..5821fc7346 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1097,8 +1097,8 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, vp8_clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); - thissme = cpi->full_search_sad(x, c, e, &mvp_full, sadpb, 16, - v_fn_ptr, x->mvcost, bsi->ref_mv); + thissme = vp8_full_search_sad(x, c, e, &mvp_full, sadpb, 16, + v_fn_ptr, x->mvcost, bsi->ref_mv); if (thissme < bestsme) { bestsme = thissme; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 4609a6bb26..df73190426 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1569,15 +1569,13 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) { } #if CONFIG_VP9_HIGHBITDEPTH -// TODO(angiebird): make sdx8f available for highbitdepth if needed #define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \ cpi->fn_ptr[BT].sdf = SDF; \ cpi->fn_ptr[BT].sdaf = SDAF; \ cpi->fn_ptr[BT].vf = VF; \ cpi->fn_ptr[BT].svf = SVF; \ cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; \ - cpi->fn_ptr[BT].sdx8f = NULL; + cpi->fn_ptr[BT].sdx4df = SDX4DF; #define MAKE_BFP_SAD_WRAPPER(fnname) \ static unsigned int fnname##_bits8(const uint8_t *src_ptr, \ @@ -2561,67 +2559,61 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff))); cpi->source_var_thresh = 0; cpi->frames_till_next_var_check = 0; -#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX8F) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; \ - cpi->fn_ptr[BT].sdx8f = SDX8F; +#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; - // TODO(angiebird): make sdx8f available for every block size BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16, vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16, - vpx_sad32x16x4d, NULL) + vpx_sad32x16x4d) BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32, vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32, - vpx_sad16x32x4d, NULL) + vpx_sad16x32x4d) BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32, vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32, - vpx_sad64x32x4d, NULL) + vpx_sad64x32x4d) BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64, vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64, - vpx_sad32x64x4d, NULL) + vpx_sad32x64x4d) BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32, vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32, - vpx_sad32x32x4d, vpx_sad32x32x8) + vpx_sad32x32x4d) BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64, vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64, - vpx_sad64x64x4d, NULL) + vpx_sad64x64x4d) BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16, vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16, - vpx_sad16x16x4d, vpx_sad16x16x8) + vpx_sad16x16x4d) BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8, vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8, - vpx_sad16x8x4d, vpx_sad16x8x8) + vpx_sad16x8x4d) BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16, vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16, - vpx_sad8x16x4d, vpx_sad8x16x8) + vpx_sad8x16x4d) BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8, - vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d, - vpx_sad8x8x8) + vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d) BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4, - vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d, - NULL) + vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d) BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8, - vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d, - NULL) + vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d) BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4, - vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d, - vpx_sad4x4x8) + vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d) #if CONFIG_VP9_HIGHBITDEPTH highbd_set_var_fns(cpi); diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index cd67064203..1f08aa5de7 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -1796,29 +1796,6 @@ static int64_t exhaustive_mesh_search_single_step( end_col = VPXMIN(center_mv->col + range, mv_limits->col_max); for (r = start_row; r <= end_row; r += 1) { c = start_col; - // sdx8f may not be available some block size - if (fn_ptr->sdx8f) { - while (c + 7 <= end_col) { - unsigned int sads[8]; - const MV mv = { r, c }; - const uint8_t *buf = get_buf_from_mv(pre, &mv); - fn_ptr->sdx8f(src->buf, src->stride, buf, pre->stride, sads); - - for (i = 0; i < 8; ++i) { - int64_t sad = (int64_t)sads[i] << LOG2_PRECISION; - if (sad < best_sad) { - const MV mv = { r, c + i }; - sad += lambda * - vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - } - c += 8; - } - } while (c + 3 <= end_col) { unsigned int sads[4]; const uint8_t *addrs[4]; diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index 0c4d8f23c6..bdaf2ce77d 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -93,16 +93,6 @@ extern fractional_mv_step_fp vp9_skip_sub_pixel_tree; extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv; extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv; -typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv, - int sad_per_bit, int distance, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv, MV *best_mv); - -typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x, MV *ref_mv, - int sad_per_bit, int distance, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv); - typedef int (*vp9_diamond_search_fn_t)( const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, diff --git a/vpx_dsp/mips/sad_mmi.c b/vpx_dsp/mips/sad_mmi.c index eaca4773f2..7f5882bca3 100644 --- a/vpx_dsp/mips/sad_mmi.c +++ b/vpx_dsp/mips/sad_mmi.c @@ -334,19 +334,6 @@ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" #endif /* _MIPS_SIM == _ABIO32 */ -// depending on call sites, pass **ref_array to avoid & in subsequent call and -// de-dup with 4D below. -#define sadMxNxK_mmi(m, n, k) \ - void vpx_sad##m##x##n##x##k##_mmi(const uint8_t *src, int src_stride, \ - const uint8_t *ref_array, int ref_stride, \ - uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < (k); ++i) \ - sad_array[i] = \ - vpx_sad##m##x##n##_mmi(src, src_stride, &ref_array[i], ref_stride); \ - } - -// This appears to be equivalent to the above when k == 4 and refs is const #define sadMxNx4D_mmi(m, n) \ void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[], \ @@ -583,10 +570,6 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride, vpx_sad16xN(32); vpx_sad16xN(16); vpx_sad16xN(8); -sadMxNxK_mmi(16, 16, 3); -sadMxNxK_mmi(16, 16, 8); -sadMxNxK_mmi(16, 8, 3); -sadMxNxK_mmi(16, 8, 8); sadMxNx4D_mmi(16, 32); sadMxNx4D_mmi(16, 16); sadMxNx4D_mmi(16, 8); @@ -681,10 +664,6 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride, vpx_sad8xN(16); vpx_sad8xN(8); vpx_sad8xN(4); -sadMxNxK_mmi(8, 16, 3); -sadMxNxK_mmi(8, 16, 8); -sadMxNxK_mmi(8, 8, 3); -sadMxNxK_mmi(8, 8, 8); sadMxNx4D_mmi(8, 16); sadMxNx4D_mmi(8, 8); sadMxNx4D_mmi(8, 4); @@ -777,8 +756,6 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride, vpx_sad4xN(8); vpx_sad4xN(4); -sadMxNxK_mmi(4, 4, 3); -sadMxNxK_mmi(4, 4, 8); sadMxNx4D_mmi(4, 8); sadMxNx4D_mmi(4, 4); diff --git a/vpx_dsp/mips/sad_msa.c b/vpx_dsp/mips/sad_msa.c index e3e91c4330..b0f8ff1fd9 100644 --- a/vpx_dsp/mips/sad_msa.c +++ b/vpx_dsp/mips/sad_msa.c @@ -159,380 +159,6 @@ static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride, return sad; } -static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v16u8 ref0, ref1, ref2, ref3, diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - INSERT_W4_UB(src0, src1, src2, src3, src); - - LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad0 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref00, ref11, ref22, ref33; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); - ref += (4 * ref_stride); - PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, - ref0, ref1); - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src, ref, ref0, ref1, diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = (height >> 1); ht_cnt--;) { - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3, diff; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - INSERT_W4_UB(src0, src1, src2, src3, src); - src_ptr += (4 * src_stride); - LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad0 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad3 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad4 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad5 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad6 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad7 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref00, ref11, ref22, ref33; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); - ref += (4 * ref_stride); - PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, - ref0, ref1); - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src, ref0, ref1, ref; - v16u8 diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = (height >> 1); ht_cnt--;) { - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); - diff = __msa_asub_u_b(src, ref); - sad3 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); - diff = __msa_asub_u_b(src, ref); - sad4 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); - diff = __msa_asub_u_b(src, ref); - sad5 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); - diff = __msa_asub_u_b(src, ref); - sad6 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); - diff = __msa_asub_u_b(src, ref); - sad7 += __msa_hadd_u_h(diff, diff); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); - diff = __msa_asub_u_b(src, ref); - sad3 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); - diff = __msa_asub_u_b(src, ref); - sad4 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); - diff = __msa_asub_u_b(src, ref); - sad5 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); - diff = __msa_asub_u_b(src, ref); - sad6 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); - diff = __msa_asub_u_b(src, ref); - sad7 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *const aref_ptr[], int32_t ref_stride, int32_t height, @@ -1037,48 +663,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ } -#define VPX_SAD_4xHEIGHTx3_MSA(height) \ - void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t sads[3]) { \ - sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_8xHEIGHTx3_MSA(height) \ - void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t sads[3]) { \ - sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_16xHEIGHTx3_MSA(height) \ - void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t sads[3]) { \ - sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_4xHEIGHTx8_MSA(height) \ - void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t sads[8]) { \ - sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_8xHEIGHTx8_MSA(height) \ - void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t sads[8]) { \ - sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_16xHEIGHTx8_MSA(height) \ - void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t sads[8]) { \ - sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - #define VPX_SAD_4xHEIGHTx4D_MSA(height) \ void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *const refs[4], \ @@ -1186,29 +770,21 @@ VPX_AVGSAD_16xHEIGHT_MSA(32); // 16x16 VPX_SAD_16xHEIGHT_MSA(16); -VPX_SAD_16xHEIGHTx3_MSA(16); -VPX_SAD_16xHEIGHTx8_MSA(16); VPX_SAD_16xHEIGHTx4D_MSA(16); VPX_AVGSAD_16xHEIGHT_MSA(16); // 16x8 VPX_SAD_16xHEIGHT_MSA(8); -VPX_SAD_16xHEIGHTx3_MSA(8); -VPX_SAD_16xHEIGHTx8_MSA(8); VPX_SAD_16xHEIGHTx4D_MSA(8); VPX_AVGSAD_16xHEIGHT_MSA(8); // 8x16 VPX_SAD_8xHEIGHT_MSA(16); -VPX_SAD_8xHEIGHTx3_MSA(16); -VPX_SAD_8xHEIGHTx8_MSA(16); VPX_SAD_8xHEIGHTx4D_MSA(16); VPX_AVGSAD_8xHEIGHT_MSA(16); // 8x8 VPX_SAD_8xHEIGHT_MSA(8); -VPX_SAD_8xHEIGHTx3_MSA(8); -VPX_SAD_8xHEIGHTx8_MSA(8); VPX_SAD_8xHEIGHTx4D_MSA(8); VPX_AVGSAD_8xHEIGHT_MSA(8); @@ -1224,7 +800,5 @@ VPX_AVGSAD_4xHEIGHT_MSA(8); // 4x4 VPX_SAD_4xHEIGHT_MSA(4); -VPX_SAD_4xHEIGHTx3_MSA(4); -VPX_SAD_4xHEIGHTx8_MSA(4); VPX_SAD_4xHEIGHTx4D_MSA(4); VPX_AVGSAD_4xHEIGHT_MSA(4); diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c index 46d513b686..b47c43430d 100644 --- a/vpx_dsp/sad.c +++ b/vpx_dsp/sad.c @@ -45,35 +45,7 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride, return sad(src_ptr, src_stride, comp_pred, m, m, n); \ } -// Compare |src_ptr| to |k| adjacent blocks starting at |ref_ptr|. -// |k| == {3,8}. Used in vp8 for an exhaustive search. -// src: ref: -// 0 1 2 3 0 1 2 3 x x -// 4 5 6 7 6 7 8 9 x x -// 8 9 10 11 12 13 14 15 x x -// 12 13 14 15 18 19 20 21 x x -// -// x 1 2 3 4 x -// x 7 8 9 10 x -// x 13 14 15 16 x -// x 19 20 21 22 x -// -// x x 2 3 4 5 -// x x 8 9 10 11 -// x x 14 15 16 17 -// x x 20 21 22 23 -// -#define sadMxNxK(m, n, k) \ - void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - uint32_t sad_array[k]) { \ - int i; \ - for (i = 0; i < k; ++i) \ - sad_array[i] = \ - vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_ptr + i, ref_stride); \ - } - -// Compare |src_ptr| to 4 distinct references in |ref_array[]| +// Compare |src_ptr| to 4 distinct references in |ref_array[4]| #define sadMxNx4D(m, n) \ void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ const uint8_t *const ref_array[4], \ @@ -99,7 +71,6 @@ sadMxNx4D(32, 64) // 32x32 sadMxN(32, 32) -sadMxNxK(32, 32, 8) sadMxNx4D(32, 32) // 32x16 @@ -112,26 +83,18 @@ sadMxNx4D(16, 32) // 16x16 sadMxN(16, 16) -sadMxNxK(16, 16, 3) -sadMxNxK(16, 16, 8) sadMxNx4D(16, 16) // 16x8 sadMxN(16, 8) -sadMxNxK(16, 8, 3) -sadMxNxK(16, 8, 8) sadMxNx4D(16, 8) // 8x16 sadMxN(8, 16) -sadMxNxK(8, 16, 3) -sadMxNxK(8, 16, 8) sadMxNx4D(8, 16) // 8x8 sadMxN(8, 8) -sadMxNxK(8, 8, 3) -sadMxNxK(8, 8, 8) sadMxNx4D(8, 8) // 8x4 @@ -144,8 +107,6 @@ sadMxNx4D(4, 8) // 4x4 sadMxN(4, 4) -sadMxNxK(4, 4, 3) -sadMxNxK(4, 4, 8) sadMxNx4D(4, 4) /* clang-format on */ diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h index f8b44f03d1..755cb907d2 100644 --- a/vpx_dsp/variance.h +++ b/vpx_dsp/variance.h @@ -59,8 +59,6 @@ typedef struct variance_vtable { vpx_sad_fn_t sdf; vpx_variance_fn_t vf; vpx_subpixvariance_fn_t svf; - vpx_sad_multi_fn_t sdx3f; - vpx_sad_multi_fn_t sdx8f; vpx_sad_multi_d_fn_t sdx4df; #if VPX_ARCH_X86 || VPX_ARCH_X86_64 vp8_copy32xn_fn_t copymem; @@ -76,7 +74,6 @@ typedef struct vp9_variance_vtable { vpx_subpixvariance_fn_t svf; vpx_subp_avg_variance_fn_t svaf; vpx_sad_multi_d_fn_t sdx4df; - vpx_sad_multi_fn_t sdx8f; } vp9_variance_fn_ptr_t; #endif // CONFIG_VP9 diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index a880e1d285..a1e511cce0 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -358,9 +358,6 @@ DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c DSP_SRCS-$(HAVE_MMI) += mips/sad_mmi.c DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c -DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm -DSP_SRCS-$(HAVE_SSSE3) += x86/sad_ssse3.asm -DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 06a8febb29..91ff884a62 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -873,44 +873,6 @@ () add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/; -# -# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally -# -# Blocks of 3 -add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]"; -specialize qw/vpx_sad16x16x3 sse3 ssse3 msa mmi/; - -add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]"; -specialize qw/vpx_sad16x8x3 sse3 ssse3 msa mmi/; - -add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]"; -specialize qw/vpx_sad8x16x3 sse3 msa mmi/; - -add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]"; -specialize qw/vpx_sad8x8x3 sse3 msa mmi/; - -add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]"; -specialize qw/vpx_sad4x4x3 sse3 msa mmi/; - -# Blocks of 8 -add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; -specialize qw/vpx_sad32x32x8 avx2/; - -add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; -specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/; - -add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; -specialize qw/vpx_sad16x8x8 sse4_1 msa mmi/; - -add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; -specialize qw/vpx_sad8x16x8 sse4_1 msa mmi/; - -add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; -specialize qw/vpx_sad8x8x8 sse4_1 msa mmi/; - -add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; -specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/; - # # Multi-block SAD, comparing a reference to N independent blocks # diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c index 5f1f757e25..6c5c8ebc87 100644 --- a/vpx_dsp/x86/sad4d_avx2.c +++ b/vpx_dsp/x86/sad4d_avx2.c @@ -68,63 +68,6 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums, sad_array); } -void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - uint32_t sad_array[8]) { - int i; - __m256i sums[8]; - - sums[0] = _mm256_setzero_si256(); - sums[1] = _mm256_setzero_si256(); - sums[2] = _mm256_setzero_si256(); - sums[3] = _mm256_setzero_si256(); - sums[4] = _mm256_setzero_si256(); - sums[5] = _mm256_setzero_si256(); - sums[6] = _mm256_setzero_si256(); - sums[7] = _mm256_setzero_si256(); - - for (i = 0; i < 32; i++) { - __m256i r[8]; - - // load src and all ref[] - const __m256i s = _mm256_load_si256((const __m256i *)src_ptr); - r[0] = _mm256_loadu_si256((const __m256i *)&ref_ptr[0]); - r[1] = _mm256_loadu_si256((const __m256i *)&ref_ptr[1]); - r[2] = _mm256_loadu_si256((const __m256i *)&ref_ptr[2]); - r[3] = _mm256_loadu_si256((const __m256i *)&ref_ptr[3]); - r[4] = _mm256_loadu_si256((const __m256i *)&ref_ptr[4]); - r[5] = _mm256_loadu_si256((const __m256i *)&ref_ptr[5]); - r[6] = _mm256_loadu_si256((const __m256i *)&ref_ptr[6]); - r[7] = _mm256_loadu_si256((const __m256i *)&ref_ptr[7]); - - // sum of the absolute differences between every ref[] to src - r[0] = _mm256_sad_epu8(r[0], s); - r[1] = _mm256_sad_epu8(r[1], s); - r[2] = _mm256_sad_epu8(r[2], s); - r[3] = _mm256_sad_epu8(r[3], s); - r[4] = _mm256_sad_epu8(r[4], s); - r[5] = _mm256_sad_epu8(r[5], s); - r[6] = _mm256_sad_epu8(r[6], s); - r[7] = _mm256_sad_epu8(r[7], s); - - // sum every ref[] - sums[0] = _mm256_add_epi32(sums[0], r[0]); - sums[1] = _mm256_add_epi32(sums[1], r[1]); - sums[2] = _mm256_add_epi32(sums[2], r[2]); - sums[3] = _mm256_add_epi32(sums[3], r[3]); - sums[4] = _mm256_add_epi32(sums[4], r[4]); - sums[5] = _mm256_add_epi32(sums[5], r[5]); - sums[6] = _mm256_add_epi32(sums[6], r[6]); - sums[7] = _mm256_add_epi32(sums[7], r[7]); - - src_ptr += src_stride; - ref_ptr += ref_stride; - } - - calc_final_4(sums, sad_array); - calc_final_4(sums + 4, sad_array + 4); -} - void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { diff --git a/vpx_dsp/x86/sad_sse3.asm b/vpx_dsp/x86/sad_sse3.asm deleted file mode 100644 index acbd2e4fa1..0000000000 --- a/vpx_dsp/x86/sad_sse3.asm +++ /dev/null @@ -1,376 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "vpx_ports/x86_abi_support.asm" - -%macro STACK_FRAME_CREATE_X3 0 -%if ABI_IS_32BIT - %define src_ptr rsi - %define src_stride rax - %define ref_ptr rdi - %define ref_stride rdx - %define end_ptr rcx - %define ret_var rbx - %define result_ptr arg(4) - %define height dword ptr arg(4) - push rbp - mov rbp, rsp - push rsi - push rdi - push rbx - - mov rsi, arg(0) ; src_ptr - mov rdi, arg(2) ; ref_ptr - - movsxd rax, dword ptr arg(1) ; src_stride - movsxd rdx, dword ptr arg(3) ; ref_stride -%else - %if LIBVPX_YASM_WIN64 - SAVE_XMM 7, u - %define src_ptr rcx - %define src_stride rdx - %define ref_ptr r8 - %define ref_stride r9 - %define end_ptr r10 - %define ret_var r11 - %define result_ptr [rsp+xmm_stack_space+8+4*8] - %define height dword ptr [rsp+xmm_stack_space+8+4*8] - %else - %define src_ptr rdi - %define src_stride rsi - %define ref_ptr rdx - %define ref_stride rcx - %define end_ptr r9 - %define ret_var r10 - %define result_ptr r8 - %define height r8 - %endif -%endif - -%endmacro - -%macro STACK_FRAME_DESTROY_X3 0 - %define src_ptr - %define src_stride - %define ref_ptr - %define ref_stride - %define end_ptr - %define ret_var - %define result_ptr - %define height - -%if ABI_IS_32BIT - pop rbx - pop rdi - pop rsi - pop rbp -%else - %if LIBVPX_YASM_WIN64 - RESTORE_XMM - %endif -%endif - ret -%endmacro - -%macro PROCESS_16X2X3 5 -%if %1==0 - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm5, XMMWORD PTR [%3] - lddqu xmm6, XMMWORD PTR [%3+1] - lddqu xmm7, XMMWORD PTR [%3+2] - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm1, XMMWORD PTR [%3] - lddqu xmm2, XMMWORD PTR [%3+1] - lddqu xmm3, XMMWORD PTR [%3+2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [%2+%4] - lddqu xmm1, XMMWORD PTR [%3+%5] - lddqu xmm2, XMMWORD PTR [%3+%5+1] - lddqu xmm3, XMMWORD PTR [%3+%5+2] - -%if %1==0 || %1==1 - lea %2, [%2+%4*2] - lea %3, [%3+%5*2] -%endif - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_8X2X3 5 -%if %1==0 - movq mm0, QWORD PTR [%2] - movq mm5, QWORD PTR [%3] - movq mm6, QWORD PTR [%3+1] - movq mm7, QWORD PTR [%3+2] - - psadbw mm5, mm0 - psadbw mm6, mm0 - psadbw mm7, mm0 -%else - movq mm0, QWORD PTR [%2] - movq mm1, QWORD PTR [%3] - movq mm2, QWORD PTR [%3+1] - movq mm3, QWORD PTR [%3+2] - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 -%endif - movq mm0, QWORD PTR [%2+%4] - movq mm1, QWORD PTR [%3+%5] - movq mm2, QWORD PTR [%3+%5+1] - movq mm3, QWORD PTR [%3+%5+2] - -%if %1==0 || %1==1 - lea %2, [%2+%4*2] - lea %3, [%3+%5*2] -%endif - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 -%endmacro - -SECTION .text - -;void int vpx_sad16x16x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -globalsym(vpx_sad16x16x3_sse3) -sym(vpx_sad16x16x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+8], xmm0 - - STACK_FRAME_DESTROY_X3 - -;void int vpx_sad16x8x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -globalsym(vpx_sad16x8x3_sse3) -sym(vpx_sad16x8x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+8], xmm0 - - STACK_FRAME_DESTROY_X3 - -;void int vpx_sad8x16x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -globalsym(vpx_sad8x16x3_sse3) -sym(vpx_sad8x16x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - punpckldq mm5, mm6 - - movq [rcx], mm5 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 - -;void int vpx_sad8x8x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -globalsym(vpx_sad8x8x3_sse3) -sym(vpx_sad8x8x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - punpckldq mm5, mm6 - - movq [rcx], mm5 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 - -;void int vpx_sad4x4x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -globalsym(vpx_sad4x4x3_sse3) -sym(vpx_sad4x4x3_sse3): - - STACK_FRAME_CREATE_X3 - - movd mm0, DWORD PTR [src_ptr] - movd mm1, DWORD PTR [ref_ptr] - - movd mm2, DWORD PTR [src_ptr+src_stride] - movd mm3, DWORD PTR [ref_ptr+ref_stride] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - movd mm4, DWORD PTR [ref_ptr+1] - movd mm5, DWORD PTR [ref_ptr+2] - - movd mm2, DWORD PTR [ref_ptr+ref_stride+1] - movd mm3, DWORD PTR [ref_ptr+ref_stride+2] - - psadbw mm1, mm0 - - punpcklbw mm4, mm2 - punpcklbw mm5, mm3 - - psadbw mm4, mm0 - psadbw mm5, mm0 - - lea src_ptr, [src_ptr+src_stride*2] - lea ref_ptr, [ref_ptr+ref_stride*2] - - movd mm0, DWORD PTR [src_ptr] - movd mm2, DWORD PTR [ref_ptr] - - movd mm3, DWORD PTR [src_ptr+src_stride] - movd mm6, DWORD PTR [ref_ptr+ref_stride] - - punpcklbw mm0, mm3 - punpcklbw mm2, mm6 - - movd mm3, DWORD PTR [ref_ptr+1] - movd mm7, DWORD PTR [ref_ptr+2] - - psadbw mm2, mm0 - - paddw mm1, mm2 - - movd mm2, DWORD PTR [ref_ptr+ref_stride+1] - movd mm6, DWORD PTR [ref_ptr+ref_stride+2] - - punpcklbw mm3, mm2 - punpcklbw mm7, mm6 - - psadbw mm3, mm0 - psadbw mm7, mm0 - - paddw mm3, mm4 - paddw mm7, mm5 - - mov rcx, result_ptr - - punpckldq mm1, mm3 - - movq [rcx], mm1 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 diff --git a/vpx_dsp/x86/sad_sse4.asm b/vpx_dsp/x86/sad_sse4.asm deleted file mode 100644 index 0818ed5f0a..0000000000 --- a/vpx_dsp/x86/sad_sse4.asm +++ /dev/null @@ -1,361 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%macro PROCESS_16X2X8 1 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - movq xmm2, MMWORD PTR [rdi+16] - punpcklqdq xmm1, xmm3 - punpcklqdq xmm3, xmm2 - - movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm1, xmm2 - paddw xmm1, xmm3 - paddw xmm1, xmm4 -%else - movdqa xmm0, XMMWORD PTR [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - movq xmm2, MMWORD PTR [rdi+16] - punpcklqdq xmm5, xmm3 - punpcklqdq xmm3, xmm2 - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm5, xmm2 - paddw xmm5, xmm3 - paddw xmm5, xmm4 - - paddw xmm1, xmm5 -%endif - movdqa xmm0, XMMWORD PTR [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - movq xmm2, MMWORD PTR [rdi+ rdx+16] - punpcklqdq xmm5, xmm3 - punpcklqdq xmm3, xmm2 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm5, xmm2 - paddw xmm5, xmm3 - paddw xmm5, xmm4 - - paddw xmm1, xmm5 -%endmacro - -%macro PROCESS_8X2X8 1 -%if %1 - movq xmm0, MMWORD PTR [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm1, xmm3 - - movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm1, xmm2 -%else - movq xmm0, MMWORD PTR [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm5, xmm3 - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm5, xmm2 - - paddw xmm1, xmm5 -%endif - movq xmm0, MMWORD PTR [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - punpcklqdq xmm5, xmm3 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm5, xmm2 - - paddw xmm1, xmm5 -%endmacro - -%macro PROCESS_4X2X8 1 -%if %1 - movd xmm0, [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm1, xmm3 - - mpsadbw xmm1, xmm0, 0x0 -%else - movd xmm0, [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm5, xmm3 - - mpsadbw xmm5, xmm0, 0x0 - - paddw xmm1, xmm5 -%endif - movd xmm0, [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - punpcklqdq xmm5, xmm3 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - mpsadbw xmm5, xmm0, 0x0 - - paddw xmm1, xmm5 -%endmacro - -%macro WRITE_AS_INTS 0 - mov rdi, arg(4) ;Results - pxor xmm0, xmm0 - movdqa xmm2, xmm1 - punpcklwd xmm1, xmm0 - punpckhwd xmm2, xmm0 - - movdqa [rdi], xmm1 - movdqa [rdi + 16], xmm2 -%endmacro - -SECTION .text - -;void vpx_sad16x16x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array); -globalsym(vpx_sad16x16x8_sse4_1) -sym(vpx_sad16x16x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vpx_sad16x8x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -globalsym(vpx_sad16x8x8_sse4_1) -sym(vpx_sad16x8x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vpx_sad8x8x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -globalsym(vpx_sad8x8x8_sse4_1) -sym(vpx_sad8x8x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vpx_sad8x16x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -globalsym(vpx_sad8x16x8_sse4_1) -sym(vpx_sad8x16x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vpx_sad4x4x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -globalsym(vpx_sad4x4x8_sse4_1) -sym(vpx_sad4x4x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_4X2X8 1 - PROCESS_4X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - - diff --git a/vpx_dsp/x86/sad_ssse3.asm b/vpx_dsp/x86/sad_ssse3.asm deleted file mode 100644 index a5bc6d7306..0000000000 --- a/vpx_dsp/x86/sad_ssse3.asm +++ /dev/null @@ -1,372 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%macro PROCESS_16X2X3 1 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - lddqu xmm5, XMMWORD PTR [rdi] - lddqu xmm6, XMMWORD PTR [rdi+1] - lddqu xmm7, XMMWORD PTR [rdi+2] - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [rsi] - lddqu xmm1, XMMWORD PTR [rdi] - lddqu xmm2, XMMWORD PTR [rdi+1] - lddqu xmm3, XMMWORD PTR [rdi+2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [rsi+rax] - lddqu xmm1, XMMWORD PTR [rdi+rdx] - lddqu xmm2, XMMWORD PTR [rdi+rdx+1] - lddqu xmm3, XMMWORD PTR [rdi+rdx+2] - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_16X2X3_OFFSET 2 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - movdqa xmm4, XMMWORD PTR [rdi] - movdqa xmm7, XMMWORD PTR [rdi+16] - - movdqa xmm5, xmm7 - palignr xmm5, xmm4, %2 - - movdqa xmm6, xmm7 - palignr xmm6, xmm4, (%2+1) - - palignr xmm7, xmm4, (%2+2) - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [rsi] - movdqa xmm4, XMMWORD PTR [rdi] - movdqa xmm3, XMMWORD PTR [rdi+16] - - movdqa xmm1, xmm3 - palignr xmm1, xmm4, %2 - - movdqa xmm2, xmm3 - palignr xmm2, xmm4, (%2+1) - - palignr xmm3, xmm4, (%2+2) - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [rsi+rax] - movdqa xmm4, XMMWORD PTR [rdi+rdx] - movdqa xmm3, XMMWORD PTR [rdi+rdx+16] - - movdqa xmm1, xmm3 - palignr xmm1, xmm4, %2 - - movdqa xmm2, xmm3 - palignr xmm2, xmm4, (%2+1) - - palignr xmm3, xmm4, (%2+2) - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_16X16X3_OFFSET 2 -%2_aligned_by_%1: - - sub rdi, %1 - - PROCESS_16X2X3_OFFSET 1, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - - jmp %2_store_off - -%endmacro - -%macro PROCESS_16X8X3_OFFSET 2 -%2_aligned_by_%1: - - sub rdi, %1 - - PROCESS_16X2X3_OFFSET 1, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - - jmp %2_store_off - -%endmacro - -SECTION .text - -;void int vpx_sad16x16x3_ssse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -globalsym(vpx_sad16x16x3_ssse3) -sym(vpx_sad16x16x3_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - push rcx - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rdx, 0xf - and rdx, rdi - - jmp .vpx_sad16x16x3_ssse3_skiptable -.vpx_sad16x16x3_ssse3_jumptable: - dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump -.vpx_sad16x16x3_ssse3_skiptable: - - call .vpx_sad16x16x3_ssse3_do_jump -.vpx_sad16x16x3_ssse3_do_jump: - pop rcx ; get the address of do_jump - mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump - add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable - - movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable - add rcx, rax - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - jmp rcx - - PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3 - -.vpx_sad16x16x3_ssse3_aligned_by_15: - PROCESS_16X2X3 1 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - -.vpx_sad16x16x3_ssse3_store_off: - mov rdi, arg(4) ;Results - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rdi], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rdi+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rdi+8], xmm0 - - ; begin epilog - pop rcx - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void int vpx_sad16x8x3_ssse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -globalsym(vpx_sad16x8x3_ssse3) -sym(vpx_sad16x8x3_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - push rcx - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rdx, 0xf - and rdx, rdi - - jmp .vpx_sad16x8x3_ssse3_skiptable -.vpx_sad16x8x3_ssse3_jumptable: - dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump -.vpx_sad16x8x3_ssse3_skiptable: - - call .vpx_sad16x8x3_ssse3_do_jump -.vpx_sad16x8x3_ssse3_do_jump: - pop rcx ; get the address of do_jump - mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump - add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable - - movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable - add rcx, rax - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - jmp rcx - - PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3 - -.vpx_sad16x8x3_ssse3_aligned_by_15: - - PROCESS_16X2X3 1 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - -.vpx_sad16x8x3_ssse3_store_off: - mov rdi, arg(4) ;Results - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rdi], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rdi+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rdi+8], xmm0 - - ; begin epilog - pop rcx - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret From 02808ecbccf4fa385a700cffdd1aac796f6f37ca Mon Sep 17 00:00:00 2001 From: Johann Date: Tue, 29 Mar 2022 12:40:12 +0900 Subject: [PATCH 0304/1000] remove skip_block from quantize Whether a block is skipped is handled by mi->skip. x->skip_block is kept exclusively to verify that the quantize functions are not called for skip blocks. Finishes the cleanup in 13eed991f Bug: libvpx:1612 Change-Id: I1598c3b682d3c5e6c57a15fa4cb5df2c65b3a58a --- test/vp9_quantize_test.cc | 89 ++++----- vp9/common/vp9_rtcd_defs.pl | 8 +- vp9/encoder/arm/neon/vp9_quantize_neon.c | 15 +- vp9/encoder/ppc/vp9_quantize_vsx.c | 15 +- vp9/encoder/vp9_encodemb.c | 187 ++++++++---------- vp9/encoder/vp9_encoder.c | 19 +- vp9/encoder/vp9_pickmode.c | 15 +- vp9/encoder/vp9_quantize.c | 45 ++--- vp9/encoder/x86/vp9_quantize_avx2.c | 11 +- vp9/encoder/x86/vp9_quantize_sse2.c | 11 +- vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm | 4 +- vpx_dsp/arm/quantize_neon.c | 10 +- vpx_dsp/ppc/quantize_vsx.c | 22 +-- vpx_dsp/quantize.c | 78 ++++---- vpx_dsp/quantize.h | 13 +- vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 10 +- vpx_dsp/x86/quantize_avx.c | 11 +- vpx_dsp/x86/quantize_sse2.c | 6 +- vpx_dsp/x86/quantize_ssse3.c | 10 +- 20 files changed, 252 insertions(+), 335 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index cb4481b103..d54f1bc9cd 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -38,26 +38,24 @@ namespace { const int number_of_iterations = 100; typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, - int skip_block, const int16_t *zbin, - const int16_t *round, const int16_t *quant, - const int16_t *quant_shift, tran_low_t *qcoeff, - tran_low_t *dqcoeff, const int16_t *dequant, - uint16_t *eob, const int16_t *scan, - const int16_t *iscan); + const int16_t *zbin, const int16_t *round, + const int16_t *quant, const int16_t *quant_shift, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan); typedef std::tuple QuantizeParam; // Wrapper for FP version which does not use zbin or quant_shift. typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, - int skip_block, const int16_t *round, - const int16_t *quant, tran_low_t *qcoeff, - tran_low_t *dqcoeff, const int16_t *dequant, - uint16_t *eob, const int16_t *scan, - const int16_t *iscan); + const int16_t *round, const int16_t *quant, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan); template -void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block, +void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, const int16_t *zbin, const int16_t *round, const int16_t *quant, const int16_t *quant_shift, tran_low_t *qcoeff, tran_low_t *dqcoeff, @@ -66,8 +64,7 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block, (void)zbin; (void)quant_shift; - fn(coeff, count, skip_block, round, quant, qcoeff, dqcoeff, dequant, eob, - scan, iscan); + fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan); } class VP9QuantizeBase : public AbstractBench { @@ -138,7 +135,6 @@ class VP9QuantizeBase : public AbstractBench { int16_t *r_ptr_; int16_t *q_ptr_; int count_; - int skip_block_; const scan_order *scan_; uint16_t eob_; }; @@ -157,8 +153,8 @@ class VP9QuantizeTest : public VP9QuantizeBase, }; void VP9QuantizeTest::Run() { - quantize_op_(coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_, r_ptr_, - q_ptr_, quant_shift_ptr_, qcoeff_.TopLeftPixel(), + quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, + quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan, scan_->iscan); } @@ -167,16 +163,14 @@ void VP9QuantizeTest::Run() { // determine if further multiplication operations are needed. // Based on vp9_quantize_fp_sse2(). inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan, int is_32x32) { + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, + int is_32x32) { int i, eob = -1; const int thr = dequant_ptr[1] >> (1 + is_32x32); (void)iscan; - (void)skip_block; - assert(!skip_block); // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. @@ -243,22 +237,20 @@ inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { - quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quant_fp_nz(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0); } void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { - quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quant_fp_nz(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1); } @@ -316,9 +308,6 @@ TEST_P(VP9QuantizeTest, OperationCheck) { eob_ = 0; for (int i = 0; i < number_of_iterations; ++i) { - // Test skip block for the first three iterations to catch all the different - // sizes. - const int skip_block = 0; TX_SIZE sz; if (max_size_ == 16) { sz = static_cast(i % 3); // TX_4X4, TX_8X8 TX_16X16 @@ -332,13 +321,13 @@ TEST_P(VP9QuantizeTest, OperationCheck) { GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - ref_quantize_op_(coeff_.TopLeftPixel(), count_, skip_block, zbin_ptr_, - r_ptr_, q_ptr_, quant_shift_ptr_, - ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), - dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, + quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, + scan_->scan, scan_->iscan); ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_.TopLeftPixel(), count_, skip_block, zbin_ptr_, r_ptr_, q_ptr_, + coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); @@ -372,7 +361,6 @@ TEST_P(VP9QuantizeTest, EOBCheck) { const uint32_t max_index = max_size_ * max_size_ - 1; for (int i = 0; i < number_of_iterations; ++i) { - skip_block_ = 0; TX_SIZE sz; if (max_size_ == 16) { sz = static_cast(i % 3); // TX_4X4, TX_8X8 TX_16X16 @@ -391,13 +379,13 @@ TEST_P(VP9QuantizeTest, EOBCheck) { GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - ref_quantize_op_(coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_, - r_ptr_, q_ptr_, quant_shift_ptr_, - ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), - dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, + quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, + scan_->scan, scan_->iscan); ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_, r_ptr_, q_ptr_, + coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); @@ -433,7 +421,6 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) { for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) { // zbin > coeff, zbin < coeff. for (int i = 0; i < 2; ++i) { - skip_block_ = 0; // TX_TYPE defines the scan order. That is not relevant to the speed test. // Pick the first one. const TX_TYPE tx_type = DCT_DCT; diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 6980b9b7fb..5146121a8d 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -128,10 +128,10 @@ () add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; -add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_fp neon sse2 avx2 vsx/, "$ssse3_x86_64"; -add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_fp_32x32 neon vsx/, "$ssse3_x86_64"; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { @@ -195,9 +195,9 @@ () # ENCODEMB INVOKE - add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ; + add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ; # fdct functions add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index d75a481796..236c3176c7 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -43,11 +43,10 @@ static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, } void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, - int skip_block, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. int i; @@ -59,8 +58,6 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); (void)scan; - (void)skip_block; - assert(!skip_block); // adjust for dc v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); @@ -138,7 +135,7 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) { } void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, - int skip_block, const int16_t *round_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, @@ -167,8 +164,6 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, uint16x8_t eob_max; (void)scan; (void)count; - (void)skip_block; - assert(!skip_block); // coeff * quant_ptr[]) >> 15 qcoeff = vqdmulhq_s16(qcoeff, quant); diff --git a/vp9/encoder/ppc/vp9_quantize_vsx.c b/vp9/encoder/ppc/vp9_quantize_vsx.c index 4f88b8fff6..4d31558471 100644 --- a/vp9/encoder/ppc/vp9_quantize_vsx.c +++ b/vp9/encoder/ppc/vp9_quantize_vsx.c @@ -39,11 +39,10 @@ static INLINE int16x8_t vec_max_across(int16x8_t a) { } void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob; bool16x8_t zero_coeff0, zero_coeff1; @@ -56,8 +55,6 @@ void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int16x8_t scan1 = vec_vsx_ld(16, iscan); (void)scan; - (void)skip_block; - assert(!skip_block); // First set of 8 coeff starts with DC + 7 AC qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant); @@ -165,7 +162,7 @@ static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, } void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, @@ -194,9 +191,7 @@ void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int16x8_t abs_coeff1 = vec_abs(coeff1); (void)scan; - (void)skip_block; (void)n_coeffs; - assert(!skip_block); mask0 = vec_cmpge(abs_coeff0, thres); round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16); diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 7630a81103..fa222f9dcf 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -366,28 +366,28 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->round_fp, - p->quant_fp, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + vp9_highbd_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp, + qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->round_fp, - p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vp9_highbd_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff, + dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); break; case TX_8X8: vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->round_fp, - p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vp9_highbd_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff, + dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->round_fp, - p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vp9_highbd_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff, + dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); break; } return; @@ -397,29 +397,26 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->round_fp, - p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vp9_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp, qcoeff, + dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); break; case TX_16X16: vpx_fdct16x16(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 256, x->skip_block, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff, dqcoeff, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_8X8: vpx_fdct8x8(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 64, x->skip_block, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff, dqcoeff, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff, dqcoeff, + pd->dequant, eob, scan_order->scan, scan_order->iscan); break; } } @@ -444,28 +441,24 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride); - vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round, - p->quant_fp[0], qcoeff, dqcoeff, - pd->dequant[0], eob); + vpx_highbd_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff, + dqcoeff, pd->dequant[0], eob); break; case TX_16X16: vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride); - vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round, - p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], - eob); + vpx_highbd_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff, + dqcoeff, pd->dequant[0], eob); break; case TX_8X8: vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride); - vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round, - p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], - eob); + vpx_highbd_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff, + dqcoeff, pd->dequant[0], eob); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round, - p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], - eob); + vpx_highbd_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff, + dqcoeff, pd->dequant[0], eob); break; } return; @@ -475,24 +468,24 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: vpx_fdct32x32_1(src_diff, coeff, diff_stride); - vpx_quantize_dc_32x32(coeff, x->skip_block, p->round, p->quant_fp[0], - qcoeff, dqcoeff, pd->dequant[0], eob); + vpx_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); break; case TX_16X16: vpx_fdct16x16_1(src_diff, coeff, diff_stride); - vpx_quantize_dc(coeff, 256, x->skip_block, p->round, p->quant_fp[0], - qcoeff, dqcoeff, pd->dequant[0], eob); + vpx_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); break; case TX_8X8: vpx_fdct8x8_1(src_diff, coeff, diff_stride); - vpx_quantize_dc(coeff, 64, x->skip_block, p->round, p->quant_fp[0], - qcoeff, dqcoeff, pd->dequant[0], eob); + vpx_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0], - qcoeff, dqcoeff, pd->dequant[0], eob); + vpx_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); break; } } @@ -518,32 +511,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, - p->round, p->quant, p->quant_shift, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b_32x32( + coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, + dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; case TX_8X8: vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; } return; @@ -553,29 +542,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; case TX_16X16: vpx_fdct16x16(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); break; case TX_8X8: vpx_fdct8x8(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); break; } } @@ -869,10 +857,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, - p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b_32x32( + coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, + dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -889,10 +876,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); else vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type); - vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, + eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -910,10 +896,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); else vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type); - vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, + eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -932,10 +917,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type); else x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, + eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -964,10 +948,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride); fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -980,9 +963,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_subtract_block(16, 16, src_diff, diff_stride, src, src_stride, dst, dst_stride); vp9_fht16x16(src_diff, coeff, diff_stride, tx_type); - vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -995,9 +978,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_subtract_block(8, 8, src_diff, diff_stride, src, src_stride, dst, dst_stride); vp9_fht8x8(src_diff, coeff, diff_stride, tx_type); - vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -1014,9 +997,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vp9_fht4x4(src_diff, coeff, diff_stride, tx_type); else x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 4609a6bb26..97805fc164 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -6620,19 +6620,22 @@ static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; const int shift = tx_size == TX_32X32 ? 0 : 2; + // skip block condition should be handled before this is called. + assert(!x->skip_block); + #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp, - p->quant_fp, qcoeff, dqcoeff, pd->dequant, - &eob, scan_order->scan, scan_order->iscan); + vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, + qcoeff, dqcoeff, pd->dequant, &eob, + scan_order->scan, scan_order->iscan); } else { - vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp, - p->quant_fp, qcoeff, dqcoeff, pd->dequant, &eob, - scan_order->scan, scan_order->iscan); + vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, + dqcoeff, pd->dequant, &eob, scan_order->scan, + scan_order->iscan); } #else - vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, &eob, scan_order->scan, + vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, + dqcoeff, pd->dequant, &eob, scan_order->scan, scan_order->iscan); #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index c8e167f25b..697c589ab3 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -771,24 +771,27 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, const int16_t *src_diff; src_diff = &p->src_diff[(r * diff_stride + c) << 2]; + // skip block condition should be handled before this is called. + assert(!x->skip_block); + switch (tx_size) { case TX_16X16: vpx_hadamard_16x16(src_diff, diff_stride, coeff); - vp9_quantize_fp(coeff, 256, x->skip_block, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff, + dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_8X8: vpx_hadamard_8x8(src_diff, diff_stride, coeff); - vp9_quantize_fp(coeff, 64, x->skip_block, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff, + dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff, + dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; } diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index c996b75167..9058997b0f 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -22,15 +22,12 @@ #include "vp9/encoder/vp9_rd.h" void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { int i, eob = -1; (void)iscan; - (void)skip_block; - assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -56,7 +53,7 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, @@ -65,8 +62,6 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int eob = -1; (void)iscan; - (void)skip_block; - assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -91,15 +86,12 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // TODO(jingning) Refactor this file and combine functions with similar // operations. void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int i, eob = -1; (void)iscan; - (void)skip_block; - assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -126,15 +118,13 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_quantize_fp_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { int i, eob = -1; (void)iscan; - (void)skip_block; - assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -176,16 +166,15 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, - x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, - &p->eobs[block], scan, iscan); + vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, + p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, &p->eobs[block], scan, iscan); return; } #endif - vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, x->skip_block, - p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, &p->eobs[block], scan, iscan); + vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, + &p->eobs[block], scan, iscan); } static void invert_quant(int16_t *quant, int16_t *shift, int d) { diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c index 8dfdbd50f6..db18b1a7a4 100644 --- a/vp9/encoder/x86/vp9_quantize_avx2.c +++ b/vp9/encoder/x86/vp9_quantize_avx2.c @@ -47,18 +47,15 @@ static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr, } void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { __m128i eob; __m256i round256, quant256, dequant256; __m256i eob256, thr256; (void)scan; - (void)skip_block; - assert(!skip_block); coeff_ptr += n_coeffs; iscan += n_coeffs; diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index e3d803b8f0..4bcadaa6a1 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -18,11 +18,10 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { __m128i zero; __m128i thr; int nzflag; @@ -30,8 +29,6 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i round, quant, dequant; (void)scan; - (void)skip_block; - assert(!skip_block); coeff_ptr += n_coeffs; iscan += n_coeffs; diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index 5703aa3bb6..680acfec69 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -19,18 +19,18 @@ pw_1: times 8 dw 1 SECTION .text %macro QUANTIZE_FP 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \ +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, round, quant, \ qcoeff, dqcoeff, dequant, \ eob, scan, iscan ; actual quantize loop - setup pointers, rounders, etc. movifnidn coeffq, coeffmp movifnidn ncoeffq, ncoeffmp - mov r2, dequantmp movifnidn roundq, roundmp movifnidn quantq, quantmp mova m1, [roundq] ; m1 = round mova m2, [quantq] ; m2 = quant + mov r2, dequantmp %ifidn %1, fp_32x32 pcmpeqw m5, m5 psrlw m5, 15 diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index adef5f6e15..bd7818a074 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -32,8 +32,8 @@ static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, } void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, @@ -42,8 +42,6 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; (void)scan; - (void)skip_block; - assert(!skip_block); // Process first 8 values which include a dc component. { @@ -189,7 +187,7 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff, // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, @@ -202,8 +200,6 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int i; (void)scan; (void)n_coeffs; // Because we will always calculate 32*32. - (void)skip_block; - assert(!skip_block); // Process first 8 values which include a dc component. { diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c index d85e63bd14..7cdcbeb405 100644 --- a/vpx_dsp/ppc/quantize_vsx.c +++ b/vpx_dsp/ppc/quantize_vsx.c @@ -95,8 +95,8 @@ static INLINE int16x8_t vec_max_across(int16x8_t a) { } void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, @@ -122,8 +122,6 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, zero_mask1 = vec_cmpge(coeff1_abs, zbin); (void)scan_ptr; - (void)skip_block; - assert(!skip_block); qcoeff0 = quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0); @@ -196,12 +194,14 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = eob[0]; } -void vpx_quantize_b_32x32_vsx( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan_ptr, const int16_t *iscan_ptr) { +void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, + const int16_t *iscan_ptr) { // In stage 1, we quantize 16 coeffs (DC + 15 AC) // In stage 2, we loop 42 times and quantize 24 coeffs per iteration // (32 * 32 - 16) / 24 = 42 @@ -227,9 +227,7 @@ void vpx_quantize_b_32x32_vsx( int16x8_t coeff1_abs = vec_abs(coeff1); (void)scan_ptr; - (void)skip_block; (void)n_coeffs; - assert(!skip_block); // 32x32 quantization requires that zbin and round be divided by 2 zbin = vec_sra(vec_add(zbin, vec_ones_s16), vec_ones_u16); diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index 61818f692e..5d6ba64a8a 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -15,7 +15,7 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" -void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, +void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant, uint16_t *eob_ptr) { @@ -28,28 +28,26 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = (tmp * quant) >> 16; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant; - if (tmp) eob = 0; - } + tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant) >> 16; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant; + if (tmp) eob = 0; + *eob_ptr = eob + 1; } #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t quant, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t dequant, - uint16_t *eob_ptr) { + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant, uint16_t *eob_ptr) { int eob = -1; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { + { const int coeff = coeff_ptr[0]; const int coeff_sign = (coeff >> 31); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; @@ -59,11 +57,12 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant; if (abs_qcoeff) eob = 0; } + *eob_ptr = eob + 1; } #endif -void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, +void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant, uint16_t *eob_ptr) { @@ -77,19 +76,18 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), - INT16_MIN, INT16_MAX); - tmp = (tmp * quant) >> 15; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2; - if (tmp) eob = 0; - } + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), INT16_MIN, + INT16_MAX); + tmp = (tmp * quant) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2; + if (tmp) eob = 0; + *eob_ptr = eob + 1; } #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, +void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, @@ -100,7 +98,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { + { const int coeff = coeff_ptr[0]; const int coeff_sign = (coeff >> 31); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; @@ -110,23 +108,21 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant / 2; if (abs_qcoeff) eob = 0; } + *eob_ptr = eob + 1; } #endif void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { int i, non_zero_count = (int)n_coeffs, eob = -1; const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; (void)iscan; - (void)skip_block; - assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -166,8 +162,8 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, @@ -176,8 +172,6 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; (void)iscan; - (void)skip_block; - assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -215,8 +209,8 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #endif void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, @@ -229,8 +223,6 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int idx_arr[1024]; int i, eob = -1; (void)iscan; - (void)skip_block; - assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -277,8 +269,8 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -290,8 +282,6 @@ void vpx_highbd_quantize_b_32x32_c( int idx_arr[1024]; int i, eob = -1; (void)iscan; - (void)skip_block; - assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h index 7cac140e9d..8e138445e2 100644 --- a/vpx_dsp/quantize.h +++ b/vpx_dsp/quantize.h @@ -18,22 +18,21 @@ extern "C" { #endif -void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, +void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant, uint16_t *eob_ptr); -void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, +void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant, uint16_t *eob_ptr); #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t quant, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t dequant, - uint16_t *eob_ptr); -void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant, uint16_t *eob_ptr); +void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 06a8febb29..372903aff2 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -709,17 +709,17 @@ () # Quantization # if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { - add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx/; - add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b sse2/; - add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b_32x32 sse2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 7149e4fb74..4535a0f7a2 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -18,7 +18,7 @@ #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, - int skip_block, const int16_t *zbin_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, @@ -39,8 +39,6 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); (void)scan; - (void)skip_block; - assert(!skip_block); memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); @@ -94,8 +92,8 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, } void vpx_highbd_quantize_b_32x32_sse2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -107,8 +105,6 @@ void vpx_highbd_quantize_b_32x32_sse2( const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); (void)scan; - (void)skip_block; - assert(!skip_block); zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); zbins[1] = _mm_set1_epi32(zbin1_tmp); diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index 0a91d36eaf..706e4e6413 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -21,8 +21,8 @@ #include "vpx_dsp/x86/quantize_ssse3.h" void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, @@ -39,8 +39,6 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i eob = zero, eob0; (void)scan; - (void)skip_block; - assert(!skip_block); *eob_ptr = 0; @@ -145,8 +143,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, @@ -166,8 +163,6 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, (void)scan; (void)n_coeffs; - (void)skip_block; - assert(!skip_block); // Setup global values. // The 32x32 halves zbin and round. diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c index e38a4059ab..459d95f28b 100644 --- a/vpx_dsp/x86/quantize_sse2.c +++ b/vpx_dsp/x86/quantize_sse2.c @@ -18,8 +18,8 @@ #include "vpx_dsp/x86/quantize_sse2.h" void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, @@ -34,8 +34,6 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i eob, eob0; (void)scan; - (void)skip_block; - assert(!skip_block); // Setup global values. load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index fc1d91959f..9d2a88b7bc 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -18,8 +18,8 @@ #include "vpx_dsp/x86/quantize_ssse3.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, @@ -34,8 +34,6 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i eob, eob0; (void)scan; - (void)skip_block; - assert(!skip_block); load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant, quant_shift_ptr, &shift); @@ -111,7 +109,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, @@ -131,8 +129,6 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, (void)scan; (void)n_coeffs; - (void)skip_block; - assert(!skip_block); // Setup global values. // The 32x32 halves zbin and round. From 1239be9e5faf3f7c1603548ed669af35639b74f8 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 29 Mar 2022 20:00:00 -0700 Subject: [PATCH 0305/1000] sad4d_avx2: fix VS 2014 build error after: d60b671a7 gcc 11 warning: mismatched bound error C2719: 'sums': formal parameter with requested alignment of 32 won't be aligned Change-Id: Iaba46d00ef2334a5e2d9ee69b5d03478fdc73a60 --- vpx_dsp/x86/sad4d_avx2.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c index 5f1f757e25..81f1a916f0 100644 --- a/vpx_dsp/x86/sad4d_avx2.c +++ b/vpx_dsp/x86/sad4d_avx2.c @@ -11,7 +11,12 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -static INLINE void calc_final_4(const __m256i sums[4], uint32_t sad_array[4]) { +// Note with sums[4] some versions of Visual Studio may fail due to parameter +// alignment, though the functions should be equivalent: +// error C2719: 'sums': formal parameter with requested alignment of 32 won't be +// aligned +static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, + uint32_t sad_array[4]) { const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); const __m256i t2 = _mm256_hadd_epi32(t0, t1); From 247658efb0a29fbcd66a84ef67aab2794e517380 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 16 Mar 2022 16:27:27 +0200 Subject: [PATCH 0306/1000] Optimize FHT functions for NEON [NEON] Optimize vp9_fht4x4, vp9_fht8x8, vp9_fht16x16 for NEON Following change #3516278, the improvement for these functions is: Before: 4.10% 0.75% vpxenc vpxenc [.] vp9_fht16x16_c 2.93% 0.65% vpxenc vpxenc [.] vp9_fht8x8_c 0.93% 0.77% vpxenc vpxenc [.] vp9_fht4x4_c And after the patch: 0.69% 0.16% vpxenc vpxenc [.] vp9_fht16x16_neon 0.28% 0.28% vpxenc vpxenc [.] vp9_fht8x8_neon 0.54% 0.53% vpxenc vpxenc [.] vp9_fht4x4_neon Bug: webm:1634 Change-Id: I6748a0c4e0cfaafa3eefdd4848d0ac3aab6900e4 --- test/dct_test.cc | 5 +- vp9/common/vp9_rtcd_defs.pl | 6 +- vp9/encoder/arm/neon/vp9_dct_neon.c | 1460 +++++++++++++++++++++++++++ vp9/vp9cx.mk | 1 + 4 files changed, 1468 insertions(+), 4 deletions(-) create mode 100644 vp9/encoder/arm/neon/vp9_dct_neon.c diff --git a/test/dct_test.cc b/test/dct_test.cc index 9541869535..20e081a24c 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -641,8 +641,11 @@ static const FuncInfo ht_neon_func_info[] = { &highbd_iht_wrapper, 16, 2 }, #endif { &vp9_fht4x4_c, &iht_wrapper, 4, 1 }, + { &vp9_fht4x4_neon, &iht_wrapper, 4, 1 }, { &vp9_fht8x8_c, &iht_wrapper, 8, 1 }, - { &vp9_fht16x16_c, &iht_wrapper, 16, 1 } + { &vp9_fht8x8_neon, &iht_wrapper, 8, 1 }, + { &vp9_fht16x16_c, &iht_wrapper, 16, 1 }, + { &vp9_fht16x16_neon, &iht_wrapper, 16, 1 } }; INSTANTIATE_TEST_SUITE_P( diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 6980b9b7fb..951da7dbea 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -159,9 +159,9 @@ () # Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH # is off. -specialize qw/vp9_fht4x4 sse2/; -specialize qw/vp9_fht8x8 sse2/; -specialize qw/vp9_fht16x16 sse2/; +specialize qw/vp9_fht4x4 sse2 neon/; +specialize qw/vp9_fht8x8 sse2 neon/; +specialize qw/vp9_fht16x16 sse2 neon/; specialize qw/vp9_fwht4x4 sse2/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { # Note that these specializations are appended to the above ones. diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c new file mode 100644 index 0000000000..a07a1608d7 --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -0,0 +1,1460 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" + +static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in, + int stride) { + // { 0, 1, 1, 1, 1, 1, 1, 1 }; + const int16x8_t nonzero_bias_a = vextq_s16(vdupq_n_s16(0), vdupq_n_s16(1), 7); + // { 1, 0, 0, 0, 0, 0, 0, 0 }; + const int16x8_t nonzero_bias_b = vextq_s16(vdupq_n_s16(1), vdupq_n_s16(0), 7); + int16x8_t mask; + + int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4); + int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4); + int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4); + int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4); + + in[0] = vcombine_s16(input_0, input_1); + in[1] = vcombine_s16(input_2, input_3); + + // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by + // one non-zero first elements + mask = vreinterpretq_s16_u16(vceqq_s16(in[0], nonzero_bias_a)); + in[0] = vaddq_s16(in[0], mask); + in[0] = vaddq_s16(in[0], nonzero_bias_b); +} + +static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) { + const int16x8_t one_s16 = vdupq_n_s16(1); + res[0] = vaddq_s16(res[0], one_s16); + res[1] = vaddq_s16(res[1], one_s16); + res[0] = vshrq_n_s16(res[0], 2); + res[1] = vshrq_n_s16(res[1], 2); + store_s16q_to_tran_low(output + 0 * 8, res[0]); + store_s16q_to_tran_low(output + 1 * 8, res[1]); +} + +static INLINE void fadst4x4_neon(int16x8_t *in) { + int32x4_t u0, u1, u2, u3; + int16x4_t out_0, out_1, out_2, out_3; + const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING); + + const int16x4_t s0 = vget_low_s16(in[0]); // | x_00 | x_01 | x_02 | x_03 | + const int16x4_t s1 = vget_high_s16(in[0]); // | x_10 | x_11 | x_12 | x_13 | + const int16x4_t s2 = vget_low_s16(in[1]); // | x_20 | x_21 | x_22 | x_23 | + const int16x4_t s3 = vget_high_s16(in[1]); // | x_30 | x_31 | x_32 | x_33 | + + // s0 * sinpi_1_9, s0 * sinpi_4_9 + // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. + const int32x4_t s0s1_9 = vmull_n_s16(s0, sinpi_1_9); + const int32x4_t s0s4_9 = vmull_n_s16(s0, sinpi_4_9); + // s1 * sinpi_1_9, s1 * sinpi_2_9 + const int32x4_t s1s1_9 = vmull_n_s16(s1, sinpi_1_9); + const int32x4_t s1s2_9 = vmull_n_s16(s1, sinpi_2_9); + // s2 * sinpi_3_9 + const int32x4_t s2s3_9 = vmull_n_s16(s2, sinpi_3_9); + // s3 * sinpi_2_9, s3 * sinpi_4_9 + const int32x4_t s3s2_9 = vmull_n_s16(s3, sinpi_2_9); + const int32x4_t s3s4_9 = vmull_n_s16(s3, sinpi_4_9); + + // (s0 + s1) * sinpi_3_9 + const int32x4_t s0_p_s1 = vaddl_s16(s0, s1); + const int32x4_t s0_p_s1_m_s3 = vsubw_s16(s0_p_s1, s3); + + // s_0 * sinpi_1_9 + s_1 * sinpi_2_9 + // s_0 * sinpi_4_9 - s_1 * sinpi_1_9 + const int32x4_t s0s1_9_p_s1s2_9 = vaddq_s32(s0s1_9, s1s2_9); + const int32x4_t s0s4_9_m_s1s1_9 = vsubq_s32(s0s4_9, s1s1_9); + /* + * t0 = s0s1_9 + s1s2_9 + s3s4_9 + * t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9 + * t2 = s0s4_9 - s1s1_9 + s3s2_9 + * t3 = s2s3_9 + */ + const int32x4_t t0 = vaddq_s32(s0s1_9_p_s1s2_9, s3s4_9); + const int32x4_t t1 = vmulq_n_s32(s0_p_s1_m_s3, sinpi_3_9); + const int32x4_t t2 = vaddq_s32(s0s4_9_m_s1s1_9, s3s2_9); + const int32x4_t t3 = s2s3_9; + /* + * u0 = t0 + t3 + * u1 = t1 + * u2 = t2 - t3 + * u3 = t2 - t0 + t3 + */ + u0 = vaddq_s32(t0, t3); + u1 = t1; + u2 = vsubq_s32(t2, t3); + u3 = vaddq_s32(vsubq_s32(t2, t0), t3); + + // fdct_round_shift + u0 = vaddq_s32(u0, k__DCT_CONST_ROUNDING); + u1 = vaddq_s32(u1, k__DCT_CONST_ROUNDING); + u2 = vaddq_s32(u2, k__DCT_CONST_ROUNDING); + u3 = vaddq_s32(u3, k__DCT_CONST_ROUNDING); + + out_0 = vshrn_n_s32(u0, DCT_CONST_BITS); + out_1 = vshrn_n_s32(u1, DCT_CONST_BITS); + out_2 = vshrn_n_s32(u2, DCT_CONST_BITS); + out_3 = vshrn_n_s32(u3, DCT_CONST_BITS); + + transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); + + in[0] = vcombine_s16(out_0, out_1); + in[1] = vcombine_s16(out_2, out_3); +} + +void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + int16x8_t in[2]; + + switch (tx_type) { + case DCT_DCT: vpx_fdct4x4_neon(input, output, stride); break; + case ADST_DCT: + load_buffer_4x4(input, in, stride); + fadst4x4_neon(in); + vpx_fdct4x4_pass1_neon((int16x4_t *)in); + write_buffer_4x4(output, in); + break; + case DCT_ADST: + load_buffer_4x4(input, in, stride); + vpx_fdct4x4_pass1_neon((int16x4_t *)in); + fadst4x4_neon(in); + write_buffer_4x4(output, in); + break; + default: + assert(tx_type == ADST_ADST); + load_buffer_4x4(input, in, stride); + fadst4x4_neon(in); + fadst4x4_neon(in); + write_buffer_4x4(output, in); + break; + } +} + +static INLINE void load_buffer_8x8(const int16_t *input, int16x8_t *in, + int stride) { + in[0] = vshlq_n_s16(vld1q_s16(input + 0 * stride), 2); + in[1] = vshlq_n_s16(vld1q_s16(input + 1 * stride), 2); + in[2] = vshlq_n_s16(vld1q_s16(input + 2 * stride), 2); + in[3] = vshlq_n_s16(vld1q_s16(input + 3 * stride), 2); + in[4] = vshlq_n_s16(vld1q_s16(input + 4 * stride), 2); + in[5] = vshlq_n_s16(vld1q_s16(input + 5 * stride), 2); + in[6] = vshlq_n_s16(vld1q_s16(input + 6 * stride), 2); + in[7] = vshlq_n_s16(vld1q_s16(input + 7 * stride), 2); +} + +/* right shift and rounding + * first get the sign bit (bit 15). + * If bit == 1, it's the simple case of shifting right by one bit. + * If bit == 2, it essentially computes the expression: + * + * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; + * + * for each row. + */ +static INLINE void right_shift_8x8(int16x8_t *res, const int bit) { + int16x8_t sign0 = vshrq_n_s16(res[0], 15); + int16x8_t sign1 = vshrq_n_s16(res[1], 15); + int16x8_t sign2 = vshrq_n_s16(res[2], 15); + int16x8_t sign3 = vshrq_n_s16(res[3], 15); + int16x8_t sign4 = vshrq_n_s16(res[4], 15); + int16x8_t sign5 = vshrq_n_s16(res[5], 15); + int16x8_t sign6 = vshrq_n_s16(res[6], 15); + int16x8_t sign7 = vshrq_n_s16(res[7], 15); + + if (bit == 2) { + const int16x8_t const_rounding = vdupq_n_s16(1); + res[0] = vaddq_s16(res[0], const_rounding); + res[1] = vaddq_s16(res[1], const_rounding); + res[2] = vaddq_s16(res[2], const_rounding); + res[3] = vaddq_s16(res[3], const_rounding); + res[4] = vaddq_s16(res[4], const_rounding); + res[5] = vaddq_s16(res[5], const_rounding); + res[6] = vaddq_s16(res[6], const_rounding); + res[7] = vaddq_s16(res[7], const_rounding); + } + + res[0] = vsubq_s16(res[0], sign0); + res[1] = vsubq_s16(res[1], sign1); + res[2] = vsubq_s16(res[2], sign2); + res[3] = vsubq_s16(res[3], sign3); + res[4] = vsubq_s16(res[4], sign4); + res[5] = vsubq_s16(res[5], sign5); + res[6] = vsubq_s16(res[6], sign6); + res[7] = vsubq_s16(res[7], sign7); + + if (bit == 1) { + res[0] = vshrq_n_s16(res[0], 1); + res[1] = vshrq_n_s16(res[1], 1); + res[2] = vshrq_n_s16(res[2], 1); + res[3] = vshrq_n_s16(res[3], 1); + res[4] = vshrq_n_s16(res[4], 1); + res[5] = vshrq_n_s16(res[5], 1); + res[6] = vshrq_n_s16(res[6], 1); + res[7] = vshrq_n_s16(res[7], 1); + } else { + res[0] = vshrq_n_s16(res[0], 2); + res[1] = vshrq_n_s16(res[1], 2); + res[2] = vshrq_n_s16(res[2], 2); + res[3] = vshrq_n_s16(res[3], 2); + res[4] = vshrq_n_s16(res[4], 2); + res[5] = vshrq_n_s16(res[5], 2); + res[6] = vshrq_n_s16(res[6], 2); + res[7] = vshrq_n_s16(res[7], 2); + } +} + +static INLINE void write_buffer_8x8(tran_low_t *output, int16x8_t *res, + int stride) { + store_s16q_to_tran_low(output + 0 * stride, res[0]); + store_s16q_to_tran_low(output + 1 * stride, res[1]); + store_s16q_to_tran_low(output + 2 * stride, res[2]); + store_s16q_to_tran_low(output + 3 * stride, res[3]); + store_s16q_to_tran_low(output + 4 * stride, res[4]); + store_s16q_to_tran_low(output + 5 * stride, res[5]); + store_s16q_to_tran_low(output + 6 * stride, res[6]); + store_s16q_to_tran_low(output + 7 * stride, res[7]); +} + +static INLINE void fadst8x8_neon(int16x8_t *in) { + int16x4_t x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi, x4_lo, + x4_hi, x5_lo, x5_hi, x6_lo, x6_hi, x7_lo, x7_hi; + int32x4_t s0_lo, s0_hi, s1_lo, s1_hi, s2_lo, s2_hi, s3_lo, s3_hi, s4_lo, + s4_hi, s5_lo, s5_hi, s6_lo, s6_hi, s7_lo, s7_hi; + int32x4_t t0_lo, t0_hi, t1_lo, t1_hi, t2_lo, t2_hi, t3_lo, t3_hi, t4_lo, + t4_hi, t5_lo, t5_hi, t6_lo, t6_hi, t7_lo, t7_hi; + const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING); + + x0_lo = vget_low_s16(in[7]); + x0_hi = vget_high_s16(in[7]); + x1_lo = vget_low_s16(in[0]); + x1_hi = vget_high_s16(in[0]); + x2_lo = vget_low_s16(in[5]); + x2_hi = vget_high_s16(in[5]); + x3_lo = vget_low_s16(in[2]); + x3_hi = vget_high_s16(in[2]); + x4_lo = vget_low_s16(in[3]); + x4_hi = vget_high_s16(in[3]); + x5_lo = vget_low_s16(in[4]); + x5_hi = vget_high_s16(in[4]); + x6_lo = vget_low_s16(in[1]); + x6_hi = vget_high_s16(in[1]); + x7_lo = vget_low_s16(in[6]); + x7_hi = vget_high_s16(in[6]); + + // stage 1 + // s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s0_lo = vaddq_s32(vmull_n_s16(x0_lo, cospi_2_64), + vmull_n_s16(x1_lo, cospi_30_64)); + s0_hi = vaddq_s32(vmull_n_s16(x0_hi, cospi_2_64), + vmull_n_s16(x1_hi, cospi_30_64)); + // s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s1_lo = vsubq_s32(vmull_n_s16(x0_lo, cospi_30_64), + vmull_n_s16(x1_lo, cospi_2_64)); + s1_hi = vsubq_s32(vmull_n_s16(x0_hi, cospi_30_64), + vmull_n_s16(x1_hi, cospi_2_64)); + // s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s2_lo = vaddq_s32(vmull_n_s16(x2_lo, cospi_10_64), + vmull_n_s16(x3_lo, cospi_22_64)); + s2_hi = vaddq_s32(vmull_n_s16(x2_hi, cospi_10_64), + vmull_n_s16(x3_hi, cospi_22_64)); + // s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s3_lo = vsubq_s32(vmull_n_s16(x2_lo, cospi_22_64), + vmull_n_s16(x3_lo, cospi_10_64)); + s3_hi = vsubq_s32(vmull_n_s16(x2_hi, cospi_22_64), + vmull_n_s16(x3_hi, cospi_10_64)); + // s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s4_lo = vaddq_s32(vmull_n_s16(x4_lo, cospi_18_64), + vmull_n_s16(x5_lo, cospi_14_64)); + s4_hi = vaddq_s32(vmull_n_s16(x4_hi, cospi_18_64), + vmull_n_s16(x5_hi, cospi_14_64)); + // s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s5_lo = vsubq_s32(vmull_n_s16(x4_lo, cospi_14_64), + vmull_n_s16(x5_lo, cospi_18_64)); + s5_hi = vsubq_s32(vmull_n_s16(x4_hi, cospi_14_64), + vmull_n_s16(x5_hi, cospi_18_64)); + // s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s6_lo = vaddq_s32(vmull_n_s16(x6_lo, cospi_26_64), + vmull_n_s16(x7_lo, cospi_6_64)); + s6_hi = vaddq_s32(vmull_n_s16(x6_hi, cospi_26_64), + vmull_n_s16(x7_hi, cospi_6_64)); + // s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + s7_lo = vsubq_s32(vmull_n_s16(x6_lo, cospi_6_64), + vmull_n_s16(x7_lo, cospi_26_64)); + s7_hi = vsubq_s32(vmull_n_s16(x6_hi, cospi_6_64), + vmull_n_s16(x7_hi, cospi_26_64)); + + // fdct_round_shift + t0_lo = vaddq_s32(s0_lo, s4_lo); + t0_hi = vaddq_s32(s0_hi, s4_hi); + t1_lo = vaddq_s32(s1_lo, s5_lo); + t1_hi = vaddq_s32(s1_hi, s5_hi); + t2_lo = vaddq_s32(s2_lo, s6_lo); + t2_hi = vaddq_s32(s2_hi, s6_hi); + t3_lo = vaddq_s32(s3_lo, s7_lo); + t3_hi = vaddq_s32(s3_hi, s7_hi); + t4_lo = vsubq_s32(s0_lo, s4_lo); + t4_hi = vsubq_s32(s0_hi, s4_hi); + t5_lo = vsubq_s32(s1_lo, s5_lo); + t5_hi = vsubq_s32(s1_hi, s5_hi); + t6_lo = vsubq_s32(s2_lo, s6_lo); + t6_hi = vsubq_s32(s2_hi, s6_hi); + t7_lo = vsubq_s32(s3_lo, s7_lo); + t7_hi = vsubq_s32(s3_hi, s7_hi); + + t0_lo = vaddq_s32(t0_lo, k__DCT_CONST_ROUNDING); + t0_hi = vaddq_s32(t0_hi, k__DCT_CONST_ROUNDING); + t1_lo = vaddq_s32(t1_lo, k__DCT_CONST_ROUNDING); + t1_hi = vaddq_s32(t1_hi, k__DCT_CONST_ROUNDING); + t2_lo = vaddq_s32(t2_lo, k__DCT_CONST_ROUNDING); + t2_hi = vaddq_s32(t2_hi, k__DCT_CONST_ROUNDING); + t3_lo = vaddq_s32(t3_lo, k__DCT_CONST_ROUNDING); + t3_hi = vaddq_s32(t3_hi, k__DCT_CONST_ROUNDING); + t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING); + t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING); + t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING); + t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING); + t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING); + t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING); + t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING); + t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING); + + t0_lo = vshrq_n_s32(t0_lo, DCT_CONST_BITS); + t0_hi = vshrq_n_s32(t0_hi, DCT_CONST_BITS); + t1_lo = vshrq_n_s32(t1_lo, DCT_CONST_BITS); + t1_hi = vshrq_n_s32(t1_hi, DCT_CONST_BITS); + t2_lo = vshrq_n_s32(t2_lo, DCT_CONST_BITS); + t2_hi = vshrq_n_s32(t2_hi, DCT_CONST_BITS); + t3_lo = vshrq_n_s32(t3_lo, DCT_CONST_BITS); + t3_hi = vshrq_n_s32(t3_hi, DCT_CONST_BITS); + t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS); + t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS); + t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS); + t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS); + t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS); + t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS); + t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS); + t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS); + + // stage 2 + s0_lo = t0_lo; + s0_hi = t0_hi; + s1_lo = t1_lo; + s1_hi = t1_hi; + s2_lo = t2_lo; + s2_hi = t2_hi; + s3_lo = t3_lo; + s3_hi = t3_hi; + s4_lo = vaddq_s32(vmulq_n_s32(t4_lo, cospi_8_64), + vmulq_n_s32(t5_lo, cospi_24_64)); + s4_hi = vaddq_s32(vmulq_n_s32(t4_hi, cospi_8_64), + vmulq_n_s32(t5_hi, cospi_24_64)); + s5_lo = vsubq_s32(vmulq_n_s32(t4_lo, cospi_24_64), + vmulq_n_s32(t5_lo, cospi_8_64)); + s5_hi = vsubq_s32(vmulq_n_s32(t4_hi, cospi_24_64), + vmulq_n_s32(t5_hi, cospi_8_64)); + s6_lo = vaddq_s32(vmulq_n_s32(t6_lo, -cospi_24_64), + vmulq_n_s32(t7_lo, cospi_8_64)); + s6_hi = vaddq_s32(vmulq_n_s32(t6_hi, -cospi_24_64), + vmulq_n_s32(t7_hi, cospi_8_64)); + s7_lo = vaddq_s32(vmulq_n_s32(t6_lo, cospi_8_64), + vmulq_n_s32(t7_lo, cospi_24_64)); + s7_hi = vaddq_s32(vmulq_n_s32(t6_hi, cospi_8_64), + vmulq_n_s32(t7_hi, cospi_24_64)); + + // s0 + s2 + t0_lo = vaddq_s32(s0_lo, s2_lo); + t0_hi = vaddq_s32(s0_hi, s2_hi); + // s1 + s3 + t1_lo = vaddq_s32(s1_lo, s3_lo); + t1_hi = vaddq_s32(s1_hi, s3_hi); + // s0 - s2 + t2_lo = vsubq_s32(s0_lo, s2_lo); + t2_hi = vsubq_s32(s0_hi, s2_hi); + // s1 - s3 + t3_lo = vsubq_s32(s1_lo, s3_lo); + t3_hi = vsubq_s32(s1_hi, s3_hi); + // s4 + s6 + t4_lo = vaddq_s32(s4_lo, s6_lo); + t4_hi = vaddq_s32(s4_hi, s6_hi); + // s5 + s7 + t5_lo = vaddq_s32(s5_lo, s7_lo); + t5_hi = vaddq_s32(s5_hi, s7_hi); + // s4 - s6 + t6_lo = vsubq_s32(s4_lo, s6_lo); + t6_hi = vsubq_s32(s4_hi, s6_hi); + // s5 - s7 + t7_lo = vsubq_s32(s5_lo, s7_lo); + t7_hi = vsubq_s32(s5_hi, s7_hi); + + // fdct_round_shift + t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING); + t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING); + t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING); + t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING); + t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING); + t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING); + t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING); + t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING); + t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS); + t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS); + t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS); + t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS); + t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS); + t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS); + t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS); + t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS); + + // stage 3 + // cospi_16_64 * (x2 + x3) + s2_lo = vmulq_n_s32(vaddq_s32(t2_lo, t3_lo), cospi_16_64); + s2_hi = vmulq_n_s32(vaddq_s32(t2_hi, t3_hi), cospi_16_64); + // cospi_16_64 * (x2 - x3) + s3_lo = vmulq_n_s32(vsubq_s32(t2_lo, t3_lo), cospi_16_64); + s3_hi = vmulq_n_s32(vsubq_s32(t2_hi, t3_hi), cospi_16_64); + // cospi_16_64 * (x6 + x7) + s6_lo = vmulq_n_s32(vaddq_s32(t6_lo, t7_lo), cospi_16_64); + s6_hi = vmulq_n_s32(vaddq_s32(t6_hi, t7_hi), cospi_16_64); + // cospi_16_64 * (x2 - x3) + s7_lo = vmulq_n_s32(vsubq_s32(t6_lo, t7_lo), cospi_16_64); + s7_hi = vmulq_n_s32(vsubq_s32(t6_hi, t7_hi), cospi_16_64); + + // final fdct_round_shift + t2_lo = vaddq_s32(s2_lo, k__DCT_CONST_ROUNDING); + t2_hi = vaddq_s32(s2_hi, k__DCT_CONST_ROUNDING); + t3_lo = vaddq_s32(s3_lo, k__DCT_CONST_ROUNDING); + t3_hi = vaddq_s32(s3_hi, k__DCT_CONST_ROUNDING); + t6_lo = vaddq_s32(s6_lo, k__DCT_CONST_ROUNDING); + t6_hi = vaddq_s32(s6_hi, k__DCT_CONST_ROUNDING); + t7_lo = vaddq_s32(s7_lo, k__DCT_CONST_ROUNDING); + t7_hi = vaddq_s32(s7_hi, k__DCT_CONST_ROUNDING); + + x2_lo = vshrn_n_s32(t2_lo, DCT_CONST_BITS); + x2_hi = vshrn_n_s32(t2_hi, DCT_CONST_BITS); + x3_lo = vshrn_n_s32(t3_lo, DCT_CONST_BITS); + x3_hi = vshrn_n_s32(t3_hi, DCT_CONST_BITS); + x6_lo = vshrn_n_s32(t6_lo, DCT_CONST_BITS); + x6_hi = vshrn_n_s32(t6_hi, DCT_CONST_BITS); + x7_lo = vshrn_n_s32(t7_lo, DCT_CONST_BITS); + x7_hi = vshrn_n_s32(t7_hi, DCT_CONST_BITS); + + // x0, x1, x4, x5 narrow down to 16-bits directly + x0_lo = vmovn_s32(t0_lo); + x0_hi = vmovn_s32(t0_hi); + x1_lo = vmovn_s32(t1_lo); + x1_hi = vmovn_s32(t1_hi); + x4_lo = vmovn_s32(t4_lo); + x4_hi = vmovn_s32(t4_hi); + x5_lo = vmovn_s32(t5_lo); + x5_hi = vmovn_s32(t5_hi); + + in[0] = vcombine_s16(x0_lo, x0_hi); + in[1] = vnegq_s16(vcombine_s16(x4_lo, x4_hi)); + in[2] = vcombine_s16(x6_lo, x6_hi); + in[3] = vnegq_s16(vcombine_s16(x2_lo, x2_hi)); + in[4] = vcombine_s16(x3_lo, x3_hi); + in[5] = vnegq_s16(vcombine_s16(x7_lo, x7_hi)); + in[6] = vcombine_s16(x5_lo, x5_hi); + in[7] = vnegq_s16(vcombine_s16(x1_lo, x1_hi)); + + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); +} + +void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + int16x8_t in[8]; + + switch (tx_type) { + case DCT_DCT: vpx_fdct8x8_neon(input, output, stride); break; + case ADST_DCT: + load_buffer_8x8(input, in, stride); + fadst8x8_neon(in); + vpx_fdct8x8_pass1_neon(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case DCT_ADST: + load_buffer_8x8(input, in, stride); + vpx_fdct8x8_pass1_neon(in); + fadst8x8_neon(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + default: + assert(tx_type == ADST_ADST); + load_buffer_8x8(input, in, stride); + fadst8x8_neon(in); + fadst8x8_neon(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + } +} + +static INLINE void load_buffer_16x16(const int16_t *input, int16x8_t *in0, + int16x8_t *in1, int stride) { + // load first 8 columns + load_buffer_8x8(input, in0, stride); + load_buffer_8x8(input + 8 * stride, in0 + 8, stride); + + input += 8; + // load second 8 columns + load_buffer_8x8(input, in1, stride); + load_buffer_8x8(input + 8 * stride, in1 + 8, stride); +} + +static INLINE void write_buffer_16x16(tran_low_t *output, int16x8_t *in0, + int16x8_t *in1, int stride) { + // write first 8 columns + write_buffer_8x8(output, in0, stride); + write_buffer_8x8(output + 8 * stride, in0 + 8, stride); + + // write second 8 columns + output += 8; + write_buffer_8x8(output, in1, stride); + write_buffer_8x8(output + 8 * stride, in1 + 8, stride); +} + +static INLINE void right_shift_16x16(int16x8_t *res0, int16x8_t *res1) { + // perform rounding operations + right_shift_8x8(res0, 2); + right_shift_8x8(res0 + 8, 2); + right_shift_8x8(res1, 2); + right_shift_8x8(res1 + 8, 2); +} + +static void fdct16_8col(int16x8_t *in) { + // perform 16x16 1-D DCT for 8 columns + int16x8_t i[8], s1[8], s2[8], s3[8], t[8]; + int16x4_t t_lo[8], t_hi[8]; + int32x4_t u_lo[8], u_hi[8]; + const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING); + + // stage 1 + i[0] = vaddq_s16(in[0], in[15]); + i[1] = vaddq_s16(in[1], in[14]); + i[2] = vaddq_s16(in[2], in[13]); + i[3] = vaddq_s16(in[3], in[12]); + i[4] = vaddq_s16(in[4], in[11]); + i[5] = vaddq_s16(in[5], in[10]); + i[6] = vaddq_s16(in[6], in[9]); + i[7] = vaddq_s16(in[7], in[8]); + + vpx_fdct8x8_pass1_neon(i); + transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]); + + // step 2 + s1[0] = vsubq_s16(in[7], in[8]); + s1[1] = vsubq_s16(in[6], in[9]); + s1[2] = vsubq_s16(in[5], in[10]); + s1[3] = vsubq_s16(in[4], in[11]); + s1[4] = vsubq_s16(in[3], in[12]); + s1[5] = vsubq_s16(in[2], in[13]); + s1[6] = vsubq_s16(in[1], in[14]); + s1[7] = vsubq_s16(in[0], in[15]); + + t[2] = vsubq_s16(s1[5], s1[2]); + t[3] = vsubq_s16(s1[4], s1[3]); + t[4] = vaddq_s16(s1[4], s1[3]); + t[5] = vaddq_s16(s1[5], s1[2]); + + t_lo[2] = vget_low_s16(t[2]); + t_hi[2] = vget_high_s16(t[2]); + t_lo[3] = vget_low_s16(t[3]); + t_hi[3] = vget_high_s16(t[3]); + t_lo[4] = vget_low_s16(t[4]); + t_hi[4] = vget_high_s16(t[4]); + t_lo[5] = vget_low_s16(t[5]); + t_hi[5] = vget_high_s16(t[5]); + + u_lo[2] = vmull_n_s16(t_lo[2], cospi_16_64); + u_hi[2] = vmull_n_s16(t_hi[2], cospi_16_64); + u_lo[3] = vmull_n_s16(t_lo[3], cospi_16_64); + u_hi[3] = vmull_n_s16(t_hi[3], cospi_16_64); + u_lo[4] = vmull_n_s16(t_lo[4], cospi_16_64); + u_hi[4] = vmull_n_s16(t_hi[4], cospi_16_64); + u_lo[5] = vmull_n_s16(t_lo[5], cospi_16_64); + u_hi[5] = vmull_n_s16(t_hi[5], cospi_16_64); + + u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING); + u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING); + u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING); + u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING); + u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING); + u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING); + u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING); + u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING); + + t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS); + t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS); + t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS); + t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS); + t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS); + t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS); + t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS); + t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS); + + s2[2] = vcombine_s16(t_lo[2], t_hi[2]); + s2[3] = vcombine_s16(t_lo[3], t_hi[3]); + s2[4] = vcombine_s16(t_lo[4], t_hi[4]); + s2[5] = vcombine_s16(t_lo[5], t_hi[5]); + + // step 3 + s3[0] = vaddq_s16(s1[0], s2[3]); + s3[1] = vaddq_s16(s1[1], s2[2]); + s3[2] = vsubq_s16(s1[1], s2[2]); + s3[3] = vsubq_s16(s1[0], s2[3]); + s3[4] = vsubq_s16(s1[7], s2[4]); + s3[5] = vsubq_s16(s1[6], s2[5]); + s3[6] = vaddq_s16(s1[6], s2[5]); + s3[7] = vaddq_s16(s1[7], s2[4]); + + // step 4 + t_lo[0] = vget_low_s16(s3[0]); + t_hi[0] = vget_high_s16(s3[0]); + t_lo[1] = vget_low_s16(s3[1]); + t_hi[1] = vget_high_s16(s3[1]); + t_lo[2] = vget_low_s16(s3[2]); + t_hi[2] = vget_high_s16(s3[2]); + t_lo[3] = vget_low_s16(s3[3]); + t_hi[3] = vget_high_s16(s3[3]); + t_lo[4] = vget_low_s16(s3[4]); + t_hi[4] = vget_high_s16(s3[4]); + t_lo[5] = vget_low_s16(s3[5]); + t_hi[5] = vget_high_s16(s3[5]); + t_lo[6] = vget_low_s16(s3[6]); + t_hi[6] = vget_high_s16(s3[6]); + t_lo[7] = vget_low_s16(s3[7]); + t_hi[7] = vget_high_s16(s3[7]); + + u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_8_64), + vmull_n_s16(t_lo[6], cospi_24_64)); + u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_8_64), + vmull_n_s16(t_hi[6], cospi_24_64)); + u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_24_64), + vmull_n_s16(t_lo[5], cospi_8_64)); + u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_24_64), + vmull_n_s16(t_hi[5], cospi_8_64)); + u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_8_64), + vmull_n_s16(t_lo[5], -cospi_24_64)); + u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_8_64), + vmull_n_s16(t_hi[5], -cospi_24_64)); + u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_24_64), + vmull_n_s16(t_lo[6], cospi_8_64)); + u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_24_64), + vmull_n_s16(t_hi[6], cospi_8_64)); + + u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING); + u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING); + u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING); + u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING); + u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING); + u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING); + u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING); + u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING); + + t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS); + t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS); + t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS); + t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS); + t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS); + t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS); + t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS); + t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS); + + s2[1] = vcombine_s16(t_lo[1], t_hi[1]); + s2[2] = vcombine_s16(t_lo[2], t_hi[2]); + s2[5] = vcombine_s16(t_lo[5], t_hi[5]); + s2[6] = vcombine_s16(t_lo[6], t_hi[6]); + + // step 5 + s1[0] = vaddq_s16(s3[0], s2[1]); + s1[1] = vsubq_s16(s3[0], s2[1]); + s1[2] = vaddq_s16(s3[3], s2[2]); + s1[3] = vsubq_s16(s3[3], s2[2]); + s1[4] = vsubq_s16(s3[4], s2[5]); + s1[5] = vaddq_s16(s3[4], s2[5]); + s1[6] = vsubq_s16(s3[7], s2[6]); + s1[7] = vaddq_s16(s3[7], s2[6]); + + // step 6 + t_lo[0] = vget_low_s16(s1[0]); + t_hi[0] = vget_high_s16(s1[0]); + t_lo[1] = vget_low_s16(s1[1]); + t_hi[1] = vget_high_s16(s1[1]); + t_lo[2] = vget_low_s16(s1[2]); + t_hi[2] = vget_high_s16(s1[2]); + t_lo[3] = vget_low_s16(s1[3]); + t_hi[3] = vget_high_s16(s1[3]); + t_lo[4] = vget_low_s16(s1[4]); + t_hi[4] = vget_high_s16(s1[4]); + t_lo[5] = vget_low_s16(s1[5]); + t_hi[5] = vget_high_s16(s1[5]); + t_lo[6] = vget_low_s16(s1[6]); + t_hi[6] = vget_high_s16(s1[6]); + t_lo[7] = vget_low_s16(s1[7]); + t_hi[7] = vget_high_s16(s1[7]); + + // step1[0] * cospi_30_64 + step1[7] * cospi_2_64; + u_lo[0] = vaddq_s32(vmull_n_s16(t_lo[0], cospi_30_64), + vmull_n_s16(t_lo[7], cospi_2_64)); + u_hi[0] = vaddq_s32(vmull_n_s16(t_hi[0], cospi_30_64), + vmull_n_s16(t_hi[7], cospi_2_64)); + + // step1[1] * cospi_14_64 + step1[6] * cospi_18_64; + u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_14_64), + vmull_n_s16(t_lo[6], cospi_18_64)); + u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_14_64), + vmull_n_s16(t_hi[6], cospi_18_64)); + + // step1[2] * cospi_22_64 + step1[5] * cospi_10_64; + u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_22_64), + vmull_n_s16(t_lo[5], cospi_10_64)); + u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_22_64), + vmull_n_s16(t_hi[5], cospi_10_64)); + + // step1[3] * cospi_6_64 + step1[4] * cospi_26_64; + u_lo[3] = vaddq_s32(vmull_n_s16(t_lo[3], cospi_6_64), + vmull_n_s16(t_lo[4], cospi_26_64)); + u_hi[3] = vaddq_s32(vmull_n_s16(t_hi[3], cospi_6_64), + vmull_n_s16(t_hi[4], cospi_26_64)); + + // step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; + u_lo[4] = vaddq_s32(vmull_n_s16(t_lo[3], -cospi_26_64), + vmull_n_s16(t_lo[4], cospi_6_64)); + u_hi[4] = vaddq_s32(vmull_n_s16(t_hi[3], -cospi_26_64), + vmull_n_s16(t_hi[4], cospi_6_64)); + + // step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; + u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], -cospi_10_64), + vmull_n_s16(t_lo[5], cospi_22_64)); + u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], -cospi_10_64), + vmull_n_s16(t_hi[5], cospi_22_64)); + + // step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; + u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_18_64), + vmull_n_s16(t_lo[6], cospi_14_64)); + u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_18_64), + vmull_n_s16(t_hi[6], cospi_14_64)); + + // step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; + u_lo[7] = vaddq_s32(vmull_n_s16(t_lo[0], -cospi_2_64), + vmull_n_s16(t_lo[7], cospi_30_64)); + u_hi[7] = vaddq_s32(vmull_n_s16(t_hi[0], -cospi_2_64), + vmull_n_s16(t_hi[7], cospi_30_64)); + + // final fdct_round_shift + u_lo[0] = vaddq_s32(u_lo[0], k__DCT_CONST_ROUNDING); + u_hi[0] = vaddq_s32(u_hi[0], k__DCT_CONST_ROUNDING); + u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING); + u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING); + u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING); + u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING); + u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING); + u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING); + u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING); + u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING); + u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING); + u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING); + u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING); + u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING); + u_lo[7] = vaddq_s32(u_lo[7], k__DCT_CONST_ROUNDING); + u_hi[7] = vaddq_s32(u_hi[7], k__DCT_CONST_ROUNDING); + + t_lo[0] = vshrn_n_s32(u_lo[0], DCT_CONST_BITS); + t_hi[0] = vshrn_n_s32(u_hi[0], DCT_CONST_BITS); + t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS); + t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS); + t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS); + t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS); + t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS); + t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS); + t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS); + t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS); + t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS); + t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS); + t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS); + t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS); + t_lo[7] = vshrn_n_s32(u_lo[7], DCT_CONST_BITS); + t_hi[7] = vshrn_n_s32(u_hi[7], DCT_CONST_BITS); + + in[0] = i[0]; + in[2] = i[1]; + in[4] = i[2]; + in[6] = i[3]; + in[8] = i[4]; + in[10] = i[5]; + in[12] = i[6]; + in[14] = i[7]; + in[1] = vcombine_s16(t_lo[0], t_hi[0]); + in[3] = vcombine_s16(t_lo[4], t_hi[4]); + in[5] = vcombine_s16(t_lo[2], t_hi[2]); + in[7] = vcombine_s16(t_lo[6], t_hi[6]); + in[9] = vcombine_s16(t_lo[1], t_hi[1]); + in[11] = vcombine_s16(t_lo[5], t_hi[5]); + in[13] = vcombine_s16(t_lo[3], t_hi[3]); + in[15] = vcombine_s16(t_lo[7], t_hi[7]); +} + +static void fadst16_8col(int16x8_t *in) { + // perform 16x16 1-D ADST for 8 columns + int16x4_t x_lo[16], x_hi[16]; + int32x4_t s_lo[16], s_hi[16]; + int32x4_t t_lo[16], t_hi[16]; + const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING); + + x_lo[0] = vget_low_s16(in[15]); + x_hi[0] = vget_high_s16(in[15]); + x_lo[1] = vget_low_s16(in[0]); + x_hi[1] = vget_high_s16(in[0]); + x_lo[2] = vget_low_s16(in[13]); + x_hi[2] = vget_high_s16(in[13]); + x_lo[3] = vget_low_s16(in[2]); + x_hi[3] = vget_high_s16(in[2]); + x_lo[4] = vget_low_s16(in[11]); + x_hi[4] = vget_high_s16(in[11]); + x_lo[5] = vget_low_s16(in[4]); + x_hi[5] = vget_high_s16(in[4]); + x_lo[6] = vget_low_s16(in[9]); + x_hi[6] = vget_high_s16(in[9]); + x_lo[7] = vget_low_s16(in[6]); + x_hi[7] = vget_high_s16(in[6]); + x_lo[8] = vget_low_s16(in[7]); + x_hi[8] = vget_high_s16(in[7]); + x_lo[9] = vget_low_s16(in[8]); + x_hi[9] = vget_high_s16(in[8]); + x_lo[10] = vget_low_s16(in[5]); + x_hi[10] = vget_high_s16(in[5]); + x_lo[11] = vget_low_s16(in[10]); + x_hi[11] = vget_high_s16(in[10]); + x_lo[12] = vget_low_s16(in[3]); + x_hi[12] = vget_high_s16(in[3]); + x_lo[13] = vget_low_s16(in[12]); + x_hi[13] = vget_high_s16(in[12]); + x_lo[14] = vget_low_s16(in[1]); + x_hi[14] = vget_high_s16(in[1]); + x_lo[15] = vget_low_s16(in[14]); + x_hi[15] = vget_high_s16(in[14]); + + // stage 1 + // s0 = cospi_1_64 * x0 + cospi_31_64 * x1; + s_lo[0] = vaddq_s32(vmull_n_s16(x_lo[0], cospi_1_64), + vmull_n_s16(x_lo[1], cospi_31_64)); + s_hi[0] = vaddq_s32(vmull_n_s16(x_hi[0], cospi_1_64), + vmull_n_s16(x_hi[1], cospi_31_64)); + // s1 = cospi_31_64 * x0 - cospi_1_64 * x1; + s_lo[1] = vsubq_s32(vmull_n_s16(x_lo[0], cospi_31_64), + vmull_n_s16(x_lo[1], cospi_1_64)); + s_hi[1] = vsubq_s32(vmull_n_s16(x_hi[0], cospi_31_64), + vmull_n_s16(x_hi[1], cospi_1_64)); + // s2 = cospi_5_64 * x2 + cospi_27_64 * x3; + s_lo[2] = vaddq_s32(vmull_n_s16(x_lo[2], cospi_5_64), + vmull_n_s16(x_lo[3], cospi_27_64)); + s_hi[2] = vaddq_s32(vmull_n_s16(x_hi[2], cospi_5_64), + vmull_n_s16(x_hi[3], cospi_27_64)); + // s3 = cospi_27_64 * x2 - cospi_5_64 * x3; + s_lo[3] = vsubq_s32(vmull_n_s16(x_lo[2], cospi_27_64), + vmull_n_s16(x_lo[3], cospi_5_64)); + s_hi[3] = vsubq_s32(vmull_n_s16(x_hi[2], cospi_27_64), + vmull_n_s16(x_hi[3], cospi_5_64)); + // s4 = cospi_9_64 * x4 + cospi_23_64 * x5; + s_lo[4] = vaddq_s32(vmull_n_s16(x_lo[4], cospi_9_64), + vmull_n_s16(x_lo[5], cospi_23_64)); + s_hi[4] = vaddq_s32(vmull_n_s16(x_hi[4], cospi_9_64), + vmull_n_s16(x_hi[5], cospi_23_64)); + // s5 = cospi_23_64 * x4 - cospi_9_64 * x5; + s_lo[5] = vsubq_s32(vmull_n_s16(x_lo[4], cospi_23_64), + vmull_n_s16(x_lo[5], cospi_9_64)); + s_hi[5] = vsubq_s32(vmull_n_s16(x_hi[4], cospi_23_64), + vmull_n_s16(x_hi[5], cospi_9_64)); + // s6 = cospi_13_64 * x6 + cospi_19_64 * x7; + s_lo[6] = vaddq_s32(vmull_n_s16(x_lo[6], cospi_13_64), + vmull_n_s16(x_lo[7], cospi_19_64)); + s_hi[6] = vaddq_s32(vmull_n_s16(x_hi[6], cospi_13_64), + vmull_n_s16(x_hi[7], cospi_19_64)); + // s7 = cospi_19_64 * x6 - cospi_13_64 * x7; + s_lo[7] = vsubq_s32(vmull_n_s16(x_lo[6], cospi_19_64), + vmull_n_s16(x_lo[7], cospi_13_64)); + s_hi[7] = vsubq_s32(vmull_n_s16(x_hi[6], cospi_19_64), + vmull_n_s16(x_hi[7], cospi_13_64)); + // s8 = cospi_17_64 * x8 + cospi_15_64 * x9; + s_lo[8] = vaddq_s32(vmull_n_s16(x_lo[8], cospi_17_64), + vmull_n_s16(x_lo[9], cospi_15_64)); + s_hi[8] = vaddq_s32(vmull_n_s16(x_hi[8], cospi_17_64), + vmull_n_s16(x_hi[9], cospi_15_64)); + // s9 = cospi_15_64 * x8 - cospi_17_64 * x9; + s_lo[9] = vsubq_s32(vmull_n_s16(x_lo[8], cospi_15_64), + vmull_n_s16(x_lo[9], cospi_17_64)); + s_hi[9] = vsubq_s32(vmull_n_s16(x_hi[8], cospi_15_64), + vmull_n_s16(x_hi[9], cospi_17_64)); + // s10 = cospi_21_64 * x10 + cospi_11_64 * x11; + s_lo[10] = vaddq_s32(vmull_n_s16(x_lo[10], cospi_21_64), + vmull_n_s16(x_lo[11], cospi_11_64)); + s_hi[10] = vaddq_s32(vmull_n_s16(x_hi[10], cospi_21_64), + vmull_n_s16(x_hi[11], cospi_11_64)); + // s11 = cospi_11_64 * x10 - cospi_21_64 * x11; + s_lo[11] = vsubq_s32(vmull_n_s16(x_lo[10], cospi_11_64), + vmull_n_s16(x_lo[11], cospi_21_64)); + s_hi[11] = vsubq_s32(vmull_n_s16(x_hi[10], cospi_11_64), + vmull_n_s16(x_hi[11], cospi_21_64)); + // s12 = cospi_25_64 * x12 + cospi_7_64 * x13; + s_lo[12] = vaddq_s32(vmull_n_s16(x_lo[12], cospi_25_64), + vmull_n_s16(x_lo[13], cospi_7_64)); + s_hi[12] = vaddq_s32(vmull_n_s16(x_hi[12], cospi_25_64), + vmull_n_s16(x_hi[13], cospi_7_64)); + // s13 = cospi_7_64 * x12 - cospi_25_64 * x13; + s_lo[13] = vsubq_s32(vmull_n_s16(x_lo[12], cospi_7_64), + vmull_n_s16(x_lo[13], cospi_25_64)); + s_hi[13] = vsubq_s32(vmull_n_s16(x_hi[12], cospi_7_64), + vmull_n_s16(x_hi[13], cospi_25_64)); + // s14 = cospi_29_64 * x14 + cospi_3_64 * x15; + s_lo[14] = vaddq_s32(vmull_n_s16(x_lo[14], cospi_29_64), + vmull_n_s16(x_lo[15], cospi_3_64)); + s_hi[14] = vaddq_s32(vmull_n_s16(x_hi[14], cospi_29_64), + vmull_n_s16(x_hi[15], cospi_3_64)); + // s15 = cospi_3_64 * x14 - cospi_29_64 * x15; + s_lo[15] = vsubq_s32(vmull_n_s16(x_lo[14], cospi_3_64), + vmull_n_s16(x_lo[15], cospi_29_64)); + s_hi[15] = vsubq_s32(vmull_n_s16(x_hi[14], cospi_3_64), + vmull_n_s16(x_hi[15], cospi_29_64)); + + // fdct_round_shift + t_lo[0] = vaddq_s32(s_lo[0], s_lo[8]); + t_hi[0] = vaddq_s32(s_hi[0], s_hi[8]); + t_lo[1] = vaddq_s32(s_lo[1], s_lo[9]); + t_hi[1] = vaddq_s32(s_hi[1], s_hi[9]); + t_lo[2] = vaddq_s32(s_lo[2], s_lo[10]); + t_hi[2] = vaddq_s32(s_hi[2], s_hi[10]); + t_lo[3] = vaddq_s32(s_lo[3], s_lo[11]); + t_hi[3] = vaddq_s32(s_hi[3], s_hi[11]); + t_lo[4] = vaddq_s32(s_lo[4], s_lo[12]); + t_hi[4] = vaddq_s32(s_hi[4], s_hi[12]); + t_lo[5] = vaddq_s32(s_lo[5], s_lo[13]); + t_hi[5] = vaddq_s32(s_hi[5], s_hi[13]); + t_lo[6] = vaddq_s32(s_lo[6], s_lo[14]); + t_hi[6] = vaddq_s32(s_hi[6], s_hi[14]); + t_lo[7] = vaddq_s32(s_lo[7], s_lo[15]); + t_hi[7] = vaddq_s32(s_hi[7], s_hi[15]); + t_lo[8] = vsubq_s32(s_lo[0], s_lo[8]); + t_hi[8] = vsubq_s32(s_hi[0], s_hi[8]); + t_lo[9] = vsubq_s32(s_lo[1], s_lo[9]); + t_hi[9] = vsubq_s32(s_hi[1], s_hi[9]); + t_lo[10] = vsubq_s32(s_lo[2], s_lo[10]); + t_hi[10] = vsubq_s32(s_hi[2], s_hi[10]); + t_lo[11] = vsubq_s32(s_lo[3], s_lo[11]); + t_hi[11] = vsubq_s32(s_hi[3], s_hi[11]); + t_lo[12] = vsubq_s32(s_lo[4], s_lo[12]); + t_hi[12] = vsubq_s32(s_hi[4], s_hi[12]); + t_lo[13] = vsubq_s32(s_lo[5], s_lo[13]); + t_hi[13] = vsubq_s32(s_hi[5], s_hi[13]); + t_lo[14] = vsubq_s32(s_lo[6], s_lo[14]); + t_hi[14] = vsubq_s32(s_hi[6], s_hi[14]); + t_lo[15] = vsubq_s32(s_lo[7], s_lo[15]); + t_hi[15] = vsubq_s32(s_hi[7], s_hi[15]); + + t_lo[0] = vaddq_s32(t_lo[0], k__DCT_CONST_ROUNDING); + t_hi[0] = vaddq_s32(t_hi[0], k__DCT_CONST_ROUNDING); + t_lo[1] = vaddq_s32(t_lo[1], k__DCT_CONST_ROUNDING); + t_hi[1] = vaddq_s32(t_hi[1], k__DCT_CONST_ROUNDING); + t_lo[2] = vaddq_s32(t_lo[2], k__DCT_CONST_ROUNDING); + t_hi[2] = vaddq_s32(t_hi[2], k__DCT_CONST_ROUNDING); + t_lo[3] = vaddq_s32(t_lo[3], k__DCT_CONST_ROUNDING); + t_hi[3] = vaddq_s32(t_hi[3], k__DCT_CONST_ROUNDING); + t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING); + t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING); + t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING); + t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING); + t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING); + t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING); + t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING); + t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING); + t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING); + t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING); + t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING); + t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING); + t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING); + t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING); + t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING); + t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING); + t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING); + t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING); + t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING); + t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING); + t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING); + t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING); + t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING); + t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING); + + t_lo[0] = vshrq_n_s32(t_lo[0], DCT_CONST_BITS); + t_hi[0] = vshrq_n_s32(t_hi[0], DCT_CONST_BITS); + t_lo[1] = vshrq_n_s32(t_lo[1], DCT_CONST_BITS); + t_hi[1] = vshrq_n_s32(t_hi[1], DCT_CONST_BITS); + t_lo[2] = vshrq_n_s32(t_lo[2], DCT_CONST_BITS); + t_hi[2] = vshrq_n_s32(t_hi[2], DCT_CONST_BITS); + t_lo[3] = vshrq_n_s32(t_lo[3], DCT_CONST_BITS); + t_hi[3] = vshrq_n_s32(t_hi[3], DCT_CONST_BITS); + t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS); + t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS); + t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS); + t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS); + t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS); + t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS); + t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS); + t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS); + t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS); + t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS); + t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS); + t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS); + t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS); + t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS); + t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS); + t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS); + t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS); + t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS); + t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS); + t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS); + t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS); + t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS); + t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS); + t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS); + + // stage 2 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + s_lo[4] = t_lo[4]; + s_hi[4] = t_hi[4]; + s_lo[5] = t_lo[5]; + s_hi[5] = t_hi[5]; + s_lo[6] = t_lo[6]; + s_hi[6] = t_hi[6]; + s_lo[7] = t_lo[7]; + s_hi[7] = t_hi[7]; + // s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s_lo[8] = vaddq_s32(vmulq_n_s32(t_lo[8], cospi_4_64), + vmulq_n_s32(t_lo[9], cospi_28_64)); + s_hi[8] = vaddq_s32(vmulq_n_s32(t_hi[8], cospi_4_64), + vmulq_n_s32(t_hi[9], cospi_28_64)); + // s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s_lo[9] = vsubq_s32(vmulq_n_s32(t_lo[8], cospi_28_64), + vmulq_n_s32(t_lo[9], cospi_4_64)); + s_hi[9] = vsubq_s32(vmulq_n_s32(t_hi[8], cospi_28_64), + vmulq_n_s32(t_hi[9], cospi_4_64)); + // s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s_lo[10] = vaddq_s32(vmulq_n_s32(t_lo[10], cospi_20_64), + vmulq_n_s32(t_lo[11], cospi_12_64)); + s_hi[10] = vaddq_s32(vmulq_n_s32(t_hi[10], cospi_20_64), + vmulq_n_s32(t_hi[11], cospi_12_64)); + // s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s_lo[11] = vsubq_s32(vmulq_n_s32(t_lo[10], cospi_12_64), + vmulq_n_s32(t_lo[11], cospi_20_64)); + s_hi[11] = vsubq_s32(vmulq_n_s32(t_hi[10], cospi_12_64), + vmulq_n_s32(t_hi[11], cospi_20_64)); + // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], -cospi_28_64), + vmulq_n_s32(t_lo[13], cospi_4_64)); + s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], -cospi_28_64), + vmulq_n_s32(t_hi[13], cospi_4_64)); + // s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_4_64), + vmulq_n_s32(t_lo[13], cospi_28_64)); + s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_4_64), + vmulq_n_s32(t_hi[13], cospi_28_64)); + // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_12_64), + vmulq_n_s32(t_lo[15], cospi_20_64)); + s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_12_64), + vmulq_n_s32(t_hi[15], cospi_20_64)); + // s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_20_64), + vmulq_n_s32(t_lo[15], cospi_12_64)); + s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_20_64), + vmulq_n_s32(t_hi[15], cospi_12_64)); + + // s0 + s4 + t_lo[0] = vaddq_s32(s_lo[0], s_lo[4]); + t_hi[0] = vaddq_s32(s_hi[0], s_hi[4]); + // s1 + s5 + t_lo[1] = vaddq_s32(s_lo[1], s_lo[5]); + t_hi[1] = vaddq_s32(s_hi[1], s_hi[5]); + // s2 + s6 + t_lo[2] = vaddq_s32(s_lo[2], s_lo[6]); + t_hi[2] = vaddq_s32(s_hi[2], s_hi[6]); + // s3 + s7 + t_lo[3] = vaddq_s32(s_lo[3], s_lo[7]); + t_hi[3] = vaddq_s32(s_hi[3], s_hi[7]); + // s0 - s4 + t_lo[4] = vsubq_s32(s_lo[0], s_lo[4]); + t_hi[4] = vsubq_s32(s_hi[0], s_hi[4]); + // s1 - s7 + t_lo[5] = vsubq_s32(s_lo[1], s_lo[5]); + t_hi[5] = vsubq_s32(s_hi[1], s_hi[5]); + // s2 - s6 + t_lo[6] = vsubq_s32(s_lo[2], s_lo[6]); + t_hi[6] = vsubq_s32(s_hi[2], s_hi[6]); + // s3 - s7 + t_lo[7] = vsubq_s32(s_lo[3], s_lo[7]); + t_hi[7] = vsubq_s32(s_hi[3], s_hi[7]); + // s8 + s12 + t_lo[8] = vaddq_s32(s_lo[8], s_lo[12]); + t_hi[8] = vaddq_s32(s_hi[8], s_hi[12]); + // s9 + s13 + t_lo[9] = vaddq_s32(s_lo[9], s_lo[13]); + t_hi[9] = vaddq_s32(s_hi[9], s_hi[13]); + // s10 + s14 + t_lo[10] = vaddq_s32(s_lo[10], s_lo[14]); + t_hi[10] = vaddq_s32(s_hi[10], s_hi[14]); + // s11 + s15 + t_lo[11] = vaddq_s32(s_lo[11], s_lo[15]); + t_hi[11] = vaddq_s32(s_hi[11], s_hi[15]); + // s8 + s12 + t_lo[12] = vsubq_s32(s_lo[8], s_lo[12]); + t_hi[12] = vsubq_s32(s_hi[8], s_hi[12]); + // s9 + s13 + t_lo[13] = vsubq_s32(s_lo[9], s_lo[13]); + t_hi[13] = vsubq_s32(s_hi[9], s_hi[13]); + // s10 + s14 + t_lo[14] = vsubq_s32(s_lo[10], s_lo[14]); + t_hi[14] = vsubq_s32(s_hi[10], s_hi[14]); + // s11 + s15 + t_lo[15] = vsubq_s32(s_lo[11], s_lo[15]); + t_hi[15] = vsubq_s32(s_hi[11], s_hi[15]); + + t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING); + t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING); + t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING); + t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING); + t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING); + t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING); + t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING); + t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING); + t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING); + t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING); + t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING); + t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING); + t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING); + t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING); + t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING); + t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING); + t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS); + t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS); + t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS); + t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS); + t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS); + t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS); + t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS); + t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS); + t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS); + t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS); + t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS); + t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS); + t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS); + t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS); + t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS); + t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS); + + // stage 3 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + // s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s_lo[4] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_8_64), + vmulq_n_s32(t_lo[5], cospi_24_64)); + s_hi[4] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_8_64), + vmulq_n_s32(t_hi[5], cospi_24_64)); + // s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s_lo[5] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_24_64), + vmulq_n_s32(t_lo[5], -cospi_8_64)); + s_hi[5] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_24_64), + vmulq_n_s32(t_hi[5], -cospi_8_64)); + // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s_lo[6] = vaddq_s32(vmulq_n_s32(t_lo[6], -cospi_24_64), + vmulq_n_s32(t_lo[7], cospi_8_64)); + s_hi[6] = vaddq_s32(vmulq_n_s32(t_hi[6], -cospi_24_64), + vmulq_n_s32(t_hi[7], cospi_8_64)); + // s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s_lo[7] = vaddq_s32(vmulq_n_s32(t_lo[6], cospi_8_64), + vmulq_n_s32(t_lo[7], cospi_24_64)); + s_hi[7] = vaddq_s32(vmulq_n_s32(t_hi[6], cospi_8_64), + vmulq_n_s32(t_hi[7], cospi_24_64)); + s_lo[8] = t_lo[8]; + s_hi[8] = t_hi[8]; + s_lo[9] = t_lo[9]; + s_hi[9] = t_hi[9]; + s_lo[10] = t_lo[10]; + s_hi[10] = t_hi[10]; + s_lo[11] = t_lo[11]; + s_hi[11] = t_hi[11]; + // s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_8_64), + vmulq_n_s32(t_lo[13], cospi_24_64)); + s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_8_64), + vmulq_n_s32(t_hi[13], cospi_24_64)); + // s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_24_64), + vmulq_n_s32(t_lo[13], -cospi_8_64)); + s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_24_64), + vmulq_n_s32(t_hi[13], -cospi_8_64)); + // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_24_64), + vmulq_n_s32(t_lo[15], cospi_8_64)); + s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_24_64), + vmulq_n_s32(t_hi[15], cospi_8_64)); + // s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_8_64), + vmulq_n_s32(t_lo[15], cospi_24_64)); + s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_8_64), + vmulq_n_s32(t_hi[15], cospi_24_64)); + + // s0 + s4 + t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]); + t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]); + // s1 + s3 + t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]); + t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]); + // s0 - s4 + t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]); + t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]); + // s1 - s3 + t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]); + t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]); + // s4 + s6 + t_lo[4] = vaddq_s32(s_lo[4], s_lo[6]); + t_hi[4] = vaddq_s32(s_hi[4], s_hi[6]); + // s5 + s7 + t_lo[5] = vaddq_s32(s_lo[5], s_lo[7]); + t_hi[5] = vaddq_s32(s_hi[5], s_hi[7]); + // s4 - s6 + t_lo[6] = vsubq_s32(s_lo[4], s_lo[6]); + t_hi[6] = vsubq_s32(s_hi[4], s_hi[6]); + // s5 - s7 + t_lo[7] = vsubq_s32(s_lo[5], s_lo[7]); + t_hi[7] = vsubq_s32(s_hi[5], s_hi[7]); + // s8 + s10 + t_lo[8] = vaddq_s32(s_lo[8], s_lo[10]); + t_hi[8] = vaddq_s32(s_hi[8], s_hi[10]); + // s9 + s11 + t_lo[9] = vaddq_s32(s_lo[9], s_lo[11]); + t_hi[9] = vaddq_s32(s_hi[9], s_hi[11]); + // s8 - s10 + t_lo[10] = vsubq_s32(s_lo[8], s_lo[10]); + t_hi[10] = vsubq_s32(s_hi[8], s_hi[10]); + // s9 - s11 + t_lo[11] = vsubq_s32(s_lo[9], s_lo[11]); + t_hi[11] = vsubq_s32(s_hi[9], s_hi[11]); + // s12 + s14 + t_lo[12] = vaddq_s32(s_lo[12], s_lo[14]); + t_hi[12] = vaddq_s32(s_hi[12], s_hi[14]); + // s13 + s15 + t_lo[13] = vaddq_s32(s_lo[13], s_lo[15]); + t_hi[13] = vaddq_s32(s_hi[13], s_hi[15]); + // s12 - s14 + t_lo[14] = vsubq_s32(s_lo[12], s_lo[14]); + t_hi[14] = vsubq_s32(s_hi[12], s_hi[14]); + // s13 - s15 + t_lo[15] = vsubq_s32(s_lo[13], s_lo[15]); + t_hi[15] = vsubq_s32(s_hi[13], s_hi[15]); + + t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING); + t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING); + t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING); + t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING); + t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING); + t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING); + t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING); + t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING); + t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING); + t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING); + t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING); + t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING); + t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING); + t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING); + t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING); + t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING); + t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS); + t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS); + t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS); + t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS); + t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS); + t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS); + t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS); + t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS); + t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS); + t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS); + t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS); + t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS); + t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS); + t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS); + t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS); + t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS); + + // stage 4 + // s2 = (-cospi_16_64) * (x2 + x3); + s_lo[2] = vmulq_n_s32(vaddq_s32(t_lo[2], t_lo[3]), -cospi_16_64); + s_hi[2] = vmulq_n_s32(vaddq_s32(t_hi[2], t_hi[3]), -cospi_16_64); + // s3 = cospi_16_64 * (x2 - x3); + s_lo[3] = vmulq_n_s32(vsubq_s32(t_lo[2], t_lo[3]), cospi_16_64); + s_hi[3] = vmulq_n_s32(vsubq_s32(t_hi[2], t_hi[3]), cospi_16_64); + // s6 = cospi_16_64 * (x6 + x7); + s_lo[6] = vmulq_n_s32(vaddq_s32(t_lo[6], t_lo[7]), cospi_16_64); + s_hi[6] = vmulq_n_s32(vaddq_s32(t_hi[6], t_hi[7]), cospi_16_64); + // s7 = cospi_16_64 * (-x6 + x7); + s_lo[7] = vmulq_n_s32(vsubq_s32(t_lo[7], t_lo[6]), cospi_16_64); + s_hi[7] = vmulq_n_s32(vsubq_s32(t_hi[7], t_hi[6]), cospi_16_64); + // s10 = cospi_16_64 * (x10 + x11); + s_lo[10] = vmulq_n_s32(vaddq_s32(t_lo[10], t_lo[11]), cospi_16_64); + s_hi[10] = vmulq_n_s32(vaddq_s32(t_hi[10], t_hi[11]), cospi_16_64); + // s11 = cospi_16_64 * (-x10 + x11); + s_lo[11] = vmulq_n_s32(vsubq_s32(t_lo[11], t_lo[10]), cospi_16_64); + s_hi[11] = vmulq_n_s32(vsubq_s32(t_hi[11], t_hi[10]), cospi_16_64); + // s14 = (-cospi_16_64) * (x14 + x15); + s_lo[14] = vmulq_n_s32(vaddq_s32(t_lo[14], t_lo[15]), -cospi_16_64); + s_hi[14] = vmulq_n_s32(vaddq_s32(t_hi[14], t_hi[15]), -cospi_16_64); + // s15 = cospi_16_64 * (x14 - x15); + s_lo[15] = vmulq_n_s32(vsubq_s32(t_lo[14], t_lo[15]), cospi_16_64); + s_hi[15] = vmulq_n_s32(vsubq_s32(t_hi[14], t_hi[15]), cospi_16_64); + + // final fdct_round_shift + t_lo[2] = vaddq_s32(s_lo[2], k__DCT_CONST_ROUNDING); + t_hi[2] = vaddq_s32(s_hi[2], k__DCT_CONST_ROUNDING); + t_lo[3] = vaddq_s32(s_lo[3], k__DCT_CONST_ROUNDING); + t_hi[3] = vaddq_s32(s_hi[3], k__DCT_CONST_ROUNDING); + t_lo[6] = vaddq_s32(s_lo[6], k__DCT_CONST_ROUNDING); + t_hi[6] = vaddq_s32(s_hi[6], k__DCT_CONST_ROUNDING); + t_lo[7] = vaddq_s32(s_lo[7], k__DCT_CONST_ROUNDING); + t_hi[7] = vaddq_s32(s_hi[7], k__DCT_CONST_ROUNDING); + t_lo[10] = vaddq_s32(s_lo[10], k__DCT_CONST_ROUNDING); + t_hi[10] = vaddq_s32(s_hi[10], k__DCT_CONST_ROUNDING); + t_lo[11] = vaddq_s32(s_lo[11], k__DCT_CONST_ROUNDING); + t_hi[11] = vaddq_s32(s_hi[11], k__DCT_CONST_ROUNDING); + t_lo[14] = vaddq_s32(s_lo[14], k__DCT_CONST_ROUNDING); + t_hi[14] = vaddq_s32(s_hi[14], k__DCT_CONST_ROUNDING); + t_lo[15] = vaddq_s32(s_lo[15], k__DCT_CONST_ROUNDING); + t_hi[15] = vaddq_s32(s_hi[15], k__DCT_CONST_ROUNDING); + + x_lo[2] = vshrn_n_s32(t_lo[2], DCT_CONST_BITS); + x_hi[2] = vshrn_n_s32(t_hi[2], DCT_CONST_BITS); + x_lo[3] = vshrn_n_s32(t_lo[3], DCT_CONST_BITS); + x_hi[3] = vshrn_n_s32(t_hi[3], DCT_CONST_BITS); + x_lo[6] = vshrn_n_s32(t_lo[6], DCT_CONST_BITS); + x_hi[6] = vshrn_n_s32(t_hi[6], DCT_CONST_BITS); + x_lo[7] = vshrn_n_s32(t_lo[7], DCT_CONST_BITS); + x_hi[7] = vshrn_n_s32(t_hi[7], DCT_CONST_BITS); + x_lo[10] = vshrn_n_s32(t_lo[10], DCT_CONST_BITS); + x_hi[10] = vshrn_n_s32(t_hi[10], DCT_CONST_BITS); + x_lo[11] = vshrn_n_s32(t_lo[11], DCT_CONST_BITS); + x_hi[11] = vshrn_n_s32(t_hi[11], DCT_CONST_BITS); + x_lo[14] = vshrn_n_s32(t_lo[14], DCT_CONST_BITS); + x_hi[14] = vshrn_n_s32(t_hi[14], DCT_CONST_BITS); + x_lo[15] = vshrn_n_s32(t_lo[15], DCT_CONST_BITS); + x_hi[15] = vshrn_n_s32(t_hi[15], DCT_CONST_BITS); + + // x0, x1, x4, x5, x8, x9, x12, x13 narrow down to 16-bits directly + x_lo[0] = vmovn_s32(t_lo[0]); + x_hi[0] = vmovn_s32(t_hi[0]); + x_lo[1] = vmovn_s32(t_lo[1]); + x_hi[1] = vmovn_s32(t_hi[1]); + x_lo[4] = vmovn_s32(t_lo[4]); + x_hi[4] = vmovn_s32(t_hi[4]); + x_lo[5] = vmovn_s32(t_lo[5]); + x_hi[5] = vmovn_s32(t_hi[5]); + x_lo[8] = vmovn_s32(t_lo[8]); + x_hi[8] = vmovn_s32(t_hi[8]); + x_lo[9] = vmovn_s32(t_lo[9]); + x_hi[9] = vmovn_s32(t_hi[9]); + x_lo[12] = vmovn_s32(t_lo[12]); + x_hi[12] = vmovn_s32(t_hi[12]); + x_lo[13] = vmovn_s32(t_lo[13]); + x_hi[13] = vmovn_s32(t_hi[13]); + + in[0] = vcombine_s16(x_lo[0], x_hi[0]); + in[1] = vnegq_s16(vcombine_s16(x_lo[8], x_hi[8])); + in[2] = vcombine_s16(x_lo[12], x_hi[12]); + in[3] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4])); + in[4] = vcombine_s16(x_lo[6], x_hi[6]); + in[5] = vcombine_s16(x_lo[14], x_hi[14]); + in[6] = vcombine_s16(x_lo[10], x_hi[10]); + in[7] = vcombine_s16(x_lo[2], x_hi[2]); + in[8] = vcombine_s16(x_lo[3], x_hi[3]); + in[9] = vcombine_s16(x_lo[11], x_hi[11]); + in[10] = vcombine_s16(x_lo[15], x_hi[15]); + in[11] = vcombine_s16(x_lo[7], x_hi[7]); + in[12] = vcombine_s16(x_lo[5], x_hi[5]); + in[13] = vnegq_s16(vcombine_s16(x_lo[13], x_hi[13])); + in[14] = vcombine_s16(x_lo[9], x_hi[9]); + in[15] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1])); +} + +static void fdct16x16_neon(int16x8_t *in0, int16x8_t *in1) { + // Left half. + fdct16_8col(in0); + // Right half. + fdct16_8col(in1); + transpose_s16_16x16(in0, in1); +} + +static void fadst16x16_neon(int16x8_t *in0, int16x8_t *in1) { + fadst16_8col(in0); + fadst16_8col(in1); + transpose_s16_16x16(in0, in1); +} + +void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + int16x8_t in0[16], in1[16]; + + switch (tx_type) { + case DCT_DCT: vpx_fdct16x16_neon(input, output, stride); break; + case ADST_DCT: + load_buffer_16x16(input, in0, in1, stride); + fadst16x16_neon(in0, in1); + right_shift_16x16(in0, in1); + fdct16x16_neon(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case DCT_ADST: + load_buffer_16x16(input, in0, in1, stride); + fdct16x16_neon(in0, in1); + right_shift_16x16(in0, in1); + fadst16x16_neon(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + default: + assert(tx_type == ADST_ADST); + load_buffer_16x16(input, in0, in1, stride); + fadst16x16_neon(in0, in1); + right_shift_16x16(in0, in1); + fadst16x16_neon(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + } +} diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 38e99165af..92a7fddb9d 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -127,6 +127,7 @@ endif VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c From 2c32425851cb89a1623ac7f3cf3d7bbba7aa32c6 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Wed, 30 Mar 2022 15:47:17 -0700 Subject: [PATCH 0307/1000] L2E: Make SimpleEncode take vp9 level as an input Level conformance is standadized in vp9. If a specific target level is set, the vp9 encoder is required to produce conformant bitstream with limit on frame size, rate, min alt-ref distance, etc. This change makes the SimpleEncode environment take the target level as an input. To make existing tests pass, we set the level to 0. Change-Id: Ia35224f75c2fe50338b5b86a50c84355f5daf6fd --- test/simple_encode_test.cc | 37 +++++++++++++++++++------------------ vp9/simple_encode.cc | 26 ++++++++++++++------------ vp9/simple_encode.h | 24 +++++++++++++++++++++++- vp9/vp9_cx_iface.c | 6 +++++- vp9/vp9_cx_iface.h | 1 + 5 files changed, 62 insertions(+), 32 deletions(-) diff --git a/test/simple_encode_test.cc b/test/simple_encode_test.cc index 03e28e3387..01fc258566 100644 --- a/test/simple_encode_test.cc +++ b/test/simple_encode_test.cc @@ -37,13 +37,14 @@ class SimpleEncodeTest : public ::testing::Test { const int frame_rate_den_ = 1; const int target_bitrate_ = 1000; const int num_frames_ = 17; + const int target_level_ = LEVEL_UNKNOWN; const std::string in_file_path_str_ = libvpx_test::GetDataPath() + "/bus_352x288_420_f20_b8.yuv"; }; TEST_F(SimpleEncodeTest, ComputeFirstPassStats) { SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, - target_bitrate_, num_frames_, + target_bitrate_, num_frames_, target_level_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); std::vector> frame_stats = @@ -64,7 +65,7 @@ TEST_F(SimpleEncodeTest, ComputeFirstPassStats) { TEST_F(SimpleEncodeTest, ObserveFirstPassMotionVectors) { SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, - target_bitrate_, num_frames_, + target_bitrate_, num_frames_, target_level_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); std::vector> fps_motion_vectors = @@ -86,7 +87,7 @@ TEST_F(SimpleEncodeTest, ObserveFirstPassMotionVectors) { TEST_F(SimpleEncodeTest, GetCodingFrameNum) { SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, - target_bitrate_, num_frames_, + target_bitrate_, num_frames_, target_level_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); const int num_coding_frames = simple_encode.GetCodingFrameNum(); @@ -95,7 +96,7 @@ TEST_F(SimpleEncodeTest, GetCodingFrameNum) { TEST_F(SimpleEncodeTest, EncodeFrame) { SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, - target_bitrate_, num_frames_, + target_bitrate_, num_frames_, target_level_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); int num_coding_frames = simple_encode.GetCodingFrameNum(); @@ -138,7 +139,7 @@ TEST_F(SimpleEncodeTest, EncodeFrame) { TEST_F(SimpleEncodeTest, ObserveKeyFrameMap) { SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, - target_bitrate_, num_frames_, + target_bitrate_, num_frames_, target_level_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); std::vector key_frame_map = simple_encode.ObserveKeyFrameMap(); @@ -167,7 +168,7 @@ TEST_F(SimpleEncodeTest, ObserveKeyFrameMap) { TEST_F(SimpleEncodeTest, EncodeFrameWithTargetFrameBits) { SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, - target_bitrate_, num_frames_, + target_bitrate_, num_frames_, target_level_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); const int num_coding_frames = simple_encode.GetCodingFrameNum(); @@ -205,7 +206,7 @@ TEST_F(SimpleEncodeTest, EncodeFrameWithTargetFrameBits) { TEST_F(SimpleEncodeTest, EncodeFrameWithQuantizeIndex) { SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, - target_bitrate_, num_frames_, + target_bitrate_, num_frames_, target_level_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); const int num_coding_frames = simple_encode.GetCodingFrameNum(); @@ -237,7 +238,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencyTest) { // The first encode. SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, target_bitrate_, num_frames_, - in_file_path_str_.c_str()); + target_level_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); const int num_coding_frames = simple_encode.GetCodingFrameNum(); simple_encode.StartEncode(); @@ -257,7 +258,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencyTest) { // The second encode with quantize index got from the first encode. SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, target_bitrate_, num_frames_, - in_file_path_str_.c_str()); + target_level_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); const int num_coding_frames = simple_encode.GetCodingFrameNum(); EXPECT_EQ(static_cast(num_coding_frames), @@ -286,7 +287,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencyTest2) { const int num_units_4x4 = num_rows_4x4 * num_cols_4x4; // The first encode. SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, - target_bitrate_, num_frames_, + target_bitrate_, num_frames_, target_level_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); const int num_coding_frames = simple_encode.GetCodingFrameNum(); @@ -309,7 +310,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencyTest2) { // The second encode. SimpleEncode simple_encode_2(width_, height_, frame_rate_num_, frame_rate_den_, target_bitrate_, num_frames_, - in_file_path_str_.c_str()); + target_level_, in_file_path_str_.c_str()); simple_encode_2.ComputeFirstPassStats(); const int num_coding_frames_2 = simple_encode_2.GetCodingFrameNum(); simple_encode_2.StartEncode(); @@ -357,7 +358,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencyTest3) { const int num_units_4x4 = num_rows_4x4 * num_cols_4x4; // The first encode. SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, - target_bitrate_, num_frames_, + target_bitrate_, num_frames_, target_level_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); const int num_coding_frames = simple_encode.GetCodingFrameNum(); @@ -377,7 +378,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencyTest3) { // The second encode. SimpleEncode simple_encode_2(width_, height_, frame_rate_num_, frame_rate_den_, target_bitrate_, num_frames_, - in_file_path_str_.c_str()); + target_level_, in_file_path_str_.c_str()); simple_encode_2.ComputeFirstPassStats(); const int num_coding_frames_2 = simple_encode_2.GetCodingFrameNum(); simple_encode_2.StartEncode(); @@ -417,7 +418,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencySetExternalGroupOfPicturesMap) { // The first encode. SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, target_bitrate_, num_frames_, - in_file_path_str_.c_str()); + target_level_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); simple_encode.StartEncode(); @@ -449,7 +450,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencySetExternalGroupOfPicturesMap) { // The external arfs are the same as the first encode. SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, target_bitrate_, num_frames_, - in_file_path_str_.c_str()); + target_level_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); simple_encode.SetExternalGroupOfPicturesMap(gop_map.data(), gop_map.size()); const int num_coding_frames = simple_encode.GetCodingFrameNum(); @@ -471,7 +472,7 @@ TEST_F(SimpleEncodeTest, EncodeConsistencySetExternalGroupOfPicturesMap) { TEST_F(SimpleEncodeTest, SetExternalGroupOfPicturesMap) { SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, - target_bitrate_, num_frames_, + target_bitrate_, num_frames_, target_level_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); @@ -541,7 +542,7 @@ TEST_F(SimpleEncodeTest, GetEncodeFrameInfo) { // Makes sure that the encode_frame_info obtained from GetEncodeFrameInfo() // matches the counterpart in encode_frame_result obtained from EncodeFrame() SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, - target_bitrate_, num_frames_, + target_bitrate_, num_frames_, target_level_, in_file_path_str_.c_str()); simple_encode.ComputeFirstPassStats(); const int num_coding_frames = simple_encode.GetCodingFrameNum(); @@ -558,7 +559,7 @@ TEST_F(SimpleEncodeTest, GetEncodeFrameInfo) { TEST_F(SimpleEncodeTest, GetFramePixelCount) { SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, - target_bitrate_, num_frames_, + target_bitrate_, num_frames_, target_level_, in_file_path_str_.c_str()); EXPECT_EQ(simple_encode.GetFramePixelCount(), static_cast(width_ * height_ * 3 / 2)); diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 6ba37a321c..1a0ada119f 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -782,11 +782,12 @@ static void UpdateEncodeConfig(const EncodeConfig &config, static VP9EncoderConfig GetEncodeConfig( int frame_width, int frame_height, vpx_rational_t frame_rate, - int target_bitrate, int encode_speed, vpx_enc_pass enc_pass, + int target_bitrate, int encode_speed, int target_level, + vpx_enc_pass enc_pass, const std::vector &encode_config_list) { - VP9EncoderConfig oxcf = - vp9_get_encoder_config(frame_width, frame_height, frame_rate, - target_bitrate, encode_speed, enc_pass); + VP9EncoderConfig oxcf = vp9_get_encoder_config( + frame_width, frame_height, frame_rate, target_bitrate, encode_speed, + target_level, enc_pass); for (const auto &config : encode_config_list) { UpdateEncodeConfig(config, &oxcf); } @@ -799,7 +800,7 @@ static VP9EncoderConfig GetEncodeConfig( SimpleEncode::SimpleEncode(int frame_width, int frame_height, int frame_rate_num, int frame_rate_den, - int target_bitrate, int num_frames, + int target_bitrate, int num_frames, int target_level, const char *infile_path, const char *outfile_path) { impl_ptr_ = std::unique_ptr(new EncodeImpl()); frame_width_ = frame_width; @@ -809,6 +810,7 @@ SimpleEncode::SimpleEncode(int frame_width, int frame_height, target_bitrate_ = target_bitrate; num_frames_ = num_frames; encode_speed_ = 0; + target_level_ = target_level; frame_coding_index_ = 0; show_frame_count_ = 0; @@ -860,9 +862,9 @@ StatusCode SimpleEncode::DumpEncodeConfigs(int pass, FILE *fp) { } const vpx_rational_t frame_rate = make_vpx_rational(frame_rate_num_, frame_rate_den_); - const VP9EncoderConfig oxcf = - GetEncodeConfig(frame_width_, frame_height_, frame_rate, target_bitrate_, - encode_speed_, enc_pass, impl_ptr_->encode_config_list); + const VP9EncoderConfig oxcf = GetEncodeConfig( + frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, + target_level_, enc_pass, impl_ptr_->encode_config_list); vp9_dump_encoder_config(&oxcf, fp); return StatusOk; } @@ -872,7 +874,7 @@ void SimpleEncode::ComputeFirstPassStats() { make_vpx_rational(frame_rate_num_, frame_rate_den_); const VP9EncoderConfig oxcf = GetEncodeConfig( frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, - VPX_RC_FIRST_PASS, impl_ptr_->encode_config_list); + target_level_, VPX_RC_FIRST_PASS, impl_ptr_->encode_config_list); impl_ptr_->cpi = init_encoder(&oxcf, impl_ptr_->img_fmt); struct lookahead_ctx *lookahead = impl_ptr_->cpi->lookahead; int i; @@ -1038,7 +1040,7 @@ void SimpleEncode::StartEncode() { make_vpx_rational(frame_rate_num_, frame_rate_den_); VP9EncoderConfig oxcf = GetEncodeConfig( frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, - VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); + target_level_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); vpx_fixed_buf_t stats; stats.buf = GetVectorData(impl_ptr_->first_pass_stats); @@ -1266,7 +1268,7 @@ int SimpleEncode::GetCodingFrameNum() const { make_vpx_rational(frame_rate_num_, frame_rate_den_); const VP9EncoderConfig oxcf = GetEncodeConfig( frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, - VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); + target_level_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); FRAME_INFO frame_info = vp9_get_frame_info(&oxcf); fps_init_first_pass_info(&twopass.first_pass_info, GetVectorData(impl_ptr_->first_pass_stats), @@ -1285,7 +1287,7 @@ std::vector SimpleEncode::ComputeKeyFrameMap() const { make_vpx_rational(frame_rate_num_, frame_rate_den_); const VP9EncoderConfig oxcf = GetEncodeConfig( frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, - VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); + target_level_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); TWO_PASS twopass; fps_init_first_pass_info(&twopass.first_pass_info, GetVectorData(impl_ptr_->first_pass_stats), diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index 8ec7069e83..7920e95ee9 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -44,6 +44,26 @@ enum RefFrameType { kRefFrameTypeNone = -1, }; +enum VP9_LEVEL { + LEVEL_UNKNOWN = 0, + LEVEL_AUTO = 1, + LEVEL_1 = 10, + LEVEL_1_1 = 11, + LEVEL_2 = 20, + LEVEL_2_1 = 21, + LEVEL_3 = 30, + LEVEL_3_1 = 31, + LEVEL_4 = 40, + LEVEL_4_1 = 41, + LEVEL_5 = 50, + LEVEL_5_1 = 51, + LEVEL_5_2 = 52, + LEVEL_6 = 60, + LEVEL_6_1 = 61, + LEVEL_6_2 = 62, + LEVEL_MAX = 255 +}; + enum GopMapFlag { kGopMapFlagStart = 1 << 0, // Indicate this location is the start of a group of pictures. @@ -343,7 +363,8 @@ class SimpleEncode { // format. SimpleEncode(int frame_width, int frame_height, int frame_rate_num, int frame_rate_den, int target_bitrate, int num_frames, - const char *infile_path, const char *outfile_path = nullptr); + int target_level, const char *infile_path, + const char *outfile_path = nullptr); ~SimpleEncode(); SimpleEncode(SimpleEncode &) = delete; SimpleEncode &operator=(const SimpleEncode &) = delete; @@ -513,6 +534,7 @@ class SimpleEncode { int target_bitrate_; int num_frames_; int encode_speed_; + int target_level_; std::FILE *in_file_; std::FILE *out_file_; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 76274437c6..b809ab3e6f 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -2143,6 +2143,7 @@ static vp9_extracfg get_extra_cfg() { VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, vpx_rational_t frame_rate, int target_bitrate, int encode_speed, + int target_level, vpx_enc_pass enc_pass) { /* This function will generate the same VP9EncoderConfig used by the * vpxenc command given below. @@ -2154,6 +2155,7 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, * FPS: frame_rate * BITRATE: target_bitrate * CPU_USED:encode_speed + * TARGET_LEVEL: target_level * * INPUT, OUTPUT, LIMIT will not affect VP9EncoderConfig * @@ -2166,6 +2168,7 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, * FPS=30/1 * LIMIT=150 * CPU_USED=0 + * TARGET_LEVEL=0 * ./vpxenc --limit=$LIMIT --width=$WIDTH --height=$HEIGHT --fps=$FPS * --lag-in-frames=25 \ * --codec=vp9 --good --cpu-used=CPU_USED --threads=0 --profile=0 \ @@ -2174,7 +2177,7 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, * --minsection-pct=0 --maxsection-pct=150 --arnr-maxframes=7 --psnr \ * --arnr-strength=5 --sharpness=0 --undershoot-pct=100 --overshoot-pct=100 \ * --frame-parallel=0 --tile-columns=0 --cpu-used=0 --end-usage=vbr \ - * --target-bitrate=$BITRATE -o $OUTPUT $INPUT + * --target-bitrate=$BITRATE --target-level=0 -o $OUTPUT $INPUT */ VP9EncoderConfig oxcf; @@ -2192,6 +2195,7 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, oxcf.frame_parallel_decoding_mode = 0; oxcf.two_pass_vbrmax_section = 150; oxcf.speed = abs(encode_speed); + oxcf.target_level = target_level; return oxcf; } diff --git a/vp9/vp9_cx_iface.h b/vp9/vp9_cx_iface.h index 01338adb4e..f2de8507ff 100644 --- a/vp9/vp9_cx_iface.h +++ b/vp9/vp9_cx_iface.h @@ -20,6 +20,7 @@ extern "C" { VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, vpx_rational_t frame_rate, int target_bitrate, int encode_speed, + int target_level, vpx_enc_pass enc_pass); void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp); From 2200039d33c49a9f7a5c438656df143755b022c4 Mon Sep 17 00:00:00 2001 From: Johann Date: Wed, 30 Mar 2022 14:57:46 +0900 Subject: [PATCH 0308/1000] quantize: replace highbd versions The optimized quantize functions were already built to handle highbd values. The only difference is the clamping. All highbd functions expand to 32bits when running in highbd mode. Removes vpx_highbd_quantize_32x32_sse2 as it is slower than the C version in the worst case. Bug: webm:1586 Change-Id: I49bf8a6a2041f78450bf43a4f655c67656b0f8d9 --- test/vp9_quantize_test.cc | 42 +++--- vp9/encoder/vp9_encodemb.c | 48 +++---- vp9/encoder/vp9_quantize.c | 8 -- vpx_dsp/quantize.h | 16 +++ vpx_dsp/vpx_dsp.mk | 3 - vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 -- vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 152 ---------------------- 7 files changed, 66 insertions(+), 211 deletions(-) delete mode 100644 vpx_dsp/x86/highbd_quantize_intrin_sse2.c diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index d54f1bc9cd..5773cd9835 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -30,6 +30,7 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/msvc.h" #include "vpx_ports/vpx_timer.h" +#include "vpx_dsp/quantize.h" using libvpx_test::ACMRandom; using libvpx_test::Buffer; @@ -464,22 +465,12 @@ using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, - ::testing::Values( - make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, - false), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); - + ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false))); #else INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, @@ -519,6 +510,24 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_SSSE3 #if HAVE_AVX +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + AVX, VP9QuantizeTest, + ::testing::Values( + make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), + make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_10, + 16, false), + make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_12, + 16, false), + make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c, + VPX_BITS_8, 32, false), + make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c, + VPX_BITS_10, 32, false), + make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c, + VPX_BITS_12, 32, false))); + +#else INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, @@ -526,6 +535,7 @@ INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest, make_tuple(&vpx_quantize_b_32x32_avx, &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false))); +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_AVX #if VPX_ARCH_X86_64 && HAVE_AVX2 diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index fa222f9dcf..e708555f89 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -511,28 +511,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32( - coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); + vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); break; case TX_8X8: vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); break; } return; @@ -857,9 +857,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32( - coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); + vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, + eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -876,9 +876,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); else vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type); - vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -896,9 +896,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); else vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type); - vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -917,9 +917,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type); else x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 9058997b0f..1c401e96b4 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -164,14 +164,6 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, return; } -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, - p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, &p->eobs[block], scan, iscan); - return; - } -#endif vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, &p->eobs[block], scan, iscan); diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h index 8e138445e2..0fcd77941b 100644 --- a/vpx_dsp/quantize.h +++ b/vpx_dsp/quantize.h @@ -37,6 +37,22 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant, uint16_t *eob_ptr); + +// Only used for reference. The optimized versions can handle HBD. +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +void vpx_highbd_quantize_b_32x32_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); #endif #ifdef __cplusplus diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index a880e1d285..b930fbd0a3 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -318,9 +318,6 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c -ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c -endif # avg DSP_SRCS-yes += avg.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 372903aff2..63097b0b6c 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -714,14 +714,6 @@ () add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/; - - if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_highbd_quantize_b sse2/; - - add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_highbd_quantize_b_32x32 sse2/; - } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER if (vpx_config("CONFIG_ENCODERS") eq "yes") { diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c deleted file mode 100644 index 4535a0f7a2..0000000000 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include - -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_mem/vpx_mem.h" -#include "vpx_ports/mem.h" - -#if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - int i, j, non_zero_regs = (int)count / 4, eob_i = -1; - __m128i zbins[2]; - __m128i nzbins[2]; - - zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], - (int)zbin_ptr[0]); - zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); - - nzbins[0] = _mm_setzero_si128(); - nzbins[1] = _mm_setzero_si128(); - nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); - nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); - - (void)scan; - - memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); - - // Pre-scan pass - for (i = ((int)count / 4) - 1; i >= 0; i--) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (test == 0xffff) - non_zero_regs--; - else - break; - } - - // Quantization pass: - for (i = 0; i < non_zero_regs; i++) { - __m128i coeffs, coeffs_sign, tmp1, tmp2; - int test; - int abs_coeff[4]; - int coeff_sign[4]; - - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - coeffs_sign = _mm_srai_epi32(coeffs, 31); - coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); - tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); - tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); - tmp1 = _mm_or_si128(tmp1, tmp2); - test = _mm_movemask_epi8(tmp1); - _mm_storeu_si128((__m128i *)abs_coeff, coeffs); - _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); - - for (j = 0; j < 4; j++) { - if (test & (1 << (4 * j))) { - int k = 4 * i + j; - const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; - const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; - const uint32_t abs_qcoeff = - (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); - qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; - dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; - if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; - } - } - } - *eob_ptr = eob_i + 1; -} - -void vpx_highbd_quantize_b_32x32_sse2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - __m128i zbins[2]; - __m128i nzbins[2]; - int idx = 0; - int idx_arr[1024]; - int i, eob = -1; - const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); - const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); - (void)scan; - - zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); - zbins[1] = _mm_set1_epi32(zbin1_tmp); - - nzbins[0] = _mm_setzero_si128(); - nzbins[1] = _mm_setzero_si128(); - nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); - nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - // Pre-scan pass - for (i = 0; i < n_coeffs / 4; i++) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (!(test & 0xf)) idx_arr[idx++] = i * 4; - if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; - if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; - if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; - } - - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = idx_arr[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; - } - *eob_ptr = eob + 1; -} -#endif From 3c98caa6a4eea59da3c5b37b128e18f16b722080 Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 31 Mar 2022 10:43:29 +0900 Subject: [PATCH 0309/1000] subpel variance: add speed test Was used to verify assembly speed versus an attempt to rewrite in intrinsics. Change-Id: I011fe5494334b8fcda04b9d54c6093dbcfc55710 --- test/variance_test.cc | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/test/variance_test.cc b/test/variance_test.cc index 1b76b20419..660bbd0ed7 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -596,6 +596,7 @@ class SubpelVarianceTest protected: void RefTest(); void ExtremeRefTest(); + void SpeedTest(); ACMRandom rnd_; uint8_t *src_; @@ -681,6 +682,37 @@ void SubpelVarianceTest::ExtremeRefTest() { } } +template +void SubpelVarianceTest::SpeedTest() { + // The only interesting points are 0, 4, and anything else. To make the loops + // simple we will use 0, 2 and 4. + for (int x = 0; x <= 4; x += 2) { + for (int y = 0; y <= 4; y += 2) { + if (!use_high_bit_depth()) { + memset(src_, 25, block_size()); + memset(ref_, 50, block_size()); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), 25, block_size()); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 50, block_size()); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + unsigned int sse; + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < 1000000000 / block_size(); ++i) { + const uint32_t variance = + params_.func(ref_, width() + 1, x, y, src_, width(), &sse); + (void)variance; + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("SubpelVariance %dx%d xoffset: %d yoffset: %d time: %5d ms\n", + width(), height(), x, y, elapsed_time / 1000); + } + } +} + template <> void SubpelVarianceTest::RefTest() { for (int x = 0; x < 8; ++x) { @@ -736,6 +768,7 @@ TEST_P(SumOfSquaresTest, Const) { ConstTest(); } TEST_P(SumOfSquaresTest, Ref) { RefTest(); } TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); } TEST_P(VpxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } +TEST_P(VpxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); } TEST_P(VpxSubpelAvgVarianceTest, Ref) { RefTest(); } INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest, From e6ede58a5a6cd34b82321c1b2c36ec14984e6ecd Mon Sep 17 00:00:00 2001 From: Johann Date: Mon, 4 Nov 2019 15:58:07 -0600 Subject: [PATCH 0310/1000] remove unused vp8_encode_intra parameter Follow it up and also remove it from other functions. BUG=webm:1612 Change-Id: I9d3cb785ab0d68c6fcae185043c896d8a135e284 --- vp8/encoder/encodeframe.c | 20 ++++++-------------- vp8/encoder/encodeintra.c | 3 +-- vp8/encoder/encodeintra.h | 2 +- vp8/encoder/firstpass.c | 2 +- 4 files changed, 9 insertions(+), 18 deletions(-) diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 4df35f6edb..620107500a 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -69,10 +69,9 @@ static const unsigned char VP8_VAR_OFFS[16] = { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; /* Original activity measure from Tim T's code. */ -static unsigned int tt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x) { +static unsigned int tt_activity_measure(MACROBLOCK *x) { unsigned int act; unsigned int sse; - (void)cpi; /* TODO: This could also be done over smaller areas (8x8), but that would * require extensive changes elsewhere, as lambda is assumed to be fixed * over an entire MB in most of the code. @@ -90,28 +89,21 @@ static unsigned int tt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x) { return act; } -/* Stub for alternative experimental activity measures. */ -static unsigned int alt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x, - int use_dc_pred) { - return vp8_encode_intra(cpi, x, use_dc_pred); -} - /* Measure the activity of the current macroblock * What we measure here is TBD so abstracted to this function */ #define ALT_ACT_MEASURE 1 -static unsigned int mb_activity_measure(VP8_COMP *cpi, MACROBLOCK *x, - int mb_row, int mb_col) { +static unsigned int mb_activity_measure(MACROBLOCK *x, int mb_row, int mb_col) { unsigned int mb_activity; if (ALT_ACT_MEASURE) { int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); - /* Or use and alternative. */ - mb_activity = alt_activity_measure(cpi, x, use_dc_pred); + /* Or use an alternative. */ + mb_activity = vp8_encode_intra(x, use_dc_pred); } else { /* Original activity measure from Tim T's code. */ - mb_activity = tt_activity_measure(cpi, x); + mb_activity = tt_activity_measure(x); } if (mb_activity < VP8_ACTIVITY_AVG_MIN) mb_activity = VP8_ACTIVITY_AVG_MIN; @@ -264,7 +256,7 @@ static void build_activity_map(VP8_COMP *cpi) { vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16); /* measure activity */ - mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col); + mb_activity = mb_activity_measure(x, mb_row, mb_col); /* Keep frame sum */ activity_sum += mb_activity; diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index f89e7cb1fa..7d448c0ea0 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -18,10 +18,9 @@ #include "vp8/common/invtrans.h" #include "encodeintra.h" -int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred) { +int vp8_encode_intra(MACROBLOCK *x, int use_dc_pred) { int i; int intra_pred_var = 0; - (void)cpi; if (use_dc_pred) { x->e_mbd.mode_info_context->mbmi.mode = DC_PRED; diff --git a/vp8/encoder/encodeintra.h b/vp8/encoder/encodeintra.h index 021dc5ed76..9a378abf49 100644 --- a/vp8/encoder/encodeintra.h +++ b/vp8/encoder/encodeintra.h @@ -16,7 +16,7 @@ extern "C" { #endif -int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred); +int vp8_encode_intra(MACROBLOCK *x, int use_dc_pred); void vp8_encode_intra16x16mby(MACROBLOCK *x); void vp8_encode_intra16x16mbuv(MACROBLOCK *x); void vp8_encode_intra4x4mby(MACROBLOCK *mb); diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 981c0fde35..14164ebc51 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -567,7 +567,7 @@ void vp8_first_pass(VP8_COMP *cpi) { vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16); /* do intra 16x16 prediction */ - this_error = vp8_encode_intra(cpi, x, use_dc_pred); + this_error = vp8_encode_intra(x, use_dc_pred); /* "intrapenalty" below deals with situations where the intra * and inter error scores are very low (eg a plain black frame) From 89cfe3835c47dabf77d38edb3af190155984fa9a Mon Sep 17 00:00:00 2001 From: Johann Date: Wed, 30 Mar 2022 15:33:40 +0900 Subject: [PATCH 0311/1000] quantize: remove highbd version The only difference between the code is the clamp. For 8 bit it is purely an optimization. The values outside this range will still saturate. Change-Id: I2a770b140690d99e151b00957789bd72f7a11e13 --- test/vp9_quantize_test.cc | 89 +++++++++++++++++++++---- vpx_dsp/quantize.c | 132 ++++++++------------------------------ vpx_dsp/quantize.h | 17 +---- 3 files changed, 104 insertions(+), 134 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 5773cd9835..fcc7ff99ec 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -30,7 +30,6 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/msvc.h" #include "vpx_ports/vpx_timer.h" -#include "vpx_dsp/quantize.h" using libvpx_test::ACMRandom; using libvpx_test::Buffer; @@ -467,10 +466,13 @@ INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, + make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_10, 16, false), - make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false))); + make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true))); #else INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, @@ -484,6 +486,28 @@ INSTANTIATE_TEST_SUITE_P( #if HAVE_SSSE3 #if VPX_ARCH_X86_64 +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + SSSE3, VP9QuantizeTest, + ::testing::Values( + make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), + make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_10, 16, + false), + make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_12, 16, + false), + make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false), + make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, + VPX_BITS_10, 32, false), + make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, + VPX_BITS_12, 32, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 32, + true))); +#else INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, @@ -497,6 +521,24 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 32, true))); +#endif // #CONFIG_VP9_HIGHBITDEPTH +#else +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + SSSE3, VP9QuantizeTest, + ::testing::Values( + make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), + make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_10, 16, + false), + make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_12, 16, + false), + make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false), + make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, + VPX_BITS_10, 32, false), + make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, + VPX_BITS_12, 32, false))); #else INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, @@ -505,7 +547,7 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false))); - +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // VPX_ARCH_X86_64 #endif // HAVE_SSSE3 @@ -516,15 +558,15 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_10, - 16, false), - make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_12, - 16, false), - make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c, + make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_10, 16, + false), + make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_12, 16, + false), + make_tuple(&vpx_quantize_b_32x32_avx, &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c, + make_tuple(&vpx_quantize_b_32x32_avx, &vpx_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c, + make_tuple(&vpx_quantize_b_32x32_avx, &vpx_quantize_b_32x32_c, VPX_BITS_12, 32, false))); #else @@ -547,6 +589,28 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_AVX2 #if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, VP9QuantizeTest, + ::testing::Values( + make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), + make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_10, 16, + false), + make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_12, 16, + false), + make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false), + make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c, + VPX_BITS_10, 32, false), + make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c, + VPX_BITS_12, 32, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 32, + true))); +#else INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, @@ -560,6 +624,7 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 32, true))); +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index 5d6ba64a8a..c29b99bb0c 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -147,66 +147,25 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; if (abs_coeff >= zbins[rc != 0]) { - int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * - quant_shift_ptr[rc != 0]) >> - 16; // quantization - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = (tran_low_t)(qcoeff_ptr[rc] * dequant_ptr[rc != 0]); - - if (tmp) eob = i; - } - } - *eob_ptr = eob + 1; -} - #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) - non_zero_count--; - else - break; - } - - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - - if (abs_coeff >= zbins[rc != 0]) { + // High bit depth configurations do not clamp to INT16. const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); +#else + const int tmp = + clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + const int abs_qcoeff = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + 16; // quantization +#endif // CONFIG_VP9_HIGHBITDEPTH qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + dqcoeff_ptr[rc] = (tran_low_t)(qcoeff_ptr[rc] * dequant_ptr[rc != 0]); if (abs_qcoeff) eob = i; } } *eob_ptr = eob + 1; } -#endif void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -243,15 +202,23 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int rc = scan[idx_arr[i]]; const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); - int tmp; - int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); - tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) * - quant_shift_ptr[rc != 0]) >> - 15; - - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int abs_qcoeff = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); +#if CONFIG_VP9_HIGHBITDEPTH + // High bit depth configurations do not clamp to INT16. + { + const int64_t tmp = + ((abs_qcoeff * quant_ptr[rc != 0]) >> 16) + abs_qcoeff; + abs_qcoeff = (int)((tmp * quant_shift_ptr[rc != 0]) >> 15); + } +#else + abs_qcoeff = clamp(abs_qcoeff, INT16_MIN, INT16_MAX); + abs_qcoeff = ((((abs_qcoeff * quant_ptr[rc != 0]) >> 16) + abs_qcoeff) * + quant_shift_ptr[rc != 0]) >> + 15; +#endif // CONFIG_VP9_HIGHBITDEPTH + + qcoeff_ptr[rc] = (abs_qcoeff ^ coeff_sign) - coeff_sign; #if (VPX_ARCH_X86 || VPX_ARCH_X86_64) && !CONFIG_VP9_HIGHBITDEPTH // When tran_low_t is only 16 bits dqcoeff can outrange it. Rather than // truncating with a cast, saturate the value. This is easier to implement @@ -262,54 +229,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; #endif // VPX_ARCH_X86 && CONFIG_VP9_HIGHBITDEPTH - if (tmp) eob = idx_arr[i]; - } - *eob_ptr = eob + 1; -} - -#if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_quantize_b_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - - int idx = 0; - int idx_arr[1024]; - int i, eob = -1; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i; - } - - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; if (abs_qcoeff) eob = idx_arr[i]; } *eob_ptr = eob + 1; } -#endif diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h index 0fcd77941b..9ac1a47418 100644 --- a/vpx_dsp/quantize.h +++ b/vpx_dsp/quantize.h @@ -38,22 +38,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant, uint16_t *eob_ptr); -// Only used for reference. The optimized versions can handle HBD. -void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan); - -void vpx_highbd_quantize_b_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan); -#endif +#endif // CONFIG_VP9_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" From 176acaf9f6efb3603e920eb35630a16f8a88ad5e Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Tue, 22 Mar 2022 13:58:50 +0800 Subject: [PATCH 0312/1000] loongarch: Fix bugs Fix bugs from loopfilter_filters_lsx.c, vpx_convolve8_avg_lsx.c Bug: webm:1755 Change-Id: I7ee8e367d66a49f3be10d7e417837d3b6ef50bdb --- vp8/common/loongarch/loopfilter_filters_lsx.c | 36 +++-- vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c | 132 +++++++++--------- vpx_dsp/loongarch/vpx_convolve_lsx.h | 15 ++ 3 files changed, 98 insertions(+), 85 deletions(-) diff --git a/vp8/common/loongarch/loopfilter_filters_lsx.c b/vp8/common/loongarch/loopfilter_filters_lsx.c index c48f794840..a3ac76d258 100644 --- a/vp8/common/loongarch/loopfilter_filters_lsx.c +++ b/vp8/common/loongarch/loopfilter_filters_lsx.c @@ -32,9 +32,9 @@ filt = __lsx_vsadd_b(filt, q0_sub_p0); \ filt = __lsx_vand_v(filt, mask); \ t1 = __lsx_vsadd_b(filt, cnst4b); \ - t1 = __lsx_vsra_b(filt, cnst3b); \ + t1 = __lsx_vsra_b(t1, cnst3b); \ t2 = __lsx_vsadd_b(filt, cnst3b); \ - t2 = __lsx_vsra_b(filt, cnst3b); \ + t2 = __lsx_vsra_b(t2, cnst3b); \ q0_m = __lsx_vssub_b(q0_m, t1); \ q0 = __lsx_vxori_b(q0_m, 0x80); \ p0_m = __lsx_vsadd_b(p0_m, t2); \ @@ -158,7 +158,6 @@ static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch, const uint8_t *b_limit1_ptr, const uint8_t *limit1_ptr, const uint8_t *thresh1_ptr) { - uint8_t *temp_src; int32_t pitch_x2 = pitch << 1; int32_t pitch_x3 = pitch_x2 + pitch; int32_t pitch_x4 = pitch << 2; @@ -167,12 +166,11 @@ static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch, __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; __m128i p3, p2, p1, p0, q3, q2, q1, q0; - temp_src = src - pitch_x4; - DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, - temp_src, pitch_x3, p3, p2, p1, p0); - temp_src += pitch_x4; - DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, - temp_src, pitch_x3, q0, q1, q2, q3); + DUP4_ARG2(__lsx_vldx, src, -pitch_x4, src, -pitch_x3, src, -pitch_x2, src, + -pitch, p3, p2, p1, p0); + q0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch_x2, q1, q2); + q3 = __lsx_vldx(src, pitch_x3); thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr); thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr); @@ -336,15 +334,15 @@ static void loop_filter_horizontal_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v, mask, flat); VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); - __lsx_vstelm_d(p1, src_u, 0, 0); - __lsx_vstelm_d(p0, src_u + pitch, 0, 0); - __lsx_vstelm_d(q0, src_u + pitch_x2, 0, 0); - __lsx_vstelm_d(q1, src_u + pitch_x3, 0, 0); + __lsx_vstelm_d(q1, src_u + pitch, 0, 0); + __lsx_vstelm_d(q0, src_u, 0, 0); + __lsx_vstelm_d(p0, src_u - pitch, 0, 0); + __lsx_vstelm_d(p1, src_u - pitch_x2, 0, 0); - __lsx_vstelm_d(p1, src_v, 0, 1); - __lsx_vstelm_d(p0, src_v + pitch, 0, 1); - __lsx_vstelm_d(q0, src_v + pitch_x2, 0, 1); - __lsx_vstelm_d(q1, src_v + pitch_x3, 0, 1); + __lsx_vstelm_d(q1, src_v + pitch, 0, 1); + __lsx_vstelm_d(q0, src_v, 0, 1); + __lsx_vstelm_d(p0, src_v - pitch, 0, 1); + __lsx_vstelm_d(p1, src_v - pitch_x2, 0, 1); } static void loop_filter_vertical_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v, @@ -396,8 +394,8 @@ static void loop_filter_vertical_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v, tmp2 = __lsx_vilvl_h(tmp1, tmp0); tmp3 = __lsx_vilvh_h(tmp1, tmp0); - tmp0 = __lsx_vilvl_b(p0, q1); - tmp1 = __lsx_vilvl_b(q1, q0); + tmp0 = __lsx_vilvh_b(p0, p1); + tmp1 = __lsx_vilvh_b(q1, q0); tmp4 = __lsx_vilvl_h(tmp1, tmp0); tmp5 = __lsx_vilvh_h(tmp1, tmp0); diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c index 27f5b5ca4f..2b983552b6 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c @@ -12,6 +12,15 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/loongarch/vpx_convolve_lsx.h" +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + static void common_hv_8ht_8vt_and_aver_dst_4w_lsx( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { @@ -90,7 +99,7 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_lsx( src0 = __lsx_vpackev_b(src1, src0); out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - out0 = __lsx_vssrarni_b_h(out1, out0, 7); + out0 = __lsx_vssrarni_b_h(out1, out0, FILTER_BITS); out0 = __lsx_vxori_b(out0, 128); out0 = __lsx_vavgr_bu(out0, src2); __lsx_vstelm_w(out0, dst, 0, 0); @@ -192,7 +201,8 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_lsx( src2 = __lsx_vpackev_b(src10, src9); src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, FILTER_BITS, src4, src3, + FILTER_BITS, out0, out1); DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); src5 = __lsx_vldrepl_d(dst_tmp, 0); dst_tmp += dst_stride; @@ -233,8 +243,6 @@ static void common_hv_8ht_8vt_and_aver_dst_16w_lsx( common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); - src += 8; - dst += 8; } static void common_hv_8ht_8vt_and_aver_dst_32w_lsx( @@ -315,7 +323,7 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx( static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert) { - uint8_t *dst_tmp1; + uint8_t *dst_tmp = dst; __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1; __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; @@ -351,26 +359,25 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx( hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff); hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6); - dst0 = __lsx_vldrepl_w(dst, 0); - dst += dst_stride; - dst1 = __lsx_vldrepl_w(dst, 0); - dst += dst_stride; - dst2 = __lsx_vldrepl_w(dst, 0); - dst += dst_stride; - dst3 = __lsx_vldrepl_w(dst, 0); - dst += dst_stride; + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; dst0 = __lsx_vilvl_w(dst1, dst0); dst1 = __lsx_vilvl_w(dst3, dst2); dst0 = __lsx_vilvl_d(dst1, dst0); - dst1 = __lsx_vldrepl_w(dst, 0); - dst += dst_stride; - dst2 = __lsx_vldrepl_w(dst, 0); - dst += dst_stride; - dst3 = __lsx_vldrepl_w(dst, 0); - dst += dst_stride; - dst4 = __lsx_vldrepl_w(dst, 0); - dst += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_w(dst_tmp, 0); dst1 = __lsx_vilvl_w(dst2, dst1); dst2 = __lsx_vilvl_w(dst4, dst3); dst1 = __lsx_vilvl_d(dst2, dst1); @@ -384,23 +391,22 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx( DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, res0, res1); DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1); - dst_tmp1 = dst; - __lsx_vstelm_w(res0, dst_tmp1, 0, 0); - dst_tmp1 += dst_stride; - __lsx_vstelm_w(res0, dst_tmp1, 0, 1); - dst_tmp1 += dst_stride; - __lsx_vstelm_w(res0, dst_tmp1, 0, 2); - dst_tmp1 += dst_stride; - __lsx_vstelm_w(res0, dst_tmp1, 0, 3); - dst_tmp1 += dst_stride; - - __lsx_vstelm_w(res1, dst_tmp1, 0, 0); - dst_tmp1 += dst_stride; - __lsx_vstelm_w(res1, dst_tmp1, 0, 1); - dst_tmp1 += dst_stride; - __lsx_vstelm_w(res1, dst_tmp1, 0, 2); - dst_tmp1 += dst_stride; - __lsx_vstelm_w(res1, dst_tmp1, 0, 3); + __lsx_vstelm_w(res0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(res1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 3); } static void common_hv_2ht_2vt_and_aver_dst_4w_lsx( @@ -431,12 +437,11 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx( mask = __lsx_vld(mc_filt_mask_arr, 0); /* rearranging filter */ filt_hz = __lsx_vldrepl_h(filter_horiz, 0); - filt_vt = __lsx_vldrepl_h(filtrt_ver, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); src0 = __lsx_vld(src, 0); DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, src, src_stride4, src1, src2, src3, src4); - src += (src_stride4 + src_stride); dst0 = __lsx_vldrepl_d(dst_tmp, 0); dst_tmp += dst_stride; @@ -445,7 +450,6 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx( dst2 = __lsx_vldrepl_d(dst_tmp, 0); dst_tmp += dst_stride; dst3 = __lsx_vldrepl_d(dst_tmp, 0); - dst_tmp += dst_stride; DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); @@ -462,12 +466,11 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx( hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); vec3 = __lsx_vpackev_b(hz_out0, hz_out1); - tmp3 = __lsx_vdp2_h_bu(vec1, filt_vt); + tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); PCKEV_AVG_ST4_D(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); - dst -= dst_stride * 3; } static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( @@ -499,28 +502,28 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( for (; loop_cnt--;) { src1 = __lsx_vld(src, 0); DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); - src4 = __lsx_vlds(src, src_stride3); + src4 = __lsx_vldx(src, src_stride3); src += src_stride4; hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); - tmp0 = __lsx_vavgr_bu(vec0, filt_vt); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); - tmp1 = __lsx_vavgr_bu(vec0, filt_vt); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); - tmp2 = __lsx_vavgr_bu(vec0, filt_vt); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); - tmp3 = __lsx_vavgr_bu(vec0, filt_vt); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1); + DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); dst0 = __lsx_vldrepl_d(dst_tmp, 0); dst_tmp += dst_stride; @@ -563,7 +566,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( int32_t dst_stride2 = dst_stride << 1; int32_t dst_stride3 = dst_stride2 + dst_stride; - int32_t dst_stride4 = dst_stride2 << 1; + int32_t dst_stride4 = dst_stride << 2; mask = __lsx_vld(mc_filt_mask_arr, 0); /* rearranging filter */ @@ -584,7 +587,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( src1 = __lsx_vld(src_tmp1, 0); DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, src5); - src5 = __lsx_vldx(src_tmp1, src_stride3); + src7 = __lsx_vldx(src_tmp1, src_stride3); src += src_stride4; dst0 = __lsx_vld(dst, 0); DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); @@ -593,42 +596,39 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); tmp3 = __lsx_vpickev_b(tmp1, tmp0); tmp3 = __lsx_vavgr_bu(tmp3, dst0); __lsx_vst(tmp3, dst, 0); - dst += dst_stride; hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); tmp3 = __lsx_vpickev_b(tmp1, tmp0); tmp3 = __lsx_vavgr_bu(tmp3, dst1); - __lsx_vst(tmp3, dst, 0); - dst += dst_stride; + __lsx_vstx(tmp3, dst, dst_stride); hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); tmp3 = __lsx_vpickev_b(tmp1, tmp0); tmp3 = __lsx_vavgr_bu(tmp3, dst2); - __lsx_vst(tmp3, dst, 0); - dst += dst_stride; + __lsx_vstx(tmp3, dst, dst_stride2); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); tmp3 = __lsx_vpickev_b(tmp1, tmp0); tmp3 = __lsx_vavgr_bu(tmp3, dst3); - __lsx_vst(tmp3, dst, 0); - dst += dst_stride; + __lsx_vstx(tmp3, dst, dst_stride3); + dst += dst_stride4; } } diff --git a/vpx_dsp/loongarch/vpx_convolve_lsx.h b/vpx_dsp/loongarch/vpx_convolve_lsx.h index 2fdb93db84..0e3dcae006 100644 --- a/vpx_dsp/loongarch/vpx_convolve_lsx.h +++ b/vpx_dsp/loongarch/vpx_convolve_lsx.h @@ -114,4 +114,19 @@ tmp1_m; \ }) +#define PCKEV_AVG_ST4_D(in0, in1, in2, in3, dst0, dst1, pdst, stride) \ + { \ + __m128i tmp0_m, tmp1_m; \ + \ + DUP2_ARG2(__lsx_vpickev_b, in1, in0, in3, in2, tmp0_m, tmp1_m); \ + DUP2_ARG2(__lsx_vavgr_bu, tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ + __lsx_vstelm_d(tmp0_m, pdst, 0, 0); \ + pdst += stride; \ + __lsx_vstelm_d(tmp0_m, pdst, 0, 1); \ + pdst += stride; \ + __lsx_vstelm_d(tmp1_m, pdst, 0, 0); \ + pdst += stride; \ + __lsx_vstelm_d(tmp1_m, pdst, 0, 1); \ + } + #endif // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ From d4060647213d51125457ae151a2402bf95ebdf71 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Mon, 28 Mar 2022 17:12:57 +0800 Subject: [PATCH 0313/1000] vp8[loongarch]: Optimize dequant_idct_add_y/uv_block 1. vp8_dequant_idct_add_uv_block_lsx 2. vp8_dequant_idct_add_y_block_lsx Bug: webm:1755 Change-Id: I1f006daaefb2075b422bc72a3f69c5abee776e2e --- vp8/common/loongarch/idct_lsx.c | 271 ++++++++++++++++++++++++++++++++ vp8/common/rtcd_defs.pl | 4 +- 2 files changed, 273 insertions(+), 2 deletions(-) diff --git a/vp8/common/loongarch/idct_lsx.c b/vp8/common/loongarch/idct_lsx.c index fb0b0384c4..679019ff63 100644 --- a/vp8/common/loongarch/idct_lsx.c +++ b/vp8/common/loongarch/idct_lsx.c @@ -12,6 +12,107 @@ #include "vp8/common/blockd.h" #include "vpx_util/loongson_intrinsics.h" +static const int32_t cospi8sqrt2minus1 = 20091; +static const int32_t sinpi8sqrt2 = 35468; + +#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, tmp0_m, tmp1_m); \ + DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, tmp2_m, tmp3_m); \ + DUP2_ARG2(__lsx_vilvl_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ + DUP2_ARG2(__lsx_vilvh_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ + } + +#define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + __m128i s4_m, s5_m, s6_m, s7_m; \ + \ + TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m); \ + DUP2_ARG2(__lsx_vilvl_d, s6_m, s4_m, s7_m, s5_m, out0, out2); \ + out1 = __lsx_vilvh_d(s6_m, s4_m); \ + out3 = __lsx_vilvh_d(s7_m, s5_m); \ + } + +#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in) \ + ({ \ + __m128i out_m; \ + __m128i zero_m = __lsx_vldi(0); \ + __m128i tmp1_m, tmp2_m; \ + __m128i sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \ + \ + tmp1_m = __lsx_vilvl_h(in, zero_m); \ + tmp2_m = __lsx_vilvh_h(in, zero_m); \ + tmp1_m = __lsx_vsrai_w(tmp1_m, 16); \ + tmp2_m = __lsx_vsrai_w(tmp2_m, 16); \ + tmp1_m = __lsx_vmul_w(tmp1_m, sinpi8_sqrt2_m); \ + tmp1_m = __lsx_vsrai_w(tmp1_m, 16); \ + tmp2_m = __lsx_vmul_w(tmp2_m, sinpi8_sqrt2_m); \ + tmp2_m = __lsx_vsrai_w(tmp2_m, 16); \ + out_m = __lsx_vpickev_h(tmp2_m, tmp1_m); \ + \ + out_m; \ + }) + +#define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + __m128i a1_m, b1_m, c1_m, d1_m; \ + __m128i c_tmp1_m, c_tmp2_m; \ + __m128i d_tmp1_m, d_tmp2_m; \ + __m128i const_cospi8sqrt2minus1_m; \ + \ + const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_h(cospi8sqrt2minus1); \ + a1_m = __lsx_vadd_h(in0, in2); \ + b1_m = __lsx_vsub_h(in0, in2); \ + c_tmp1_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1); \ + \ + c_tmp2_m = __lsx_vmuh_h(in3, const_cospi8sqrt2minus1_m); \ + c_tmp2_m = __lsx_vslli_h(c_tmp2_m, 1); \ + c_tmp2_m = __lsx_vsrai_h(c_tmp2_m, 1); \ + c_tmp2_m = __lsx_vadd_h(in3, c_tmp2_m); \ + c1_m = __lsx_vsub_h(c_tmp1_m, c_tmp2_m); \ + \ + d_tmp1_m = __lsx_vmuh_h(in1, const_cospi8sqrt2minus1_m); \ + d_tmp1_m = __lsx_vslli_h(d_tmp1_m, 1); \ + d_tmp1_m = __lsx_vsrai_h(d_tmp1_m, 1); \ + d_tmp1_m = __lsx_vadd_h(in1, d_tmp1_m); \ + d_tmp2_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3); \ + d1_m = __lsx_vadd_h(d_tmp1_m, d_tmp2_m); \ + LSX_BUTTERFLY_4_H(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ + } + +#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + __m128i a1_m, b1_m, c1_m, d1_m; \ + __m128i c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \ + __m128i const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m; \ + \ + const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_w(cospi8sqrt2minus1); \ + sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \ + a1_m = __lsx_vadd_w(in0, in2); \ + b1_m = __lsx_vsub_w(in0, in2); \ + c_tmp1_m = __lsx_vmul_w(in1, sinpi8_sqrt2_m); \ + c_tmp1_m = __lsx_vsrai_w(c_tmp1_m, 16); \ + c_tmp2_m = __lsx_vmul_w(in3, const_cospi8sqrt2minus1_m); \ + c_tmp2_m = __lsx_vsrai_w(c_tmp2_m, 16); \ + c_tmp2_m = __lsx_vadd_w(in3, c_tmp2_m); \ + c1_m = __lsx_vsub_w(c_tmp1_m, c_tmp2_m); \ + d_tmp1_m = __lsx_vmul_w(in1, const_cospi8sqrt2minus1_m); \ + d_tmp1_m = __lsx_vsrai_w(d_tmp1_m, 16); \ + d_tmp1_m = __lsx_vadd_w(in1, d_tmp1_m); \ + d_tmp2_m = __lsx_vmul_w(in3, sinpi8_sqrt2_m); \ + d_tmp2_m = __lsx_vsrai_w(d_tmp2_m, 16); \ + d1_m = __lsx_vadd_w(d_tmp1_m, d_tmp2_m); \ + LSX_BUTTERFLY_4_W(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ + } + +#define UNPCK_SH_SW(in, out0, out1) \ + { \ + out0 = __lsx_vsllwil_w_h(in, 0); \ + out1 = __lsx_vexth_w_h(in); \ + } + static void idct4x4_addconst_lsx(int16_t in_dc, uint8_t *pred, int32_t pred_stride, uint8_t *dest, int32_t dest_stride) { @@ -52,3 +153,173 @@ void vp8_dc_only_idct_add_lsx(int16_t input_dc, uint8_t *pred_ptr, int32_t dst_stride) { idct4x4_addconst_lsx(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride); } + +static void dequant_idct4x4_addblk_2x_lsx(int16_t *input, + int16_t *dequant_input, uint8_t *dest, + int32_t dest_stride) { + __m128i dest0, dest1, dest2, dest3; + __m128i in0, in1, in2, in3, mul0, mul1, mul2, mul3, dequant_in0, dequant_in1; + __m128i hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3, res0, res1, res2, res3; + __m128i hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r; + __m128i vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r; + __m128i zero = __lsx_vldi(0); + + int32_t dest_stride2 = dest_stride << 1; + int32_t dest_stride3 = dest_stride2 + dest_stride; + + DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2, + in3); + DUP2_ARG2(__lsx_vld, dequant_input, 0, dequant_input, 16, dequant_in0, + dequant_in1); + + DUP4_ARG2(__lsx_vmul_h, in0, dequant_in0, in1, dequant_in1, in2, dequant_in0, + in3, dequant_in1, mul0, mul1, mul2, mul3); + DUP2_ARG2(__lsx_vpickev_d, mul2, mul0, mul3, mul1, in0, in2); + DUP2_ARG2(__lsx_vpickod_d, mul2, mul0, mul3, mul1, in1, in3); + + VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3); + TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); + UNPCK_SH_SW(hz0, hz0r, hz0l); + UNPCK_SH_SW(hz1, hz1r, hz1l); + UNPCK_SH_SW(hz2, hz2r, hz2l); + UNPCK_SH_SW(hz3, hz3r, hz3l); + VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l); + DUP4_ARG2(__lsx_vsrari_w, vt0l, 3, vt1l, 3, vt2l, 3, vt3l, 3, vt0l, vt1l, + vt2l, vt3l); + VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r); + DUP4_ARG2(__lsx_vsrari_w, vt0r, 3, vt1r, 3, vt2r, 3, vt3r, 3, vt0r, vt1r, + vt2r, vt3r); + DUP4_ARG2(__lsx_vpickev_h, vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r, + vt0, vt1, vt2, vt3); + TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); + dest0 = __lsx_vld(dest, 0); + DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2); + dest3 = __lsx_vldx(dest, dest_stride3); + DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3, + res0, res1, res2, res3); + DUP4_ARG2(__lsx_vadd_h, res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, + res1, res2, res3); + + res0 = __lsx_vclip255_h(res0); + res1 = __lsx_vclip255_h(res1); + res2 = __lsx_vclip255_h(res2); + res3 = __lsx_vclip255_h(res3); + DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, vt0l, vt1l); + + __lsx_vstelm_d(vt0l, dest, 0, 0); + __lsx_vstelm_d(vt0l, dest + dest_stride, 0, 1); + __lsx_vstelm_d(vt1l, dest + dest_stride2, 0, 0); + __lsx_vstelm_d(vt1l, dest + dest_stride3, 0, 1); + + __lsx_vst(zero, input, 0); + __lsx_vst(zero, input, 16); + __lsx_vst(zero, input, 32); + __lsx_vst(zero, input, 48); +} + +static void dequant_idct_addconst_2x_lsx(int16_t *input, int16_t *dequant_input, + uint8_t *dest, int32_t dest_stride) { + __m128i input_dc0, input_dc1, vec, res0, res1, res2, res3; + __m128i dest0, dest1, dest2, dest3; + __m128i zero = __lsx_vldi(0); + int32_t dest_stride2 = dest_stride << 1; + int32_t dest_stride3 = dest_stride2 + dest_stride; + + input_dc0 = __lsx_vreplgr2vr_h(input[0] * dequant_input[0]); + input_dc1 = __lsx_vreplgr2vr_h(input[16] * dequant_input[0]); + DUP2_ARG2(__lsx_vsrari_h, input_dc0, 3, input_dc1, 3, input_dc0, input_dc1); + vec = __lsx_vpickev_d(input_dc1, input_dc0); + input[0] = 0; + input[16] = 0; + dest0 = __lsx_vld(dest, 0); + DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2); + dest3 = __lsx_vldx(dest, dest_stride3); + DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3, + res0, res1, res2, res3); + DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0, + res1, res2, res3); + res0 = __lsx_vclip255_h(res0); + res1 = __lsx_vclip255_h(res1); + res2 = __lsx_vclip255_h(res2); + res3 = __lsx_vclip255_h(res3); + + DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, res0, res1); + __lsx_vstelm_d(res0, dest, 0, 0); + __lsx_vstelm_d(res0, dest + dest_stride, 0, 1); + __lsx_vstelm_d(res1, dest + dest_stride2, 0, 0); + __lsx_vstelm_d(res1, dest + dest_stride3, 0, 1); +} + +void vp8_dequant_idct_add_y_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst, + int32_t stride, char *eobs) { + int16_t *eobs_h = (int16_t *)eobs; + uint8_t i; + + for (i = 4; i--;) { + if (eobs_h[0]) { + if (eobs_h[0] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst, stride); + } + } + + q += 32; + + if (eobs_h[1]) { + if (eobs_h[1] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst + 8, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst + 8, stride); + } + } + + q += 32; + dst += (4 * stride); + eobs_h += 2; + } +} + +void vp8_dequant_idct_add_uv_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst_u, + uint8_t *dst_v, int32_t stride, + char *eobs) { + int16_t *eobs_h = (int16_t *)eobs; + if (eobs_h[0]) { + if (eobs_h[0] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride); + } + } + + q += 32; + dst_u += (stride * 4); + + if (eobs_h[1]) { + if (eobs_h[1] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride); + } + } + + q += 32; + + if (eobs_h[2]) { + if (eobs_h[2] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride); + } + } + q += 32; + dst_v += (stride * 4); + + if (eobs_h[3]) { + if (eobs_h[3] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride); + } + } +} diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index c7911032f6..e4b40fa9ed 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -38,10 +38,10 @@ () specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa mmi/; add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs"; -specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi/; +specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi lsx/; add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"; -specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi/; +specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi lsx/; # # Loopfilter From 8ff9f66b8de7bcec70296c1f304ab409330c3525 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Wed, 30 Mar 2022 14:04:10 +0800 Subject: [PATCH 0314/1000] vp9[loongarch]: Optimize vpx_convolve8_avg_horiz_c 1. vpx_convolve8_avg_horiz_lsx Bug: webm:1755 Change-Id: I0b6520be0afa1689da329f56ec6cd95c1730250c --- test/convolve_test.cc | 8 +- .../loongarch/vpx_convolve8_avg_horiz_lsx.c | 972 ++++++++++++++++++ vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c | 11 - vpx_dsp/loongarch/vpx_convolve8_lsx.c | 11 - vpx_dsp/loongarch/vpx_convolve_lsx.h | 11 + vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 7 files changed, 989 insertions(+), 27 deletions(-) create mode 100644 vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c diff --git a/test/convolve_test.cc b/test/convolve_test.cc index a631ec77f7..5189be647a 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -1452,10 +1452,10 @@ INSTANTIATE_TEST_SUITE_P(MSA, ConvolveTest, #if HAVE_LSX const ConvolveFunctions convolve8_lsx( vpx_convolve_copy_c, vpx_convolve_avg_lsx, vpx_convolve8_horiz_lsx, - vpx_convolve8_avg_horiz_c, vpx_convolve8_vert_lsx, vpx_convolve8_avg_vert_c, - vpx_convolve8_lsx, vpx_convolve8_avg_lsx, vpx_scaled_horiz_c, - vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c, - vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); + vpx_convolve8_avg_horiz_lsx, vpx_convolve8_vert_lsx, + vpx_convolve8_avg_vert_c, vpx_convolve8_lsx, vpx_convolve8_avg_lsx, + vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, + vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve8_lsx[] = { ALL_SIZES(convolve8_lsx) }; INSTANTIATE_TEST_SUITE_P(LSX, ConvolveTest, diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c new file mode 100644 index 0000000000..1c59228813 --- /dev/null +++ b/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c @@ -0,0 +1,972 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1; + __m128i dst0, dst1, dst2, dst3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, tmp0, tmp1); + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); + tmp0 = __lsx_vxori_b(tmp0, 128); + dst0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vstelm_w(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 3); +} + +static void common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + tmp0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp0 = __lsx_vilvl_w(tmp1, tmp0); + tmp1 = __lsx_vilvl_w(tmp3, tmp2); + dst0 = __lsx_vilvl_d(tmp1, tmp0); + + tmp0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp3 = __lsx_vldrepl_w(dst_tmp, 0); + tmp0 = __lsx_vilvl_w(tmp1, tmp0); + tmp1 = __lsx_vilvl_w(tmp3, tmp2); + dst1 = __lsx_vilvl_d(tmp1, tmp0); + + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, tmp0, tmp1); + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, tmp2, tmp3); + DUP4_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp2, tmp2, 7, + tmp3, tmp3, 7, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1); + __lsx_vstelm_w(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 3); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 3); +} + +static void common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_hz_8t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_8t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + int32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *_src = (uint8_t *)src - 3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; loop_cnt--;) { + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, tmp0, + tmp1, tmp2, tmp3); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1); + __lsx_vstelm_d(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(dst1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst1, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt = height >> 1; + int32_t dst_stride2 = dst_stride << 1; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + dst0 = __lsx_vld(dst_tmp, 0); + dst1 = __lsx_vldx(dst_tmp, dst_stride); + dst_tmp += dst_stride2; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2, + mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2, + mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2, + mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2, + mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15); + DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3, + filter0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2, + tmp11, filter2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2, + tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3, + tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, + tmp7); + DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst2, dst3); + DUP2_ARG2(__lsx_vxori_b, dst2, 128, dst3, 128, dst2, dst3); + DUP2_ARG2(__lsx_vavgr_bu, dst0, dst2, dst1, dst3, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vstx(dst1, dst, dst_stride); + dst += dst_stride2; + } +} + +static void common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3, dst0, dst1; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst, 16, dst0, dst1); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2, + mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2, + mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2, + mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2, + mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15); + DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3, + filter0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2, + tmp11, filter2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2, + tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3, + tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, + tmp7); + DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + DUP2_ARG2(__lsx_vavgr_bu, dst0, tmp0, dst1, tmp1, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt = height; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3, dst0, dst1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + + DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2); + src3 = __lsx_vld(src, 56); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP2_ARG2(__lsx_vld, dst, 32, dst, 48, dst0, dst1); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1); + __lsx_vst(out0, dst, 32); + __lsx_vst(out1, dst, 48); + src += src_stride; + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_4x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, mask; + __m128i dst0, dst1, dst2, dst3, vec0, vec1, filt0; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1); + dst0 = __lsx_vilvl_d(dst1, dst0); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec0, vec1); + vec0 = __lsx_vssrarni_bu_h(vec1, vec0, FILTER_BITS); + vec0 = __lsx_vavgr_bu(vec0, dst0); + __lsx_vstelm_w(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(vec0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(vec0, dst, 0, 3); +} + +static void common_hz_2t_and_aver_dst_4x8_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; + __m128i dst0, dst1, dst2, dst3, dst4; + __m128i vec4, vec5, vec6, vec7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp1 = (uint8_t *)src + src_stride4; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src4 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp1, src_stride3); + + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1); + dst0 = __lsx_vilvl_d(dst1, dst0); + + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, dst2, dst1, dst4, dst3, dst1, dst2); + dst1 = __lsx_vilvl_d(dst2, dst1); + + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask, src7, src6, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec4, vec5, vec6, vec7); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5, + FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0, + res1, res2, res3); + DUP2_ARG2(__lsx_vilvl_d, res1, res0, res3, res2, res0, res2); + DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res2, dst1, res0, res2); + + __lsx_vstelm_w(res0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(res2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res2, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res2, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res2, dst, 0, 3); + dst += dst_stride; +} + +static void common_hz_2t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_hz_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_and_aver_dst_8x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, mask; + __m128i filt0, dst0, dst1, dst2, dst3; + __m128i vec0, vec1, vec2, vec3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec1); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec1, dst1, vec0, vec1); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec1, dst, 0, 1); +} + +static void common_hz_2t_and_aver_dst_8x8mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + __m128i src0, src1, src2, src3, mask; + __m128i filt0, dst0, dst1, dst2, dst3; + __m128i vec0, vec1, vec2, vec3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + + if (height == 16) { + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_hz_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_hz_2t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2) - 1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, dst0; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp1 = (uint8_t *)src + 8; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src_tmp1 += src_stride4; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + res0, res1, res2, res3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, + res4, res5, res6, res7); + DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2, + FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, res0, + res2, res4, res6); + dst0 = __lsx_vld(dst, 0); + res0 = __lsx_vavgr_bu(res0, dst0); + __lsx_vst(res0, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res2 = __lsx_vavgr_bu(res2, dst0); + __lsx_vst(res2, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res4 = __lsx_vavgr_bu(res4, dst0); + __lsx_vst(res4, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res6 = __lsx_vavgr_bu(res6, dst0); + __lsx_vst(res6, dst, 0); + dst += dst_stride; + + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src_tmp1 += src_stride4; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, res0, res1, res2, res3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, res4, res5, res6, res7); + + DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2, + FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, + res0, res2, res4, res6); + dst0 = __lsx_vld(dst, 0); + res0 = __lsx_vavgr_bu(res0, dst0); + __lsx_vst(res0, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res2 = __lsx_vavgr_bu(res2, dst0); + __lsx_vst(res2, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res4 = __lsx_vavgr_bu(res4, dst0); + __lsx_vst(res4, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res6 = __lsx_vavgr_bu(res6, dst0); + __lsx_vst(res6, dst, 0); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, dst0, dst1; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vld, src, 16, src, 24, src2, src3); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vld, src, 16, src, 24, src6, src7); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, res0, res1, res2, res3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, res4, res5, res6, res7); + DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2, + FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, + res0, res2, res4, res6); + + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + res0 = __lsx_vavgr_bu(res0, dst0); + __lsx_vst(res0, dst, 0); + res2 = __lsx_vavgr_bu(res2, dst1); + __lsx_vst(res2, dst, 16); + dst += dst_stride; + + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + res4 = __lsx_vavgr_bu(res4, dst0); + __lsx_vst(res4, dst, 0); + res6 = __lsx_vavgr_bu(res6, dst1); + __lsx_vst(res6, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, dst0, dst1, dst2, dst3; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4, + src6); + src7 = __lsx_vld(src, 56); + DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out2, out4, out6); + + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst1, dst2, + dst3); + out0 = __lsx_vavgr_bu(out0, dst0); + __lsx_vst(out0, dst, 0); + out2 = __lsx_vavgr_bu(out2, dst1); + __lsx_vst(out2, dst, 16); + out4 = __lsx_vavgr_bu(out4, dst2); + __lsx_vst(out4, dst, 32); + out6 = __lsx_vavgr_bu(out6, dst3); + __lsx_vst(out6, dst, 48); + dst += dst_stride; + } +} + +void vpx_convolve8_avg_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_x = filter[x0_q4]; + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + + if (vpx_get_filter_taps(filter_x) == 2) { + switch (w) { + case 4: + common_hz_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 8: + common_hz_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 16: + common_hz_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + + case 32: + common_hz_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 64: + common_hz_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + default: + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 8: + common_hz_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 16: + common_hz_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 32: + common_hz_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 64: + common_hz_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + default: + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c index 3608fe326c..5d67d65274 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c @@ -12,17 +12,6 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/loongarch/vpx_convolve_lsx.h" -#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \ - { \ - _src0 = __lsx_vld(_src, 0); \ - _src += _stride; \ - _src1 = __lsx_vld(_src, 0); \ - _src += _stride; \ - _src2 = __lsx_vld(_src, 0); \ - _src += _stride; \ - _src3 = __lsx_vld(_src, 0); \ - } - static const uint8_t mc_filt_mask_arr[16 * 3] = { /* 8 width cases */ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, diff --git a/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_lsx.c index 51a162bf3e..894c137203 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_lsx.c @@ -12,17 +12,6 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/loongarch/vpx_convolve_lsx.h" -#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \ - { \ - _src0 = __lsx_vld(_src, 0); \ - _src += _stride; \ - _src1 = __lsx_vld(_src, 0); \ - _src += _stride; \ - _src2 = __lsx_vld(_src, 0); \ - _src += _stride; \ - _src3 = __lsx_vld(_src, 0); \ - } - static const uint8_t mc_filt_mask_arr[16 * 3] = { /* 8 width cases */ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, diff --git a/vpx_dsp/loongarch/vpx_convolve_lsx.h b/vpx_dsp/loongarch/vpx_convolve_lsx.h index 0e3dcae006..d319bc4f7d 100644 --- a/vpx_dsp/loongarch/vpx_convolve_lsx.h +++ b/vpx_dsp/loongarch/vpx_convolve_lsx.h @@ -14,6 +14,17 @@ #include "vpx_util/loongson_intrinsics.h" #include "vpx_dsp/vpx_filter.h" +#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \ + { \ + _src0 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src1 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src2 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src3 = __lsx_vld(_src, 0); \ + } + #define FILT_8TAP_DPADD_S_H(_reg0, _reg1, _reg2, _reg3, _filter0, _filter1, \ _filter2, _filter3) \ ({ \ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index bf348c1126..c844379364 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -164,6 +164,7 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_vert_dspr2.c DSP_SRCS-$(HAVE_VSX) += ppc/vpx_convolve_vsx.c # common (lsx) +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_horiz_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_horiz_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_vert_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_lsx.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 73f28ff927..649aa17d1d 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -386,7 +386,7 @@ () specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; +specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; From 6ac395ed771a4fa986637197a71ac6fe58d57965 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 31 Mar 2022 12:10:03 -0700 Subject: [PATCH 0315/1000] Revert "quantize: remove highbd version" This reverts commit 89cfe3835c47dabf77d38edb3af190155984fa9a. This is a prerequisite for reverting 2200039d33c49a9f7a5c438656df143755b022c4 which causes high bitdepth test failures Bug: webm:1586 Change-Id: I28f3b98f3339f3573b1492b88bf733dade133fc0 --- test/vp9_quantize_test.cc | 89 ++++--------------------- vpx_dsp/quantize.c | 132 ++++++++++++++++++++++++++++++-------- vpx_dsp/quantize.h | 17 ++++- 3 files changed, 134 insertions(+), 104 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index fcc7ff99ec..5773cd9835 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -30,6 +30,7 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/msvc.h" #include "vpx_ports/vpx_timer.h" +#include "vpx_dsp/quantize.h" using libvpx_test::ACMRandom; using libvpx_test::Buffer; @@ -466,13 +467,10 @@ INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, + make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_10, 16, false), - make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, - VPX_BITS_12, 16, false), - make_tuple(&QuantFPWrapper, - &QuantFPWrapper, VPX_BITS_8, - 16, true))); + make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false))); #else INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, @@ -486,28 +484,6 @@ INSTANTIATE_TEST_SUITE_P( #if HAVE_SSSE3 #if VPX_ARCH_X86_64 -#if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_SUITE_P( - SSSE3, VP9QuantizeTest, - ::testing::Values( - make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_8, 16, - false), - make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_10, 16, - false), - make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_12, 16, - false), - make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false), - make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, - VPX_BITS_10, 32, false), - make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, - VPX_BITS_12, 32, false), - make_tuple(&QuantFPWrapper, - &QuantFPWrapper, VPX_BITS_8, 16, true), - make_tuple(&QuantFPWrapper, - &QuantFPWrapper, VPX_BITS_8, 32, - true))); -#else INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, @@ -521,24 +497,6 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 32, true))); -#endif // #CONFIG_VP9_HIGHBITDEPTH -#else -#if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_SUITE_P( - SSSE3, VP9QuantizeTest, - ::testing::Values( - make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_8, 16, - false), - make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_10, 16, - false), - make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_12, 16, - false), - make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false), - make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, - VPX_BITS_10, 32, false), - make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, - VPX_BITS_12, 32, false))); #else INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, @@ -547,7 +505,7 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false))); -#endif // CONFIG_VP9_HIGHBITDEPTH + #endif // VPX_ARCH_X86_64 #endif // HAVE_SSSE3 @@ -558,15 +516,15 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_10, 16, - false), - make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_12, 16, - false), - make_tuple(&vpx_quantize_b_32x32_avx, &vpx_quantize_b_32x32_c, + make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_10, + 16, false), + make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_12, + 16, false), + make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_quantize_b_32x32_avx, &vpx_quantize_b_32x32_c, + make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_quantize_b_32x32_avx, &vpx_quantize_b_32x32_c, + make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); #else @@ -589,28 +547,6 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_AVX2 #if HAVE_NEON -#if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_SUITE_P( - NEON, VP9QuantizeTest, - ::testing::Values( - make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, - false), - make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_10, 16, - false), - make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_12, 16, - false), - make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false), - make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c, - VPX_BITS_10, 32, false), - make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c, - VPX_BITS_12, 32, false), - make_tuple(&QuantFPWrapper, - &QuantFPWrapper, VPX_BITS_8, 16, true), - make_tuple(&QuantFPWrapper, - &QuantFPWrapper, VPX_BITS_8, 32, - true))); -#else INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, @@ -624,7 +560,6 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 32, true))); -#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index c29b99bb0c..5d6ba64a8a 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -147,25 +147,66 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; if (abs_coeff >= zbins[rc != 0]) { + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + 16; // quantization + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = (tran_low_t)(qcoeff_ptr[rc] * dequant_ptr[rc != 0]); + + if (tmp) eob = i; + } + } + *eob_ptr = eob + 1; +} + #if CONFIG_VP9_HIGHBITDEPTH - // High bit depth configurations do not clamp to INT16. +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, non_zero_count = (int)n_coeffs, eob = -1; + const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= zbins[rc != 0]) { const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); -#else - const int tmp = - clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - const int abs_qcoeff = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * - quant_shift_ptr[rc != 0]) >> - 16; // quantization -#endif // CONFIG_VP9_HIGHBITDEPTH qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = (tran_low_t)(qcoeff_ptr[rc] * dequant_ptr[rc != 0]); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; if (abs_qcoeff) eob = i; } } *eob_ptr = eob + 1; } +#endif void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -202,23 +243,15 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int rc = scan[idx_arr[i]]; const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int abs_qcoeff = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); -#if CONFIG_VP9_HIGHBITDEPTH - // High bit depth configurations do not clamp to INT16. - { - const int64_t tmp = - ((abs_qcoeff * quant_ptr[rc != 0]) >> 16) + abs_qcoeff; - abs_qcoeff = (int)((tmp * quant_shift_ptr[rc != 0]) >> 15); - } -#else - abs_qcoeff = clamp(abs_qcoeff, INT16_MIN, INT16_MAX); - abs_qcoeff = ((((abs_qcoeff * quant_ptr[rc != 0]) >> 16) + abs_qcoeff) * - quant_shift_ptr[rc != 0]) >> - 15; -#endif // CONFIG_VP9_HIGHBITDEPTH - - qcoeff_ptr[rc] = (abs_qcoeff ^ coeff_sign) - coeff_sign; + int tmp; + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); + tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) * + quant_shift_ptr[rc != 0]) >> + 15; + + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; #if (VPX_ARCH_X86 || VPX_ARCH_X86_64) && !CONFIG_VP9_HIGHBITDEPTH // When tran_low_t is only 16 bits dqcoeff can outrange it. Rather than // truncating with a cast, saturate the value. This is easier to implement @@ -229,7 +262,54 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; #endif // VPX_ARCH_X86 && CONFIG_VP9_HIGHBITDEPTH + if (tmp) eob = idx_arr[i]; + } + *eob_ptr = eob + 1; +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_quantize_b_32x32_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), + ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; if (abs_qcoeff) eob = idx_arr[i]; } *eob_ptr = eob + 1; } +#endif diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h index 9ac1a47418..0fcd77941b 100644 --- a/vpx_dsp/quantize.h +++ b/vpx_dsp/quantize.h @@ -38,7 +38,22 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant, uint16_t *eob_ptr); -#endif // CONFIG_VP9_HIGHBITDEPTH +// Only used for reference. The optimized versions can handle HBD. +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +void vpx_highbd_quantize_b_32x32_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); +#endif #ifdef __cplusplus } // extern "C" From d00fd066e85176df0a21de3e99bad92ac2bacb00 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 31 Mar 2022 12:11:01 -0700 Subject: [PATCH 0316/1000] Revert "quantize: replace highbd versions" This reverts commit 2200039d33c49a9f7a5c438656df143755b022c4. This causes failures with VP9/EndToEndTestLarge.EndtoEndPSNRTest/*; it seems the assembly does not match the C code. Bug: webm:1586 Change-Id: I4c63beebf88d4c12789d681b0d38014510b147fe --- test/vp9_quantize_test.cc | 42 +++--- vp9/encoder/vp9_encodemb.c | 48 +++---- vp9/encoder/vp9_quantize.c | 8 ++ vpx_dsp/quantize.h | 16 --- vpx_dsp/vpx_dsp.mk | 3 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 ++ vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 152 ++++++++++++++++++++++ 7 files changed, 211 insertions(+), 66 deletions(-) create mode 100644 vpx_dsp/x86/highbd_quantize_intrin_sse2.c diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 5773cd9835..d54f1bc9cd 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -30,7 +30,6 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/msvc.h" #include "vpx_ports/vpx_timer.h" -#include "vpx_dsp/quantize.h" using libvpx_test::ACMRandom; using libvpx_test::Buffer; @@ -465,12 +464,22 @@ using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_10, 16, false), - make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false))); + ::testing::Values( + make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), + make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&vpx_highbd_quantize_b_32x32_sse2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_sse2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_sse2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); + #else INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, @@ -510,24 +519,6 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_SSSE3 #if HAVE_AVX -#if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_SUITE_P( - AVX, VP9QuantizeTest, - ::testing::Values( - make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_8, 16, - false), - make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_10, - 16, false), - make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_12, - 16, false), - make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c, - VPX_BITS_8, 32, false), - make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c, - VPX_BITS_10, 32, false), - make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c, - VPX_BITS_12, 32, false))); - -#else INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, @@ -535,7 +526,6 @@ INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest, make_tuple(&vpx_quantize_b_32x32_avx, &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false))); -#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_AVX #if VPX_ARCH_X86_64 && HAVE_AVX2 diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index e708555f89..fa222f9dcf 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -511,28 +511,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b_32x32( + coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, + dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; case TX_8X8: vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; } return; @@ -857,9 +857,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b_32x32( + coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, + dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -876,9 +876,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); else vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type); - vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, + eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -896,9 +896,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); else vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type); - vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, + eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -917,9 +917,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type); else x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, + eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 1c401e96b4..9058997b0f 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -164,6 +164,14 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, return; } +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, + p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, &p->eobs[block], scan, iscan); + return; + } +#endif vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, &p->eobs[block], scan, iscan); diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h index 0fcd77941b..8e138445e2 100644 --- a/vpx_dsp/quantize.h +++ b/vpx_dsp/quantize.h @@ -37,22 +37,6 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant, uint16_t *eob_ptr); - -// Only used for reference. The optimized versions can handle HBD. -void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan); - -void vpx_highbd_quantize_b_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan); #endif #ifdef __cplusplus diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index bf348c1126..a1e511cce0 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -318,6 +318,9 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c +endif # avg DSP_SRCS-yes += avg.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 73f28ff927..83dbcfdda2 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -714,6 +714,14 @@ () add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/; + + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vpx_highbd_quantize_b sse2/; + + add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vpx_highbd_quantize_b_32x32 sse2/; + } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER if (vpx_config("CONFIG_ENCODERS") eq "yes") { diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c new file mode 100644 index 0000000000..4535a0f7a2 --- /dev/null +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, j, non_zero_regs = (int)count / 4, eob_i = -1; + __m128i zbins[2]; + __m128i nzbins[2]; + + zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], + (int)zbin_ptr[0]); + zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + (void)scan; + + memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = ((int)count / 4) - 1; i >= 0; i--) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (test == 0xffff) + non_zero_regs--; + else + break; + } + + // Quantization pass: + for (i = 0; i < non_zero_regs; i++) { + __m128i coeffs, coeffs_sign, tmp1, tmp2; + int test; + int abs_coeff[4]; + int coeff_sign[4]; + + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + coeffs_sign = _mm_srai_epi32(coeffs, 31); + coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); + tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); + tmp1 = _mm_or_si128(tmp1, tmp2); + test = _mm_movemask_epi8(tmp1); + _mm_storeu_si128((__m128i *)abs_coeff, coeffs); + _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); + + for (j = 0; j < 4; j++) { + if (test & (1 << (4 * j))) { + int k = 4 * i + j; + const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; + const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; + const uint32_t abs_qcoeff = + (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); + qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; + dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; + if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; + } + } + } + *eob_ptr = eob_i + 1; +} + +void vpx_highbd_quantize_b_32x32_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m128i zbins[2]; + __m128i nzbins[2]; + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); + (void)scan; + + zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); + zbins[1] = _mm_set1_epi32(zbin1_tmp); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; + } + *eob_ptr = eob + 1; +} +#endif From d04f78b5635e84a9099f2b0105562b87ba75f2cd Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 11 Apr 2022 11:08:12 -0700 Subject: [PATCH 0317/1000] rate_hist,show_histogram: fix crash w/0 buckets this can occur if 0 frames are encoded, e.g., due to --skip see also: https://crbug.com/aomedia/3243 Change-Id: I791d5ad6611dbcb60d790e6b705298328ec48126 --- rate_hist.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rate_hist.c b/rate_hist.c index d10e754fee..947950d481 100644 --- a/rate_hist.c +++ b/rate_hist.c @@ -196,7 +196,9 @@ static void show_histogram(const struct hist_bucket *bucket, int buckets, int width1, width2; int i; + if (!buckets) return; assert(bucket != NULL); + assert(buckets > 0); switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) { case 1: From a3cd75e29bdc1ca9df81f944a6c873220509fda8 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 11 Apr 2022 11:41:19 -0700 Subject: [PATCH 0318/1000] vpxdec: add some allocation checks see also: https://crbug.com/aomedia/3244 Change-Id: I7d151e63a91b8c1a5ee4e861f0b8461eeece6a2f --- vpxdec.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/vpxdec.c b/vpxdec.c index ad368a230b..363eb1a24b 100644 --- a/vpxdec.c +++ b/vpxdec.c @@ -815,6 +815,10 @@ static int main_loop(int argc, const char **argv_) { ext_fb_list.num_external_frame_buffers = num_external_frame_buffers; ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc( num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb)); + if (!ext_fb_list.ext_fb) { + fprintf(stderr, "Failed to allocate ExternalFrameBuffer\n"); + goto fail; + } if (vpx_codec_set_frame_buffer_functions(&decoder, get_vp9_frame_buffer, release_vp9_frame_buffer, &ext_fb_list)) { @@ -930,6 +934,11 @@ static int main_loop(int argc, const char **argv_) { } scaled_img = vpx_img_alloc(NULL, img->fmt, render_width, render_height, 16); + if (!scaled_img) { + fprintf(stderr, "Failed to allocate scaled image (%d x %d)\n", + render_width, render_height); + goto fail; + } scaled_img->bit_depth = img->bit_depth; } @@ -966,6 +975,10 @@ static int main_loop(int argc, const char **argv_) { if (!img_shifted) { img_shifted = vpx_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16); + if (!img_shifted) { + fprintf(stderr, "Failed to allocate image\n"); + goto fail; + } img_shifted->bit_depth = output_bit_depth; } if (output_bit_depth > img->bit_depth) { From 3a3645dbdc7822f52f7b136861e4447ea8d551f9 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Tue, 15 Mar 2022 16:15:36 +0800 Subject: [PATCH 0319/1000] vp9[loongarch]: Optimize sad64x64/32x32/16x16 1. vpx_sad64x64x4d_lsx 2. vpx_sad32x32x4d_lsx 3. vpx_sad16x16x4d_lsx 4. vpx_sad64x64_lsx 5. vpx_sad32x32_lsx Bug: webm:1755 Change-Id: Ief71c2216f697b261d7c1fc481c89c9f1a6098e6 --- test/sad_test.cc | 18 ++ vpx_dsp/loongarch/sad_lsx.c | 378 +++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 2 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 10 +- 4 files changed, 403 insertions(+), 5 deletions(-) create mode 100644 vpx_dsp/loongarch/sad_lsx.c diff --git a/test/sad_test.cc b/test/sad_test.cc index 560c5f3823..aec4cbc380 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1128,4 +1128,22 @@ const SadMxNx4Param x4d_mmi_tests[] = { }; INSTANTIATE_TEST_SUITE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests)); #endif // HAVE_MMI + +//------------------------------------------------------------------------------ +// loongarch functions +#if HAVE_LSX +const SadMxNParam lsx_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_lsx), + SadMxNParam(32, 32, &vpx_sad32x32_lsx), +}; +INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests)); + +const SadMxNx4Param x4d_lsx_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_lsx), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_lsx), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_lsx), +}; +INSTANTIATE_TEST_SUITE_P(LSX, SADx4Test, ::testing::ValuesIn(x4d_lsx_tests)); +#endif // HAVE_LSX + } // namespace diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c new file mode 100644 index 0000000000..59b268ca1f --- /dev/null +++ b/vpx_dsp/loongarch/sad_lsx.c @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +#define SAD_UB2_UH(in0, in1, ref0, ref1) \ + ({ \ + __m128i diff0_m, diff1_m, sad_m0; \ + __m128i sad_m = __lsx_vldi(0); \ + \ + diff0_m = __lsx_vabsd_bu(in0, ref0); \ + diff1_m = __lsx_vabsd_bu(in1, ref1); \ + \ + sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m); \ + sad_m = __lsx_vadd_h(sad_m, sad_m0); \ + sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m); \ + sad_m = __lsx_vadd_h(sad_m, sad_m0); \ + \ + sad_m; \ + }) + +#define HADD_UW_U32(in) \ + ({ \ + __m128i res0_m; \ + uint32_t sum_m; \ + res0_m = __lsx_vhaddw_du_wu(in, in); \ + res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m); \ + sum_m = __lsx_vpickve2gr_w(res0_m, 0); \ + sum_m; \ + }) + +#define HADD_UH_U32(in) \ + ({ \ + __m128i res_m; \ + uint32_t sum_m; \ + res_m = __lsx_vhaddw_wu_hu(in, in); \ + sum_m = HADD_UW_U32(res_m); \ + sum_m; \ + }) + +static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt = (height >> 2); + __m128i src0, src1, ref0, ref1; + __m128i sad_tmp; + __m128i sad = __lsx_vldi(0); + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + return HADD_UH_U32(sad); +} + +static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt = (height >> 1); + uint32_t sad = 0; + __m128i src0, src1, src2, src3; + __m128i ref0, ref1, ref2, ref3; + __m128i sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + } + + sad = HADD_UH_U32(sad0); + sad += HADD_UH_U32(sad1); + + return sad; +} + +static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt = (height >> 1); + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + __m128i src, ref0, ref1, ref2, ref3, diff, sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + __m128i sad2 = sad0; + __m128i sad3 = sad0; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref0 = __lsx_vld(ref0_ptr, 0); + ref0_ptr += ref_stride; + ref1 = __lsx_vld(ref1_ptr, 0); + ref1_ptr += ref_stride; + ref2 = __lsx_vld(ref2_ptr, 0); + ref2_ptr += ref_stride; + ref3 = __lsx_vld(ref3_ptr, 0); + ref3_ptr += ref_stride; + + diff = __lsx_vabsd_bu(src, ref0); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + diff = __lsx_vabsd_bu(src, ref1); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + diff = __lsx_vabsd_bu(src, ref2); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + diff = __lsx_vabsd_bu(src, ref3); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref0 = __lsx_vld(ref0_ptr, 0); + ref0_ptr += ref_stride; + ref1 = __lsx_vld(ref1_ptr, 0); + ref1_ptr += ref_stride; + ref2 = __lsx_vld(ref2_ptr, 0); + ref2_ptr += ref_stride; + ref3 = __lsx_vld(ref3_ptr, 0); + ref3_ptr += ref_stride; + + diff = __lsx_vabsd_bu(src, ref0); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + diff = __lsx_vabsd_bu(src, ref1); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + diff = __lsx_vabsd_bu(src, ref2); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + diff = __lsx_vabsd_bu(src, ref3); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + } + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt = height; + __m128i src0, src1, ref0, ref1, sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + __m128i sad2 = sad0; + __m128i sad3 = sad0; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + + DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1); + ref0_ptr += ref_stride; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + + DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1); + ref1_ptr += ref_stride; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1); + ref2_ptr += ref_stride; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + + DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1); + ref3_ptr += ref_stride; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + } + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt = height; + __m128i src0, src1, src2, src3; + __m128i ref0, ref1, ref2, ref3; + __m128i sad, sad_tmp; + + __m128i sad0_0 = __lsx_vldi(0); + __m128i sad0_1 = sad0_0; + __m128i sad1_0 = sad0_0; + __m128i sad1_1 = sad0_0; + __m128i sad2_0 = sad0_0; + __m128i sad2_1 = sad0_0; + __m128i sad3_0 = sad0_0; + __m128i sad3_1 = sad0_0; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + + DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48, + ref0, ref1, ref2, ref3); + ref0_ptr += ref_stride; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp); + sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp); + + DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48, + ref0, ref1, ref2, ref3); + ref1_ptr += ref_stride; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp); + sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp); + + DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48, + ref0, ref1, ref2, ref3); + ref2_ptr += ref_stride; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp); + sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp); + + DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48, + ref0, ref1, ref2, ref3); + ref3_ptr += ref_stride; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp); + sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp); + } + sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[0] = HADD_UW_U32(sad); + + sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[1] = HADD_UW_U32(sad); + + sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[2] = HADD_UW_U32(sad); + + sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[3] = HADD_UW_U32(sad); +} + +#define VPX_SAD_32xHT_LSX(height) \ + uint32_t vpx_sad32x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_32width_lsx(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_64xHT_LSX(height) \ + uint32_t vpx_sad64x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_64width_lsx(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_16xHTx4D_LSX(height) \ + void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_16width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_SAD_32xHTx4D_LSX(height) \ + void vpx_sad32x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_32width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_SAD_64xHTx4D_LSX(height) \ + void vpx_sad64x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define SAD64 VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) + +SAD64 + +#define SAD32 VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) + +SAD32 + +#define SAD16 VPX_SAD_16xHTx4D_LSX(16) + +SAD16 + +#undef SAD64 +#undef SAD32 +#undef SAD16 diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 62fed9e2ee..2289f6f225 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -356,6 +356,8 @@ DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c +DSP_SRCS-$(HAVE_LSX) += loongarch/sad_lsx.c + DSP_SRCS-$(HAVE_MMI) += mips/sad_mmi.c DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index af9be16a77..7513af5f77 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -735,7 +735,7 @@ () # Single block SAD # add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx mmi/; @@ -744,7 +744,7 @@ () specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx mmi/; @@ -877,7 +877,7 @@ () # Multi-block SAD, comparing a reference to N independent blocks # add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/; +specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi lsx/; add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/; @@ -886,7 +886,7 @@ () specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/; add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/; +specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi lsx/; add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/; @@ -895,7 +895,7 @@ () specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/; add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/; +specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi lsx/; add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/; From caf65c14a82e8d66af9d016738d210b0b307d7eb Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Sat, 19 Mar 2022 10:44:27 +0800 Subject: [PATCH 0320/1000] vp9[loongarch]: Optimize vpx_variance64x64/32x32 1. vpx_variance64x64_lsx 2. vpx_variance32x32_lsx Bug: webm:1755 Change-Id: I45c5aa94cbbf7128473894a990d931acaa40e102 --- test/variance_test.cc | 7 ++ vpx_dsp/loongarch/variance_lsx.c | 161 +++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 2 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 4 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 vpx_dsp/loongarch/variance_lsx.c diff --git a/test/variance_test.cc b/test/variance_test.cc index 660bbd0ed7..8060875197 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1649,4 +1649,11 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_mmi, 0), SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_mmi, 0))); #endif // HAVE_MMI + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_lsx), + VarianceParams(5, 5, &vpx_variance32x32_lsx))); +#endif } // namespace diff --git a/vpx_dsp/loongarch/variance_lsx.c b/vpx_dsp/loongarch/variance_lsx.c new file mode 100644 index 0000000000..8164e98189 --- /dev/null +++ b/vpx_dsp/loongarch/variance_lsx.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +#define HADD_SW_S32(in) \ + ({ \ + __m128i res0_m; \ + int32_t sum_m; \ + \ + res0_m = __lsx_vhaddw_d_w(in, in); \ + res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \ + sum_m = __lsx_vpickve2gr_w(res0_m, 0); \ + sum_m; \ + }) + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + { \ + __m128i src_l0_m, src_l1_m; \ + __m128i res_l0_m, res_l1_m; \ + \ + src_l0_m = __lsx_vilvl_b(src, ref); \ + src_l1_m = __lsx_vilvh_b(src, ref); \ + DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \ + res_l0_m, res_l1_m); \ + var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \ + var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \ + sub = __lsx_vadd_h(sub, res_l0_m); \ + sub = __lsx_vadd_h(sub, res_l1_m); \ + } + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + (sse) - (((int64_t)(diff) * (diff)) >> (shift)) + +static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt = (height >> 2); + __m128i avg = __lsx_vldi(0); + __m128i src0, src1, ref0, ref1; + __m128i vec; + __m128i var = avg; + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __lsx_vhaddw_w_h(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt = 32; + __m128i avg0 = __lsx_vldi(0); + __m128i src0, src1, src2, src3; + __m128i ref0, ref1, ref2, ref3; + __m128i vec0, vec1; + __m128i avg1 = avg0; + __m128i avg2 = avg0; + __m128i avg3 = avg0; + __m128i var = avg0; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + vec0 = __lsx_vhaddw_w_h(avg0, avg0); + vec1 = __lsx_vhaddw_w_h(avg1, avg1); + vec0 = __lsx_vadd_w(vec0, vec1); + vec1 = __lsx_vhaddw_w_h(avg2, avg2); + vec0 = __lsx_vadd_w(vec0, vec1); + vec1 = __lsx_vhaddw_w_h(avg3, avg3); + vec0 = __lsx_vadd_w(vec0, vec1); + *diff = HADD_SW_S32(vec0); + + return HADD_SW_S32(var); +} + +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); + +#define VPX_VARIANCE_WDXHT_LSX(wd, ht) \ + uint32_t vpx_variance##wd##x##ht##_lsx( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, uint32_t *sse) { \ + int32_t diff; \ + \ + *sse = \ + sse_diff_##wd##width_lsx(src, src_stride, ref, ref_stride, ht, &diff); \ + \ + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } + +VPX_VARIANCE_WDXHT_LSX(32, 32) + +uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_64x64_lsx(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_64Wx64H(*sse, diff); +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 2289f6f225..198e0060f7 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -390,6 +390,8 @@ DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c +DSP_SRCS-$(HAVE_LSX) += loongarch/variance_lsx.c + DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 7513af5f77..bcc1b916cc 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1080,7 +1080,7 @@ () # Variance # add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx lsx/; add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi vsx/; @@ -1089,7 +1089,7 @@ () specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx lsx/; add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi vsx/; From d387c89e86de35fe3f12b5c9db2919bc82f90157 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Thu, 31 Mar 2022 15:01:30 +0800 Subject: [PATCH 0321/1000] Update loongson_intrinsics.h from v1.0.5 to v1.2.1 Bug: webm:1755 Change-Id: Ib636d2aa521332b76b6aa1b0aa0a9005aafbf32b --- vpx_util/loongson_intrinsics.h | 259 ++++++++++++++++++++++++++++++--- 1 file changed, 240 insertions(+), 19 deletions(-) diff --git a/vpx_util/loongson_intrinsics.h b/vpx_util/loongson_intrinsics.h index a34b6e8b44..b8b9e6db02 100644 --- a/vpx_util/loongson_intrinsics.h +++ b/vpx_util/loongson_intrinsics.h @@ -39,8 +39,8 @@ * MICRO version: Comment changes or implementation changes. */ #define LSOM_VERSION_MAJOR 1 -#define LSOM_VERSION_MINOR 0 -#define LSOM_VERSION_MICRO 5 +#define LSOM_VERSION_MINOR 2 +#define LSOM_VERSION_MICRO 1 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \ { \ @@ -90,8 +90,8 @@ * Return Type - halfword * Details : Signed byte elements from in_h are multiplied by * signed byte elements from in_l, and then added adjacent to - * each other to get results with the twice size of input. - * Then the results plus to signed half-word elements from in_c. + * each other to get a result twice the size of input. Then + * the results are added to signed half-word elements from in_c. * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) * in_c : 1,2,3,4, 1,2,3,4 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 @@ -116,9 +116,9 @@ static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, * Return Type - halfword * Details : Unsigned byte elements from in_h are multiplied by * unsigned byte elements from in_l, and then added adjacent to - * each other to get results with the twice size of input. - * The results plus to signed half-word elements from in_c. - * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) + * each other to get a result twice the size of input. + * The results are added to signed half-word elements from in_c. + * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l) * in_c : 1,2,3,4, 1,2,3,4 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 @@ -134,6 +134,32 @@ static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, return out; } +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * The results are added to signed half-word elements from in_c. + * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l) + * in_c : 1,1,1,1, 1,1,1,1 + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8 + * out : -4,-24,-60,-112, 6,26,62,114 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h, + __m128i in_l) { + __m128i out; + + out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l); + out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l); + return out; +} + /* * ============================================================================= * Description : Dot product & addition of half-word vector elements @@ -142,8 +168,8 @@ static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, * Return Type - __m128i * Details : Signed half-word elements from in_h are multiplied by * signed half-word elements from in_l, and then added adjacent to - * each other to get results with the twice size of input. - * Then the results plus to signed word elements from in_c. + * each other to get a result twice the size of input. + * Then the results are added to signed word elements from in_c. * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) * in_c : 1,2,3,4 * in_h : 1,2,3,4, 5,6,7,8 @@ -168,7 +194,7 @@ static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, * Return Type - halfword * Details : Signed byte elements from in_h are multiplied by * signed byte elements from in_l, and then added adjacent to - * each other to get results with the twice size of input. + * each other to get a result twice the size of input. * Example : out = __lsx_vdp2_h_b(in_h, in_l) * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 @@ -191,7 +217,7 @@ static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) { * Return Type - halfword * Details : Unsigned byte elements from in_h are multiplied by * unsigned byte elements from in_l, and then added adjacent to - * each other to get results with the twice size of input. + * each other to get a result twice the size of input. * Example : out = __lsx_vdp2_h_bu(in_h, in_l) * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 @@ -214,7 +240,7 @@ static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) { * Return Type - halfword * Details : Unsigned byte elements from in_h are multiplied by * signed byte elements from in_l, and then added adjacent to - * each other to get results with the twice size of input. + * each other to get a result twice the size of input. * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l) * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1 @@ -237,7 +263,7 @@ static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) { * Return Type - halfword * Details : Signed byte elements from in_h are multiplied by * signed byte elements from in_l, and then added adjacent to - * each other to get results with the twice size of input. + * each other to get a result twice the size of input. * Example : out = __lsx_vdp2_w_h(in_h, in_l) * in_h : 1,2,3,4, 5,6,7,8 * in_l : 8,7,6,5, 4,3,2,1 @@ -252,6 +278,29 @@ static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) { return out; } +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Outputs - out + * Return Type - double + * Details : Signed byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * Example : out = __lsx_vdp2_d_w(in_h, in_l) + * in_h : 1,2,3,4 + * in_l : 8,7,6,5 + * out : 22,38 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2_d_w(__m128i in_h, __m128i in_l) { + __m128i out; + + out = __lsx_vmulwev_d_w(in_h, in_l); + out = __lsx_vmaddwod_d_w(out, in_h, in_l); + return out; +} + /* * ============================================================================= * Description : Clip all halfword elements of input vector between min & max @@ -679,6 +728,132 @@ static inline __m128i __lsx_vclip255_w(__m128i _in) { _out7 = __lsx_vsub_d(_in0, _in7); \ } +/* + * ============================================================================= + * Description : Butterfly of 16 input vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ + * Outputs - _out0, _out1, _out2, _out3, ~ + * Details : Butterfly operation + * Example : + * _out0 = _in0 + _in15; + * _out1 = _in1 + _in14; + * _out2 = _in2 + _in13; + * _out3 = _in3 + _in12; + * _out4 = _in4 + _in11; + * _out5 = _in5 + _in10; + * _out6 = _in6 + _in9; + * _out7 = _in7 + _in8; + * _out8 = _in7 - _in8; + * _out9 = _in6 - _in9; + * _out10 = _in5 - _in10; + * _out11 = _in4 - _in11; + * _out12 = _in3 - _in12; + * _out13 = _in2 - _in13; + * _out14 = _in1 - _in14; + * _out15 = _in0 - _in15; + * ============================================================================= + */ + +#define LSX_BUTTERFLY_16_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7, _out8, _out9, _out10, _out11, _out12, \ + _out13, _out14, _out15) \ + { \ + _out0 = __lsx_vadd_b(_in0, _in15); \ + _out1 = __lsx_vadd_b(_in1, _in14); \ + _out2 = __lsx_vadd_b(_in2, _in13); \ + _out3 = __lsx_vadd_b(_in3, _in12); \ + _out4 = __lsx_vadd_b(_in4, _in11); \ + _out5 = __lsx_vadd_b(_in5, _in10); \ + _out6 = __lsx_vadd_b(_in6, _in9); \ + _out7 = __lsx_vadd_b(_in7, _in8); \ + \ + _out8 = __lsx_vsub_b(_in7, _in8); \ + _out9 = __lsx_vsub_b(_in6, _in9); \ + _out10 = __lsx_vsub_b(_in5, _in10); \ + _out11 = __lsx_vsub_b(_in4, _in11); \ + _out12 = __lsx_vsub_b(_in3, _in12); \ + _out13 = __lsx_vsub_b(_in2, _in13); \ + _out14 = __lsx_vsub_b(_in1, _in14); \ + _out15 = __lsx_vsub_b(_in0, _in15); \ + } + +#define LSX_BUTTERFLY_16_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7, _out8, _out9, _out10, _out11, _out12, \ + _out13, _out14, _out15) \ + { \ + _out0 = __lsx_vadd_h(_in0, _in15); \ + _out1 = __lsx_vadd_h(_in1, _in14); \ + _out2 = __lsx_vadd_h(_in2, _in13); \ + _out3 = __lsx_vadd_h(_in3, _in12); \ + _out4 = __lsx_vadd_h(_in4, _in11); \ + _out5 = __lsx_vadd_h(_in5, _in10); \ + _out6 = __lsx_vadd_h(_in6, _in9); \ + _out7 = __lsx_vadd_h(_in7, _in8); \ + \ + _out8 = __lsx_vsub_h(_in7, _in8); \ + _out9 = __lsx_vsub_h(_in6, _in9); \ + _out10 = __lsx_vsub_h(_in5, _in10); \ + _out11 = __lsx_vsub_h(_in4, _in11); \ + _out12 = __lsx_vsub_h(_in3, _in12); \ + _out13 = __lsx_vsub_h(_in2, _in13); \ + _out14 = __lsx_vsub_h(_in1, _in14); \ + _out15 = __lsx_vsub_h(_in0, _in15); \ + } + +#define LSX_BUTTERFLY_16_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7, _out8, _out9, _out10, _out11, _out12, \ + _out13, _out14, _out15) \ + { \ + _out0 = __lsx_vadd_w(_in0, _in15); \ + _out1 = __lsx_vadd_w(_in1, _in14); \ + _out2 = __lsx_vadd_w(_in2, _in13); \ + _out3 = __lsx_vadd_w(_in3, _in12); \ + _out4 = __lsx_vadd_w(_in4, _in11); \ + _out5 = __lsx_vadd_w(_in5, _in10); \ + _out6 = __lsx_vadd_w(_in6, _in9); \ + _out7 = __lsx_vadd_w(_in7, _in8); \ + \ + _out8 = __lsx_vsub_w(_in7, _in8); \ + _out9 = __lsx_vsub_w(_in6, _in9); \ + _out10 = __lsx_vsub_w(_in5, _in10); \ + _out11 = __lsx_vsub_w(_in4, _in11); \ + _out12 = __lsx_vsub_w(_in3, _in12); \ + _out13 = __lsx_vsub_w(_in2, _in13); \ + _out14 = __lsx_vsub_w(_in1, _in14); \ + _out15 = __lsx_vsub_w(_in0, _in15); \ + } + +#define LSX_BUTTERFLY_16_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7, _out8, _out9, _out10, _out11, _out12, \ + _out13, _out14, _out15) \ + { \ + _out0 = __lsx_vadd_d(_in0, _in15); \ + _out1 = __lsx_vadd_d(_in1, _in14); \ + _out2 = __lsx_vadd_d(_in2, _in13); \ + _out3 = __lsx_vadd_d(_in3, _in12); \ + _out4 = __lsx_vadd_d(_in4, _in11); \ + _out5 = __lsx_vadd_d(_in5, _in10); \ + _out6 = __lsx_vadd_d(_in6, _in9); \ + _out7 = __lsx_vadd_d(_in7, _in8); \ + \ + _out8 = __lsx_vsub_d(_in7, _in8); \ + _out9 = __lsx_vsub_d(_in6, _in9); \ + _out10 = __lsx_vsub_d(_in5, _in10); \ + _out11 = __lsx_vsub_d(_in4, _in11); \ + _out12 = __lsx_vsub_d(_in3, _in12); \ + _out13 = __lsx_vsub_d(_in2, _in13); \ + _out14 = __lsx_vsub_d(_in1, _in14); \ + _out15 = __lsx_vsub_d(_in0, _in15); \ + } + #endif // LSX #ifdef __loongarch_asx @@ -692,7 +867,7 @@ static inline __m128i __lsx_vclip255_w(__m128i _in) { * Details : Unsigned byte elements from in_h are multiplied with * unsigned byte elements from in_l producing a result * twice the size of input i.e. signed halfword. - * Then this multiplied results of adjacent odd-even elements + * Then these multiplied results of adjacent odd-even elements * are added to the out vector * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) * ============================================================================= @@ -714,7 +889,7 @@ static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) { * Details : Signed byte elements from in_h are multiplied with * signed byte elements from in_l producing a result * twice the size of input i.e. signed halfword. - * Then this multiplication results of adjacent odd-even elements + * Then these multiplication results of adjacent odd-even elements * are added to the out vector * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) * ============================================================================= @@ -736,7 +911,7 @@ static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) { * Details : Signed halfword elements from in_h are multiplied with * signed halfword elements from in_l producing a result * twice the size of input i.e. signed word. - * Then this multiplied results of adjacent odd-even elements + * Then these multiplied results of adjacent odd-even elements * are added to the out vector. * Example : out = __lasx_xvdp2_w_h(in_h, in_l) * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 @@ -761,7 +936,7 @@ static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) { * Details : Signed word elements from in_h are multiplied with * signed word elements from in_l producing a result * twice the size of input i.e. signed double-word. - * Then this multiplied results of adjacent odd-even elements + * Then these multiplied results of adjacent odd-even elements * are added to the out vector. * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) * ============================================================================= @@ -805,7 +980,7 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) { * Details : Signed byte elements from in_h are multiplied with * signed byte elements from in_l producing a result * twice the size of input i.e. signed halfword. - * Then this multiplied results of adjacent odd-even elements + * Then these multiplied results of adjacent odd-even elements * are added to the in_c vector. * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) * ============================================================================= @@ -819,6 +994,52 @@ static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h, return out; } +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied with + * unsigned byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then these multiplied results of adjacent odd-even elements + * are added to the in_c vector. + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l); + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied with + * signed byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then these multiplied results of adjacent odd-even elements + * are added to the in_c vector. + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l); + out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l); + return out; +} + /* * ============================================================================= * Description : Dot product of halfword vector elements @@ -955,7 +1176,7 @@ static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, * Details : Signed halfword elements from in_h are multiplied with * signed halfword elements from in_l producing a result * four times the size of input i.e. signed doubleword. - * Then this multiplication results of four adjacent elements + * Then these multiplication results of four adjacent elements * are added together and stored to the out vector. * Example : out = __lasx_xvdp4_d_h(in_h, in_l) * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1 From e87f6d0a2c9fa5dd9267f9e930b5ab5921bdaae6 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 13 Apr 2022 22:14:33 -0700 Subject: [PATCH 0322/1000] vp8,define_gf_group: remove unused variable this clears a warning under clang-13: vp8/encoder/firstpass.c:1634:10: warning: variable 'mod_err_per_mb_accumulator' set but not used [-Wunused-but-set-variable] Change-Id: I694a99d56724be89090e01c45559237c0fda147a --- vp8/encoder/firstpass.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 14164ebc51..ed177e3cb6 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -1631,7 +1631,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { double this_frame_mv_in_out = 0.0; double mv_in_out_accumulator = 0.0; double abs_mv_in_out_accumulator = 0.0; - double mod_err_per_mb_accumulator = 0.0; int max_bits = frame_max_bits(cpi); /* Max for a single frame */ @@ -1682,9 +1681,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { gf_group_err += mod_frame_err; - mod_err_per_mb_accumulator += - mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs); - if (EOF == input_stats(cpi, &next_frame)) break; /* Test for the case where there is a brief flash but the prediction From a165f4ba64ec8c992ca57a1b4444cd4a19527dde Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 13 Apr 2022 22:16:30 -0700 Subject: [PATCH 0323/1000] vp9,update_mbgraph_frame_stats: rm unused variables this quiets warnings under clang-13 of the form: ../vp9/encoder/vp9_mbgraph.c:222:42: warning: variable 'gld_y_offset' set but not used [-Wunused-but-set-variable] Change-Id: I32170b90c07058f780b4e8100ee5217232149db8 --- vp9/encoder/vp9_mbgraph.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index 831c79c175..7c2790cb98 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -219,7 +219,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, VP9_COMMON *const cm = &cpi->common; int mb_col, mb_row, offset = 0; - int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0; + int mb_y_offset = 0; MV gld_top_mv = { 0, 0 }; MODE_INFO mi_local; MODE_INFO mi_above, mi_left; @@ -243,8 +243,6 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { MV gld_left_mv = gld_top_mv; int mb_y_in_offset = mb_y_offset; - int arf_y_in_offset = arf_y_offset; - int gld_y_in_offset = gld_y_offset; // Set up limit values for motion vectors to prevent them extending outside // the UMV borders. @@ -266,8 +264,6 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, xd->left_mi = &mi_left; mb_y_in_offset += 16; - gld_y_in_offset += 16; - arf_y_in_offset += 16; x->mv_limits.col_min -= 16; x->mv_limits.col_max -= 16; } @@ -276,8 +272,6 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, xd->above_mi = &mi_above; mb_y_offset += buf->y_stride * 16; - gld_y_offset += golden_ref->y_stride * 16; - if (alt_ref) arf_y_offset += alt_ref->y_stride * 16; x->mv_limits.row_min -= 16; x->mv_limits.row_max -= 16; offset += cm->mb_cols; From 474a50c64837d05c68e6aa3d24ae096c53f2757d Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 6 Apr 2022 09:46:57 -0700 Subject: [PATCH 0324/1000] Fix int overflow in intermediate calculation This is not a complete fix to webm:1751. Bug: webm:1751 Change-Id: Ieed6c823744f5f0625d529db3746cfe4f549c8c0 --- vp8/encoder/bitstream.c | 6 ++---- vp8/encoder/boolhuff.h | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c index 87825fa6fe..0e97af5f2e 100644 --- a/vp8/encoder/bitstream.c +++ b/vp8/encoder/bitstream.c @@ -172,9 +172,8 @@ void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) { validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error); w->buffer[w->pos++] = (lowvalue >> (24 - offset)) & 0xff; - lowvalue <<= offset; shift = count; - lowvalue &= 0xffffff; + lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff); count -= 8; } @@ -223,9 +222,8 @@ void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) { validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error); w->buffer[w->pos++] = (lowvalue >> (24 - offset)) & 0xff; - lowvalue <<= offset; shift = count; - lowvalue &= 0xffffff; + lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff); count -= 8; } diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h index 8cc61bdd44..a8c536b99c 100644 --- a/vp8/encoder/boolhuff.h +++ b/vp8/encoder/boolhuff.h @@ -94,9 +94,8 @@ static void vp8_encode_bool(BOOL_CODER *bc, int bit, int probability) { validate_buffer(bc->buffer + bc->pos, 1, bc->buffer_end, bc->error); bc->buffer[bc->pos++] = (lowvalue >> (24 - offset) & 0xff); - lowvalue <<= offset; shift = count; - lowvalue &= 0xffffff; + lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff); count -= 8; } From 73b8aade83cd7d0fffe29254b931a34ad4621510 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 13 Apr 2022 22:24:08 -0700 Subject: [PATCH 0325/1000] temporal_filter_sse4: remove unused function params this clears warnings under clang-13 of the form: ../vp9/encoder/x86/temporal_filter_sse4.c:275:39: warning: parameter 'u_pre' set but not used [-Wunused-but-set-parameter] Change-Id: I21519b5b0b9c21b04b174327415e0e73b56bdfda --- vp9/encoder/x86/temporal_filter_sse4.c | 179 ++++++++----------------- 1 file changed, 59 insertions(+), 120 deletions(-) diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c index 437f49f5a0..bdbd66051d 100644 --- a/vp9/encoder/x86/temporal_filter_sse4.c +++ b/vp9/encoder/x86/temporal_filter_sse4.c @@ -270,13 +270,11 @@ static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist, // size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL, // else use top_weight for top half, and bottom weight for bottom half. static void vp9_apply_temporal_filter_luma_16( - const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, - int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, - int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, - int uv_pre_stride, unsigned int block_width, unsigned int block_height, - int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum, - uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist, - const uint16_t *v_dist, const int16_t *const *neighbors_first, + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors_first, const int16_t *const *neighbors_second, int top_weight, int bottom_weight, const int *blk_fw) { const int rounding = (1 << strength) >> 1; @@ -301,7 +299,6 @@ static void vp9_apply_temporal_filter_luma_16( assert(strength <= 6); assert(block_width == 16); - (void)block_width; // Initialize the weights @@ -342,17 +339,12 @@ static void vp9_apply_temporal_filter_luma_16( accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, y_accum); - y_src += y_src_stride; y_pre += y_pre_stride; y_count += y_pre_stride; y_accum += y_pre_stride; y_dist += DIST_STRIDE; - u_src += uv_src_stride; - u_pre += uv_pre_stride; u_dist += DIST_STRIDE; - v_src += uv_src_stride; - v_pre += uv_pre_stride; v_dist += DIST_STRIDE; // Then all the rows except the last one @@ -392,11 +384,7 @@ static void vp9_apply_temporal_filter_luma_16( read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, &v_second); - u_src += uv_src_stride; - u_pre += uv_pre_stride; u_dist += DIST_STRIDE; - v_src += uv_src_stride; - v_pre += uv_pre_stride; v_dist += DIST_STRIDE; } @@ -413,7 +401,6 @@ static void vp9_apply_temporal_filter_luma_16( accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, y_accum); - y_src += y_src_stride; y_pre += y_pre_stride; y_count += y_pre_stride; y_accum += y_pre_stride; @@ -458,13 +445,10 @@ static void vp9_apply_temporal_filter_luma_16( // Perform temporal filter for the luma component. static void vp9_apply_temporal_filter_luma( - const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, - int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, - int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, - int uv_pre_stride, unsigned int block_width, unsigned int block_height, - int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, - uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist, - const uint16_t *u_dist, const uint16_t *v_dist) { + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { unsigned int blk_col = 0, uv_blk_col = 0; const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x; const unsigned int mid_width = block_width >> 1, @@ -482,21 +466,16 @@ static void vp9_apply_temporal_filter_luma( neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; if (use_whole_blk) { vp9_apply_temporal_filter_luma_16( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, - u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, - block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, - y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, - v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, - bottom_weight, NULL); + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); } else { vp9_apply_temporal_filter_luma_16( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, - u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, - block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, - y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, - v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw); + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, 0, 0, blk_fw); } return; @@ -506,9 +485,7 @@ static void vp9_apply_temporal_filter_luma( neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS; vp9_apply_temporal_filter_luma_16( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, - v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength, + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, bottom_weight, NULL); @@ -521,13 +498,10 @@ static void vp9_apply_temporal_filter_luma( for (; blk_col < mid_width; blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { vp9_apply_temporal_filter_luma_16( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, - u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height, - ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, - y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, - v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, - bottom_weight, NULL); + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); } if (!use_whole_blk) { @@ -539,21 +513,16 @@ static void vp9_apply_temporal_filter_luma( for (; blk_col < last_width; blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { vp9_apply_temporal_filter_luma_16( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, - u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height, - ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, - y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, - v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, - bottom_weight, NULL); + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); } // Right neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; vp9_apply_temporal_filter_luma_16( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, - v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength, + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, bottom_weight, NULL); @@ -564,10 +533,7 @@ static void vp9_apply_temporal_filter_luma( // blk_fw as an array of size 4 for the weights for each of the 4 subblocks, // else use top_weight for top half, and bottom weight for bottom half. static void vp9_apply_temporal_filter_chroma_8( - const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, - int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, - int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, - int uv_pre_stride, unsigned int uv_block_width, + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int uv_block_height, int ss_x, int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, @@ -587,8 +553,6 @@ static void vp9_apply_temporal_filter_chroma_8( // Loop variable unsigned int h; - (void)uv_block_width; - // Initilize weight if (blk_fw) { weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0], @@ -621,10 +585,8 @@ static void vp9_apply_temporal_filter_chroma_8( accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); - u_src += uv_src_stride; u_pre += uv_pre_stride; u_dist += DIST_STRIDE; - v_src += uv_src_stride; v_pre += uv_pre_stride; v_dist += DIST_STRIDE; u_count += uv_pre_stride; @@ -632,8 +594,6 @@ static void vp9_apply_temporal_filter_chroma_8( v_count += uv_pre_stride; v_accum += uv_pre_stride; - y_src += y_src_stride * (1 + ss_y); - y_pre += y_pre_stride * (1 + ss_y); y_dist += DIST_STRIDE * (1 + ss_y); // Then all the rows except the last one @@ -676,10 +636,8 @@ static void vp9_apply_temporal_filter_chroma_8( accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); - u_src += uv_src_stride; u_pre += uv_pre_stride; u_dist += DIST_STRIDE; - v_src += uv_src_stride; v_pre += uv_pre_stride; v_dist += DIST_STRIDE; u_count += uv_pre_stride; @@ -687,8 +645,6 @@ static void vp9_apply_temporal_filter_chroma_8( v_count += uv_pre_stride; v_accum += uv_pre_stride; - y_src += y_src_stride * (1 + ss_y); - y_pre += y_pre_stride * (1 + ss_y); y_dist += DIST_STRIDE * (1 + ss_y); } @@ -719,12 +675,10 @@ static void vp9_apply_temporal_filter_chroma_8( // Perform temporal filter for the chroma components. static void vp9_apply_temporal_filter_chroma( - const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, - int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, - int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, - int uv_pre_stride, unsigned int block_width, unsigned int block_height, - int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, - uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { const unsigned int uv_width = block_width >> ss_x, uv_height = block_height >> ss_y; @@ -751,22 +705,17 @@ static void vp9_apply_temporal_filter_chroma( if (use_whole_blk) { vp9_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, - u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, - uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, - u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, - y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, - top_weight, bottom_weight, NULL); + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); } else { vp9_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, - u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, - uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, - u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, - y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, - 0, 0, blk_fw); + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw); } return; @@ -782,10 +731,8 @@ static void vp9_apply_temporal_filter_chroma( } vp9_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, - v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, - strength, u_accum + uv_blk_col, u_count + uv_blk_col, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, bottom_weight, NULL); @@ -805,13 +752,11 @@ static void vp9_apply_temporal_filter_chroma( for (; uv_blk_col < uv_mid_width; blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { vp9_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, - u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, - uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, - u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, - y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, - top_weight, bottom_weight, NULL); + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); } if (!use_whole_blk) { @@ -823,13 +768,11 @@ static void vp9_apply_temporal_filter_chroma( for (; uv_blk_col < uv_last_width; blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { vp9_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, - u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, - uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, - u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, - y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, - top_weight, bottom_weight, NULL); + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); } // Right @@ -842,10 +785,8 @@ static void vp9_apply_temporal_filter_chroma( } vp9_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, - v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, - strength, u_accum + uv_blk_col, u_count + uv_blk_col, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, bottom_weight, NULL); @@ -922,14 +863,12 @@ void vp9_apply_temporal_filter_sse4_1( u_dist_ptr = u_dist + 1; v_dist_ptr = v_dist + 1; - vp9_apply_temporal_filter_luma( - y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride, - u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, - strength, blk_fw_ptr, use_whole_blk, y_accum, y_count, y_dist_ptr, - u_dist_ptr, v_dist_ptr); + vp9_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height, + ss_x, ss_y, strength, blk_fw_ptr, + use_whole_blk, y_accum, y_count, y_dist_ptr, + u_dist_ptr, v_dist_ptr); vp9_apply_temporal_filter_chroma( - y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride, u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr); From a067d8a5bcfda7ed8b967477c06dab3151be0d24 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Wed, 16 Mar 2022 14:16:33 +0800 Subject: [PATCH 0326/1000] vp9[loongarch]: Optimize vpx_fdct32x32/32x32_rd 1. vpx_fdct32x32_lsx 2. vpx_fdct32x32_rd_lsx Bug: webm:1755 Change-Id: I83bce11c0d905cf137545a46cd756aef9cedce47 --- test/dct32x32_test.cc | 9 + vpx_dsp/loongarch/fwd_dct32x32_lsx.c | 1176 ++++++++++++++++++++++++++ vpx_dsp/loongarch/fwd_txfm_lsx.h | 94 ++ vpx_dsp/loongarch/txfm_macros_lsx.h | 47 + vpx_dsp/vpx_dsp.mk | 3 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 6 files changed, 1331 insertions(+), 2 deletions(-) create mode 100644 vpx_dsp/loongarch/fwd_dct32x32_lsx.c create mode 100644 vpx_dsp/loongarch/fwd_txfm_lsx.h create mode 100644 vpx_dsp/loongarch/txfm_macros_lsx.h diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index 8398e17e81..a764d187a3 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -396,4 +396,13 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&vpx_fdct32x32_rd_vsx, &vpx_idct32x32_1024_add_vsx, 1, VPX_BITS_8))); #endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + LSX, Trans32x32Test, + ::testing::Values(make_tuple(&vpx_fdct32x32_lsx, &vpx_idct32x32_1024_add_c, + 0, VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_lsx, + &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/vpx_dsp/loongarch/fwd_dct32x32_lsx.c b/vpx_dsp/loongarch/fwd_dct32x32_lsx.c new file mode 100644 index 0000000000..e5c301b2c1 --- /dev/null +++ b/vpx_dsp/loongarch/fwd_dct32x32_lsx.c @@ -0,0 +1,1176 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/fwd_txfm_lsx.h" +#include "vpx_dsp/fwd_txfm.h" + +#define UNPCK_SH_SW(in, out0, out1) \ + { \ + out0 = __lsx_vsllwil_w_h(in, 0); \ + out1 = __lsx_vexth_w_h(in); \ + } + +static void fdct8x32_1d_column_load_butterfly(const int16_t *input, + int32_t src_stride, + int16_t *temp_buff) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i step0, step1, step2, step3; + __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + __m128i step0_1, step1_1, step2_1, step3_1; + + int32_t stride = src_stride << 1; + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + const int16_t *input_tmp = (int16_t *)input; + + in0 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2); + in3 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in0_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1); + in3_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp = input + (src_stride * 24); + in4_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1); + in7_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in4 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6); + in7 = __lsx_vldx(input_tmp, stride3); + + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1, + in2_1, in3_1); + DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1, + in6_1, in7_1); + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, + in7_1); + + __lsx_vst(step0, temp_buff, 0); + __lsx_vst(step1, temp_buff, 16); + __lsx_vst(step2, temp_buff, 32); + __lsx_vst(step3, temp_buff, 48); + + __lsx_vst(in4, temp_buff, 448); + __lsx_vst(in5, temp_buff, 464); + __lsx_vst(in6, temp_buff, 480); + __lsx_vst(in7, temp_buff, 496); + + __lsx_vst(step0_1, temp_buff, 64); + __lsx_vst(step1_1, temp_buff, 80); + __lsx_vst(step2_1, temp_buff, 96); + __lsx_vst(step3_1, temp_buff, 112); + + __lsx_vst(in4_1, temp_buff, 384); + __lsx_vst(in5_1, temp_buff, 400); + __lsx_vst(in6_1, temp_buff, 416); + __lsx_vst(in7_1, temp_buff, 432); + + /* 3rd and 4th set */ + input_tmp = input + (src_stride * 8); + in0 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2); + in3 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in0_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1); + in3_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in4_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1); + in7_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in4 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6); + in7 = __lsx_vldx(input_tmp, stride3); + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1, + in2_1, in3_1); + DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1, + in6_1, in7_1); + + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, + in7_1); + + __lsx_vst(step0, temp_buff, 128); + __lsx_vst(step1, temp_buff, 144); + __lsx_vst(step2, temp_buff, 160); + __lsx_vst(step3, temp_buff, 176); + + __lsx_vst(in4, temp_buff, 320); + __lsx_vst(in5, temp_buff, 336); + __lsx_vst(in6, temp_buff, 352); + __lsx_vst(in7, temp_buff, 368); + + __lsx_vst(step0_1, temp_buff, 192); + __lsx_vst(step1_1, temp_buff, 208); + __lsx_vst(step2_1, temp_buff, 224); + __lsx_vst(step3_1, temp_buff, 240); + + __lsx_vst(in4_1, temp_buff, 256); + __lsx_vst(in5_1, temp_buff, 272); + __lsx_vst(in6_1, temp_buff, 288); + __lsx_vst(in7_1, temp_buff, 304); +} + +static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i temp0, temp1; + + /* fdct even */ + DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12, + in13, in14, in15); + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, + vec2, vec3, in12, in13, in14, in15); + DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5, + in6, in7); + DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9, + in10, in11); + LSX_BUTTERFLY_8_H(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, + vec7, in8, in9, in10, in11); + + /* Stage 3 */ + DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, + in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 0); + __lsx_vst(temp1, temp, 1024); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 512); + __lsx_vst(temp1, temp, 1536); + + DUP4_ARG2(__lsx_vsub_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, + vec6, vec5, vec4); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 256); + __lsx_vst(temp1, temp, 1792); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 1280); + __lsx_vst(temp1, temp, 768); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 128); + __lsx_vst(temp1, temp, 1920); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 1152); + __lsx_vst(temp1, temp, 896); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + temp0 = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 640); + __lsx_vst(temp1, temp, 1408); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 384); + __lsx_vst(temp1, temp, 1664); +} + +static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { + __m128i in16, in17, in18, in19, in20, in21, in22, in23; + __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + __m128i tmp0, tmp1; + + DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 160, input, 176, in20, in21, + in26, in27); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 192, input, 208, in18, in19, + in28, in29); + + vec4 = __lsx_vsub_h(in19, in20); + __lsx_vst(vec4, input, 64); + vec4 = __lsx_vsub_h(in18, in21); + __lsx_vst(vec4, input, 80); + vec4 = __lsx_vsub_h(in29, in26); + __lsx_vst(vec4, input, 160); + vec4 = __lsx_vsub_h(in28, in27); + __lsx_vst(vec4, input, 176); + + in21 = __lsx_vadd_h(in18, in21); + in20 = __lsx_vadd_h(in19, in20); + in27 = __lsx_vadd_h(in28, in27); + in26 = __lsx_vadd_h(in29, in26); + + DUP4_ARG2(__lsx_vld, input, 96, input, 112, input, 128, input, 144, in22, + in23, in24, in25); + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 224, input, 240, in16, in17, + in30, in31); + + vec4 = __lsx_vsub_h(in17, in22); + __lsx_vst(vec4, input, 32); + vec4 = __lsx_vsub_h(in16, in23); + __lsx_vst(vec4, input, 48); + vec4 = __lsx_vsub_h(in31, in24); + __lsx_vst(vec4, input, 192); + vec4 = __lsx_vsub_h(in30, in25); + __lsx_vst(vec4, input, 208); + + DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16, + in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27, + in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 0); + __lsx_vst(vec4, temp_ptr, 1920); + + DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 896); + __lsx_vst(vec4, temp_ptr, 1024); + + DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23, + in26, in24, in20); + tmp0 = __lsx_vneg_h(in23); + DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25); + DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec4, temp_ptr, 1408); + __lsx_vst(vec5, temp_ptr, 512); + + DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec4, temp_ptr, 384); + __lsx_vst(vec5, temp_ptr, 1536); + + DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 64, input, 80, in22, in23, + in20, in21); + DUP4_ARG2(__lsx_vld, input, 160, input, 176, input, 192, input, 208, in26, + in27, in24, in25); + in16 = in20; + in17 = in21; + DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1); + DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26); + DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28, + in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 1664); + __lsx_vst(vec4, temp_ptr, 256); + + DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 640); + __lsx_vst(vec4, temp_ptr, 1280); + + DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16, + in29, in30, in19); + tmp0 = __lsx_vneg_h(in16); + DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31); + DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 1152); + __lsx_vst(vec4, temp_ptr, 768); + + DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 128); + __lsx_vst(vec4, temp_ptr, 1792); +} + +static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride, + int16_t *tmp_buf, int16_t *tmp_buf_big) { + fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf); + fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big); + fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32)); +} + +static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, + int16_t *output) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i step0, step1, step2, step3, step4, step5, step6, step7; + + DUP4_ARG2(__lsx_vld, temp_buff, 0, temp_buff, 64, temp_buff, 128, temp_buff, + 192, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vld, temp_buff, 256, temp_buff, 320, temp_buff, 384, + temp_buff, 448, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vld, temp_buff, 48, temp_buff, 112, temp_buff, 176, temp_buff, + 240, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, temp_buff, 304, temp_buff, 368, temp_buff, 432, + temp_buff, 496, in12, in13, in14, in15); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, step0, step1, step2, step3, + step4, step5, step6, step7, in8, in9, in10, in11, in12, + in13, in14, in15); + + __lsx_vst(step0, output, 0); + __lsx_vst(step1, output, 16); + __lsx_vst(step2, output, 32); + __lsx_vst(step3, output, 48); + __lsx_vst(step4, output, 64); + __lsx_vst(step5, output, 80); + __lsx_vst(step6, output, 96); + __lsx_vst(step7, output, 112); + + __lsx_vst(in8, output, 384); + __lsx_vst(in9, output, 400); + __lsx_vst(in10, output, 416); + __lsx_vst(in11, output, 432); + __lsx_vst(in12, output, 448); + __lsx_vst(in13, output, 464); + __lsx_vst(in14, output, 480); + __lsx_vst(in15, output, 496); + + /* 2nd set */ + DUP4_ARG2(__lsx_vld, temp_buff, 16, temp_buff, 80, temp_buff, 144, temp_buff, + 208, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vld, temp_buff, 272, temp_buff, 336, temp_buff, 400, + temp_buff, 464, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vld, temp_buff, 32, temp_buff, 96, temp_buff, 160, temp_buff, + 224, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, temp_buff, 288, temp_buff, 352, temp_buff, 416, + temp_buff, 480, in12, in13, in14, in15); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, step0, step1, step2, step3, + step4, step5, step6, step7, in8, in9, in10, in11, in12, + in13, in14, in15); + + __lsx_vst(step0, output, 128); + __lsx_vst(step1, output, 144); + __lsx_vst(step2, output, 160); + __lsx_vst(step3, output, 176); + __lsx_vst(step4, output, 192); + __lsx_vst(step5, output, 208); + __lsx_vst(step6, output, 224); + __lsx_vst(step7, output, 240); + + __lsx_vst(in8, output, 256); + __lsx_vst(in9, output, 272); + __lsx_vst(in10, output, 288); + __lsx_vst(in11, output, 304); + __lsx_vst(in12, output, 320); + __lsx_vst(in13, output, 336); + __lsx_vst(in14, output, 352); + __lsx_vst(in15, output, 368); +} + +static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, + int16_t *out) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; + __m128i vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; + __m128i tmp0_w, tmp1_w, tmp2_w, tmp3_w; + + /* fdct32 even */ + /* stage 2 */ + DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5, + in6, in7); + DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12, + in13, in14, in15); + + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, + vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, + in15); + + __lsx_vst(vec0, interm_ptr, 0); + __lsx_vst(vec1, interm_ptr, 16); + __lsx_vst(vec2, interm_ptr, 32); + __lsx_vst(vec3, interm_ptr, 48); + __lsx_vst(vec4, interm_ptr, 64); + __lsx_vst(vec5, interm_ptr, 80); + __lsx_vst(vec6, interm_ptr, 96); + __lsx_vst(vec7, interm_ptr, 112); + + __lsx_vst(in8, interm_ptr, 128); + __lsx_vst(in9, interm_ptr, 144); + __lsx_vst(in10, interm_ptr, 160); + __lsx_vst(in11, interm_ptr, 176); + __lsx_vst(in12, interm_ptr, 192); + __lsx_vst(in13, interm_ptr, 208); + __lsx_vst(in14, interm_ptr, 224); + __lsx_vst(in15, interm_ptr, 240); + + /* Stage 3 */ + UNPCK_SH_SW(vec0, vec0_l, vec0_r); + UNPCK_SH_SW(vec1, vec1_l, vec1_r); + UNPCK_SH_SW(vec2, vec2_l, vec2_r); + UNPCK_SH_SW(vec3, vec3_l, vec3_r); + UNPCK_SH_SW(vec4, vec4_l, vec4_r); + UNPCK_SH_SW(vec5, vec5_l, vec5_r); + UNPCK_SH_SW(vec6, vec6_l, vec6_r); + UNPCK_SH_SW(vec7, vec7_l, vec7_r); + DUP4_ARG2(__lsx_vadd_w, vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, + vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w); + LSX_BUTTERFLY_4_W(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, + vec5_r); + DUP4_ARG2(__lsx_vadd_w, vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, + vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r); + + tmp3_w = __lsx_vadd_w(vec0_r, vec3_r); + vec0_r = __lsx_vsub_w(vec0_r, vec3_r); + vec3_r = __lsx_vadd_w(vec1_r, vec2_r); + vec1_r = __lsx_vsub_w(vec1_r, vec2_r); + + DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + __lsx_vst(vec5, out, 0); + __lsx_vst(vec4, out, 16); + + DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + __lsx_vst(vec5, out, 32); + __lsx_vst(vec4, out, 48); + + DUP4_ARG2(__lsx_vld, interm_ptr, 0, interm_ptr, 16, interm_ptr, 32, + interm_ptr, 48, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, interm_ptr, 64, interm_ptr, 80, interm_ptr, 96, + interm_ptr, 112, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, + vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 64); + __lsx_vst(in5, out, 112); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 80); + __lsx_vst(in5, out, 96); + + DUP4_ARG2(__lsx_vld, interm_ptr, 128, interm_ptr, 144, interm_ptr, 160, + interm_ptr, 176, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, interm_ptr, 192, interm_ptr, 208, interm_ptr, 224, + interm_ptr, 240, in12, in13, in14, in15); + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 128); + __lsx_vst(in5, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 144); + __lsx_vst(in5, out, 224); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + tmp0_w = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(tmp0_w, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 160); + __lsx_vst(in5, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 192); + __lsx_vst(in5, out, 176); +} + +static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6, + in7); + DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13, + in14, in15); + + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, + vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, + in15); + /* Stage 3 */ + DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, + in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 0); + __lsx_vst(temp1, out, 16); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 32); + __lsx_vst(temp1, out, 48); + + DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, + vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 64); + __lsx_vst(temp1, out, 112); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 80); + __lsx_vst(temp1, out, 96); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 128); + __lsx_vst(temp1, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 144); + __lsx_vst(temp1, out, 224); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + temp0 = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5) + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 160); + __lsx_vst(temp1, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 192); + __lsx_vst(temp1, out, 176); +} + +static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + __m128i in16, in17, in18, in19, in20, in21, in22, in23; + __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + __m128i tmp0, tmp1; + + in20 = __lsx_vld(temp, 64); + in21 = __lsx_vld(temp, 80); + in26 = __lsx_vld(temp, 160); + in27 = __lsx_vld(temp, 176); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = __lsx_vld(temp, 32); + in19 = __lsx_vld(temp, 48); + in28 = __lsx_vld(temp, 192); + in29 = __lsx_vld(temp, 208); + + vec4 = __lsx_vsub_h(in19, in20); + __lsx_vst(vec4, interm_ptr, 64); + vec4 = __lsx_vsub_h(in18, in21); + __lsx_vst(vec4, interm_ptr, 176); + vec4 = __lsx_vsub_h(in28, in27); + __lsx_vst(vec4, interm_ptr, 112); + vec4 = __lsx_vsub_h(in29, in26); + __lsx_vst(vec4, interm_ptr, 128); + + DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21, + in20, in27, in26); + + in22 = __lsx_vld(temp, 96); + in23 = __lsx_vld(temp, 112); + in24 = __lsx_vld(temp, 128); + in25 = __lsx_vld(temp, 144); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = __lsx_vld(temp, 0); + in17 = __lsx_vld(temp, 16); + in30 = __lsx_vld(temp, 224); + in31 = __lsx_vld(temp, 240); + + vec4 = __lsx_vsub_h(in17, in22); + __lsx_vst(vec4, interm_ptr, 80); + vec4 = __lsx_vsub_h(in30, in25); + __lsx_vst(vec4, interm_ptr, 96); + vec4 = __lsx_vsub_h(in31, in24); + __lsx_vst(vec4, interm_ptr, 144); + vec4 = __lsx_vsub_h(in16, in23); + __lsx_vst(vec4, interm_ptr, 160); + + DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16, + in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + + DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27, + in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 0); + __lsx_vst(vec4, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21); + + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 224); + __lsx_vst(vec4, out, 16); + + DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23, + in26, in24, in20); + tmp0 = __lsx_vneg_h(in23); + DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25); + DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec4, out, 32); + __lsx_vst(vec5, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec4, out, 48); + __lsx_vst(vec5, out, 192); + + in20 = __lsx_vld(interm_ptr, 64); + in21 = __lsx_vld(interm_ptr, 176); + in27 = __lsx_vld(interm_ptr, 112); + in26 = __lsx_vld(interm_ptr, 128); + + in16 = in20; + in17 = in21; + DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1); + DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = __lsx_vld(interm_ptr, 80); + in25 = __lsx_vld(interm_ptr, 96); + in24 = __lsx_vld(interm_ptr, 144); + in23 = __lsx_vld(interm_ptr, 160); + + DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28, + in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 64); + __lsx_vst(vec4, out, 176); + + DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 80); + __lsx_vst(vec4, out, 160); + + DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16, + in29, in30, in19); + tmp0 = __lsx_vneg_h(in16); + DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31); + DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19); + + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 144); + __lsx_vst(vec4, out, 96); + + DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18); + + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec4, out, 112); + __lsx_vst(vec5, out, 128); +} + +static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + + /* 1st set */ + in0 = __lsx_vld(temp, 0); + in4 = __lsx_vld(temp, 64); + in2 = __lsx_vld(temp, 128); + in6 = __lsx_vld(temp, 192); + in1 = __lsx_vld(temp, 256); + in7 = __lsx_vld(temp, 304); + in3 = __lsx_vld(temp, 384); + in5 = __lsx_vld(temp, 432); + + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + /* 2nd set */ + in0_1 = __lsx_vld(temp, 32); + in1_1 = __lsx_vld(temp, 464); + in2_1 = __lsx_vld(temp, 160); + in3_1 = __lsx_vld(temp, 336); + in4_1 = __lsx_vld(temp, 96); + in5_1 = __lsx_vld(temp, 352); + in6_1 = __lsx_vld(temp, 224); + in7_1 = __lsx_vld(temp, 480); + + __lsx_vst(in0, output, 0); + __lsx_vst(in1, output, 64); + __lsx_vst(in2, output, 128); + __lsx_vst(in3, output, 192); + __lsx_vst(in4, output, 256); + __lsx_vst(in5, output, 320); + __lsx_vst(in6, output, 384); + __lsx_vst(in7, output, 448); + + LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + + /* 3rd set */ + in0 = __lsx_vld(temp, 16); + in1 = __lsx_vld(temp, 272); + in2 = __lsx_vld(temp, 144); + in3 = __lsx_vld(temp, 400); + in4 = __lsx_vld(temp, 80); + in5 = __lsx_vld(temp, 416); + in6 = __lsx_vld(temp, 208); + in7 = __lsx_vld(temp, 288); + + __lsx_vst(in0_1, output, 16); + __lsx_vst(in1_1, output, 80); + __lsx_vst(in2_1, output, 144); + __lsx_vst(in3_1, output, 208); + __lsx_vst(in4_1, output, 272); + __lsx_vst(in5_1, output, 336); + __lsx_vst(in6_1, output, 400); + __lsx_vst(in7_1, output, 464); + + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + __lsx_vst(in0, output, 32); + __lsx_vst(in1, output, 96); + __lsx_vst(in2, output, 160); + __lsx_vst(in3, output, 224); + __lsx_vst(in4, output, 288); + __lsx_vst(in5, output, 352); + __lsx_vst(in6, output, 416); + __lsx_vst(in7, output, 480); + + /* 4th set */ + in0_1 = __lsx_vld(temp, 48); + in1_1 = __lsx_vld(temp, 448); + in2_1 = __lsx_vld(temp, 176); + in3_1 = __lsx_vld(temp, 320); + in4_1 = __lsx_vld(temp, 112); + in5_1 = __lsx_vld(temp, 368); + in6_1 = __lsx_vld(temp, 240); + in7_1 = __lsx_vld(temp, 496); + + LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + + __lsx_vst(in0_1, output, 48); + __lsx_vst(in1_1, output, 112); + __lsx_vst(in2_1, output, 176); + __lsx_vst(in3_1, output, 240); + __lsx_vst(in4_1, output, 304); + __lsx_vst(in5_1, output, 368); + __lsx_vst(in6_1, output, 432); + __lsx_vst(in7_1, output, 496); +} + +static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) { + fdct8x32_1d_row_load_butterfly(temp, temp_buf); + fdct8x32_1d_row_even(temp_buf, temp_buf); + fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); + fdct8x32_1d_row_transpose_store(temp_buf, output); +} + +static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf); + fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void vpx_fdct32x32_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + int i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf, + tmp_buf_big + (8 * i)); + } + + /* row transform */ + fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output); + + /* row transform */ + for (i = 1; i < 4; ++i) { + fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256)); + } +} + +static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6, + in7); + DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13, + in14, in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, + vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, + in15); + + FDCT_POSTPROC_2V_NEG_H(vec0, vec1); + FDCT_POSTPROC_2V_NEG_H(vec2, vec3); + FDCT_POSTPROC_2V_NEG_H(vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec6, vec7); + FDCT_POSTPROC_2V_NEG_H(in8, in9); + FDCT_POSTPROC_2V_NEG_H(in10, in11); + FDCT_POSTPROC_2V_NEG_H(in12, in13); + FDCT_POSTPROC_2V_NEG_H(in14, in15); + + /* Stage 3 */ + DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, + in1, in2, in3); + + temp0 = __lsx_vadd_h(in0, in3); + in0 = __lsx_vsub_h(in0, in3); + in3 = __lsx_vadd_h(in1, in2); + in1 = __lsx_vsub_h(in1, in2); + + DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); + __lsx_vst(temp0, out, 0); + __lsx_vst(temp1, out, 16); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + __lsx_vst(temp0, out, 32); + __lsx_vst(temp1, out, 48); + + DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, + vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + __lsx_vst(temp0, out, 64); + __lsx_vst(temp1, out, 112); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + __lsx_vst(temp0, out, 80); + __lsx_vst(temp1, out, 96); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + __lsx_vst(temp0, out, 128); + __lsx_vst(temp1, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + __lsx_vst(temp0, out, 144); + __lsx_vst(temp1, out, 224); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + temp0 = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + __lsx_vst(temp0, out, 160); + __lsx_vst(temp1, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + __lsx_vst(temp0, out, 192); + __lsx_vst(temp1, out, 176); +} + +static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + __m128i in16, in17, in18, in19, in20, in21, in22, in23; + __m128i in24, in25, in26, in27, in28, in29, in30, in31; + __m128i vec4, vec5, tmp0, tmp1; + + in20 = __lsx_vld(temp, 64); + in21 = __lsx_vld(temp, 80); + in26 = __lsx_vld(temp, 160); + in27 = __lsx_vld(temp, 176); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + FDCT_POSTPROC_2V_NEG_H(in20, in21); + FDCT_POSTPROC_2V_NEG_H(in26, in27); + + in18 = __lsx_vld(temp, 32); + in19 = __lsx_vld(temp, 48); + in28 = __lsx_vld(temp, 192); + in29 = __lsx_vld(temp, 208); + + FDCT_POSTPROC_2V_NEG_H(in18, in19); + FDCT_POSTPROC_2V_NEG_H(in28, in29); + + vec4 = __lsx_vsub_h(in19, in20); + __lsx_vst(vec4, interm_ptr, 64); + vec4 = __lsx_vsub_h(in18, in21); + __lsx_vst(vec4, interm_ptr, 176); + vec4 = __lsx_vsub_h(in29, in26); + __lsx_vst(vec4, interm_ptr, 128); + vec4 = __lsx_vsub_h(in28, in27); + __lsx_vst(vec4, interm_ptr, 112); + + DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21, + in20, in27, in26); + + in22 = __lsx_vld(temp, 96); + in23 = __lsx_vld(temp, 112); + in24 = __lsx_vld(temp, 128); + in25 = __lsx_vld(temp, 144); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + FDCT_POSTPROC_2V_NEG_H(in22, in23); + FDCT_POSTPROC_2V_NEG_H(in24, in25); + + in16 = __lsx_vld(temp, 0); + in17 = __lsx_vld(temp, 16); + in30 = __lsx_vld(temp, 224); + in31 = __lsx_vld(temp, 240); + + FDCT_POSTPROC_2V_NEG_H(in16, in17); + FDCT_POSTPROC_2V_NEG_H(in30, in31); + + vec4 = __lsx_vsub_h(in17, in22); + __lsx_vst(vec4, interm_ptr, 80); + vec4 = __lsx_vsub_h(in30, in25); + __lsx_vst(vec4, interm_ptr, 96); + vec4 = __lsx_vsub_h(in31, in24); + __lsx_vst(vec4, interm_ptr, 144); + vec4 = __lsx_vsub_h(in16, in23); + __lsx_vst(vec4, interm_ptr, 160); + + DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16, + in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27, + in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + __lsx_vst(vec5, out, 0); + __lsx_vst(vec4, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + __lsx_vst(vec5, out, 224); + __lsx_vst(vec4, out, 16); + + DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23, + in26, in24, in20); + tmp0 = __lsx_vneg_h(in23); + DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25); + DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + __lsx_vst(vec4, out, 32); + __lsx_vst(vec5, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + __lsx_vst(vec4, out, 48); + __lsx_vst(vec5, out, 192); + + in20 = __lsx_vld(interm_ptr, 64); + in21 = __lsx_vld(interm_ptr, 176); + in27 = __lsx_vld(interm_ptr, 112); + in26 = __lsx_vld(interm_ptr, 128); + + in16 = in20; + in17 = in21; + DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1); + DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = __lsx_vld(interm_ptr, 80); + in25 = __lsx_vld(interm_ptr, 96); + in24 = __lsx_vld(interm_ptr, 144); + in23 = __lsx_vld(interm_ptr, 160); + + DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28, + in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + in16 = __lsx_vadd_h(in28, in29); + in19 = __lsx_vadd_h(in31, in30); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + __lsx_vst(vec5, out, 64); + __lsx_vst(vec4, out, 176); + + DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + __lsx_vst(vec5, out, 80); + __lsx_vst(vec4, out, 160); + + DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16, + in29, in30, in19); + tmp0 = __lsx_vneg_h(in16); + DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31); + DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + __lsx_vst(vec5, out, 144); + __lsx_vst(vec4, out, 96); + + DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + __lsx_vst(vec4, out, 112); + __lsx_vst(vec5, out, 128); +} + +static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf); + fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128)); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void vpx_fdct32x32_rd_lsx(const int16_t *input, int16_t *out, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0], + &tmp_buf_big[0] + (8 * i)); + } + /* row transform */ + for (i = 0; i < 4; ++i) { + fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0], + out + (8 * i * 32)); + } +} diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h new file mode 100644 index 0000000000..0e59852c42 --- /dev/null +++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ + +#include "vpx_dsp/loongarch/txfm_macros_lsx.h" +#include "vpx_dsp/txfm_common.h" + +#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ + { \ + __m128i tp0_m, tp1_m; \ + __m128i one = __lsx_vreplgr2vr_h(1); \ + \ + tp0_m = __lsx_vslei_h(vec0, 0); \ + tp1_m = __lsx_vslei_h(vec1, 0); \ + tp0_m = __lsx_vxori_b(tp0_m, 255); \ + tp1_m = __lsx_vxori_b(tp1_m, 255); \ + vec0 = __lsx_vadd_h(vec0, one); \ + vec1 = __lsx_vadd_h(vec1, one); \ + tp0_m = __lsx_vand_v(one, tp0_m); \ + tp1_m = __lsx_vand_v(one, tp1_m); \ + vec0 = __lsx_vadd_h(vec0, tp0_m); \ + vec1 = __lsx_vadd_h(vec1, tp1_m); \ + vec0 = __lsx_vsrai_h(vec0, 2); \ + vec1 = __lsx_vsrai_h(vec1, 2); \ + } + +#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \ + { \ + __m128i tp0_m, tp1_m; \ + __m128i one_m = __lsx_vldi(0x401); \ + \ + tp0_m = __lsx_vslti_h(vec0, 0); \ + tp1_m = __lsx_vslti_h(vec1, 0); \ + vec0 = __lsx_vadd_h(vec0, one_m); \ + vec1 = __lsx_vadd_h(vec1, one_m); \ + tp0_m = __lsx_vand_v(one_m, tp0_m); \ + tp1_m = __lsx_vand_v(one_m, tp1_m); \ + vec0 = __lsx_vadd_h(vec0, tp0_m); \ + vec1 = __lsx_vadd_h(vec1, tp1_m); \ + vec0 = __lsx_vsrai_h(vec0, 2); \ + vec1 = __lsx_vsrai_h(vec1, 2); \ + } + +#define FDCT32_POSTPROC_NEG_W(vec) \ + { \ + __m128i temp_m; \ + __m128i one_m = __lsx_vreplgr2vr_w(1); \ + \ + temp_m = __lsx_vslti_w(vec, 0); \ + vec = __lsx_vadd_w(vec, one_m); \ + temp_m = __lsx_vand_v(one_m, temp_m); \ + vec = __lsx_vadd_w(vec, temp_m); \ + vec = __lsx_vsrai_w(vec, 2); \ + } + +#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \ + const0, const1, out0, out1, out2, out3) \ + { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + __m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1; \ + __m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0); \ + \ + s0_m = __lsx_vreplgr2vr_w((int32_t)const1); \ + k0_m = __lsx_vpackev_w(s0_m, k0_m); \ + \ + DUP2_ARG1(__lsx_vneg_w, reg1_left, reg1_right, _tmp0, _tmp1); \ + s1_m = __lsx_vilvl_w(_tmp0, reg0_left); \ + s0_m = __lsx_vilvh_w(_tmp0, reg0_left); \ + s3_m = __lsx_vilvl_w(reg0_left, reg1_left); \ + s2_m = __lsx_vilvh_w(reg0_left, reg1_left); \ + s5_m = __lsx_vilvl_w(_tmp1, reg0_right); \ + s4_m = __lsx_vilvh_w(_tmp1, reg0_right); \ + s7_m = __lsx_vilvl_w(reg0_right, reg1_right); \ + s6_m = __lsx_vilvh_w(reg0_right, reg1_right); \ + DUP2_ARG2(__lsx_vdp2_d_w, s0_m, k0_m, s1_m, k0_m, tp0_m, tp1_m); \ + DUP2_ARG2(__lsx_vdp2_d_w, s4_m, k0_m, s5_m, k0_m, tp2_m, tp3_m); \ + DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \ + DCT_CONST_BITS, out0, out1); \ + DUP2_ARG2(__lsx_vdp2_d_w, s2_m, k0_m, s3_m, k0_m, tp0_m, tp1_m); \ + DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m); \ + DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \ + DCT_CONST_BITS, out2, out3); \ + } + +#endif // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ diff --git a/vpx_dsp/loongarch/txfm_macros_lsx.h b/vpx_dsp/loongarch/txfm_macros_lsx.h new file mode 100644 index 0000000000..bc6f7dacc9 --- /dev/null +++ b/vpx_dsp/loongarch/txfm_macros_lsx.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_ + +#include "vpx_util/loongson_intrinsics.h" + +#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ + { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \ + __m128i k0_m, k1_m, k2_m, k3_m; \ + __m128i zero = __lsx_vldi(0); \ + \ + k0_m = __lsx_vreplgr2vr_h(cnst0); \ + k1_m = __lsx_vreplgr2vr_h(cnst1); \ + k2_m = __lsx_vpackev_h(k1_m, k0_m); \ + k0_m = __lsx_vpackev_h(zero, k0_m); \ + k1_m = __lsx_vpackev_h(k1_m, zero); \ + \ + s5_m = __lsx_vilvl_h(reg1, reg0); \ + s4_m = __lsx_vilvh_h(reg1, reg0); \ + s3_m = __lsx_vilvl_h(reg0, reg1); \ + s2_m = __lsx_vilvh_h(reg0, reg1); \ + \ + s1_m = __lsx_vdp2_w_h(s5_m, k0_m); \ + s0_m = __lsx_vdp2_w_h(s4_m, k0_m); \ + k3_m = __lsx_vdp2_w_h(s5_m, k1_m); \ + s1_m = __lsx_vsub_w(s1_m, k3_m); \ + k3_m = __lsx_vdp2_w_h(s4_m, k1_m); \ + s0_m = __lsx_vsub_w(s0_m, k3_m); \ + \ + out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ + \ + s1_m = __lsx_vdp2_w_h(s3_m, k2_m); \ + s0_m = __lsx_vdp2_w_h(s2_m, k2_m); \ + out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ + } + +#endif // VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 198e0060f7..6da133e383 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -211,6 +211,7 @@ endif # CONFIG_VP9 DSP_SRCS-yes += txfm_common.h DSP_SRCS-$(HAVE_SSE2) += x86/txfm_common_sse2.h DSP_SRCS-$(HAVE_MSA) += mips/txfm_macros_msa.h +DSP_SRCS-$(HAVE_LSX) += loongarch/txfm_macros_lsx.h # forward transform ifeq ($(CONFIG_VP9_ENCODER),yes) DSP_SRCS-yes += fwd_txfm.c @@ -231,9 +232,11 @@ DSP_SRCS-$(HAVE_NEON) += arm/fdct_partial_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c +DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.h ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c +DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_dct32x32_lsx.c endif # !CONFIG_VP9_HIGHBITDEPTH DSP_SRCS-$(HAVE_VSX) += ppc/fdct32x32_vsx.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index bcc1b916cc..436bee94dd 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -591,10 +591,10 @@ () specialize qw/vpx_fdct16x16_1 sse2 neon msa/; add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct32x32 neon sse2 avx2 msa/; + specialize qw/vpx_fdct32x32 neon sse2 avx2 msa lsx/; add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa vsx/; + specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa vsx lsx/; add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct32x32_1 sse2 neon msa/; From 81e5841a167e351613ce8ffc068d7ebe83e666a1 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Thu, 14 Apr 2022 11:30:55 +0800 Subject: [PATCH 0327/1000] vp9[loongarch]: Optimize idct32x32_1024/1/34_add 1. vpx_idct32x32_1024_add_lsx 2. vpx_idct32x32_34_add_lsx 3. vpx_idct32x32_1_add_lsx Bug: webm:1755 Change-Id: I9c24f75e0d93613754d8e30da7e007b8d1374e60 --- test/dct32x32_test.cc | 6 +- test/partial_idct_test.cc | 14 + vpx_dsp/loongarch/fwd_txfm_lsx.h | 22 + vpx_dsp/loongarch/idct32x32_lsx.c | 834 ++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 2 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +- 6 files changed, 878 insertions(+), 6 deletions(-) create mode 100644 vpx_dsp/loongarch/idct32x32_lsx.c diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index a764d187a3..91bb8e01ea 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -400,9 +400,9 @@ INSTANTIATE_TEST_SUITE_P( #if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_SUITE_P( LSX, Trans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_lsx, &vpx_idct32x32_1024_add_c, - 0, VPX_BITS_8), + ::testing::Values(make_tuple(&vpx_fdct32x32_lsx, + &vpx_idct32x32_1024_add_lsx, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_lsx, - &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8))); + &vpx_idct32x32_1024_add_lsx, 1, VPX_BITS_8))); #endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index a160120de1..7eb888a586 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -954,6 +954,20 @@ INSTANTIATE_TEST_SUITE_P(MSA, PartialIDctTest, ::testing::ValuesIn(msa_partial_idct_tests)); #endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH +const PartialInvTxfmParam lsx_partial_idct_tests[] = { + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1024, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 34, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1, 8, 1), +}; + +INSTANTIATE_TEST_SUITE_P(LSX, PartialIDctTest, + ::testing::ValuesIn(lsx_partial_idct_tests)); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH + #endif // !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h index 0e59852c42..a6f62dbc81 100644 --- a/vpx_dsp/loongarch/fwd_txfm_lsx.h +++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h @@ -91,4 +91,26 @@ DCT_CONST_BITS, out2, out3); \ } +#define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2, \ + in3) \ + { \ + __m128i dst0_m, dst1_m, dst2_m, dst3_m; \ + __m128i tmp0_m, tmp1_m; \ + __m128i res0_m, res1_m, res2_m, res3_m; \ + \ + dst0_m = __lsx_vld(dst, 0); \ + DUP2_ARG2(__lsx_vldx, dst, _stride, dst, _stride2, dst1_m, dst2_m); \ + dst3_m = __lsx_vldx(dst, _stride3); \ + DUP4_ARG2(__lsx_vsllwil_hu_bu, dst0_m, 0, dst1_m, 0, dst2_m, 0, dst3_m, 0, \ + res0_m, res1_m, res2_m, res3_m); \ + DUP4_ARG2(__lsx_vadd_h, res0_m, in0, res1_m, in1, res2_m, in2, res3_m, \ + in3, res0_m, res1_m, res2_m, res3_m); \ + DUP2_ARG3(__lsx_vssrarni_bu_h, res1_m, res0_m, 0, res3_m, res2_m, 0, \ + tmp0_m, tmp1_m); \ + __lsx_vstelm_d(tmp0_m, dst, 0, 0); \ + __lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1); \ + __lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0); \ + __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1); \ + } + #endif // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ diff --git a/vpx_dsp/loongarch/idct32x32_lsx.c b/vpx_dsp/loongarch/idct32x32_lsx.c new file mode 100644 index 0000000000..d6890c28e1 --- /dev/null +++ b/vpx_dsp/loongarch/idct32x32_lsx.c @@ -0,0 +1,834 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/fwd_txfm_lsx.h" + +#define UNPCK_UB_SH(_in, _out0, _out1) \ + { \ + _out0 = __lsx_vsllwil_hu_bu(_in, 0); \ + _out1 = __lsx_vexth_hu_bu(_in); \ + } + +static void idct32x8_row_transpose_store(const int16_t *input, + int16_t *tmp_buf) { + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i n0, n1, n2, n3, n4, n5, n6, n7; + + /* 1st & 2nd 8x8 */ + DUP4_ARG2(__lsx_vld, input, 0, input, 64, input, 128, input, 192, m0, n0, m1, + n1); + DUP4_ARG2(__lsx_vld, input, 256, input, 320, input, 384, input, 448, m2, n2, + m3, n3); + DUP4_ARG2(__lsx_vld, input, 16, input, 80, input, 144, input, 208, m4, n4, m5, + n5); + DUP4_ARG2(__lsx_vld, input, 272, input, 336, input, 400, input, 464, m6, n6, + m7, n7); + + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + + __lsx_vst(m0, tmp_buf, 0); + __lsx_vst(n0, tmp_buf, 16); + __lsx_vst(m1, tmp_buf, 32); + __lsx_vst(n1, tmp_buf, 48); + __lsx_vst(m2, tmp_buf, 64); + __lsx_vst(n2, tmp_buf, 80); + __lsx_vst(m3, tmp_buf, 96); + __lsx_vst(n3, tmp_buf, 112); + __lsx_vst(m4, tmp_buf, 128); + __lsx_vst(n4, tmp_buf, 144); + __lsx_vst(m5, tmp_buf, 160); + __lsx_vst(n5, tmp_buf, 176); + __lsx_vst(m6, tmp_buf, 192); + __lsx_vst(n6, tmp_buf, 208); + __lsx_vst(m7, tmp_buf, 224); + __lsx_vst(n7, tmp_buf, 240); + + /* 3rd & 4th 8x8 */ + DUP4_ARG2(__lsx_vld, input, 32, input, 96, input, 160, input, 224, m0, n0, m1, + n1); + DUP4_ARG2(__lsx_vld, input, 288, input, 352, input, 416, input, 480, m2, n2, + m3, n3); + DUP4_ARG2(__lsx_vld, input, 48, input, 112, input, 176, input, 240, m4, n4, + m5, n5); + DUP4_ARG2(__lsx_vld, input, 304, input, 368, input, 432, input, 496, m6, n6, + m7, n7); + + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + + __lsx_vst(m0, tmp_buf, 256); + __lsx_vst(n0, tmp_buf, 272); + __lsx_vst(m1, tmp_buf, 288); + __lsx_vst(n1, tmp_buf, 304); + __lsx_vst(m2, tmp_buf, 320); + __lsx_vst(n2, tmp_buf, 336); + __lsx_vst(m3, tmp_buf, 352); + __lsx_vst(n3, tmp_buf, 368); + __lsx_vst(m4, tmp_buf, 384); + __lsx_vst(n4, tmp_buf, 400); + __lsx_vst(m5, tmp_buf, 416); + __lsx_vst(n5, tmp_buf, 432); + __lsx_vst(m6, tmp_buf, 448); + __lsx_vst(n6, tmp_buf, 464); + __lsx_vst(m7, tmp_buf, 480); + __lsx_vst(n7, tmp_buf, 496); +} + +static void idct32x8_row_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + __m128i tmp0; + + /* Even stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 64, tmp_buf, 128, tmp_buf, 192, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 320, tmp_buf, 384, tmp_buf, 448, + reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 32, tmp_buf, 96, tmp_buf, 160, tmp_buf, 224, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 288, tmp_buf, 352, tmp_buf, 416, tmp_buf, 480, + reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = __lsx_vadd_h(reg0, reg4); + reg0 = __lsx_vsub_h(reg0, reg4); + reg4 = __lsx_vadd_h(reg6, reg2); + reg6 = __lsx_vsub_h(reg6, reg2); + reg2 = __lsx_vadd_h(reg1, reg5); + reg1 = __lsx_vsub_h(reg1, reg5); + reg5 = __lsx_vadd_h(reg7, reg3); + reg7 = __lsx_vsub_h(reg7, reg3); + reg3 = vec0; + + vec1 = reg2; + reg2 = __lsx_vadd_h(reg3, reg4); + reg3 = __lsx_vsub_h(reg3, reg4); + reg4 = __lsx_vsub_h(reg5, vec1); + reg5 = __lsx_vadd_h(reg5, vec1); + + tmp0 = __lsx_vneg_h(reg6); + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = __lsx_vsub_h(reg0, reg6); + reg0 = __lsx_vadd_h(reg0, reg6); + vec1 = __lsx_vsub_h(reg7, reg1); + reg7 = __lsx_vadd_h(reg7, reg1); + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 240); + __lsx_vst(loc1, tmp_eve_buf, 0); + __lsx_vst(loc2, tmp_eve_buf, 224); + __lsx_vst(loc3, tmp_eve_buf, 16); + + LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 208); + __lsx_vst(loc1, tmp_eve_buf, 32); + __lsx_vst(loc2, tmp_eve_buf, 192); + __lsx_vst(loc3, tmp_eve_buf, 48); + + /* Store 8 */ + LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 176); + __lsx_vst(loc1, tmp_eve_buf, 64); + __lsx_vst(loc2, tmp_eve_buf, 160); + __lsx_vst(loc3, tmp_eve_buf, 80); + + LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 144); + __lsx_vst(loc1, tmp_eve_buf, 96); + __lsx_vst(loc2, tmp_eve_buf, 128); + __lsx_vst(loc3, tmp_eve_buf, 112); +} + +static void idct32x8_row_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 16, tmp_buf, 112, tmp_buf, 144, tmp_buf, 240, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 272, tmp_buf, 368, tmp_buf, 400, tmp_buf, 496, + reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = __lsx_vadd_h(reg0, reg3); + reg0 = __lsx_vsub_h(reg0, reg3); + reg3 = __lsx_vadd_h(reg7, reg4); + reg7 = __lsx_vsub_h(reg7, reg4); + reg4 = __lsx_vadd_h(reg1, reg2); + reg1 = __lsx_vsub_h(reg1, reg2); + reg2 = __lsx_vadd_h(reg6, reg5); + reg6 = __lsx_vsub_h(reg6, reg5); + reg5 = vec0; + + /* 4 Stores */ + DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 64); + __lsx_vst(vec1, tmp_odd_buf, 80); + + DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 0); + __lsx_vst(vec1, tmp_odd_buf, 16); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + __lsx_vst(vec0, tmp_odd_buf, 96); + __lsx_vst(vec1, tmp_odd_buf, 112); + + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + __lsx_vst(vec2, tmp_odd_buf, 32); + __lsx_vst(vec3, tmp_odd_buf, 48); + + /* Odd stage 2 */ + /* 8 loads */ + DUP4_ARG2(__lsx_vld, tmp_buf, 48, tmp_buf, 80, tmp_buf, 176, tmp_buf, 208, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 304, tmp_buf, 336, tmp_buf, 432, tmp_buf, 464, + reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, + vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + + LSX_BUTTERFLY_4_H(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3); + __lsx_vst(vec0, tmp_odd_buf, 192); + __lsx_vst(vec1, tmp_odd_buf, 240); + + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 160); + __lsx_vst(vec1, tmp_odd_buf, 176); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vadd_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, + vec2, vec0, vec3); + LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + __lsx_vst(reg0, tmp_odd_buf, 208); + __lsx_vst(reg1, tmp_odd_buf, 224); + + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + __lsx_vst(reg0, tmp_odd_buf, 128); + __lsx_vst(reg1, tmp_odd_buf, 144); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32, + tmp_odd_buf, 48, reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160, + tmp_odd_buf, 176, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 0); + __lsx_vst(loc1, tmp_odd_buf, 16); + __lsx_vst(loc2, tmp_odd_buf, 32); + __lsx_vst(loc3, tmp_odd_buf, 48); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 128); + __lsx_vst(loc1, tmp_odd_buf, 144); + __lsx_vst(loc2, tmp_odd_buf, 160); + __lsx_vst(loc3, tmp_odd_buf, 176); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96, + tmp_odd_buf, 112, reg1, reg2, reg0, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224, + tmp_odd_buf, 240, reg4, reg5, reg6, reg7); + + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 64); + __lsx_vst(loc1, tmp_odd_buf, 80); + __lsx_vst(loc2, tmp_odd_buf, 96); + __lsx_vst(loc3, tmp_odd_buf, 112); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 192); + __lsx_vst(loc1, tmp_odd_buf, 208); + __lsx_vst(loc2, tmp_odd_buf, 224); + __lsx_vst(loc3, tmp_odd_buf, 240); +} + +static void idct_butterfly_transpose_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, int16_t *dst) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i n0, n1, n2, n3, n4, n5, n6, n7; + __m128i reg0, reg1, reg2, reg3; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224, + tmp_odd_buf, 96, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64, + tmp_eve_buf, 192, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, + m4, m2, m6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 496); + __lsx_vst(reg1, tmp_buf, 368); + __lsx_vst(reg2, tmp_buf, 432); + __lsx_vst(reg3, tmp_buf, 304); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160, + tmp_odd_buf, 48, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96, + tmp_eve_buf, 224, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, + m5, m3, m7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 464); + __lsx_vst(reg1, tmp_buf, 336); + __lsx_vst(reg2, tmp_buf, 400); + __lsx_vst(reg3, tmp_buf, 272); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192, + tmp_odd_buf, 112, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80, + tmp_eve_buf, 208, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, + n4, n2, n6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 480); + __lsx_vst(reg1, tmp_buf, 352); + __lsx_vst(reg2, tmp_buf, 416); + __lsx_vst(reg3, tmp_buf, 288); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128, + tmp_odd_buf, 16, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112, + tmp_eve_buf, 240, loc0, loc1, loc2, loc3); + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, + n5, n3, n7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 448); + __lsx_vst(reg1, tmp_buf, 320); + __lsx_vst(reg2, tmp_buf, 384); + __lsx_vst(reg3, tmp_buf, 256); + + /* Transpose : 16 vectors */ + /* 1st & 2nd 8x8 */ + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + __lsx_vst(m0, dst, 0); + __lsx_vst(n0, dst, 64); + __lsx_vst(m1, dst, 128); + __lsx_vst(n1, dst, 192); + __lsx_vst(m2, dst, 256); + __lsx_vst(n2, dst, 320); + __lsx_vst(m3, dst, 384); + __lsx_vst(n3, dst, 448); + + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + __lsx_vst(m4, dst, 16); + __lsx_vst(n4, dst, 80); + __lsx_vst(m5, dst, 144); + __lsx_vst(n5, dst, 208); + __lsx_vst(m6, dst, 272); + __lsx_vst(n6, dst, 336); + __lsx_vst(m7, dst, 400); + __lsx_vst(n7, dst, 464); + + /* 3rd & 4th 8x8 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 272, tmp_buf, 288, tmp_buf, 304, + m0, n0, m1, n1); + DUP4_ARG2(__lsx_vld, tmp_buf, 320, tmp_buf, 336, tmp_buf, 352, tmp_buf, 368, + m2, n2, m3, n3); + DUP4_ARG2(__lsx_vld, tmp_buf, 384, tmp_buf, 400, tmp_buf, 416, tmp_buf, 432, + m4, n4, m5, n5); + DUP4_ARG2(__lsx_vld, tmp_buf, 448, tmp_buf, 464, tmp_buf, 480, tmp_buf, 496, + m6, n6, m7, n7); + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + __lsx_vst(m0, dst, 32); + __lsx_vst(n0, dst, 96); + __lsx_vst(m1, dst, 160); + __lsx_vst(n1, dst, 224); + __lsx_vst(m2, dst, 288); + __lsx_vst(n2, dst, 352); + __lsx_vst(m3, dst, 416); + __lsx_vst(n3, dst, 480); + __lsx_vst(m4, dst, 48); + __lsx_vst(n4, dst, 112); + __lsx_vst(m5, dst, 176); + __lsx_vst(n5, dst, 240); + __lsx_vst(m6, dst, 304); + __lsx_vst(n6, dst, 368); + __lsx_vst(m7, dst, 432); + __lsx_vst(n7, dst, 496); +} + +static void idct32x8_1d_rows_lsx(const int16_t *input, int16_t *output) { + DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]); + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct32x8_row_transpose_store(input, &tmp_buf[0]); + idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); + idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); + idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0], + output); +} + +static void idct8x32_column_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + __m128i tmp0; + + /* Even stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf, + 1792, reg4, reg5, reg6, reg7); + tmp_buf += 64; + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + /* Load 8 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf, + 1792, reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = __lsx_vadd_h(reg0, reg4); + reg0 = __lsx_vsub_h(reg0, reg4); + reg4 = __lsx_vadd_h(reg6, reg2); + reg6 = __lsx_vsub_h(reg6, reg2); + reg2 = __lsx_vadd_h(reg1, reg5); + reg1 = __lsx_vsub_h(reg1, reg5); + reg5 = __lsx_vadd_h(reg7, reg3); + reg7 = __lsx_vsub_h(reg7, reg3); + reg3 = vec0; + + vec1 = reg2; + reg2 = __lsx_vadd_h(reg3, reg4); + reg3 = __lsx_vsub_h(reg3, reg4); + reg4 = __lsx_vsub_h(reg5, vec1); + reg5 = __lsx_vadd_h(reg5, vec1); + + tmp0 = __lsx_vneg_h(reg6); + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = __lsx_vsub_h(reg0, reg6); + reg0 = __lsx_vadd_h(reg0, reg6); + vec1 = __lsx_vsub_h(reg7, reg1); + reg7 = __lsx_vadd_h(reg7, reg1); + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + /* Store 8 */ + LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 0); + __lsx_vst(loc3, tmp_eve_buf, 16); + __lsx_vst(loc2, tmp_eve_buf, 224); + __lsx_vst(loc0, tmp_eve_buf, 240); + + LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 32); + __lsx_vst(loc3, tmp_eve_buf, 48); + __lsx_vst(loc2, tmp_eve_buf, 192); + __lsx_vst(loc0, tmp_eve_buf, 208); + + /* Store 8 */ + LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 64); + __lsx_vst(loc3, tmp_eve_buf, 80); + __lsx_vst(loc2, tmp_eve_buf, 160); + __lsx_vst(loc0, tmp_eve_buf, 176); + + LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 96); + __lsx_vst(loc3, tmp_eve_buf, 112); + __lsx_vst(loc2, tmp_eve_buf, 128); + __lsx_vst(loc0, tmp_eve_buf, 144); +} + +static void idct8x32_column_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 448, tmp_buf, 576, tmp_buf, 960, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1088, tmp_buf, 1472, tmp_buf, 1600, tmp_buf, + 1984, reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = __lsx_vadd_h(reg0, reg3); + reg0 = __lsx_vsub_h(reg0, reg3); + reg3 = __lsx_vadd_h(reg7, reg4); + reg7 = __lsx_vsub_h(reg7, reg4); + reg4 = __lsx_vadd_h(reg1, reg2); + reg1 = __lsx_vsub_h(reg1, reg2); + reg2 = __lsx_vadd_h(reg6, reg5); + reg6 = __lsx_vsub_h(reg6, reg5); + reg5 = vec0; + + /* 4 Stores */ + DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 64); + __lsx_vst(vec1, tmp_odd_buf, 80); + DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 0); + __lsx_vst(vec1, tmp_odd_buf, 16); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + __lsx_vst(vec0, tmp_odd_buf, 96); + __lsx_vst(vec1, tmp_odd_buf, 112); + __lsx_vst(vec2, tmp_odd_buf, 32); + __lsx_vst(vec3, tmp_odd_buf, 48); + + /* Odd stage 2 */ + /* 8 loads */ + DUP4_ARG2(__lsx_vld, tmp_buf, 192, tmp_buf, 320, tmp_buf, 704, tmp_buf, 832, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1216, tmp_buf, 1344, tmp_buf, 1728, tmp_buf, + 1856, reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, + vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + LSX_BUTTERFLY_4_H(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); + __lsx_vst(vec0, tmp_odd_buf, 192); + __lsx_vst(vec1, tmp_odd_buf, 240); + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 160); + __lsx_vst(vec1, tmp_odd_buf, 176); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vadd_h, reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, + vec1, vec2, vec3); + LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + __lsx_vst(reg0, tmp_odd_buf, 208); + __lsx_vst(reg1, tmp_odd_buf, 224); + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + __lsx_vst(reg0, tmp_odd_buf, 128); + __lsx_vst(reg1, tmp_odd_buf, 144); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32, + tmp_odd_buf, 48, reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160, + tmp_odd_buf, 176, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 0); + __lsx_vst(loc1, tmp_odd_buf, 16); + __lsx_vst(loc2, tmp_odd_buf, 32); + __lsx_vst(loc3, tmp_odd_buf, 48); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 128); + __lsx_vst(loc1, tmp_odd_buf, 144); + __lsx_vst(loc2, tmp_odd_buf, 160); + __lsx_vst(loc3, tmp_odd_buf, 176); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96, + tmp_odd_buf, 112, reg1, reg2, reg0, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224, + tmp_odd_buf, 240, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 64); + __lsx_vst(loc1, tmp_odd_buf, 80); + __lsx_vst(loc2, tmp_odd_buf, 96); + __lsx_vst(loc3, tmp_odd_buf, 112); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 192); + __lsx_vst(loc1, tmp_odd_buf, 208); + __lsx_vst(loc2, tmp_odd_buf, 224); + __lsx_vst(loc3, tmp_odd_buf, 240); +} + +static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, uint8_t *dst, + int32_t dst_stride) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i n0, n1, n2, n3, n4, n5, n6, n7; + int32_t stride = dst_stride << 2; + int32_t stride2 = stride << 1; + int32_t stride3 = stride + stride2; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224, + tmp_odd_buf, 96, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64, + tmp_eve_buf, 192, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, + m4, m2, m6); + DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6); + VP9_ADDBLK_ST8x4_UB(dst, stride, stride2, stride3, m0, m2, m4, m6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, + m2, m4, m0); + DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6); + VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), stride, stride2, stride3, m0, m2, + m4, m6); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160, + tmp_odd_buf, 48, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96, + tmp_eve_buf, 224, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, + m5, m3, m7); + DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7); + VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), stride, stride2, stride3, m1, m3, + m5, m7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, + m3, m5, m1); + DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7); + VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), stride, stride2, stride3, m1, m3, + m5, m7); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192, + tmp_odd_buf, 112, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80, + tmp_eve_buf, 208, loc0, loc1, loc2, loc3); + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, + n4, n2, n6); + DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6); + VP9_ADDBLK_ST8x4_UB((dst + dst_stride), stride, stride2, stride3, n0, n2, n4, + n6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, + n2, n4, n0); + DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6); + VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), stride, stride2, stride3, n0, n2, + n4, n6); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128, + tmp_odd_buf, 16, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112, + tmp_eve_buf, 240, loc0, loc1, loc2, loc3); + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, + n5, n3, n7); + DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7); + VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), stride, stride2, stride3, n1, n3, + n5, n7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, + n3, n5, n1); + DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7); + VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), stride, stride2, stride3, n1, n3, + n5, n7); +} + +static void idct8x32_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); + idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); + idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst, + dst_stride); +} + +void vpx_idct32x32_1024_add_lsx(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + + /* transform rows */ + for (i = 0; i < 4; ++i) { + /* process 32 * 8 block */ + idct32x8_1d_rows_lsx((input + (i << 8)), (out_ptr + (i << 8))); + } + + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void vpx_idct32x32_34_add_lsx(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + __m128i zero = __lsx_vldi(0); + + for (i = 32; i--;) { + __lsx_vst(zero, out_ptr, 0); + __lsx_vst(zero, out_ptr, 16); + __lsx_vst(zero, out_ptr, 32); + __lsx_vst(zero, out_ptr, 48); + out_ptr += 32; + } + + out_ptr = out_arr; + + /* rows: only upper-left 8x8 has non-zero coeff */ + idct32x8_1d_rows_lsx(input, out_ptr); + + /* transform columns */ + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void vpx_idct32x32_1_add_lsx(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + int16_t out; + __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + __m128i res0, res1, res2, res3, res4, res5, res6, res7, vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __lsx_vreplgr2vr_h(out); + + for (i = 16; i--;) { + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + dst2 = __lsx_vldx(dst, dst_stride); + dst3 = __lsx_vldx(dst + 16, dst_stride); + + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + + DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0, + res1, res2, res3); + DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec, res4, + res5, res6, res7); + DUP4_ARG3(__lsx_vssrarni_bu_h, res4, res0, 0, res5, res1, 0, res6, res2, 0, + res7, res3, 0, tmp0, tmp1, tmp2, tmp3); + __lsx_vst(tmp0, dst, 0); + __lsx_vst(tmp1, dst, 16); + dst += dst_stride; + __lsx_vst(tmp2, dst, 0); + __lsx_vst(tmp3, dst, 16); + dst += dst_stride; + } +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 6da133e383..3eba23c0af 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -269,6 +269,8 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c + +DSP_SRCS-$(HAVE_LSX) += loongarch/idct32x32_lsx.c else # CONFIG_VP9_HIGHBITDEPTH DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct4x4_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct8x8_add_neon.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 436bee94dd..7d78dc72ac 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -652,12 +652,12 @@ () $vpx_idct16x16_38_add_msa=vpx_idct16x16_256_add_msa; specialize qw/vpx_idct16x16_10_add dspr2 msa/; specialize qw/vpx_idct16x16_1_add dspr2 msa/; - specialize qw/vpx_idct32x32_1024_add dspr2 msa/; + specialize qw/vpx_idct32x32_1024_add dspr2 msa lsx/; specialize qw/vpx_idct32x32_135_add dspr2 msa/; $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2; $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa; - specialize qw/vpx_idct32x32_34_add dspr2 msa/; - specialize qw/vpx_idct32x32_1_add dspr2 msa/; + specialize qw/vpx_idct32x32_34_add dspr2 msa lsx/; + specialize qw/vpx_idct32x32_1_add dspr2 msa lsx/; specialize qw/vpx_iwht4x4_16_add msa/; specialize qw/vpx_iwht4x4_1_add msa/; } # !CONFIG_VP9_HIGHBITDEPTH From c8b9bf2b289a5755c0cc1187ebf07e7af75ef37d Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 15 Apr 2022 22:23:05 -0700 Subject: [PATCH 0328/1000] vp8: fix some implicit unsigned -> int conversions fixes some warnings with clang-13 -fsanitize=integer: vp8/decoder/threading.c:77:27: runtime error: implicit conversion from type 'unsigned int' of value 4294967295 (32-bit, unsigned) to type 'int' changed the value to -1 (32-bit, signed) these bitmask constants were missed in: 1676cddaa vp8: fix some implicit signed -> unsigned conv warnings Bug: webm:1759 Change-Id: I5d894d08fd41e32b91b56a4d91276837b3415ee4 --- vp8/decoder/threading.c | 4 ++-- vp8/encoder/ethreading.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index 491e2ce4c1..490f62d1b3 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -74,9 +74,9 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2)); memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv)); - mbd->fullpixel_mask = 0xffffffff; + mbd->fullpixel_mask = ~0; - if (pc->full_pixel) mbd->fullpixel_mask = 0xfffffff8; + if (pc->full_pixel) mbd->fullpixel_mask = ~7; } for (i = 0; i < pc->mb_rows; ++i) diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index 55a1528b14..cb35f4f491 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -470,8 +470,8 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x, setup_mbby_copy(&mbr_ei[i].mb, x); - mbd->fullpixel_mask = 0xffffffff; - if (cm->full_pixel) mbd->fullpixel_mask = 0xfffffff8; + mbd->fullpixel_mask = ~0; + if (cm->full_pixel) mbd->fullpixel_mask = ~7; vp8_zero(mb->coef_counts); vp8_zero(x->ymode_count); From 9750257826bea4c73557f0612b24e9b85baf7031 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 15 Apr 2022 22:29:31 -0700 Subject: [PATCH 0329/1000] vp8,get_sub_mv_ref_prob: change arguments to uint32_t this matches the call with int_mv::as_int and fixes a warning with clang-13 -fsanitize=integer: vp8/decoder/decodemv.c:240:32: runtime error: implicit conversion from type 'uint32_t' (aka 'unsigned int') of value 4282515456 (32-bit, unsigned) to type 'int' changed the value to -12451840 (32-bit, signed) Bug: webm:1759 Change-Id: I7c0aa72baa45421929afac26566e149adc6669d7 --- vp8/decoder/decodemv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c index 94373852dc..51817a2cb9 100644 --- a/vp8/decoder/decodemv.c +++ b/vp8/decoder/decodemv.c @@ -173,7 +173,8 @@ const vp8_prob vp8_sub_mv_ref_prob3[8][VP8_SUBMVREFS - 1] = { { 208, 1, 1 } /* SUBMVREF_LEFT_ABOVE_ZED */ }; -static const vp8_prob *get_sub_mv_ref_prob(const int left, const int above) { +static const vp8_prob *get_sub_mv_ref_prob(const uint32_t left, + const uint32_t above) { int lez = (left == 0); int aez = (above == 0); int lea = (left == above); From 946bcdf9069c980edd4edad5721262efe26f75ba Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 15 Apr 2022 16:45:26 -0700 Subject: [PATCH 0330/1000] Upgrade GoogleTest to v1.11.0 The release tag is release-1.11.0. Ref: https://aomedia-review.googlesource.com/c/aom/+/156641 79c98a122 Upgrade GoogleTest to v1.11.0 Note the tree structure differs from libaom, but is left untouched to avoid breaking test include paths in this commit. Change-Id: Ia3c6861d45a3befc2decb1da5b1018bcfd38f95a --- third_party/googletest/README.libvpx | 8 +- third_party/googletest/src/CONTRIBUTORS | 25 + third_party/googletest/src/README.md | 169 ++- .../src/include/gtest/gtest-death-test.h | 45 +- .../src/include/gtest/gtest-matchers.h | 372 +++++-- .../src/include/gtest/gtest-message.h | 6 +- .../src/include/gtest/gtest-param-test.h | 14 +- .../src/include/gtest/gtest-printers.h | 654 +++++++----- .../googletest/src/include/gtest/gtest-spi.h | 6 +- .../src/include/gtest/gtest-test-part.h | 6 +- .../src/include/gtest/gtest-typed-test.h | 14 +- .../googletest/src/include/gtest/gtest.h | 90 +- .../src/include/gtest/gtest_pred_impl.h | 6 +- .../googletest/src/include/gtest/gtest_prod.h | 6 +- .../gtest/internal/custom/gtest-port.h | 6 +- .../gtest/internal/custom/gtest-printers.h | 6 +- .../src/include/gtest/internal/custom/gtest.h | 6 +- .../internal/gtest-death-test-internal.h | 6 +- .../include/gtest/internal/gtest-filepath.h | 8 +- .../include/gtest/internal/gtest-internal.h | 266 +++-- .../include/gtest/internal/gtest-param-util.h | 43 +- .../include/gtest/internal/gtest-port-arch.h | 9 +- .../src/include/gtest/internal/gtest-port.h | 175 +++- .../src/include/gtest/internal/gtest-string.h | 9 +- .../include/gtest/internal/gtest-type-util.h | 48 +- .../googletest/src/src/gtest-death-test.cc | 63 +- .../googletest/src/src/gtest-filepath.cc | 45 +- .../googletest/src/src/gtest-internal-inl.h | 35 +- third_party/googletest/src/src/gtest-port.cc | 56 +- .../googletest/src/src/gtest-printers.cc | 197 +++- .../googletest/src/src/gtest-typed-test.cc | 16 +- third_party/googletest/src/src/gtest.cc | 989 ++++++++++++------ 32 files changed, 2220 insertions(+), 1184 deletions(-) diff --git a/third_party/googletest/README.libvpx b/third_party/googletest/README.libvpx index ed55fb09f9..b9a74922f0 100644 --- a/third_party/googletest/README.libvpx +++ b/third_party/googletest/README.libvpx @@ -1,5 +1,5 @@ URL: https://github.com/google/googletest.git -Version: release-1.10.0-224-g23b2a3b1 +Version: release-1.11.0 License: BSD License File: LICENSE @@ -13,11 +13,9 @@ generation. Local Modifications: - Remove everything but: + CONTRIBUTORS googletest/ - CONTRIBUTORS include - LICENSE README.md src -- Enable kErrorOnUninstantiatedParameterizedTest and - kErrorOnUninstantiatedTypeParameterizedTest in gtest.cc + LICENSE diff --git a/third_party/googletest/src/CONTRIBUTORS b/third_party/googletest/src/CONTRIBUTORS index 1e4afe2182..76db0b40ff 100644 --- a/third_party/googletest/src/CONTRIBUTORS +++ b/third_party/googletest/src/CONTRIBUTORS @@ -5,34 +5,59 @@ Ajay Joshi Balázs Dán +Benoit Sigoure Bharat Mediratta +Bogdan Piloca Chandler Carruth Chris Prince Chris Taylor Dan Egnor +Dave MacLachlan +David Anderson +Dean Sturtevant Eric Roman +Gene Volovich Hady Zalek +Hal Burch Jeffrey Yasskin +Jim Keller +Joe Walnes +Jon Wray Jói Sigurðsson Keir Mierle Keith Ray Kenton Varda +Kostya Serebryany Krystian Kuzniarek +Lev Makhlis Manuel Klimek +Mario Tanev +Mark Paskin Markus Heule +Matthew Simmons Mika Raento +Mike Bland Miklós Fazekas +Neal Norwitz +Nermin Ozkiranartli +Owen Carlsen +Paneendra Ba Pasi Valminen Patrick Hanna Patrick Riley +Paul Menage Peter Kaminski +Piotr Kaminski Preston Jackson Rainer Klaffenboeck Russ Cox Russ Rufer Sean Mcafee Sigurður Ásgeirsson +Sverre Sundsdal +Takeshi Yoshino Tracy Bialik Vadim Berman Vlad Losev +Wolfgang Klier Zhanyong Wan diff --git a/third_party/googletest/src/README.md b/third_party/googletest/src/README.md index 904048f484..1f8b349ae7 100644 --- a/third_party/googletest/src/README.md +++ b/third_party/googletest/src/README.md @@ -2,39 +2,51 @@ #### Setup -To build Google Test and your tests that use it, you need to tell your build +To build GoogleTest and your tests that use it, you need to tell your build system where to find its headers and source files. The exact way to do it depends on which build system you use, and is usually straightforward. ### Build with CMake -Google Test comes with a CMake build script +GoogleTest comes with a CMake build script ([CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt)) that can be used on a wide range of platforms ("C" stands for cross-platform.). If you don't have CMake installed already, you can download it for free from . CMake works by generating native makefiles or build projects that can be used in -the compiler environment of your choice. You can either build Google Test as a +the compiler environment of your choice. You can either build GoogleTest as a standalone project or it can be incorporated into an existing CMake build for another project. #### Standalone CMake Project -When building Google Test as a standalone project, the typical workflow starts -with: +When building GoogleTest as a standalone project, the typical workflow starts +with - mkdir mybuild # Create a directory to hold the build output. - cd mybuild - cmake ${GTEST_DIR} # Generate native build scripts. +``` +git clone https://github.com/google/googletest.git -b release-1.10.0 +cd googletest # Main directory of the cloned repository. +mkdir build # Create a directory to hold the build output. +cd build +cmake .. # Generate native build scripts for GoogleTest. +``` -If you want to build Google Test's samples, you should replace the last command -with +The above command also includes GoogleMock by default. And so, if you want to +build only GoogleTest, you should replace the last command with - cmake -Dgtest_build_samples=ON ${GTEST_DIR} +``` +cmake .. -DBUILD_GMOCK=OFF +``` If you are on a \*nix system, you should now see a Makefile in the current -directory. Just type 'make' to build gtest. +directory. Just type `make` to build GoogleTest. And then you can simply install +GoogleTest if you are a system administrator. + +``` +make +sudo make install # Install in /usr/local/ by default +``` If you use Windows and have Visual Studio installed, a `gtest.sln` file and several `.vcproj` files will be created. You can then build them using Visual @@ -44,13 +56,19 @@ On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated. #### Incorporating Into An Existing CMake Project -If you want to use gtest in a project which already uses CMake, then a more -robust and flexible approach is to build gtest as part of that project directly. -This is done by making the GoogleTest source code available to the main build -and adding it using CMake's `add_subdirectory()` command. This has the -significant advantage that the same compiler and linker settings are used -between gtest and the rest of your project, so issues associated with using -incompatible libraries (eg debug/release), etc. are avoided. This is +If you want to use GoogleTest in a project which already uses CMake, the easiest +way is to get installed libraries and headers. + +* Import GoogleTest by using `find_package` (or `pkg_check_modules`). For + example, if `find_package(GTest CONFIG REQUIRED)` succeeds, you can use the + libraries as `GTest::gtest`, `GTest::gmock`. + +And a more robust and flexible approach is to build GoogleTest as part of that +project directly. This is done by making the GoogleTest source code available to +the main build and adding it using CMake's `add_subdirectory()` command. This +has the significant advantage that the same compiler and linker settings are +used between GoogleTest and the rest of your project, so issues associated with +using incompatible libraries (eg debug/release), etc. are avoided. This is particularly useful on Windows. Making GoogleTest's source code available to the main build can be done a few different ways: @@ -64,68 +82,23 @@ main build can be done a few different ways: possible or appropriate. Git submodules, for example, have their own set of advantages and drawbacks. * Use CMake to download GoogleTest as part of the build's configure step. This - is just a little more complex, but doesn't have the limitations of the other - methods. + approach doesn't have the limitations of the other methods. -The last of the above methods is implemented with a small piece of CMake code in -a separate file (e.g. `CMakeLists.txt.in`) which is copied to the build area and -then invoked as a sub-build _during the CMake stage_. That directory is then -pulled into the main build with `add_subdirectory()`. For example: +The last of the above methods is implemented with a small piece of CMake code +that downloads and pulls the GoogleTest code into the main build. -New file `CMakeLists.txt.in`: +Just add to your `CMakeLists.txt`: ```cmake -cmake_minimum_required(VERSION 2.8.2) - -project(googletest-download NONE) - -include(ExternalProject) -ExternalProject_Add(googletest - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG master - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src" - BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" +include(FetchContent) +FetchContent_Declare( + googletest + # Specify the commit you depend on and update it regularly. + URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip ) -``` - -Existing build's `CMakeLists.txt`: - -```cmake -# Download and unpack googletest at configure time -configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt) -execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) -if(result) - message(FATAL_ERROR "CMake step for googletest failed: ${result}") -endif() -execute_process(COMMAND ${CMAKE_COMMAND} --build . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) -if(result) - message(FATAL_ERROR "Build step for googletest failed: ${result}") -endif() - -# Prevent overriding the parent project's compiler/linker -# settings on Windows +# For Windows: Prevent overriding the parent project's compiler/linker settings set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) - -# Add googletest directly to our build. This defines -# the gtest and gtest_main targets. -add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src - ${CMAKE_CURRENT_BINARY_DIR}/googletest-build - EXCLUDE_FROM_ALL) - -# The gtest/gtest_main targets carry header search path -# dependencies automatically when using CMake 2.8.11 or -# later. Otherwise we have to add them here ourselves. -if (CMAKE_VERSION VERSION_LESS 2.8.11) - include_directories("${gtest_SOURCE_DIR}/include") -endif() +FetchContent_MakeAvailable(googletest) # Now simply link against gtest or gtest_main as needed. Eg add_executable(example example.cpp) @@ -133,20 +106,18 @@ target_link_libraries(example gtest_main) add_test(NAME example_test COMMAND example) ``` -Note that this approach requires CMake 2.8.2 or later due to its use of the -`ExternalProject_Add()` command. The above technique is discussed in more detail -in [this separate article](http://crascit.com/2015/07/25/cmake-gtest/) which -also contains a link to a fully generalized implementation of the technique. +Note that this approach requires CMake 3.14 or later due to its use of the +`FetchContent_MakeAvailable()` command. ##### Visual Studio Dynamic vs Static Runtimes By default, new Visual Studio projects link the C runtimes dynamically but -Google Test links them statically. This will generate an error that looks +GoogleTest links them statically. This will generate an error that looks something like the following: gtest.lib(gtest-all.obj) : error LNK2038: mismatch detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value 'MDd_DynamicDebug' in main.obj -Google Test already has a CMake option for this: `gtest_force_shared_crt` +GoogleTest already has a CMake option for this: `gtest_force_shared_crt` Enabling this option will make gtest link the runtimes dynamically too, and match the project in which it is included. @@ -154,17 +125,17 @@ match the project in which it is included. #### C++ Standard Version An environment that supports C++11 is required in order to successfully build -Google Test. One way to ensure this is to specify the standard in the top-level +GoogleTest. One way to ensure this is to specify the standard in the top-level project, for example by using the `set(CMAKE_CXX_STANDARD 11)` command. If this -is not feasible, for example in a C project using Google Test for validation, +is not feasible, for example in a C project using GoogleTest for validation, then it can be specified by adding it to the options for cmake via the `DCMAKE_CXX_FLAGS` option. -### Tweaking Google Test +### Tweaking GoogleTest -Google Test can be used in diverse environments. The default configuration may +GoogleTest can be used in diverse environments. The default configuration may not work (or may not work well) out of the box in some environments. However, -you can easily tweak Google Test by defining control macros on the compiler +you can easily tweak GoogleTest by defining control macros on the compiler command line. Generally, these macros are named like `GTEST_XYZ` and you define them to either 1 or 0 to enable or disable a certain feature. @@ -173,12 +144,12 @@ We list the most frequently used macros below. For a complete list, see file ### Multi-threaded Tests -Google Test is thread-safe where the pthread library is available. After +GoogleTest is thread-safe where the pthread library is available. After `#include "gtest/gtest.h"`, you can check the `GTEST_IS_THREADSAFE` macro to see whether this is the case (yes if the macro is `#defined` to 1, no if it's undefined.). -If Google Test doesn't correctly detect whether pthread is available in your +If GoogleTest doesn't correctly detect whether pthread is available in your environment, you can force it with -DGTEST_HAS_PTHREAD=1 @@ -187,16 +158,16 @@ or -DGTEST_HAS_PTHREAD=0 -When Google Test uses pthread, you may need to add flags to your compiler and/or +When GoogleTest uses pthread, you may need to add flags to your compiler and/or linker to select the pthread library, or you'll get link errors. If you use the -CMake script or the deprecated Autotools script, this is taken care of for you. -If you use your own build script, you'll need to read your compiler and linker's -manual to figure out what flags to add. +CMake script, this is taken care of for you. If you use your own build script, +you'll need to read your compiler and linker's manual to figure out what flags +to add. ### As a Shared Library (DLL) -Google Test is compact, so most users can build and link it as a static library -for the simplicity. You can choose to use Google Test as a shared library (known +GoogleTest is compact, so most users can build and link it as a static library +for the simplicity. You can choose to use GoogleTest as a shared library (known as a DLL on Windows) if you prefer. To compile *gtest* as a shared library, add @@ -216,22 +187,22 @@ Note: while the above steps aren't technically necessary today when using some compilers (e.g. GCC), they may become necessary in the future, if we decide to improve the speed of loading the library (see for details). Therefore you are recommended -to always add the above flags when using Google Test as a shared library. -Otherwise a future release of Google Test may break your build script. +to always add the above flags when using GoogleTest as a shared library. +Otherwise a future release of GoogleTest may break your build script. ### Avoiding Macro Name Clashes In C++, macros don't obey namespaces. Therefore two libraries that both define a macro of the same name will clash if you `#include` both definitions. In case a -Google Test macro clashes with another library, you can force Google Test to +GoogleTest macro clashes with another library, you can force GoogleTest to rename its macro to avoid the conflict. -Specifically, if both Google Test and some other code define macro FOO, you can +Specifically, if both GoogleTest and some other code define macro FOO, you can add -DGTEST_DONT_DEFINE_FOO=1 -to the compiler flags to tell Google Test to change the macro's name from `FOO` +to the compiler flags to tell GoogleTest to change the macro's name from `FOO` to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write diff --git a/third_party/googletest/src/include/gtest/gtest-death-test.h b/third_party/googletest/src/include/gtest/gtest-death-test.h index dc878ffbb3..9b4d4d1337 100644 --- a/third_party/googletest/src/include/gtest/gtest-death-test.h +++ b/third_party/googletest/src/include/gtest/gtest-death-test.h @@ -35,8 +35,8 @@ // directly. // GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ -#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ #include "gtest/internal/gtest-death-test-internal.h" @@ -97,6 +97,10 @@ GTEST_API_ bool InDeathTestChild(); // // ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!"); // +// The final parameter to each of these macros is a matcher applied to any data +// the sub-process wrote to stderr. For compatibility with existing tests, a +// bare string is interpreted as a regular expression matcher. +// // On the regular expressions used in death tests: // // GOOGLETEST_CM0005 DO NOT DELETE @@ -162,27 +166,27 @@ GTEST_API_ bool InDeathTestChild(); // directory in PATH. // -// Asserts that a given statement causes the program to exit, with an -// integer exit status that satisfies predicate, and emitting error output -// that matches regex. -# define ASSERT_EXIT(statement, predicate, regex) \ - GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_) +// Asserts that a given `statement` causes the program to exit, with an +// integer exit status that satisfies `predicate`, and emitting error output +// that matches `matcher`. +# define ASSERT_EXIT(statement, predicate, matcher) \ + GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_) -// Like ASSERT_EXIT, but continues on to successive tests in the +// Like `ASSERT_EXIT`, but continues on to successive tests in the // test suite, if any: -# define EXPECT_EXIT(statement, predicate, regex) \ - GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_) +# define EXPECT_EXIT(statement, predicate, matcher) \ + GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_) -// Asserts that a given statement causes the program to exit, either by +// Asserts that a given `statement` causes the program to exit, either by // explicitly exiting with a nonzero exit code or being killed by a -// signal, and emitting error output that matches regex. -# define ASSERT_DEATH(statement, regex) \ - ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) +// signal, and emitting error output that matches `matcher`. +# define ASSERT_DEATH(statement, matcher) \ + ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher) -// Like ASSERT_DEATH, but continues on to successive tests in the +// Like `ASSERT_DEATH`, but continues on to successive tests in the // test suite, if any: -# define EXPECT_DEATH(statement, regex) \ - EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) +# define EXPECT_DEATH(statement, matcher) \ + EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher) // Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*: @@ -190,11 +194,10 @@ GTEST_API_ bool InDeathTestChild(); class GTEST_API_ ExitedWithCode { public: explicit ExitedWithCode(int exit_code); + ExitedWithCode(const ExitedWithCode&) = default; + void operator=(const ExitedWithCode& other) = delete; bool operator()(int exit_status) const; private: - // No implementation - assignment is unsupported. - void operator=(const ExitedWithCode& other); - const int exit_code_; }; @@ -340,4 +343,4 @@ class GTEST_API_ KilledBySignal { } // namespace testing -#endif // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ diff --git a/third_party/googletest/src/include/gtest/gtest-matchers.h b/third_party/googletest/src/include/gtest/gtest-matchers.h index a61cef4093..9fa34a05ba 100644 --- a/third_party/googletest/src/include/gtest/gtest-matchers.h +++ b/third_party/googletest/src/include/gtest/gtest-matchers.h @@ -32,13 +32,10 @@ // This file implements just enough of the matcher interface to allow // EXPECT_DEATH and friends to accept a matcher argument. -// IWYU pragma: private, include "testing/base/public/gunit.h" -// IWYU pragma: friend third_party/googletest/googlemock/.* -// IWYU pragma: friend third_party/googletest/googletest/.* - -#ifndef GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ -#define GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ +#include #include #include #include @@ -63,20 +60,16 @@ GTEST_DISABLE_MSC_WARNINGS_PUSH_( namespace testing { // To implement a matcher Foo for type T, define: -// 1. a class FooMatcherImpl that implements the -// MatcherInterface interface, and -// 2. a factory function that creates a Matcher object from a -// FooMatcherImpl*. -// -// The two-level delegation design makes it possible to allow a user -// to write "v" instead of "Eq(v)" where a Matcher is expected, which -// is impossible if we pass matchers by pointers. It also eases -// ownership management as Matcher objects can now be copied like -// plain values. - -// MatchResultListener is an abstract class. Its << operator can be -// used by a matcher to explain why a value matches or doesn't match. +// 1. a class FooMatcherMatcher that implements the matcher interface: +// using is_gtest_matcher = void; +// bool MatchAndExplain(const T&, std::ostream*); +// (MatchResultListener* can also be used instead of std::ostream*) +// void DescribeTo(std::ostream*); +// void DescribeNegationTo(std::ostream*); // +// 2. a factory function that creates a Matcher object from a +// FooMatcherMatcher. + class MatchResultListener { public: // Creates a listener object with the given underlying ostream. The @@ -113,7 +106,7 @@ inline MatchResultListener::~MatchResultListener() { // An instance of a subclass of this knows how to describe itself as a // matcher. -class MatcherDescriberInterface { +class GTEST_API_ MatcherDescriberInterface { public: virtual ~MatcherDescriberInterface() {} @@ -181,31 +174,6 @@ class MatcherInterface : public MatcherDescriberInterface { namespace internal { -// Converts a MatcherInterface to a MatcherInterface. -template -class MatcherInterfaceAdapter : public MatcherInterface { - public: - explicit MatcherInterfaceAdapter(const MatcherInterface* impl) - : impl_(impl) {} - ~MatcherInterfaceAdapter() override { delete impl_; } - - void DescribeTo(::std::ostream* os) const override { impl_->DescribeTo(os); } - - void DescribeNegationTo(::std::ostream* os) const override { - impl_->DescribeNegationTo(os); - } - - bool MatchAndExplain(const T& x, - MatchResultListener* listener) const override { - return impl_->MatchAndExplain(x, listener); - } - - private: - const MatcherInterface* const impl_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(MatcherInterfaceAdapter); -}; - struct AnyEq { template bool operator()(const A& a, const B& b) const { return a == b; } @@ -252,16 +220,35 @@ class StreamMatchResultListener : public MatchResultListener { GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener); }; +struct SharedPayloadBase { + std::atomic ref{1}; + void Ref() { ref.fetch_add(1, std::memory_order_relaxed); } + bool Unref() { return ref.fetch_sub(1, std::memory_order_acq_rel) == 1; } +}; + +template +struct SharedPayload : SharedPayloadBase { + explicit SharedPayload(const T& v) : value(v) {} + explicit SharedPayload(T&& v) : value(std::move(v)) {} + + static void Destroy(SharedPayloadBase* shared) { + delete static_cast(shared); + } + + T value; +}; + // An internal class for implementing Matcher, which will derive // from it. We put functionalities common to all Matcher // specializations here to avoid code duplication. template -class MatcherBase { +class MatcherBase : private MatcherDescriberInterface { public: // Returns true if and only if the matcher matches x; also explains the // match result to 'listener'. bool MatchAndExplain(const T& x, MatchResultListener* listener) const { - return impl_->MatchAndExplain(x, listener); + GTEST_CHECK_(vtable_ != nullptr); + return vtable_->match_and_explain(*this, x, listener); } // Returns true if and only if this matcher matches x. @@ -271,11 +258,15 @@ class MatcherBase { } // Describes this matcher to an ostream. - void DescribeTo(::std::ostream* os) const { impl_->DescribeTo(os); } + void DescribeTo(::std::ostream* os) const final { + GTEST_CHECK_(vtable_ != nullptr); + vtable_->describe(*this, os, false); + } // Describes the negation of this matcher to an ostream. - void DescribeNegationTo(::std::ostream* os) const { - impl_->DescribeNegationTo(os); + void DescribeNegationTo(::std::ostream* os) const final { + GTEST_CHECK_(vtable_ != nullptr); + vtable_->describe(*this, os, true); } // Explains why x matches, or doesn't match, the matcher. @@ -288,31 +279,194 @@ class MatcherBase { // of the describer, which is only guaranteed to be alive when // this matcher object is alive. const MatcherDescriberInterface* GetDescriber() const { - return impl_.get(); + if (vtable_ == nullptr) return nullptr; + return vtable_->get_describer(*this); } protected: - MatcherBase() {} + MatcherBase() : vtable_(nullptr) {} // Constructs a matcher from its implementation. - explicit MatcherBase(const MatcherInterface* impl) : impl_(impl) {} - template - explicit MatcherBase( - const MatcherInterface* impl, - typename std::enable_if::value>::type* = - nullptr) - : impl_(new internal::MatcherInterfaceAdapter(impl)) {} + explicit MatcherBase(const MatcherInterface* impl) { + Init(impl); + } + + template ::type::is_gtest_matcher> + MatcherBase(M&& m) { // NOLINT + Init(std::forward(m)); + } + + MatcherBase(const MatcherBase& other) + : vtable_(other.vtable_), buffer_(other.buffer_) { + if (IsShared()) buffer_.shared->Ref(); + } + + MatcherBase& operator=(const MatcherBase& other) { + if (this == &other) return *this; + Destroy(); + vtable_ = other.vtable_; + buffer_ = other.buffer_; + if (IsShared()) buffer_.shared->Ref(); + return *this; + } - MatcherBase(const MatcherBase&) = default; - MatcherBase& operator=(const MatcherBase&) = default; - MatcherBase(MatcherBase&&) = default; - MatcherBase& operator=(MatcherBase&&) = default; + MatcherBase(MatcherBase&& other) + : vtable_(other.vtable_), buffer_(other.buffer_) { + other.vtable_ = nullptr; + } + + MatcherBase& operator=(MatcherBase&& other) { + if (this == &other) return *this; + Destroy(); + vtable_ = other.vtable_; + buffer_ = other.buffer_; + other.vtable_ = nullptr; + return *this; + } - virtual ~MatcherBase() {} + ~MatcherBase() override { Destroy(); } private: - std::shared_ptr> impl_; + struct VTable { + bool (*match_and_explain)(const MatcherBase&, const T&, + MatchResultListener*); + void (*describe)(const MatcherBase&, std::ostream*, bool negation); + // Returns the captured object if it implements the interface, otherwise + // returns the MatcherBase itself. + const MatcherDescriberInterface* (*get_describer)(const MatcherBase&); + // Called on shared instances when the reference count reaches 0. + void (*shared_destroy)(SharedPayloadBase*); + }; + + bool IsShared() const { + return vtable_ != nullptr && vtable_->shared_destroy != nullptr; + } + + // If the implementation uses a listener, call that. + template + static auto MatchAndExplainImpl(const MatcherBase& m, const T& value, + MatchResultListener* listener) + -> decltype(P::Get(m).MatchAndExplain(value, listener->stream())) { + return P::Get(m).MatchAndExplain(value, listener->stream()); + } + + template + static auto MatchAndExplainImpl(const MatcherBase& m, const T& value, + MatchResultListener* listener) + -> decltype(P::Get(m).MatchAndExplain(value, listener)) { + return P::Get(m).MatchAndExplain(value, listener); + } + + template + static void DescribeImpl(const MatcherBase& m, std::ostream* os, + bool negation) { + if (negation) { + P::Get(m).DescribeNegationTo(os); + } else { + P::Get(m).DescribeTo(os); + } + } + + template + static const MatcherDescriberInterface* GetDescriberImpl( + const MatcherBase& m) { + // If the impl is a MatcherDescriberInterface, then return it. + // Otherwise use MatcherBase itself. + // This allows us to implement the GetDescriber() function without support + // from the impl, but some users really want to get their impl back when + // they call GetDescriber(). + // We use std::get on a tuple as a workaround of not having `if constexpr`. + return std::get<( + std::is_convertible::value + ? 1 + : 0)>(std::make_tuple(&m, &P::Get(m))); + } + + template + const VTable* GetVTable() { + static constexpr VTable kVTable = {&MatchAndExplainImpl

, + &DescribeImpl

, &GetDescriberImpl

, + P::shared_destroy}; + return &kVTable; + } + + union Buffer { + // Add some types to give Buffer some common alignment/size use cases. + void* ptr; + double d; + int64_t i; + // And add one for the out-of-line cases. + SharedPayloadBase* shared; + }; + + void Destroy() { + if (IsShared() && buffer_.shared->Unref()) { + vtable_->shared_destroy(buffer_.shared); + } + } + + template + static constexpr bool IsInlined() { + return sizeof(M) <= sizeof(Buffer) && alignof(M) <= alignof(Buffer) && + std::is_trivially_copy_constructible::value && + std::is_trivially_destructible::value; + } + + template ()> + struct ValuePolicy { + static const M& Get(const MatcherBase& m) { + // When inlined along with Init, need to be explicit to avoid violating + // strict aliasing rules. + const M *ptr = static_cast( + static_cast(&m.buffer_)); + return *ptr; + } + static void Init(MatcherBase& m, M impl) { + ::new (static_cast(&m.buffer_)) M(impl); + } + static constexpr auto shared_destroy = nullptr; + }; + + template + struct ValuePolicy { + using Shared = SharedPayload; + static const M& Get(const MatcherBase& m) { + return static_cast(m.buffer_.shared)->value; + } + template + static void Init(MatcherBase& m, Arg&& arg) { + m.buffer_.shared = new Shared(std::forward(arg)); + } + static constexpr auto shared_destroy = &Shared::Destroy; + }; + + template + struct ValuePolicy*, B> { + using M = const MatcherInterface; + using Shared = SharedPayload>; + static const M& Get(const MatcherBase& m) { + return *static_cast(m.buffer_.shared)->value; + } + static void Init(MatcherBase& m, M* impl) { + m.buffer_.shared = new Shared(std::unique_ptr(impl)); + } + + static constexpr auto shared_destroy = &Shared::Destroy; + }; + + template + void Init(M&& m) { + using MM = typename std::decay::type; + using Policy = ValuePolicy; + vtable_ = GetVTable(); + Policy::Init(*this, std::forward(m)); + } + + const VTable* vtable_; + Buffer buffer_; }; } // namespace internal @@ -340,6 +494,10 @@ class Matcher : public internal::MatcherBase { nullptr) : internal::MatcherBase(impl) {} + template ::type::is_gtest_matcher> + Matcher(M&& m) : internal::MatcherBase(std::forward(m)) {} // NOLINT + // Implicit constructor here allows people to write // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes Matcher(T value); // NOLINT @@ -357,6 +515,11 @@ class GTEST_API_ Matcher explicit Matcher(const MatcherInterface* impl) : internal::MatcherBase(impl) {} + template ::type::is_gtest_matcher> + Matcher(M&& m) // NOLINT + : internal::MatcherBase(std::forward(m)) {} + // Allows the user to write str instead of Eq(str) sometimes, where // str is a std::string object. Matcher(const std::string& s); // NOLINT @@ -376,6 +539,11 @@ class GTEST_API_ Matcher explicit Matcher(const MatcherInterface* impl) : internal::MatcherBase(impl) {} + template ::type::is_gtest_matcher> + Matcher(M&& m) // NOLINT + : internal::MatcherBase(std::forward(m)) {} + // Allows the user to write str instead of Eq(str) sometimes, where // str is a string object. Matcher(const std::string& s); // NOLINT @@ -397,6 +565,12 @@ class GTEST_API_ Matcher explicit Matcher(const MatcherInterface* impl) : internal::MatcherBase(impl) {} + template ::type::is_gtest_matcher> + Matcher(M&& m) // NOLINT + : internal::MatcherBase(std::forward(m)) { + } + // Allows the user to write str instead of Eq(str) sometimes, where // str is a std::string object. Matcher(const std::string& s); // NOLINT @@ -419,6 +593,11 @@ class GTEST_API_ Matcher explicit Matcher(const MatcherInterface* impl) : internal::MatcherBase(impl) {} + template ::type::is_gtest_matcher> + Matcher(M&& m) // NOLINT + : internal::MatcherBase(std::forward(m)) {} + // Allows the user to write str instead of Eq(str) sometimes, where // str is a std::string object. Matcher(const std::string& s); // NOLINT @@ -529,37 +708,32 @@ template class ComparisonBase { public: explicit ComparisonBase(const Rhs& rhs) : rhs_(rhs) {} + + using is_gtest_matcher = void; + template - operator Matcher() const { - return Matcher(new Impl(rhs_)); + bool MatchAndExplain(const Lhs& lhs, std::ostream*) const { + return Op()(lhs, Unwrap(rhs_)); + } + void DescribeTo(std::ostream* os) const { + *os << D::Desc() << " "; + UniversalPrint(Unwrap(rhs_), os); + } + void DescribeNegationTo(std::ostream* os) const { + *os << D::NegatedDesc() << " "; + UniversalPrint(Unwrap(rhs_), os); } private: template - static const T& Unwrap(const T& v) { return v; } + static const T& Unwrap(const T& v) { + return v; + } template - static const T& Unwrap(std::reference_wrapper v) { return v; } - - template - class Impl : public MatcherInterface { - public: - explicit Impl(const Rhs& rhs) : rhs_(rhs) {} - bool MatchAndExplain(Lhs lhs, - MatchResultListener* /* listener */) const override { - return Op()(lhs, Unwrap(rhs_)); - } - void DescribeTo(::std::ostream* os) const override { - *os << D::Desc() << " "; - UniversalPrint(Unwrap(rhs_), os); - } - void DescribeNegationTo(::std::ostream* os) const override { - *os << D::NegatedDesc() << " "; - UniversalPrint(Unwrap(rhs_), os); - } + static const T& Unwrap(std::reference_wrapper v) { + return v; + } - private: - Rhs rhs_; - }; Rhs rhs_; }; @@ -612,6 +786,10 @@ class GeMatcher : public ComparisonBase, Rhs, AnyGe> { static const char* NegatedDesc() { return "isn't >="; } }; +template ::value>::type> +using StringLike = T; + // Implements polymorphic matchers MatchesRegex(regex) and // ContainsRegex(regex), which can be used as a Matcher as long as // T can be converted to a string. @@ -672,9 +850,10 @@ inline PolymorphicMatcher MatchesRegex( const internal::RE* regex) { return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true)); } -inline PolymorphicMatcher MatchesRegex( - const std::string& regex) { - return MatchesRegex(new internal::RE(regex)); +template +PolymorphicMatcher MatchesRegex( + const internal::StringLike& regex) { + return MatchesRegex(new internal::RE(std::string(regex))); } // Matches a string that contains regular expression 'regex'. @@ -683,9 +862,10 @@ inline PolymorphicMatcher ContainsRegex( const internal::RE* regex) { return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false)); } -inline PolymorphicMatcher ContainsRegex( - const std::string& regex) { - return ContainsRegex(new internal::RE(regex)); +template +PolymorphicMatcher ContainsRegex( + const internal::StringLike& regex) { + return ContainsRegex(new internal::RE(std::string(regex))); } // Creates a polymorphic matcher that matches anything equal to x. @@ -747,4 +927,4 @@ inline internal::NeMatcher Ne(Rhs x) { GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 5046 -#endif // GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ diff --git a/third_party/googletest/src/include/gtest/gtest-message.h b/third_party/googletest/src/include/gtest/gtest-message.h index 21899232a2..becfd49fcb 100644 --- a/third_party/googletest/src/include/gtest/gtest-message.h +++ b/third_party/googletest/src/include/gtest/gtest-message.h @@ -44,8 +44,8 @@ // GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ -#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ #include #include @@ -216,4 +216,4 @@ std::string StreamableToString(const T& streamable) { GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 -#endif // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ diff --git a/third_party/googletest/src/include/gtest/gtest-param-test.h b/third_party/googletest/src/include/gtest/gtest-param-test.h index 5b039df9f6..804e702817 100644 --- a/third_party/googletest/src/include/gtest/gtest-param-test.h +++ b/third_party/googletest/src/include/gtest/gtest-param-test.h @@ -30,12 +30,9 @@ // Macros and functions for implementing parameterized tests // in Google C++ Testing and Mocking Framework (Google Test) // -// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! -// // GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ -#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ - +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ // Value-parameterized tests allow you to test your code with different // parameters without writing multiple copies of the same test. @@ -371,8 +368,6 @@ inline internal::ParamGenerator Bool() { // std::tuple where T1, T2, ..., TN are the types // of elements from sequences produces by gen1, gen2, ..., genN. // -// Combine can have up to 10 arguments. -// // Example: // // This will instantiate tests in test suite AnimalTest each one with @@ -428,7 +423,8 @@ internal::CartesianProductHolder Combine(const Generator&... g) { ->AddTestPattern( \ GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name), \ new ::testing::internal::TestMetaFactory()); \ + test_suite_name, test_name)>(), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__)); \ return 0; \ } \ static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ @@ -508,4 +504,4 @@ internal::CartesianProductHolder Combine(const Generator&... g) { } // namespace testing -#endif // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ diff --git a/third_party/googletest/src/include/gtest/gtest-printers.h b/third_party/googletest/src/include/gtest/gtest-printers.h index 407d1f1859..076c9de1f4 100644 --- a/third_party/googletest/src/include/gtest/gtest-printers.h +++ b/third_party/googletest/src/include/gtest/gtest-printers.h @@ -97,10 +97,11 @@ // GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ -#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ #include +#include #include // NOLINT #include #include @@ -108,64 +109,124 @@ #include #include #include + #include "gtest/internal/gtest-internal.h" #include "gtest/internal/gtest-port.h" -#if GTEST_HAS_ABSL -#include "absl/strings/string_view.h" -#include "absl/types/optional.h" -#include "absl/types/variant.h" -#endif // GTEST_HAS_ABSL - namespace testing { -// Definitions in the 'internal' and 'internal2' name spaces are -// subject to change without notice. DO NOT USE THEM IN USER CODE! -namespace internal2 { +// Definitions in the internal* namespaces are subject to change without notice. +// DO NOT USE THEM IN USER CODE! +namespace internal { -// Prints the given number of bytes in the given object to the given -// ostream. -GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes, - size_t count, - ::std::ostream* os); +template +void UniversalPrint(const T& value, ::std::ostream* os); -// For selecting which printer to use when a given type has neither << -// nor PrintTo(). -enum TypeKind { - kProtobuf, // a protobuf type - kConvertibleToInteger, // a type implicitly convertible to BiggestInt - // (e.g. a named or unnamed enum type) -#if GTEST_INTERNAL_HAS_STRING_VIEW - kConvertibleToStringView, // a type implicitly convertible to - // absl::string_view or std::string_view -#endif - kOtherType // anything else +// Used to print an STL-style container when the user doesn't define +// a PrintTo() for it. +struct ContainerPrinter { + template (0)) == sizeof(IsContainer)) && + !IsRecursiveContainer::value>::type> + static void PrintValue(const T& container, std::ostream* os) { + const size_t kMaxCount = 32; // The maximum number of elements to print. + *os << '{'; + size_t count = 0; + for (auto&& elem : container) { + if (count > 0) { + *os << ','; + if (count == kMaxCount) { // Enough has been printed. + *os << " ..."; + break; + } + } + *os << ' '; + // We cannot call PrintTo(elem, os) here as PrintTo() doesn't + // handle `elem` being a native array. + internal::UniversalPrint(elem, os); + ++count; + } + + if (count > 0) { + *os << ' '; + } + *os << '}'; + } }; -// TypeWithoutFormatter::PrintValue(value, os) is called -// by the universal printer to print a value of type T when neither -// operator<< nor PrintTo() is defined for T, where kTypeKind is the -// "kind" of T as defined by enum TypeKind. -template -class TypeWithoutFormatter { - public: - // This default version is called when kTypeKind is kOtherType. +// Used to print a pointer that is neither a char pointer nor a member +// pointer, when the user doesn't define PrintTo() for it. (A member +// variable pointer or member function pointer doesn't really point to +// a location in the address space. Their representation is +// implementation-defined. Therefore they will be printed as raw +// bytes.) +struct FunctionPointerPrinter { + template ::value>::type> + static void PrintValue(T* p, ::std::ostream* os) { + if (p == nullptr) { + *os << "NULL"; + } else { + // T is a function type, so '*os << p' doesn't do what we want + // (it just prints p as bool). We want to print p as a const + // void*. + *os << reinterpret_cast(p); + } + } +}; + +struct PointerPrinter { + template + static void PrintValue(T* p, ::std::ostream* os) { + if (p == nullptr) { + *os << "NULL"; + } else { + // T is not a function type. We just call << to print p, + // relying on ADL to pick up user-defined << for their pointer + // types, if any. + *os << p; + } + } +}; + +namespace internal_stream_operator_without_lexical_name_lookup { + +// The presence of an operator<< here will terminate lexical scope lookup +// straight away (even though it cannot be a match because of its argument +// types). Thus, the two operator<< calls in StreamPrinter will find only ADL +// candidates. +struct LookupBlocker {}; +void operator<<(LookupBlocker, LookupBlocker); + +struct StreamPrinter { + template ::value>::type, + // Only accept types for which we can find a streaming operator via + // ADL (possibly involving implicit conversions). + typename = decltype(std::declval() + << std::declval())> static void PrintValue(const T& value, ::std::ostream* os) { - PrintBytesInObjectTo( - static_cast( - reinterpret_cast(std::addressof(value))), - sizeof(value), os); + // Call streaming operator found by ADL, possibly with implicit conversions + // of the arguments. + *os << value; } }; -// We print a protobuf using its ShortDebugString() when the string -// doesn't exceed this many characters; otherwise we print it using -// DebugString() for better readability. -const size_t kProtobufOneLinerMaxLength = 50; +} // namespace internal_stream_operator_without_lexical_name_lookup -template -class TypeWithoutFormatter { - public: +struct ProtobufPrinter { + // We print a protobuf using its ShortDebugString() when the string + // doesn't exceed this many characters; otherwise we print it using + // DebugString() for better readability. + static const size_t kProtobufOneLinerMaxLength = 50; + + template ::value>::type> static void PrintValue(const T& value, ::std::ostream* os) { std::string pretty_str = value.ShortDebugString(); if (pretty_str.length() > kProtobufOneLinerMaxLength) { @@ -175,9 +236,7 @@ class TypeWithoutFormatter { } }; -template -class TypeWithoutFormatter { - public: +struct ConvertibleToIntegerPrinter { // Since T has no << operator or PrintTo() but can be implicitly // converted to BiggestInt, we print it as a BiggestInt. // @@ -185,111 +244,73 @@ class TypeWithoutFormatter { // case printing it as an integer is the desired behavior. In case // T is not an enum, printing it as an integer is the best we can do // given that it has no user-defined printer. - static void PrintValue(const T& value, ::std::ostream* os) { - const internal::BiggestInt kBigInt = value; - *os << kBigInt; + static void PrintValue(internal::BiggestInt value, ::std::ostream* os) { + *os << value; } }; +struct ConvertibleToStringViewPrinter { #if GTEST_INTERNAL_HAS_STRING_VIEW -template -class TypeWithoutFormatter { - public: - // Since T has neither operator<< nor PrintTo() but can be implicitly - // converted to absl::string_view, we print it as a absl::string_view - // (or std::string_view). - // - // Note: the implementation is further below, as it depends on - // internal::PrintTo symbol which is defined later in the file. - static void PrintValue(const T& value, ::std::ostream* os); -}; + static void PrintValue(internal::StringView value, ::std::ostream* os) { + internal::UniversalPrint(value, os); + } #endif +}; -// Prints the given value to the given ostream. If the value is a -// protocol message, its debug string is printed; if it's an enum or -// of a type implicitly convertible to BiggestInt, it's printed as an -// integer; otherwise the bytes in the value are printed. This is -// what UniversalPrinter::Print() does when it knows nothing about -// type T and T has neither << operator nor PrintTo(). -// -// A user can override this behavior for a class type Foo by defining -// a << operator in the namespace where Foo is defined. -// -// We put this operator in namespace 'internal2' instead of 'internal' -// to simplify the implementation, as much code in 'internal' needs to -// use << in STL, which would conflict with our own << were it defined -// in 'internal'. -// -// Note that this operator<< takes a generic std::basic_ostream type instead of the more restricted std::ostream. If -// we define it to take an std::ostream instead, we'll get an -// "ambiguous overloads" compiler error when trying to print a type -// Foo that supports streaming to std::basic_ostream, as the compiler cannot tell whether -// operator<<(std::ostream&, const T&) or -// operator<<(std::basic_stream, const Foo&) is more -// specific. -template -::std::basic_ostream& operator<<( - ::std::basic_ostream& os, const T& x) { - TypeWithoutFormatter::value - ? kProtobuf - : std::is_convertible< - const T&, internal::BiggestInt>::value - ? kConvertibleToInteger - : -#if GTEST_INTERNAL_HAS_STRING_VIEW - std::is_convertible< - const T&, internal::StringView>::value - ? kConvertibleToStringView - : -#endif - kOtherType)>::PrintValue(x, &os); - return os; -} -} // namespace internal2 -} // namespace testing +// Prints the given number of bytes in the given object to the given +// ostream. +GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes, + size_t count, + ::std::ostream* os); +struct RawBytesPrinter { + // SFINAE on `sizeof` to make sure we have a complete type. + template + static void PrintValue(const T& value, ::std::ostream* os) { + PrintBytesInObjectTo( + static_cast( + // Load bearing cast to void* to support iOS + reinterpret_cast(std::addressof(value))), + sizeof(value), os); + } +}; -// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up -// magic needed for implementing UniversalPrinter won't work. -namespace testing_internal { +struct FallbackPrinter { + template + static void PrintValue(const T&, ::std::ostream* os) { + *os << "(incomplete type)"; + } +}; -// Used to print a value that is not an STL-style container when the -// user doesn't define PrintTo() for it. -template -void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) { - // With the following statement, during unqualified name lookup, - // testing::internal2::operator<< appears as if it was declared in - // the nearest enclosing namespace that contains both - // ::testing_internal and ::testing::internal2, i.e. the global - // namespace. For more details, refer to the C++ Standard section - // 7.3.4-1 [namespace.udir]. This allows us to fall back onto - // testing::internal2::operator<< in case T doesn't come with a << - // operator. - - using ::testing::internal2::operator<<; - - // Assuming T is defined in namespace foo, in the next statement, - // the compiler will consider all of: - // - // 1. foo::operator<< (thanks to Koenig look-up), - // 2. ::operator<< (as the current namespace is enclosed in ::), - // 3. testing::internal2::operator<< (thanks to the using statement above). - // - // The operator<< whose type matches T best will be picked. - // - // We deliberately allow #2 to be a candidate, as sometimes it's - // impossible to define #1 (e.g. when foo is ::std, defining - // anything in it is undefined behavior unless you are a compiler - // vendor.). - *os << value; -} +// Try every printer in order and return the first one that works. +template +struct FindFirstPrinter : FindFirstPrinter {}; -} // namespace testing_internal +template +struct FindFirstPrinter< + T, decltype(Printer::PrintValue(std::declval(), nullptr)), + Printer, Printers...> { + using type = Printer; +}; -namespace testing { -namespace internal { +// Select the best printer in the following order: +// - Print containers (they have begin/end/etc). +// - Print function pointers. +// - Print object pointers. +// - Use the stream operator, if available. +// - Print protocol buffers. +// - Print types convertible to BiggestInt. +// - Print types convertible to StringView, if available. +// - Fallback to printing the raw bytes of the object. +template +void PrintWithFallback(const T& value, ::std::ostream* os) { + using Printer = typename FindFirstPrinter< + T, void, ContainerPrinter, FunctionPointerPrinter, PointerPrinter, + internal_stream_operator_without_lexical_name_lookup::StreamPrinter, + ProtobufPrinter, ConvertibleToIntegerPrinter, + ConvertibleToStringViewPrinter, RawBytesPrinter, FallbackPrinter>::type; + Printer::PrintValue(value, os); +} // FormatForComparison::Format(value) formats a // value of type ToPrint that is an operand of a comparison assertion @@ -339,6 +360,14 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char); GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char); GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t); GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t); +#ifdef __cpp_char8_t +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t); +#endif +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char16_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char16_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char32_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t); #undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_ @@ -356,6 +385,14 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t); GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string); GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string); +#ifdef __cpp_char8_t +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char8_t, ::std::u8string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char8_t, ::std::u8string); +#endif +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char16_t, ::std::u16string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char16_t, ::std::u16string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char32_t, ::std::u32string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char32_t, ::std::u32string); #if GTEST_HAS_STD_WSTRING GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring); @@ -388,85 +425,6 @@ std::string FormatForComparisonFailureMessage( template class UniversalPrinter; -template -void UniversalPrint(const T& value, ::std::ostream* os); - -enum DefaultPrinterType { - kPrintContainer, - kPrintPointer, - kPrintFunctionPointer, - kPrintOther, -}; -template struct WrapPrinterType {}; - -// Used to print an STL-style container when the user doesn't define -// a PrintTo() for it. -template -void DefaultPrintTo(WrapPrinterType /* dummy */, - const C& container, ::std::ostream* os) { - const size_t kMaxCount = 32; // The maximum number of elements to print. - *os << '{'; - size_t count = 0; - for (typename C::const_iterator it = container.begin(); - it != container.end(); ++it, ++count) { - if (count > 0) { - *os << ','; - if (count == kMaxCount) { // Enough has been printed. - *os << " ..."; - break; - } - } - *os << ' '; - // We cannot call PrintTo(*it, os) here as PrintTo() doesn't - // handle *it being a native array. - internal::UniversalPrint(*it, os); - } - - if (count > 0) { - *os << ' '; - } - *os << '}'; -} - -// Used to print a pointer that is neither a char pointer nor a member -// pointer, when the user doesn't define PrintTo() for it. (A member -// variable pointer or member function pointer doesn't really point to -// a location in the address space. Their representation is -// implementation-defined. Therefore they will be printed as raw -// bytes.) -template -void DefaultPrintTo(WrapPrinterType /* dummy */, - T* p, ::std::ostream* os) { - if (p == nullptr) { - *os << "NULL"; - } else { - // T is not a function type. We just call << to print p, - // relying on ADL to pick up user-defined << for their pointer - // types, if any. - *os << p; - } -} -template -void DefaultPrintTo(WrapPrinterType /* dummy */, - T* p, ::std::ostream* os) { - if (p == nullptr) { - *os << "NULL"; - } else { - // T is a function type, so '*os << p' doesn't do what we want - // (it just prints p as bool). We want to print p as a const - // void*. - *os << reinterpret_cast(p); - } -} - -// Used to print a non-container, non-pointer value when the user -// doesn't define PrintTo() for it. -template -void DefaultPrintTo(WrapPrinterType /* dummy */, - const T& value, ::std::ostream* os) { - ::testing_internal::DefaultPrintNonContainerTo(value, os); -} - // Prints the given value using the << operator if it has one; // otherwise prints the bytes in it. This is what // UniversalPrinter::Print() does when PrintTo() is not specialized @@ -480,36 +438,7 @@ void DefaultPrintTo(WrapPrinterType /* dummy */, // wants). template void PrintTo(const T& value, ::std::ostream* os) { - // DefaultPrintTo() is overloaded. The type of its first argument - // determines which version will be picked. - // - // Note that we check for container types here, prior to we check - // for protocol message types in our operator<<. The rationale is: - // - // For protocol messages, we want to give people a chance to - // override Google Mock's format by defining a PrintTo() or - // operator<<. For STL containers, other formats can be - // incompatible with Google Mock's format for the container - // elements; therefore we check for container types here to ensure - // that our format is used. - // - // Note that MSVC and clang-cl do allow an implicit conversion from - // pointer-to-function to pointer-to-object, but clang-cl warns on it. - // So don't use ImplicitlyConvertible if it can be helped since it will - // cause this warning, and use a separate overload of DefaultPrintTo for - // function pointers so that the `*os << p` in the object pointer overload - // doesn't cause that warning either. - DefaultPrintTo( - WrapPrinterType < - (sizeof(IsContainerTest(0)) == sizeof(IsContainer)) && - !IsRecursiveContainer::value - ? kPrintContainer - : !std::is_pointer::value - ? kPrintOther - : std::is_function::type>::value - ? kPrintFunctionPointer - : kPrintPointer > (), - value, os); + internal::PrintWithFallback(value, os); } // The following list of PrintTo() overloads tells @@ -540,6 +469,16 @@ inline void PrintTo(bool x, ::std::ostream* os) { // is implemented as an unsigned type. GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os); +GTEST_API_ void PrintTo(char32_t c, ::std::ostream* os); +inline void PrintTo(char16_t c, ::std::ostream* os) { + PrintTo(ImplicitCast_(c), os); +} +#ifdef __cpp_char8_t +inline void PrintTo(char8_t c, ::std::ostream* os) { + PrintTo(ImplicitCast_(c), os); +} +#endif + // Overloads for C strings. GTEST_API_ void PrintTo(const char* s, ::std::ostream* os); inline void PrintTo(char* s, ::std::ostream* os) { @@ -560,6 +499,23 @@ inline void PrintTo(const unsigned char* s, ::std::ostream* os) { inline void PrintTo(unsigned char* s, ::std::ostream* os) { PrintTo(ImplicitCast_(s), os); } +#ifdef __cpp_char8_t +// Overloads for u8 strings. +GTEST_API_ void PrintTo(const char8_t* s, ::std::ostream* os); +inline void PrintTo(char8_t* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +#endif +// Overloads for u16 strings. +GTEST_API_ void PrintTo(const char16_t* s, ::std::ostream* os); +inline void PrintTo(char16_t* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +// Overloads for u32 strings. +GTEST_API_ void PrintTo(const char32_t* s, ::std::ostream* os); +inline void PrintTo(char32_t* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} // MSVC can be configured to define wchar_t as a typedef of unsigned // short. It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native @@ -594,6 +550,26 @@ inline void PrintTo(const ::std::string& s, ::std::ostream* os) { PrintStringTo(s, os); } +// Overloads for ::std::u8string +#ifdef __cpp_char8_t +GTEST_API_ void PrintU8StringTo(const ::std::u8string& s, ::std::ostream* os); +inline void PrintTo(const ::std::u8string& s, ::std::ostream* os) { + PrintU8StringTo(s, os); +} +#endif + +// Overloads for ::std::u16string +GTEST_API_ void PrintU16StringTo(const ::std::u16string& s, ::std::ostream* os); +inline void PrintTo(const ::std::u16string& s, ::std::ostream* os) { + PrintU16StringTo(s, os); +} + +// Overloads for ::std::u32string +GTEST_API_ void PrintU32StringTo(const ::std::u32string& s, ::std::ostream* os); +inline void PrintTo(const ::std::u32string& s, ::std::ostream* os) { + PrintU32StringTo(s, os); +} + // Overloads for ::std::wstring. #if GTEST_HAS_STD_WSTRING GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os); @@ -616,6 +592,43 @@ void PrintTo(std::reference_wrapper ref, ::std::ostream* os) { UniversalPrinter::Print(ref.get(), os); } +inline const void* VoidifyPointer(const void* p) { return p; } +inline const void* VoidifyPointer(volatile const void* p) { + return const_cast(p); +} + +template +void PrintSmartPointer(const Ptr& ptr, std::ostream* os, char) { + if (ptr == nullptr) { + *os << "(nullptr)"; + } else { + // We can't print the value. Just print the pointer.. + *os << "(" << (VoidifyPointer)(ptr.get()) << ")"; + } +} +template ::value && + !std::is_array::value>::type> +void PrintSmartPointer(const Ptr& ptr, std::ostream* os, int) { + if (ptr == nullptr) { + *os << "(nullptr)"; + } else { + *os << "(ptr = " << (VoidifyPointer)(ptr.get()) << ", value = "; + UniversalPrinter::Print(*ptr, os); + *os << ")"; + } +} + +template +void PrintTo(const std::unique_ptr& ptr, std::ostream* os) { + (PrintSmartPointer)(ptr, os, 0); +} + +template +void PrintTo(const std::shared_ptr& ptr, std::ostream* os) { + (PrintSmartPointer)(ptr, os, 0); +} + // Helper function for printing a tuple. T must be instantiated with // a tuple type. template @@ -681,14 +694,46 @@ class UniversalPrinter { GTEST_DISABLE_MSC_WARNINGS_POP_() }; -#if GTEST_HAS_ABSL +// Remove any const-qualifiers before passing a type to UniversalPrinter. +template +class UniversalPrinter : public UniversalPrinter {}; + +#if GTEST_INTERNAL_HAS_ANY -// Printer for absl::optional +// Printer for std::any / absl::any + +template <> +class UniversalPrinter { + public: + static void Print(const Any& value, ::std::ostream* os) { + if (value.has_value()) { + *os << "value of type " << GetTypeName(value); + } else { + *os << "no value"; + } + } + + private: + static std::string GetTypeName(const Any& value) { +#if GTEST_HAS_RTTI + return internal::GetTypeName(value.type()); +#else + static_cast(value); // possibly unused + return ""; +#endif // GTEST_HAS_RTTI + } +}; + +#endif // GTEST_INTERNAL_HAS_ANY + +#if GTEST_INTERNAL_HAS_OPTIONAL + +// Printer for std::optional / absl::optional template -class UniversalPrinter<::absl::optional> { +class UniversalPrinter> { public: - static void Print(const ::absl::optional& value, ::std::ostream* os) { + static void Print(const Optional& value, ::std::ostream* os) { *os << '('; if (!value) { *os << "nullopt"; @@ -699,14 +744,22 @@ class UniversalPrinter<::absl::optional> { } }; -// Printer for absl::variant +#endif // GTEST_INTERNAL_HAS_OPTIONAL + +#if GTEST_INTERNAL_HAS_VARIANT + +// Printer for std::variant / absl::variant template -class UniversalPrinter<::absl::variant> { +class UniversalPrinter> { public: - static void Print(const ::absl::variant& value, ::std::ostream* os) { + static void Print(const Variant& value, ::std::ostream* os) { *os << '('; - absl::visit(Visitor{os}, value); +#if GTEST_HAS_ABSL + absl::visit(Visitor{os, value.index()}, value); +#else + std::visit(Visitor{os, value.index()}, value); +#endif // GTEST_HAS_ABSL *os << ')'; } @@ -714,14 +767,16 @@ class UniversalPrinter<::absl::variant> { struct Visitor { template void operator()(const U& u) const { - *os << "'" << GetTypeName() << "' with value "; + *os << "'" << GetTypeName() << "(index = " << index + << ")' with value "; UniversalPrint(u, os); } ::std::ostream* os; + std::size_t index; }; }; -#endif // GTEST_HAS_ABSL +#endif // GTEST_INTERNAL_HAS_VARIANT // UniversalPrintArray(begin, len, os) prints an array of 'len' // elements, starting at address 'begin'. @@ -750,6 +805,20 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) { GTEST_API_ void UniversalPrintArray( const char* begin, size_t len, ::std::ostream* os); +#ifdef __cpp_char8_t +// This overload prints a (const) char8_t array compactly. +GTEST_API_ void UniversalPrintArray(const char8_t* begin, size_t len, + ::std::ostream* os); +#endif + +// This overload prints a (const) char16_t array compactly. +GTEST_API_ void UniversalPrintArray(const char16_t* begin, size_t len, + ::std::ostream* os); + +// This overload prints a (const) char32_t array compactly. +GTEST_API_ void UniversalPrintArray(const char32_t* begin, size_t len, + ::std::ostream* os); + // This overload prints a (const) wchar_t array compactly. GTEST_API_ void UniversalPrintArray( const wchar_t* begin, size_t len, ::std::ostream* os); @@ -822,12 +891,55 @@ class UniversalTersePrinter { } }; template <> -class UniversalTersePrinter { +class UniversalTersePrinter : public UniversalTersePrinter { +}; + +#ifdef __cpp_char8_t +template <> +class UniversalTersePrinter { public: - static void Print(char* str, ::std::ostream* os) { - UniversalTersePrinter::Print(str, os); + static void Print(const char8_t* str, ::std::ostream* os) { + if (str == nullptr) { + *os << "NULL"; + } else { + UniversalPrint(::std::u8string(str), os); + } } }; +template <> +class UniversalTersePrinter + : public UniversalTersePrinter {}; +#endif + +template <> +class UniversalTersePrinter { + public: + static void Print(const char16_t* str, ::std::ostream* os) { + if (str == nullptr) { + *os << "NULL"; + } else { + UniversalPrint(::std::u16string(str), os); + } + } +}; +template <> +class UniversalTersePrinter + : public UniversalTersePrinter {}; + +template <> +class UniversalTersePrinter { + public: + static void Print(const char32_t* str, ::std::ostream* os) { + if (str == nullptr) { + *os << "NULL"; + } else { + UniversalPrint(::std::u32string(str), os); + } + } +}; +template <> +class UniversalTersePrinter + : public UniversalTersePrinter {}; #if GTEST_HAS_STD_WSTRING template <> @@ -900,16 +1012,6 @@ Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) { } // namespace internal -#if GTEST_INTERNAL_HAS_STRING_VIEW -namespace internal2 { -template -void TypeWithoutFormatter::PrintValue( - const T& value, ::std::ostream* os) { - internal::PrintTo(internal::StringView(value), os); -} -} // namespace internal2 -#endif - template ::std::string PrintToString(const T& value) { ::std::stringstream ss; @@ -924,4 +1026,4 @@ ::std::string PrintToString(const T& value) { // declarations from this file. #include "gtest/internal/custom/gtest-printers.h" -#endif // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ diff --git a/third_party/googletest/src/include/gtest/gtest-spi.h b/third_party/googletest/src/include/gtest/gtest-spi.h index aa38870e8e..eacef44669 100644 --- a/third_party/googletest/src/include/gtest/gtest-spi.h +++ b/third_party/googletest/src/include/gtest/gtest-spi.h @@ -33,8 +33,8 @@ // GOOGLETEST_CM0004 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_ -#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_ #include "gtest/gtest.h" @@ -235,4 +235,4 @@ GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 }\ } while (::testing::internal::AlwaysFalse()) -#endif // GTEST_INCLUDE_GTEST_GTEST_SPI_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_ diff --git a/third_party/googletest/src/include/gtest/gtest-test-part.h b/third_party/googletest/src/include/gtest/gtest-test-part.h index 05a7985358..203fdf98c6 100644 --- a/third_party/googletest/src/include/gtest/gtest-test-part.h +++ b/third_party/googletest/src/include/gtest/gtest-test-part.h @@ -29,8 +29,8 @@ // // GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ -#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ #include #include @@ -181,4 +181,4 @@ class GTEST_API_ HasNewFatalFailureHelper GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 -#endif // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ diff --git a/third_party/googletest/src/include/gtest/gtest-typed-test.h b/third_party/googletest/src/include/gtest/gtest-typed-test.h index 3ffa50b739..9fdc6be10d 100644 --- a/third_party/googletest/src/include/gtest/gtest-typed-test.h +++ b/third_party/googletest/src/include/gtest/gtest-typed-test.h @@ -29,8 +29,8 @@ // GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ -#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ // This header implements typed tests and type-parameterized tests. @@ -175,8 +175,6 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); // Implements typed tests. -#if GTEST_HAS_TYPED_TEST - // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. // // Expands to the name of the typedef for the type parameters of the @@ -230,12 +228,8 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); TYPED_TEST_SUITE #endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ -#endif // GTEST_HAS_TYPED_TEST - // Implements type-parameterized tests. -#if GTEST_HAS_TYPED_TEST_P - // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. // // Expands to the namespace name that the type-parameterized tests for @@ -332,6 +326,4 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); INSTANTIATE_TYPED_TEST_SUITE_P #endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ -#endif // GTEST_HAS_TYPED_TEST_P - -#endif // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ diff --git a/third_party/googletest/src/include/gtest/gtest.h b/third_party/googletest/src/include/gtest/gtest.h index 39cff08d65..7a5d057c4a 100644 --- a/third_party/googletest/src/include/gtest/gtest.h +++ b/third_party/googletest/src/include/gtest/gtest.h @@ -49,8 +49,8 @@ // GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_GTEST_H_ -#define GTEST_INCLUDE_GTEST_GTEST_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_H_ #include #include @@ -101,6 +101,10 @@ GTEST_DECLARE_bool_(catch_exceptions); // to let Google Test decide. GTEST_DECLARE_string_(color); +// This flag controls whether the test runner should continue execution past +// first failure. +GTEST_DECLARE_bool_(fail_fast); + // This flag sets up the filter to select by name using a glob pattern // the tests to run. If the filter is not given all tests are executed. GTEST_DECLARE_string_(filter); @@ -117,6 +121,9 @@ GTEST_DECLARE_bool_(list_tests); // in addition to its normal textual output. GTEST_DECLARE_string_(output); +// This flags control whether Google Test prints only test failures. +GTEST_DECLARE_bool_(brief); + // This flags control whether Google Test prints the elapsed time for each // test. GTEST_DECLARE_bool_(print_time); @@ -411,10 +418,10 @@ class GTEST_API_ Test { // The d'tor is virtual as we intend to inherit from Test. virtual ~Test(); - // Sets up the stuff shared by all tests in this test case. + // Sets up the stuff shared by all tests in this test suite. // // Google Test will call Foo::SetUpTestSuite() before running the first - // test in test case Foo. Hence a sub-class can define its own + // test in test suite Foo. Hence a sub-class can define its own // SetUpTestSuite() method to shadow the one defined in the super // class. static void SetUpTestSuite() {} @@ -422,12 +429,13 @@ class GTEST_API_ Test { // Tears down the stuff shared by all tests in this test suite. // // Google Test will call Foo::TearDownTestSuite() after running the last - // test in test case Foo. Hence a sub-class can define its own + // test in test suite Foo. Hence a sub-class can define its own // TearDownTestSuite() method to shadow the one defined in the super // class. static void TearDownTestSuite() {} - // Legacy API is deprecated but still available + // Legacy API is deprecated but still available. Use SetUpTestSuite and + // TearDownTestSuite instead. #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ static void TearDownTestCase() {} static void SetUpTestCase() {} @@ -665,7 +673,7 @@ class GTEST_API_ TestResult { // Protects mutable state of the property vector and of owned // properties, whose values may be updated. - internal::Mutex test_properites_mutex_; + internal::Mutex test_properties_mutex_; // The vector of TestPartResults std::vector test_part_results_; @@ -795,6 +803,9 @@ class GTEST_API_ TestInfo { // deletes it. void Run(); + // Skip and records the test result for this object. + void Skip(); + static void ClearTestResult(TestInfo* test_info) { test_info->result_.Clear(); } @@ -943,6 +954,9 @@ class GTEST_API_ TestSuite { // Runs every test in this TestSuite. void Run(); + // Skips the execution of tests under this TestSuite + void Skip(); + // Runs SetUpTestSuite() for this TestSuite. This wrapper is needed // for catching exceptions thrown from SetUpTestSuite(). void RunSetUpTestSuite() { @@ -1535,14 +1549,6 @@ AssertionResult CmpHelperEQ(const char* lhs_expression, return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs); } -// With this overloaded version, we allow anonymous enums to be used -// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums -// can be implicitly cast to BiggestInt. -GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression, - const char* rhs_expression, - BiggestInt lhs, - BiggestInt rhs); - class EqHelper { public: // This templatized version is for the general case. @@ -1599,11 +1605,6 @@ AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2, // ASSERT_?? and EXPECT_??. It is here just to avoid copy-and-paste // of similar code. // -// For each templatized helper function, we also define an overloaded -// version for BiggestInt in order to reduce code bloat and allow -// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled -// with gcc 4. -// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. #define GTEST_IMPL_CMP_HELPER_(op_name, op)\ @@ -1615,22 +1616,20 @@ AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ } else {\ return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\ }\ -}\ -GTEST_API_ AssertionResult CmpHelper##op_name(\ - const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2) +} // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. // Implements the helper function for {ASSERT|EXPECT}_NE -GTEST_IMPL_CMP_HELPER_(NE, !=); +GTEST_IMPL_CMP_HELPER_(NE, !=) // Implements the helper function for {ASSERT|EXPECT}_LE -GTEST_IMPL_CMP_HELPER_(LE, <=); +GTEST_IMPL_CMP_HELPER_(LE, <=) // Implements the helper function for {ASSERT|EXPECT}_LT -GTEST_IMPL_CMP_HELPER_(LT, <); +GTEST_IMPL_CMP_HELPER_(LT, <) // Implements the helper function for {ASSERT|EXPECT}_GE -GTEST_IMPL_CMP_HELPER_(GE, >=); +GTEST_IMPL_CMP_HELPER_(GE, >=) // Implements the helper function for {ASSERT|EXPECT}_GT -GTEST_IMPL_CMP_HELPER_(GT, >); +GTEST_IMPL_CMP_HELPER_(GT, >) #undef GTEST_IMPL_CMP_HELPER_ @@ -1807,12 +1806,6 @@ class GTEST_API_ AssertHelper { GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper); }; -enum GTestColor { COLOR_DEFAULT, COLOR_RED, COLOR_GREEN, COLOR_YELLOW }; - -GTEST_API_ GTEST_ATTRIBUTE_PRINTF_(2, 3) void ColoredPrintf(GTestColor color, - const char* fmt, - ...); - } // namespace internal // The pure interface class that all value-parameterized tests inherit from. @@ -1969,19 +1962,38 @@ class TestWithParam : public Test, public WithParamInterface { // Boolean assertions. Condition can be either a Boolean expression or an // AssertionResult. For more information on how to use AssertionResult with // these macros see comments on that class. -#define EXPECT_TRUE(condition) \ +#define GTEST_EXPECT_TRUE(condition) \ GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ GTEST_NONFATAL_FAILURE_) -#define EXPECT_FALSE(condition) \ +#define GTEST_EXPECT_FALSE(condition) \ GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ GTEST_NONFATAL_FAILURE_) -#define ASSERT_TRUE(condition) \ +#define GTEST_ASSERT_TRUE(condition) \ GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ GTEST_FATAL_FAILURE_) -#define ASSERT_FALSE(condition) \ +#define GTEST_ASSERT_FALSE(condition) \ GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ GTEST_FATAL_FAILURE_) +// Define these macros to 1 to omit the definition of the corresponding +// EXPECT or ASSERT, which clashes with some users' own code. + +#if !GTEST_DONT_DEFINE_EXPECT_TRUE +#define EXPECT_TRUE(condition) GTEST_EXPECT_TRUE(condition) +#endif + +#if !GTEST_DONT_DEFINE_EXPECT_FALSE +#define EXPECT_FALSE(condition) GTEST_EXPECT_FALSE(condition) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_TRUE +#define ASSERT_TRUE(condition) GTEST_ASSERT_TRUE(condition) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_FALSE +#define ASSERT_FALSE(condition) GTEST_ASSERT_FALSE(condition) +#endif + // Macros for testing equalities and inequalities. // // * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2 @@ -2480,4 +2492,4 @@ inline int RUN_ALL_TESTS() { GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 -#endif // GTEST_INCLUDE_GTEST_GTEST_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_H_ diff --git a/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/third_party/googletest/src/include/gtest/gtest_pred_impl.h index d514255c73..5029a9bb02 100644 --- a/third_party/googletest/src/include/gtest/gtest_pred_impl.h +++ b/third_party/googletest/src/include/gtest/gtest_pred_impl.h @@ -33,8 +33,8 @@ // Implements a family of generic predicate assertion macros. // GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ -#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ #include "gtest/gtest.h" @@ -356,4 +356,4 @@ AssertionResult AssertPred5Helper(const char* pred_text, } // namespace testing -#endif // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ diff --git a/third_party/googletest/src/include/gtest/gtest_prod.h b/third_party/googletest/src/include/gtest/gtest_prod.h index e651671ebd..38b9d85a51 100644 --- a/third_party/googletest/src/include/gtest/gtest_prod.h +++ b/third_party/googletest/src/include/gtest/gtest_prod.h @@ -31,8 +31,8 @@ // Google C++ Testing and Mocking Framework definitions useful in production code. // GOOGLETEST_CM0003 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_ -#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_ // When you need to test the private or protected members of a class, // use the FRIEND_TEST macro to declare your tests as friends of the @@ -58,4 +58,4 @@ #define FRIEND_TEST(test_case_name, test_name)\ friend class test_case_name##_##test_name##_Test -#endif // GTEST_INCLUDE_GTEST_GTEST_PROD_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_ diff --git a/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h index cd85d956d2..db02881c0c 100644 --- a/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h +++ b/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h @@ -31,7 +31,7 @@ // // ** Custom implementation starts here ** -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ -#endif // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ diff --git a/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h b/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h index eb4467abca..b9495d8378 100644 --- a/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h +++ b/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h @@ -36,7 +36,7 @@ // // ** Custom implementation starts here ** -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ -#endif // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ diff --git a/third_party/googletest/src/include/gtest/internal/custom/gtest.h b/third_party/googletest/src/include/gtest/internal/custom/gtest.h index 4c8e07be23..afaaf17ba2 100644 --- a/third_party/googletest/src/include/gtest/internal/custom/gtest.h +++ b/third_party/googletest/src/include/gtest/internal/custom/gtest.h @@ -31,7 +31,7 @@ // // ** Custom implementation starts here ** -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ -#endif // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h index 68bd353061..490296dfad 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h @@ -33,8 +33,8 @@ // death tests. They are subject to change without notice. // GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ #include "gtest/gtest-matchers.h" #include "gtest/internal/gtest-internal.h" @@ -301,4 +301,4 @@ InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag(); } // namespace internal } // namespace testing -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/third_party/googletest/src/include/gtest/internal/gtest-filepath.h index c11b101516..0c033abc34 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-filepath.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-filepath.h @@ -37,8 +37,8 @@ // GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ #include "gtest/internal/gtest-string.h" @@ -195,7 +195,7 @@ class GTEST_API_ FilePath { void Normalize(); - // Returns a pointer to the last occurence of a valid path separator in + // Returns a pointer to the last occurrence of a valid path separator in // the FilePath. On Windows, for example, both '/' and '\' are valid path // separators. Returns NULL if no path separator was found. const char* FindLastPathSeparator() const; @@ -208,4 +208,4 @@ class GTEST_API_ FilePath { GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/third_party/googletest/src/include/gtest/internal/gtest-internal.h index 6bad8780b5..f8cbdbd81d 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-internal.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-internal.h @@ -34,8 +34,8 @@ // GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ #include "gtest/internal/gtest-port.h" @@ -90,7 +90,9 @@ #define GTEST_STRINGIFY_HELPER_(name, ...) #name #define GTEST_STRINGIFY_(...) GTEST_STRINGIFY_HELPER_(__VA_ARGS__, ) -namespace proto2 { class Message; } +namespace proto2 { +class MessageLite; +} namespace testing { @@ -285,7 +287,7 @@ class FloatingPoint { // // See the following article for more details on ULP: // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/ - static const size_t kMaxUlps = 4; + static const uint32_t kMaxUlps = 4; // Constructs a FloatingPoint from a raw floating-point number. // @@ -518,6 +520,7 @@ struct SuiteApiResolver : T { static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char* filename, int line_num) { +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ SetUpTearDownSuiteFuncType test_case_fp = GetNotDefaultOrNull(&T::SetUpTestCase, &Test::SetUpTestCase); SetUpTearDownSuiteFuncType test_suite_fp = @@ -529,10 +532,16 @@ struct SuiteApiResolver : T { << filename << ":" << line_num; return test_case_fp != nullptr ? test_case_fp : test_suite_fp; +#else + (void)(filename); + (void)(line_num); + return &T::SetUpTestSuite; +#endif } static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char* filename, int line_num) { +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ SetUpTearDownSuiteFuncType test_case_fp = GetNotDefaultOrNull(&T::TearDownTestCase, &Test::TearDownTestCase); SetUpTearDownSuiteFuncType test_suite_fp = @@ -544,6 +553,11 @@ struct SuiteApiResolver : T { << filename << ":" << line_num; return test_case_fp != nullptr ? test_case_fp : test_suite_fp; +#else + (void)(filename); + (void)(line_num); + return &T::TearDownTestSuite; +#endif } }; @@ -552,11 +566,11 @@ struct SuiteApiResolver : T { // // Arguments: // -// test_suite_name: name of the test suite +// test_suite_name: name of the test suite // name: name of the test -// type_param the name of the test's type parameter, or NULL if +// type_param: the name of the test's type parameter, or NULL if // this is not a typed or a type-parameterized test. -// value_param text representation of the test's value parameter, +// value_param: text representation of the test's value parameter, // or NULL if this is not a type-parameterized test. // code_location: code location where the test is defined // fixture_class_id: ID of the test fixture class @@ -576,8 +590,6 @@ GTEST_API_ TestInfo* MakeAndRegisterTestInfo( // and returns false. None of pstr, *pstr, and prefix can be NULL. GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr); -#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P - GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ /* class A needs to have dll-interface to be used by clients of class B */) @@ -809,8 +821,6 @@ class TypeParameterizedTestSuite { } }; -#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P - // Returns the current OS stack trace as an std::string. // // The maximum number of stack frames to be included is specified by @@ -878,11 +888,34 @@ class GTEST_API_ Random { #define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \ typename std::remove_const::type>::type -// IsAProtocolMessage::value is a compile-time bool constant that's -// true if and only if T is type proto2::Message or a subclass of it. +// HasDebugStringAndShortDebugString::value is a compile-time bool constant +// that's true if and only if T has methods DebugString() and ShortDebugString() +// that return std::string. template -struct IsAProtocolMessage - : public std::is_convertible {}; +class HasDebugStringAndShortDebugString { + private: + template + static auto CheckDebugString(C*) -> typename std::is_same< + std::string, decltype(std::declval().DebugString())>::type; + template + static std::false_type CheckDebugString(...); + + template + static auto CheckShortDebugString(C*) -> typename std::is_same< + std::string, decltype(std::declval().ShortDebugString())>::type; + template + static std::false_type CheckShortDebugString(...); + + using HasDebugStringType = decltype(CheckDebugString(nullptr)); + using HasShortDebugStringType = decltype(CheckShortDebugString(nullptr)); + + public: + static constexpr bool value = + HasDebugStringType::value && HasShortDebugStringType::value; +}; + +template +constexpr bool HasDebugStringAndShortDebugString::value; // When the compiler sees expression IsContainerTest(0), if C is an // STL-style container class, the first overload of IsContainerTest @@ -1118,8 +1151,6 @@ class NativeArray { const Element* array_; size_t size_; void (NativeArray::*clone_)(const Element*, size_t); - - GTEST_DISALLOW_ASSIGN_(NativeArray); }; // Backport of std::index_sequence. @@ -1143,12 +1174,18 @@ struct DoubleSequence, sizeofT> { // Backport of std::make_index_sequence. // It uses O(ln(N)) instantiation depth. template -struct MakeIndexSequence - : DoubleSequence::type, +struct MakeIndexSequenceImpl + : DoubleSequence::type, N / 2>::type {}; template <> -struct MakeIndexSequence<0> : IndexSequence<> {}; +struct MakeIndexSequenceImpl<0> : IndexSequence<> {}; + +template +using MakeIndexSequence = typename MakeIndexSequenceImpl::type; + +template +using IndexSequenceFor = typename MakeIndexSequence::type; template struct Ignore { @@ -1174,6 +1211,8 @@ struct ElemFromList { static_cast(nullptr)...)); }; +struct FlatTupleConstructTag {}; + template class FlatTuple; @@ -1184,7 +1223,9 @@ template struct FlatTupleElemBase, I> { using value_type = typename ElemFromList::type; FlatTupleElemBase() = default; - explicit FlatTupleElemBase(value_type t) : value(std::move(t)) {} + template + explicit FlatTupleElemBase(FlatTupleConstructTag, Arg&& t) + : value(std::forward(t)) {} value_type value; }; @@ -1196,8 +1237,30 @@ struct FlatTupleBase, IndexSequence> : FlatTupleElemBase, Idx>... { using Indices = IndexSequence; FlatTupleBase() = default; - explicit FlatTupleBase(T... t) - : FlatTupleElemBase, Idx>(std::move(t))... {} + template + explicit FlatTupleBase(FlatTupleConstructTag, Args&&... args) + : FlatTupleElemBase, Idx>(FlatTupleConstructTag{}, + std::forward(args))... {} + + template + const typename ElemFromList::type& Get() const { + return FlatTupleElemBase, I>::value; + } + + template + typename ElemFromList::type& Get() { + return FlatTupleElemBase, I>::value; + } + + template + auto Apply(F&& f) -> decltype(std::forward(f)(this->Get()...)) { + return std::forward(f)(Get()...); + } + + template + auto Apply(F&& f) const -> decltype(std::forward(f)(this->Get()...)) { + return std::forward(f)(Get()...); + } }; // Analog to std::tuple but with different tradeoffs. @@ -1218,17 +1281,12 @@ class FlatTuple public: FlatTuple() = default; - explicit FlatTuple(T... t) : FlatTuple::FlatTupleBase(std::move(t)...) {} + template + explicit FlatTuple(FlatTupleConstructTag tag, Args&&... args) + : FlatTuple::FlatTupleBase(tag, std::forward(args)...) {} - template - const typename ElemFromList::type& Get() const { - return static_cast*>(this)->value; - } - - template - typename ElemFromList::type& Get() { - return static_cast*>(this)->value; - } + using FlatTuple::FlatTupleBase::Apply; + using FlatTuple::FlatTupleBase::Get; }; // Utility functions to be called with static_assert to induce deprecation @@ -1261,6 +1319,22 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; } } // namespace internal } // namespace testing +namespace std { +// Some standard library implementations use `struct tuple_size` and some use +// `class tuple_size`. Clang warns about the mismatch. +// https://reviews.llvm.org/D55466 +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wmismatched-tags" +#endif +template +struct tuple_size> + : std::integral_constant {}; +#ifdef __clang__ +#pragma clang diagnostic pop +#endif +} // namespace std + #define GTEST_MESSAGE_AT_(file, line, message, result_type) \ ::testing::internal::AssertHelper(result_type, file, line, message) \ = ::testing::Message() @@ -1283,44 +1357,98 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; } // Suppress MSVC warning 4072 (unreachable code) for the code following // statement if it returns or throws (or doesn't return or throw in some // situations). +// NOTE: The "else" is important to keep this expansion to prevent a top-level +// "else" from attaching to our "if". #define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \ - if (::testing::internal::AlwaysTrue()) { statement; } + if (::testing::internal::AlwaysTrue()) { \ + statement; \ + } else /* NOLINT */ \ + static_assert(true, "") // User must have a semicolon after expansion. -#define GTEST_TEST_THROW_(statement, expected_exception, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::ConstCharPtr gtest_msg = "") { \ - bool gtest_caught_expected = false; \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } \ - catch (expected_exception const&) { \ - gtest_caught_expected = true; \ - } \ - catch (...) { \ - gtest_msg.value = \ - "Expected: " #statement " throws an exception of type " \ - #expected_exception ".\n Actual: it throws a different type."; \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ - } \ - if (!gtest_caught_expected) { \ - gtest_msg.value = \ - "Expected: " #statement " throws an exception of type " \ - #expected_exception ".\n Actual: it throws nothing."; \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \ - fail(gtest_msg.value) +#if GTEST_HAS_EXCEPTIONS + +namespace testing { +namespace internal { + +class NeverThrown { + public: + const char* what() const noexcept { + return "this exception should never be thrown"; + } +}; + +} // namespace internal +} // namespace testing + +#if GTEST_HAS_RTTI + +#define GTEST_EXCEPTION_TYPE_(e) ::testing::internal::GetTypeName(typeid(e)) + +#else // GTEST_HAS_RTTI + +#define GTEST_EXCEPTION_TYPE_(e) \ + std::string { "an std::exception-derived error" } + +#endif // GTEST_HAS_RTTI + +#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception) \ + catch (typename std::conditional< \ + std::is_same::type>::type, \ + std::exception>::value, \ + const ::testing::internal::NeverThrown&, const std::exception&>::type \ + e) { \ + gtest_msg.value = "Expected: " #statement \ + " throws an exception of type " #expected_exception \ + ".\n Actual: it throws "; \ + gtest_msg.value += GTEST_EXCEPTION_TYPE_(e); \ + gtest_msg.value += " with description \""; \ + gtest_msg.value += e.what(); \ + gtest_msg.value += "\"."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } + +#else // GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception) + +#endif // GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_THROW_(statement, expected_exception, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::TrueWithString gtest_msg{}) { \ + bool gtest_caught_expected = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (expected_exception const&) { \ + gtest_caught_expected = true; \ + } \ + GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception) \ + catch (...) { \ + gtest_msg.value = "Expected: " #statement \ + " throws an exception of type " #expected_exception \ + ".\n Actual: it throws a different type."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + if (!gtest_caught_expected) { \ + gtest_msg.value = "Expected: " #statement \ + " throws an exception of type " #expected_exception \ + ".\n Actual: it throws nothing."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + } else /*NOLINT*/ \ + GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__) \ + : fail(gtest_msg.value.c_str()) #if GTEST_HAS_EXCEPTIONS -#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \ - catch (std::exception const& e) { \ - gtest_msg.value = ( \ - "it throws std::exception-derived exception with description: \"" \ - ); \ - gtest_msg.value += e.what(); \ - gtest_msg.value += "\"."; \ +#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \ + catch (std::exception const& e) { \ + gtest_msg.value = "it throws "; \ + gtest_msg.value += GTEST_EXCEPTION_TYPE_(e); \ + gtest_msg.value += " with description \""; \ + gtest_msg.value += e.what(); \ + gtest_msg.value += "\"."; \ goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ } @@ -1367,7 +1495,7 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; } // Implements Boolean test assertions such as EXPECT_TRUE. expression can be // either a boolean expression or an AssertionResult. text is a textual -// represenation of expression as it was passed into the EXPECT_TRUE. +// representation of expression as it was passed into the EXPECT_TRUE. #define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ if (const ::testing::AssertionResult gtest_ar_ = \ @@ -1404,7 +1532,7 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; } class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ : public parent_class { \ public: \ - GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {} \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default; \ ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \ GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ test_name)); \ @@ -1429,4 +1557,4 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; } test_suite_name, test_name)>); \ void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody() -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h index 7f7a13bf84..c2ef6e3124 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h @@ -32,8 +32,8 @@ // GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ #include @@ -459,7 +459,7 @@ class ParameterizedTestSuiteInfoBase { // Base part of test suite name for display purposes. virtual const std::string& GetTestSuiteName() const = 0; - // Test case id to verify identity. + // Test suite id to verify identity. virtual TypeId GetTestSuiteTypeId() const = 0; // UnitTest class invokes this method to register tests in this // test suite right before running them in RUN_ALL_TESTS macro. @@ -478,7 +478,7 @@ class ParameterizedTestSuiteInfoBase { // // Report a the name of a test_suit as safe to ignore // as the side effect of construction of this type. -struct MarkAsIgnored { +struct GTEST_API_ MarkAsIgnored { explicit MarkAsIgnored(const char* test_suite); }; @@ -507,11 +507,11 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { CodeLocation code_location) : test_suite_name_(name), code_location_(code_location) {} - // Test case base name for display purposes. + // Test suite base name for display purposes. const std::string& GetTestSuiteName() const override { return test_suite_name_; } - // Test case id to verify identity. + // Test suite id to verify identity. TypeId GetTestSuiteTypeId() const override { return GetTypeId(); } // TEST_P macro uses AddTestPattern() to record information // about a single test in a LocalTestInfo structure. @@ -520,9 +520,10 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is // test suite base name and DoBar is test base name. void AddTestPattern(const char* test_suite_name, const char* test_base_name, - TestMetaFactoryBase* meta_factory) { - tests_.push_back(std::shared_ptr( - new TestInfo(test_suite_name, test_base_name, meta_factory))); + TestMetaFactoryBase* meta_factory, + CodeLocation code_location) { + tests_.push_back(std::shared_ptr(new TestInfo( + test_suite_name, test_base_name, meta_factory, code_location))); } // INSTANTIATE_TEST_SUITE_P macro uses AddGenerator() to record information // about a generator. @@ -589,7 +590,7 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { MakeAndRegisterTestInfo( test_suite_name.c_str(), test_name_stream.GetString().c_str(), nullptr, // No type parameter. - PrintToString(*param_it).c_str(), code_location_, + PrintToString(*param_it).c_str(), test_info->code_location, GetTestSuiteTypeId(), SuiteApiResolver::GetSetUpCaseOrSuite(file, line), SuiteApiResolver::GetTearDownCaseOrSuite(file, line), @@ -610,14 +611,17 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { // with TEST_P macro. struct TestInfo { TestInfo(const char* a_test_suite_base_name, const char* a_test_base_name, - TestMetaFactoryBase* a_test_meta_factory) + TestMetaFactoryBase* a_test_meta_factory, + CodeLocation a_code_location) : test_suite_base_name(a_test_suite_base_name), test_base_name(a_test_base_name), - test_meta_factory(a_test_meta_factory) {} + test_meta_factory(a_test_meta_factory), + code_location(a_code_location) {} const std::string test_suite_base_name; const std::string test_base_name; const std::unique_ptr > test_meta_factory; + const CodeLocation code_location; }; using TestInfoContainer = ::std::vector >; // Records data received from INSTANTIATE_TEST_SUITE_P macros: @@ -650,7 +654,7 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { // Check for invalid characters for (std::string::size_type index = 0; index < name.size(); ++index) { - if (!isalnum(name[index]) && name[index] != '_') + if (!IsAlNum(name[index]) && name[index] != '_') return false; } @@ -779,10 +783,15 @@ internal::ParamGenerator ValuesIn( namespace internal { // Used in the Values() function to provide polymorphic capabilities. +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4100) +#endif + template class ValueArray { public: - ValueArray(Ts... v) : v_{std::move(v)...} {} + explicit ValueArray(Ts... v) : v_(FlatTupleConstructTag{}, std::move(v)...) {} template operator ParamGenerator() const { // NOLINT @@ -798,6 +807,10 @@ class ValueArray { FlatTuple v_; }; +#ifdef _MSC_VER +#pragma warning(pop) +#endif + template class CartesianProductGenerator : public ParamGeneratorInterface<::std::tuple> { @@ -931,4 +944,4 @@ class CartesianProductHolder { } // namespace internal } // namespace testing -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h index d3239b25ba..dd845915e3 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h @@ -32,8 +32,8 @@ // This header file defines the GTEST_OS_* macro. // It is separate from gtest-port.h so that custom/gtest-port.h can include it. -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ // Determines the platform on which Google Test is compiled. #ifdef __CYGWIN__ @@ -68,6 +68,7 @@ # define GTEST_OS_OS2 1 #elif defined __APPLE__ # define GTEST_OS_MAC 1 +# include # if TARGET_OS_IPHONE # define GTEST_OS_IOS 1 # endif @@ -106,6 +107,8 @@ #define GTEST_OS_ESP8266 1 #elif defined ESP32 #define GTEST_OS_ESP32 1 +#elif defined(__XTENSA__) +#define GTEST_OS_XTENSA 1 #endif // __CYGWIN__ -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port.h b/third_party/googletest/src/include/gtest/internal/gtest-port.h index 60ff47164f..0953a781c0 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-port.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-port.h @@ -40,8 +40,8 @@ // GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ // Environment-describing macros // ----------------------------- @@ -199,9 +199,18 @@ // suppressed (constant conditional). // GTEST_INTENTIONAL_CONST_COND_POP_ - finish code section where MSVC C4127 // is suppressed. +// GTEST_INTERNAL_HAS_ANY - for enabling UniversalPrinter or +// UniversalPrinter specializations. +// GTEST_INTERNAL_HAS_OPTIONAL - for enabling UniversalPrinter +// or +// UniversalPrinter +// specializations. // GTEST_INTERNAL_HAS_STRING_VIEW - for enabling Matcher or // Matcher // specializations. +// GTEST_INTERNAL_HAS_VARIANT - for enabling UniversalPrinter or +// UniversalPrinter +// specializations. // // Synchronization: // Mutex, MutexLock, ThreadLocal, GetThreadCount() @@ -252,6 +261,8 @@ #include #include #include + +#include #include #include #include @@ -267,6 +278,7 @@ #endif #include // NOLINT +#include #include #include // NOLINT #include @@ -347,6 +359,10 @@ typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION; // WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION. typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; #endif +#elif GTEST_OS_XTENSA +#include +// Xtensa toolchains define strcasecmp in the string.h header instead of +// strings.h. string.h is already included. #else // This assumes that non-Windows OSes provide unistd.h. For OSes where this // is not the case, we need to include headers that provide the functions @@ -367,7 +383,7 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // On Android, is only available starting with Gingerbread. # define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9) # else -# define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS) +#define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS && !GTEST_OS_XTENSA) # endif #endif @@ -452,7 +468,7 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // no support for it at least as recent as Froyo (2.2). #define GTEST_HAS_STD_WSTRING \ (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ - GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266)) + GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266 || GTEST_OS_XTENSA)) #endif // GTEST_HAS_STD_WSTRING @@ -577,7 +593,7 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // By default, we assume that stream redirection is supported on all // platforms except known mobile ones. #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ - GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 + GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA # define GTEST_HAS_STREAM_REDIRECTION 0 # else # define GTEST_HAS_STREAM_REDIRECTION 1 @@ -679,8 +695,8 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // A macro to disallow copy constructor and operator= // This should be used in the private: declarations for a class. #define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \ - type(type const &) = delete; \ - GTEST_DISALLOW_ASSIGN_(type) + type(type const&) = delete; \ + type& operator=(type const&) = delete // A macro to disallow move operator= // This should be used in the private: declarations for a class. @@ -690,8 +706,8 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // A macro to disallow move constructor and operator= // This should be used in the private: declarations for a class. #define GTEST_DISALLOW_MOVE_AND_ASSIGN_(type) \ - type(type &&) noexcept = delete; \ - GTEST_DISALLOW_MOVE_ASSIGN_(type) + type(type&&) noexcept = delete; \ + type& operator=(type&&) noexcept = delete // Tell the compiler to warn about unused return values for functions declared // with this macro. The macro should be used on function declarations @@ -918,8 +934,6 @@ class GTEST_API_ RE { const char* full_pattern_; // For FullMatch(); # endif - - GTEST_DISALLOW_ASSIGN_(RE); }; #endif // GTEST_USES_PCRE @@ -1926,6 +1940,19 @@ inline bool IsUpper(char ch) { inline bool IsXDigit(char ch) { return isxdigit(static_cast(ch)) != 0; } +#ifdef __cpp_char8_t +inline bool IsXDigit(char8_t ch) { + return isxdigit(static_cast(ch)) != 0; +} +#endif +inline bool IsXDigit(char16_t ch) { + const unsigned char low_byte = static_cast(ch); + return ch == low_byte && isxdigit(low_byte) != 0; +} +inline bool IsXDigit(char32_t ch) { + const unsigned char low_byte = static_cast(ch); + return ch == low_byte && isxdigit(low_byte) != 0; +} inline bool IsXDigit(wchar_t ch) { const unsigned char low_byte = static_cast(ch); return ch == low_byte && isxdigit(low_byte) != 0; @@ -1960,16 +1987,16 @@ namespace posix { typedef struct _stat StatStruct; # ifdef __BORLANDC__ -inline int IsATTY(int fd) { return isatty(fd); } +inline int DoIsATTY(int fd) { return isatty(fd); } inline int StrCaseCmp(const char* s1, const char* s2) { return stricmp(s1, s2); } inline char* StrDup(const char* src) { return strdup(src); } # else // !__BORLANDC__ # if GTEST_OS_WINDOWS_MOBILE -inline int IsATTY(int /* fd */) { return 0; } +inline int DoIsATTY(int /* fd */) { return 0; } # else -inline int IsATTY(int fd) { return _isatty(fd); } +inline int DoIsATTY(int fd) { return _isatty(fd); } # endif // GTEST_OS_WINDOWS_MOBILE inline int StrCaseCmp(const char* s1, const char* s2) { return _stricmp(s1, s2); @@ -1994,7 +2021,7 @@ inline bool IsDir(const StatStruct& st) { typedef struct stat StatStruct; inline int FileNo(FILE* file) { return fileno(file); } -inline int IsATTY(int fd) { return isatty(fd); } +inline int DoIsATTY(int fd) { return isatty(fd); } inline int Stat(const char* path, StatStruct* buf) { // stat function not implemented on ESP8266 return 0; @@ -2011,7 +2038,7 @@ inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); } typedef struct stat StatStruct; inline int FileNo(FILE* file) { return fileno(file); } -inline int IsATTY(int fd) { return isatty(fd); } +inline int DoIsATTY(int fd) { return isatty(fd); } inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); } inline int StrCaseCmp(const char* s1, const char* s2) { return strcasecmp(s1, s2); @@ -2022,6 +2049,17 @@ inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); } #endif // GTEST_OS_WINDOWS +inline int IsATTY(int fd) { + // DoIsATTY might change errno (for example ENOTTY in case you redirect stdout + // to a file on Linux), which is unexpected, so save the previous value, and + // restore it after the call. + int savedErrno = errno; + int isAttyValue = DoIsATTY(fd); + errno = savedErrno; + + return isAttyValue; +} + // Functions deprecated by MSVC 8.0. GTEST_DISABLE_MSC_DEPRECATED_PUSH_() @@ -2030,11 +2068,20 @@ GTEST_DISABLE_MSC_DEPRECATED_PUSH_() // StrError() aren't needed on Windows CE at this time and thus not // defined there. -#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT +#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \ + !GTEST_OS_WINDOWS_RT && !GTEST_OS_ESP8266 && !GTEST_OS_XTENSA inline int ChDir(const char* dir) { return chdir(dir); } #endif inline FILE* FOpen(const char* path, const char* mode) { +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW + struct wchar_codecvt : public std::codecvt {}; + std::wstring_convert converter; + std::wstring wide_path = converter.from_bytes(path); + std::wstring wide_mode = converter.from_bytes(mode); + return _wfopen(wide_path.c_str(), wide_mode.c_str()); +#else // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW return fopen(path, mode); +#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW } #if !GTEST_OS_WINDOWS_MOBILE inline FILE *FReopen(const char* path, const char* mode, FILE* stream) { @@ -2055,7 +2102,7 @@ inline const char* StrError(int errnum) { return strerror(errnum); } #endif inline const char* GetEnv(const char* name) { #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ - GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 + GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA // We are on an embedded platform, which has no environment variables. static_cast(name); // To prevent 'unused argument' warning. return nullptr; @@ -2191,7 +2238,8 @@ using TimeInMillis = int64_t; // Represents time in milliseconds. // Parses 'str' for a 32-bit signed integer. If successful, writes the result // to *value and returns true; otherwise leaves *value unchanged and returns // false. -bool ParseInt32(const Message& src_text, const char* str, int32_t* value); +GTEST_API_ bool ParseInt32(const Message& src_text, const char* str, + int32_t* value); // Parses a bool/int32_t/string from the environment variable // corresponding to the given Google Test flag. @@ -2223,6 +2271,64 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val); #endif // !defined(GTEST_INTERNAL_DEPRECATED) +#if GTEST_HAS_ABSL +// Always use absl::any for UniversalPrinter<> specializations if googletest +// is built with absl support. +#define GTEST_INTERNAL_HAS_ANY 1 +#include "absl/types/any.h" +namespace testing { +namespace internal { +using Any = ::absl::any; +} // namespace internal +} // namespace testing +#else +#ifdef __has_include +#if __has_include() && __cplusplus >= 201703L +// Otherwise for C++17 and higher use std::any for UniversalPrinter<> +// specializations. +#define GTEST_INTERNAL_HAS_ANY 1 +#include +namespace testing { +namespace internal { +using Any = ::std::any; +} // namespace internal +} // namespace testing +// The case where absl is configured NOT to alias std::any is not +// supported. +#endif // __has_include() && __cplusplus >= 201703L +#endif // __has_include +#endif // GTEST_HAS_ABSL + +#if GTEST_HAS_ABSL +// Always use absl::optional for UniversalPrinter<> specializations if +// googletest is built with absl support. +#define GTEST_INTERNAL_HAS_OPTIONAL 1 +#include "absl/types/optional.h" +namespace testing { +namespace internal { +template +using Optional = ::absl::optional; +} // namespace internal +} // namespace testing +#else +#ifdef __has_include +#if __has_include() && __cplusplus >= 201703L +// Otherwise for C++17 and higher use std::optional for UniversalPrinter<> +// specializations. +#define GTEST_INTERNAL_HAS_OPTIONAL 1 +#include +namespace testing { +namespace internal { +template +using Optional = ::std::optional; +} // namespace internal +} // namespace testing +// The case where absl is configured NOT to alias std::optional is not +// supported. +#endif // __has_include() && __cplusplus >= 201703L +#endif // __has_include +#endif // GTEST_HAS_ABSL + #if GTEST_HAS_ABSL // Always use absl::string_view for Matcher<> specializations if googletest // is built with absl support. @@ -2251,4 +2357,33 @@ using StringView = ::std::string_view; # endif // __has_include #endif // GTEST_HAS_ABSL -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ +#if GTEST_HAS_ABSL +// Always use absl::variant for UniversalPrinter<> specializations if googletest +// is built with absl support. +#define GTEST_INTERNAL_HAS_VARIANT 1 +#include "absl/types/variant.h" +namespace testing { +namespace internal { +template +using Variant = ::absl::variant; +} // namespace internal +} // namespace testing +#else +#ifdef __has_include +#if __has_include() && __cplusplus >= 201703L +// Otherwise for C++17 and higher use std::variant for UniversalPrinter<> +// specializations. +#define GTEST_INTERNAL_HAS_VARIANT 1 +#include +namespace testing { +namespace internal { +template +using Variant = ::std::variant; +} // namespace internal +} // namespace testing +// The case where absl is configured NOT to alias std::variant is not supported. +#endif // __has_include() && __cplusplus >= 201703L +#endif // __has_include +#endif // GTEST_HAS_ABSL + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-string.h b/third_party/googletest/src/include/gtest/internal/gtest-string.h index 0b2a91a5dc..10f774f966 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-string.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-string.h @@ -38,8 +38,8 @@ // GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ #ifdef __BORLANDC__ // string.h is not guaranteed to provide strcpy on C++ Builder. @@ -149,6 +149,9 @@ class GTEST_API_ String { // Formats an int value as "%02d". static std::string FormatIntWidth2(int value); // "%02d" for width == 2 + // Formats an int value to given width with leading zeros. + static std::string FormatIntWidthN(int value, int width); + // Formats an int value as "%X". static std::string FormatHexInt(int value); @@ -169,4 +172,4 @@ GTEST_API_ std::string StringStreamToString(::std::stringstream* stream); } // namespace internal } // namespace testing -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h index 082fdad12c..b87a2e2cac 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h @@ -32,8 +32,8 @@ // GOOGLETEST_CM0001 DO NOT DELETE -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ #include "gtest/internal/gtest-port.h" @@ -64,38 +64,40 @@ inline std::string CanonicalizeForStdLibVersioning(std::string s) { return s; } -// GetTypeName() returns a human-readable name of type T. -// NB: This function is also used in Google Mock, so don't move it inside of -// the typed-test-only section below. -template -std::string GetTypeName() { -# if GTEST_HAS_RTTI - - const char* const name = typeid(T).name(); -# if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC) +#if GTEST_HAS_RTTI +// GetTypeName(const std::type_info&) returns a human-readable name of type T. +inline std::string GetTypeName(const std::type_info& type) { + const char* const name = type.name(); +#if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC) int status = 0; // gcc's implementation of typeid(T).name() mangles the type name, // so we have to demangle it. -# if GTEST_HAS_CXXABI_H_ +#if GTEST_HAS_CXXABI_H_ using abi::__cxa_demangle; -# endif // GTEST_HAS_CXXABI_H_ +#endif // GTEST_HAS_CXXABI_H_ char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status); const std::string name_str(status == 0 ? readable_name : name); free(readable_name); return CanonicalizeForStdLibVersioning(name_str); -# else +#else return name; -# endif // GTEST_HAS_CXXABI_H_ || __HP_aCC - -# else +#endif // GTEST_HAS_CXXABI_H_ || __HP_aCC +} +#endif // GTEST_HAS_RTTI +// GetTypeName() returns a human-readable name of type T if and only if +// RTTI is enabled, otherwise it returns a dummy type name. +// NB: This function is also used in Google Mock, so don't move it inside of +// the typed-test-only section below. +template +std::string GetTypeName() { +#if GTEST_HAS_RTTI + return GetTypeName(typeid(T)); +#else return ""; - -# endif // GTEST_HAS_RTTI +#endif // GTEST_HAS_RTTI } -#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P - // A unique type indicating an empty node struct None {}; @@ -171,8 +173,6 @@ struct GenerateTypeList { using type = typename proxy::type; }; -#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P - } // namespace internal template @@ -180,4 +180,4 @@ using Types = internal::ProxyTypeList; } // namespace testing -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ diff --git a/third_party/googletest/src/src/gtest-death-test.cc b/third_party/googletest/src/src/gtest-death-test.cc index 5d1031bea2..bf4f6331da 100644 --- a/third_party/googletest/src/src/gtest-death-test.cc +++ b/third_party/googletest/src/src/gtest-death-test.cc @@ -32,6 +32,7 @@ #include "gtest/gtest-death-test.h" +#include #include #include "gtest/internal/gtest-port.h" @@ -247,7 +248,7 @@ static std::string DeathTestThreadWarning(size_t thread_count) { msg << "detected " << thread_count << " threads."; } msg << " See " - "https://github.com/google/googletest/blob/master/googletest/docs/" + "https://github.com/google/googletest/blob/master/docs/" "advanced.md#death-tests-and-threads" << " for more explanation and suggested solutions, especially if" << " this is the last message you see before your test times out."; @@ -864,7 +865,7 @@ class Arguments { } int size() { - return args_.size() - 1; + return static_cast(args_.size()) - 1; } private: @@ -890,18 +891,17 @@ int FuchsiaDeathTest::Wait() { // Register to wait for the child process to terminate. status_zx = child_process_.wait_async( - port, kProcessKey, ZX_PROCESS_TERMINATED, ZX_WAIT_ASYNC_ONCE); + port, kProcessKey, ZX_PROCESS_TERMINATED, 0); GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); // Register to wait for the socket to be readable or closed. status_zx = stderr_socket_.wait_async( - port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, - ZX_WAIT_ASYNC_ONCE); + port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0); GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); // Register to wait for an exception. status_zx = exception_channel_.wait_async( - port, kExceptionKey, ZX_CHANNEL_READABLE, ZX_WAIT_ASYNC_ONCE); + port, kExceptionKey, ZX_CHANNEL_READABLE, 0); GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); bool process_terminated = false; @@ -941,8 +941,7 @@ int FuchsiaDeathTest::Wait() { } else { GTEST_DEATH_TEST_CHECK_(status_zx == ZX_ERR_SHOULD_WAIT); status_zx = stderr_socket_.wait_async( - port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, - ZX_WAIT_ASYNC_ONCE); + port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0); GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); } } else { @@ -955,12 +954,12 @@ int FuchsiaDeathTest::Wait() { ReadAndInterpretStatusByte(); zx_info_process_t buffer; - status_zx = child_process_.get_info( - ZX_INFO_PROCESS, &buffer, sizeof(buffer), nullptr, nullptr); + status_zx = child_process_.get_info(ZX_INFO_PROCESS, &buffer, sizeof(buffer), + nullptr, nullptr); GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); - GTEST_DEATH_TEST_CHECK_(buffer.exited); - set_status(buffer.return_code); + GTEST_DEATH_TEST_CHECK_(buffer.flags & ZX_INFO_PROCESS_FLAG_EXITED); + set_status(static_cast(buffer.return_code)); return status(); } @@ -1225,21 +1224,9 @@ struct ExecDeathTestArgs { int close_fd; // File descriptor to close; the read end of a pipe }; -# if GTEST_OS_MAC -inline char** GetEnviron() { - // When Google Test is built as a framework on MacOS X, the environ variable - // is unavailable. Apple's documentation (man environ) recommends using - // _NSGetEnviron() instead. - return *_NSGetEnviron(); -} -# else -// Some POSIX platforms expect you to declare environ. extern "C" makes -// it reside in the global namespace. +# if GTEST_OS_QNX extern "C" char** environ; -inline char** GetEnviron() { return environ; } -# endif // GTEST_OS_MAC - -# if !GTEST_OS_QNX +# else // GTEST_OS_QNX // The main function for a threadsafe-style death test child process. // This function is called in a clone()-ed process and thus must avoid // any potentially unsafe operations like malloc or libc functions. @@ -1259,18 +1246,18 @@ static int ExecDeathTestChildMain(void* child_arg) { return EXIT_FAILURE; } - // We can safely call execve() as it's a direct system call. We + // We can safely call execv() as it's almost a direct system call. We // cannot use execvp() as it's a libc function and thus potentially - // unsafe. Since execve() doesn't search the PATH, the user must + // unsafe. Since execv() doesn't search the PATH, the user must // invoke the test program via a valid path that contains at least // one path separator. - execve(args->argv[0], args->argv, GetEnviron()); - DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " + + execv(args->argv[0], args->argv); + DeathTestAbort(std::string("execv(") + args->argv[0] + ", ...) in " + original_dir + " failed: " + GetLastErrnoDescription()); return EXIT_FAILURE; } -# endif // !GTEST_OS_QNX +# endif // GTEST_OS_QNX # if GTEST_HAS_CLONE // Two utility routines that together determine the direction the stack @@ -1284,19 +1271,24 @@ static int ExecDeathTestChildMain(void* child_arg) { // correct answer. static void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_; +// Make sure sanitizers do not tamper with the stack here. +// Ideally, we want to use `__builtin_frame_address` instead of a local variable +// address with sanitizer disabled, but it does not work when the +// compiler optimizes the stack frame out, which happens on PowerPC targets. // HWAddressSanitizer add a random tag to the MSB of the local variable address, // making comparison result unpredictable. +GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ static void StackLowerThanAddress(const void* ptr, bool* result) { - int dummy; - *result = (&dummy < ptr); + int dummy = 0; + *result = std::less()(&dummy, ptr); } // Make sure AddressSanitizer does not tamper with the stack here. GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ static bool StackGrowsDown() { - int dummy; + int dummy = 0; bool result; StackLowerThanAddress(&dummy, &result); return result; @@ -1339,8 +1331,7 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { fd_flags | FD_CLOEXEC)); struct inheritance inherit = {0}; // spawn is a system call. - child_pid = - spawn(args.argv[0], 0, nullptr, &inherit, args.argv, GetEnviron()); + child_pid = spawn(args.argv[0], 0, nullptr, &inherit, args.argv, environ); // Restores the current working directory. GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1); GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd)); diff --git a/third_party/googletest/src/src/gtest-filepath.cc b/third_party/googletest/src/src/gtest-filepath.cc index 9aad12fbd1..0b5629401b 100644 --- a/third_party/googletest/src/src/gtest-filepath.cc +++ b/third_party/googletest/src/src/gtest-filepath.cc @@ -92,8 +92,9 @@ static bool IsPathSeparator(char c) { // Returns the current working directory, or "" if unsuccessful. FilePath FilePath::GetCurrentDir() { -#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ - GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32 +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ + GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32 || \ + GTEST_OS_XTENSA // These platforms do not have a current directory, so we just return // something reasonable. return FilePath(kCurrentDirectoryString); @@ -209,7 +210,7 @@ bool FilePath::FileOrDirectoryExists() const { delete [] unicode; return attributes != kInvalidFileAttributes; #else - posix::StatStruct file_stat; + posix::StatStruct file_stat{}; return posix::Stat(pathname_.c_str(), &file_stat) == 0; #endif // GTEST_OS_WINDOWS_MOBILE } @@ -236,7 +237,7 @@ bool FilePath::DirectoryExists() const { result = true; } #else - posix::StatStruct file_stat; + posix::StatStruct file_stat{}; result = posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat); #endif // GTEST_OS_WINDOWS_MOBILE @@ -323,7 +324,7 @@ bool FilePath::CreateFolder() const { delete [] unicode; #elif GTEST_OS_WINDOWS int result = _mkdir(pathname_.c_str()); -#elif GTEST_OS_ESP8266 +#elif GTEST_OS_ESP8266 || GTEST_OS_XTENSA // do nothing int result = 0; #else @@ -349,33 +350,19 @@ FilePath FilePath::RemoveTrailingPathSeparator() const { // For example, "bar///foo" becomes "bar/foo". Does not eliminate other // redundancies that might be in a pathname involving "." or "..". void FilePath::Normalize() { - if (pathname_.c_str() == nullptr) { - pathname_ = ""; - return; - } - const char* src = pathname_.c_str(); - char* const dest = new char[pathname_.length() + 1]; - char* dest_ptr = dest; - memset(dest_ptr, 0, pathname_.length() + 1); - - while (*src != '\0') { - *dest_ptr = *src; - if (!IsPathSeparator(*src)) { - src++; + auto out = pathname_.begin(); + + for (const char character : pathname_) { + if (!IsPathSeparator(character)) { + *(out++) = character; + } else if (out == pathname_.begin() || *std::prev(out) != kPathSeparator) { + *(out++) = kPathSeparator; } else { -#if GTEST_HAS_ALT_PATH_SEP_ - if (*dest_ptr == kAlternatePathSeparator) { - *dest_ptr = kPathSeparator; - } -#endif - while (IsPathSeparator(*src)) - src++; + continue; } - dest_ptr++; } - *dest_ptr = '\0'; - pathname_ = dest; - delete[] dest; + + pathname_.erase(out, pathname_.end()); } } // namespace internal diff --git a/third_party/googletest/src/src/gtest-internal-inl.h b/third_party/googletest/src/src/gtest-internal-inl.h index e42ff47539..6d8cecbbb3 100644 --- a/third_party/googletest/src/src/gtest-internal-inl.h +++ b/third_party/googletest/src/src/gtest-internal-inl.h @@ -31,8 +31,8 @@ // This file contains purely Google Test's internal implementation. Please // DO NOT #INCLUDE IT IN A USER PROGRAM. -#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_ -#define GTEST_SRC_GTEST_INTERNAL_INL_H_ +#ifndef GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_ +#define GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_ #ifndef _WIN32_WCE # include @@ -84,9 +84,11 @@ const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests"; const char kBreakOnFailureFlag[] = "break_on_failure"; const char kCatchExceptionsFlag[] = "catch_exceptions"; const char kColorFlag[] = "color"; +const char kFailFast[] = "fail_fast"; const char kFilterFlag[] = "filter"; const char kListTestsFlag[] = "list_tests"; const char kOutputFlag[] = "output"; +const char kBriefFlag[] = "brief"; const char kPrintTimeFlag[] = "print_time"; const char kPrintUTF8Flag[] = "print_utf8"; const char kRandomSeedFlag[] = "random_seed"; @@ -164,10 +166,12 @@ class GTestFlagSaver { color_ = GTEST_FLAG(color); death_test_style_ = GTEST_FLAG(death_test_style); death_test_use_fork_ = GTEST_FLAG(death_test_use_fork); + fail_fast_ = GTEST_FLAG(fail_fast); filter_ = GTEST_FLAG(filter); internal_run_death_test_ = GTEST_FLAG(internal_run_death_test); list_tests_ = GTEST_FLAG(list_tests); output_ = GTEST_FLAG(output); + brief_ = GTEST_FLAG(brief); print_time_ = GTEST_FLAG(print_time); print_utf8_ = GTEST_FLAG(print_utf8); random_seed_ = GTEST_FLAG(random_seed); @@ -187,9 +191,11 @@ class GTestFlagSaver { GTEST_FLAG(death_test_style) = death_test_style_; GTEST_FLAG(death_test_use_fork) = death_test_use_fork_; GTEST_FLAG(filter) = filter_; + GTEST_FLAG(fail_fast) = fail_fast_; GTEST_FLAG(internal_run_death_test) = internal_run_death_test_; GTEST_FLAG(list_tests) = list_tests_; GTEST_FLAG(output) = output_; + GTEST_FLAG(brief) = brief_; GTEST_FLAG(print_time) = print_time_; GTEST_FLAG(print_utf8) = print_utf8_; GTEST_FLAG(random_seed) = random_seed_; @@ -208,10 +214,12 @@ class GTestFlagSaver { std::string color_; std::string death_test_style_; bool death_test_use_fork_; + bool fail_fast_; std::string filter_; std::string internal_run_death_test_; bool list_tests_; std::string output_; + bool brief_; bool print_time_; bool print_utf8_; int32_t random_seed_; @@ -386,13 +394,6 @@ class GTEST_API_ UnitTestOptions { // Functions for processing the gtest_filter flag. - // Returns true if and only if the wildcard pattern matches the string. - // The first ':' or '\0' character in pattern marks the end of it. - // - // This recursive algorithm isn't very efficient, but is clear and - // works well enough for matching test names, which are short. - static bool PatternMatchesString(const char *pattern, const char *str); - // Returns true if and only if the user-specified filter matches the test // suite name and the test name. static bool FilterMatchesTest(const std::string& test_suite_name, @@ -647,10 +648,10 @@ class GTEST_API_ UnitTestImpl { // Arguments: // // test_suite_name: name of the test suite - // type_param: the name of the test's type parameter, or NULL if - // this is not a typed or a type-parameterized test. - // set_up_tc: pointer to the function that sets up the test suite - // tear_down_tc: pointer to the function that tears down the test suite + // type_param: the name of the test's type parameter, or NULL if + // this is not a typed or a type-parameterized test. + // set_up_tc: pointer to the function that sets up the test suite + // tear_down_tc: pointer to the function that tears down the test suite TestSuite* GetTestSuite(const char* test_suite_name, const char* type_param, internal::SetUpTestSuiteFunc set_up_tc, internal::TearDownTestSuiteFunc tear_down_tc); @@ -674,6 +675,7 @@ class GTEST_API_ UnitTestImpl { void AddTestInfo(internal::SetUpTestSuiteFunc set_up_tc, internal::TearDownTestSuiteFunc tear_down_tc, TestInfo* test_info) { +#if GTEST_HAS_DEATH_TEST // In order to support thread-safe death tests, we need to // remember the original working directory when the test program // was first invoked. We cannot do this in RUN_ALL_TESTS(), as @@ -686,6 +688,7 @@ class GTEST_API_ UnitTestImpl { GTEST_CHECK_(!original_working_dir_.IsEmpty()) << "Failed to get the current working directory."; } +#endif // GTEST_HAS_DEATH_TEST GetTestSuite(test_info->test_suite_name(), test_info->type_param(), set_up_tc, tear_down_tc) @@ -1161,13 +1164,13 @@ class StreamingListener : public EmptyTestEventListener { } // Note that "event=TestCaseStart" is a wire format and has to remain - // "case" for compatibilty + // "case" for compatibility void OnTestCaseStart(const TestCase& test_case) override { SendLn(std::string("event=TestCaseStart&name=") + test_case.name()); } // Note that "event=TestCaseEnd" is a wire format and has to remain - // "case" for compatibilty + // "case" for compatibility void OnTestCaseEnd(const TestCase& test_case) override { SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) + "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) + @@ -1215,4 +1218,4 @@ class StreamingListener : public EmptyTestEventListener { GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 -#endif // GTEST_SRC_GTEST_INTERNAL_INL_H_ +#endif // GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_ diff --git a/third_party/googletest/src/src/gtest-port.cc b/third_party/googletest/src/src/gtest-port.cc index a05c50a39b..53a4d37f97 100644 --- a/third_party/googletest/src/src/gtest-port.cc +++ b/third_party/googletest/src/src/gtest-port.cc @@ -198,7 +198,8 @@ size_t GetThreadCount() { if (sysctl(mib, miblen, NULL, &size, NULL, 0)) { return 0; } - mib[5] = size / mib[4]; + + mib[5] = static_cast(size / static_cast(mib[4])); // populate array of structs struct kinfo_proc info[mib[5]]; @@ -207,8 +208,8 @@ size_t GetThreadCount() { } // exclude empty members - int nthreads = 0; - for (int i = 0; i < size / mib[4]; i++) { + size_t nthreads = 0; + for (size_t i = 0; i < size / static_cast(mib[4]); i++) { if (info[i].p_tid != -1) nthreads++; } @@ -687,8 +688,8 @@ class ThreadLocalRegistryImpl { static Mutex thread_map_mutex_; }; -Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex); -Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex); +Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex); // NOLINT +Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex); // NOLINT ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread( const ThreadLocalBase* thread_local_instance) { @@ -1094,9 +1095,9 @@ class CapturedStream { filename_ = temp_file_path; # else // There's no guarantee that a test has write access to the current - // directory, so we create the temporary file in the /tmp directory - // instead. We use /tmp on most systems, and /sdcard on Android. - // That's because Android doesn't have /tmp. + // directory, so we create the temporary file in a temporary directory. + std::string name_template; + # if GTEST_OS_LINUX_ANDROID // Note: Android applications are expected to call the framework's // Context.getExternalStorageDirectory() method through JNI to get @@ -1109,17 +1110,46 @@ class CapturedStream { // The location /data/local/tmp is directly accessible from native code. // '/sdcard' and other variants cannot be relied on, as they are not // guaranteed to be mounted, or may have a delay in mounting. - char name_template[] = "/data/local/tmp/gtest_captured_stream.XXXXXX"; + name_template = "/data/local/tmp/"; +# elif GTEST_OS_IOS + char user_temp_dir[PATH_MAX + 1]; + + // Documented alternative to NSTemporaryDirectory() (for obtaining creating + // a temporary directory) at + // https://developer.apple.com/library/archive/documentation/Security/Conceptual/SecureCodingGuide/Articles/RaceConditions.html#//apple_ref/doc/uid/TP40002585-SW10 + // + // _CS_DARWIN_USER_TEMP_DIR (as well as _CS_DARWIN_USER_CACHE_DIR) is not + // documented in the confstr() man page at + // https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/confstr.3.html#//apple_ref/doc/man/3/confstr + // but are still available, according to the WebKit patches at + // https://trac.webkit.org/changeset/262004/webkit + // https://trac.webkit.org/changeset/263705/webkit + // + // The confstr() implementation falls back to getenv("TMPDIR"). See + // https://opensource.apple.com/source/Libc/Libc-1439.100.3/gen/confstr.c.auto.html + ::confstr(_CS_DARWIN_USER_TEMP_DIR, user_temp_dir, sizeof(user_temp_dir)); + + name_template = user_temp_dir; + if (name_template.back() != GTEST_PATH_SEP_[0]) + name_template.push_back(GTEST_PATH_SEP_[0]); # else - char name_template[] = "/tmp/captured_stream.XXXXXX"; -# endif // GTEST_OS_LINUX_ANDROID - const int captured_fd = mkstemp(name_template); + name_template = "/tmp/"; +# endif + name_template.append("gtest_captured_stream.XXXXXX"); + + // mkstemp() modifies the string bytes in place, and does not go beyond the + // string's length. This results in well-defined behavior in C++17. + // + // The const_cast is needed below C++17. The constraints on std::string + // implementations in C++11 and above make assumption behind the const_cast + // fairly safe. + const int captured_fd = ::mkstemp(const_cast(name_template.data())); if (captured_fd == -1) { GTEST_LOG_(WARNING) << "Failed to create tmp file " << name_template << " for test; does the test have access to the /tmp directory?"; } - filename_ = name_template; + filename_ = std::move(name_template); # endif // GTEST_OS_WINDOWS fflush(nullptr); dup2(captured_fd, fd_); diff --git a/third_party/googletest/src/src/gtest-printers.cc b/third_party/googletest/src/src/gtest-printers.cc index 3337be312e..1b68fcb500 100644 --- a/third_party/googletest/src/src/gtest-printers.cc +++ b/third_party/googletest/src/src/gtest-printers.cc @@ -42,11 +42,16 @@ // defines Foo. #include "gtest/gtest-printers.h" + #include + #include +#include #include #include // NOLINT #include +#include + #include "gtest/internal/gtest-port.h" #include "src/gtest-internal-inl.h" @@ -102,9 +107,19 @@ void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count, *os << ">"; } +// Helpers for widening a character to char32_t. Since the standard does not +// specify if char / wchar_t is signed or unsigned, it is important to first +// convert it to the unsigned type of the same width before widening it to +// char32_t. +template +char32_t ToChar32(CharType in) { + return static_cast( + static_cast::type>(in)); +} + } // namespace -namespace internal2 { +namespace internal { // Delegates to PrintBytesInObjectToImpl() to print the bytes in the // given object. The delegation simplifies the implementation, which @@ -116,10 +131,6 @@ void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count, PrintBytesInObjectToImpl(obj_bytes, count, os); } -} // namespace internal2 - -namespace internal { - // Depending on the value of a char (or wchar_t), we print it in one // of three formats: // - as is if it's a printable ASCII (e.g. 'a', '2', ' '), @@ -134,18 +145,15 @@ enum CharFormat { // Returns true if c is a printable ASCII character. We test the // value of c directly instead of calling isprint(), which is buggy on // Windows Mobile. -inline bool IsPrintableAscii(wchar_t c) { - return 0x20 <= c && c <= 0x7E; -} +inline bool IsPrintableAscii(char32_t c) { return 0x20 <= c && c <= 0x7E; } -// Prints a wide or narrow char c as a character literal without the -// quotes, escaping it when necessary; returns how c was formatted. -// The template argument UnsignedChar is the unsigned version of Char, -// which is the type of c. -template +// Prints c (of type char, char8_t, char16_t, char32_t, or wchar_t) as a +// character literal without the quotes, escaping it when necessary; returns how +// c was formatted. +template static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) { - wchar_t w_c = static_cast(c); - switch (w_c) { + const char32_t u_c = ToChar32(c); + switch (u_c) { case L'\0': *os << "\\0"; break; @@ -177,13 +185,12 @@ static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) { *os << "\\v"; break; default: - if (IsPrintableAscii(w_c)) { + if (IsPrintableAscii(u_c)) { *os << static_cast(c); return kAsIs; } else { ostream::fmtflags flags = os->flags(); - *os << "\\x" << std::hex << std::uppercase - << static_cast(static_cast(c)); + *os << "\\x" << std::hex << std::uppercase << static_cast(u_c); os->flags(flags); return kHexEscape; } @@ -191,9 +198,9 @@ static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) { return kSpecialEscape; } -// Prints a wchar_t c as if it's part of a string literal, escaping it when +// Prints a char32_t c as if it's part of a string literal, escaping it when // necessary; returns how c was formatted. -static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) { +static CharFormat PrintAsStringLiteralTo(char32_t c, ostream* os) { switch (c) { case L'\'': *os << "'"; @@ -202,26 +209,68 @@ static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) { *os << "\\\""; return kSpecialEscape; default: - return PrintAsCharLiteralTo(c, os); + return PrintAsCharLiteralTo(c, os); } } +static const char* GetCharWidthPrefix(char) { + return ""; +} + +static const char* GetCharWidthPrefix(signed char) { + return ""; +} + +static const char* GetCharWidthPrefix(unsigned char) { + return ""; +} + +#ifdef __cpp_char8_t +static const char* GetCharWidthPrefix(char8_t) { + return "u8"; +} +#endif + +static const char* GetCharWidthPrefix(char16_t) { + return "u"; +} + +static const char* GetCharWidthPrefix(char32_t) { + return "U"; +} + +static const char* GetCharWidthPrefix(wchar_t) { + return "L"; +} + // Prints a char c as if it's part of a string literal, escaping it when // necessary; returns how c was formatted. static CharFormat PrintAsStringLiteralTo(char c, ostream* os) { - return PrintAsStringLiteralTo( - static_cast(static_cast(c)), os); + return PrintAsStringLiteralTo(ToChar32(c), os); +} + +#ifdef __cpp_char8_t +static CharFormat PrintAsStringLiteralTo(char8_t c, ostream* os) { + return PrintAsStringLiteralTo(ToChar32(c), os); +} +#endif + +static CharFormat PrintAsStringLiteralTo(char16_t c, ostream* os) { + return PrintAsStringLiteralTo(ToChar32(c), os); } -// Prints a wide or narrow character c and its code. '\0' is printed -// as "'\\0'", other unprintable characters are also properly escaped -// using the standard C++ escape sequence. The template argument -// UnsignedChar is the unsigned version of Char, which is the type of c. -template +static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) { + return PrintAsStringLiteralTo(ToChar32(c), os); +} + +// Prints a character c (of type char, char8_t, char16_t, char32_t, or wchar_t) +// and its code. '\0' is printed as "'\\0'", other unprintable characters are +// also properly escaped using the standard C++ escape sequence. +template void PrintCharAndCodeTo(Char c, ostream* os) { // First, print c as a literal in the most readable form we can find. - *os << ((sizeof(c) > 1) ? "L'" : "'"); - const CharFormat format = PrintAsCharLiteralTo(c, os); + *os << GetCharWidthPrefix(c) << "'"; + const CharFormat format = PrintAsCharLiteralTo(c, os); *os << "'"; // To aid user debugging, we also print c's code in decimal, unless @@ -242,21 +291,21 @@ void PrintCharAndCodeTo(Char c, ostream* os) { *os << ")"; } -void PrintTo(unsigned char c, ::std::ostream* os) { - PrintCharAndCodeTo(c, os); -} -void PrintTo(signed char c, ::std::ostream* os) { - PrintCharAndCodeTo(c, os); -} +void PrintTo(unsigned char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); } +void PrintTo(signed char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); } // Prints a wchar_t as a symbol if it is printable or as its internal // code otherwise and also as its code. L'\0' is printed as "L'\\0'". -void PrintTo(wchar_t wc, ostream* os) { - PrintCharAndCodeTo(wc, os); +void PrintTo(wchar_t wc, ostream* os) { PrintCharAndCodeTo(wc, os); } + +// TODO(dcheng): Consider making this delegate to PrintCharAndCodeTo() as well. +void PrintTo(char32_t c, ::std::ostream* os) { + *os << std::hex << "U+" << std::uppercase << std::setfill('0') << std::setw(4) + << static_cast(c); } // Prints the given array of characters to the ostream. CharType must be either -// char or wchar_t. +// char, char8_t, char16_t, char32_t, or wchar_t. // The array starts at begin, the length is len, it may include '\0' characters // and may not be NUL-terminated. template @@ -266,8 +315,8 @@ GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat PrintCharsAsStringTo( const CharType* begin, size_t len, ostream* os) { - const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\""; - *os << kQuoteBegin; + const char* const quote_prefix = GetCharWidthPrefix(*begin); + *os << quote_prefix << "\""; bool is_previous_hex = false; CharFormat print_format = kAsIs; for (size_t index = 0; index < len; ++index) { @@ -276,7 +325,7 @@ static CharFormat PrintCharsAsStringTo( // Previous character is of '\x..' form and this character can be // interpreted as another hexadecimal digit in its number. Break string to // disambiguate. - *os << "\" " << kQuoteBegin; + *os << "\" " << quote_prefix << "\""; } is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape; // Remember if any characters required hex escaping. @@ -322,22 +371,57 @@ void UniversalPrintArray(const char* begin, size_t len, ostream* os) { UniversalPrintCharArray(begin, len, os); } +#ifdef __cpp_char8_t +// Prints a (const) char8_t array of 'len' elements, starting at address +// 'begin'. +void UniversalPrintArray(const char8_t* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} +#endif + +// Prints a (const) char16_t array of 'len' elements, starting at address +// 'begin'. +void UniversalPrintArray(const char16_t* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} + +// Prints a (const) char32_t array of 'len' elements, starting at address +// 'begin'. +void UniversalPrintArray(const char32_t* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} + // Prints a (const) wchar_t array of 'len' elements, starting at address // 'begin'. void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) { UniversalPrintCharArray(begin, len, os); } -// Prints the given C string to the ostream. -void PrintTo(const char* s, ostream* os) { +namespace { + +// Prints a null-terminated C-style string to the ostream. +template +void PrintCStringTo(const Char* s, ostream* os) { if (s == nullptr) { *os << "NULL"; } else { *os << ImplicitCast_(s) << " pointing to "; - PrintCharsAsStringTo(s, strlen(s), os); + PrintCharsAsStringTo(s, std::char_traits::length(s), os); } } +} // anonymous namespace + +void PrintTo(const char* s, ostream* os) { PrintCStringTo(s, os); } + +#ifdef __cpp_char8_t +void PrintTo(const char8_t* s, ostream* os) { PrintCStringTo(s, os); } +#endif + +void PrintTo(const char16_t* s, ostream* os) { PrintCStringTo(s, os); } + +void PrintTo(const char32_t* s, ostream* os) { PrintCStringTo(s, os); } + // MSVC compiler can be configured to define whar_t as a typedef // of unsigned short. Defining an overload for const wchar_t* in that case // would cause pointers to unsigned shorts be printed as wide strings, @@ -346,14 +430,7 @@ void PrintTo(const char* s, ostream* os) { // wchar_t is implemented as a native type. #if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) // Prints the given wide C string to the ostream. -void PrintTo(const wchar_t* s, ostream* os) { - if (s == nullptr) { - *os << "NULL"; - } else { - *os << ImplicitCast_(s) << " pointing to "; - PrintCharsAsStringTo(s, wcslen(s), os); - } -} +void PrintTo(const wchar_t* s, ostream* os) { PrintCStringTo(s, os); } #endif // wchar_t is native namespace { @@ -431,6 +508,20 @@ void PrintStringTo(const ::std::string& s, ostream* os) { } } +#ifdef __cpp_char8_t +void PrintU8StringTo(const ::std::u8string& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} +#endif + +void PrintU16StringTo(const ::std::u16string& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} + +void PrintU32StringTo(const ::std::u32string& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} + #if GTEST_HAS_STD_WSTRING void PrintWideStringTo(const ::std::wstring& s, ostream* os) { PrintCharsAsStringTo(s.data(), s.size(), os); diff --git a/third_party/googletest/src/src/gtest-typed-test.cc b/third_party/googletest/src/src/gtest-typed-test.cc index 1b1cfb0dc1..c02c3df659 100644 --- a/third_party/googletest/src/src/gtest-typed-test.cc +++ b/third_party/googletest/src/src/gtest-typed-test.cc @@ -35,8 +35,6 @@ namespace testing { namespace internal { -#if GTEST_HAS_TYPED_TEST_P - // Skips to the first non-space char in str. Returns an empty string if str // contains only whitespace characters. static const char* SkipSpaces(const char* str) { @@ -78,17 +76,7 @@ const char* TypedTestSuitePState::VerifyRegisteredTestNames( continue; } - bool found = false; - for (RegisteredTestIter it = registered_tests_.begin(); - it != registered_tests_.end(); - ++it) { - if (name == it->first) { - found = true; - break; - } - } - - if (found) { + if (registered_tests_.count(name) != 0) { tests.insert(name); } else { errors << "No test named " << name @@ -115,7 +103,5 @@ const char* TypedTestSuitePState::VerifyRegisteredTestNames( return registered_tests; } -#endif // GTEST_HAS_TYPED_TEST_P - } // namespace internal } // namespace testing diff --git a/third_party/googletest/src/src/gtest.cc b/third_party/googletest/src/src/gtest.cc index b8f6a5c31c..21c611aff1 100644 --- a/third_party/googletest/src/src/gtest.cc +++ b/third_party/googletest/src/src/gtest.cc @@ -35,7 +35,6 @@ #include "gtest/gtest-spi.h" #include -#include #include #include #include @@ -44,6 +43,8 @@ #include #include +#include // NOLINT +#include #include #include #include @@ -55,8 +56,6 @@ #if GTEST_OS_LINUX -# define GTEST_HAS_GETTIMEOFDAY_ 1 - # include // NOLINT # include // NOLINT # include // NOLINT @@ -68,7 +67,6 @@ # include #elif GTEST_OS_ZOS -# define GTEST_HAS_GETTIMEOFDAY_ 1 # include // NOLINT // On z/OS we additionally need strings.h for strcasecmp. @@ -86,7 +84,6 @@ #ifdef _MSC_VER # include // NOLINT -# include // NOLINT #endif # include // NOLINT @@ -95,16 +92,11 @@ # include // NOLINT # if GTEST_OS_WINDOWS_MINGW -// MinGW has gettimeofday() but not _ftime64(). -# define GTEST_HAS_GETTIMEOFDAY_ 1 # include // NOLINT # endif // GTEST_OS_WINDOWS_MINGW #else -// Assume other platforms have gettimeofday(). -# define GTEST_HAS_GETTIMEOFDAY_ 1 - // cpplint thinks that the header is already included, so we want to // silence it. # include // NOLINT @@ -213,6 +205,21 @@ static const char* GetDefaultFilter() { return kUniversalFilter; } +// Bazel passes in the argument to '--test_runner_fail_fast' via the +// TESTBRIDGE_TEST_RUNNER_FAIL_FAST environment variable. +static bool GetDefaultFailFast() { + const char* const testbridge_test_runner_fail_fast = + internal::posix::GetEnv("TESTBRIDGE_TEST_RUNNER_FAIL_FAST"); + if (testbridge_test_runner_fail_fast != nullptr) { + return strcmp(testbridge_test_runner_fail_fast, "1") == 0; + } + return false; +} + +GTEST_DEFINE_bool_( + fail_fast, internal::BoolFromGTestEnv("fail_fast", GetDefaultFailFast()), + "True if and only if a test failure should stop further test execution."); + GTEST_DEFINE_bool_( also_run_disabled_tests, internal::BoolFromGTestEnv("also_run_disabled_tests", false), @@ -273,6 +280,10 @@ GTEST_DEFINE_string_( "executable's name and, if necessary, made unique by adding " "digits."); +GTEST_DEFINE_bool_( + brief, internal::BoolFromGTestEnv("brief", false), + "True if only test failures should be displayed in text output."); + GTEST_DEFINE_bool_(print_time, internal::BoolFromGTestEnv("print_time", true), "True if and only if " GTEST_NAME_ " should display elapsed time in text output."); @@ -479,7 +490,7 @@ void InsertSyntheticTestCase(const std::string& name, CodeLocation location, "removed but the rest got left behind."; std::string message = - "Paramaterized test suite " + name + + "Parameterized test suite " + name + (has_test_p ? kMissingInstantiation : kMissingTestCase) + "\n\n" "To suppress this error for this test suite, insert the following line " @@ -487,7 +498,7 @@ void InsertSyntheticTestCase(const std::string& name, CodeLocation location, "\n\n" "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + name + ");"; - std::string full_name = "UninstantiatedParamaterizedTestSuite<" + name + ">"; + std::string full_name = "UninstantiatedParameterizedTestSuite<" + name + ">"; RegisterTest( // "GoogleTestVerification", full_name.c_str(), nullptr, // No type parameter. @@ -534,7 +545,7 @@ void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() { if (ignored.find(testcase.first) != ignored.end()) continue; std::string message = - "Type paramaterized test suite " + testcase.first + + "Type parameterized test suite " + testcase.first + " is defined via REGISTER_TYPED_TEST_SUITE_P, but never instantiated " "via INSTANTIATE_TYPED_TEST_SUITE_P. None of the test cases will run." "\n\n" @@ -544,13 +555,13 @@ void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() { "utilities.)" "\n\n" "To suppress this error for this test suite, insert the following line " - "(in a non-header) in the namespace it is definedin in:" + "(in a non-header) in the namespace it is defined in:" "\n\n" "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + testcase.first + ");"; std::string full_name = - "UninstantiatedTypeParamaterizedTestSuite<" + testcase.first + ">"; + "UninstantiatedTypeParameterizedTestSuite<" + testcase.first + ">"; RegisterTest( // "GoogleTestVerification", full_name.c_str(), nullptr, // No type parameter. @@ -635,47 +646,82 @@ std::string UnitTestOptions::GetAbsolutePathToOutputFile() { return result.string(); } -// Returns true if and only if the wildcard pattern matches the string. -// The first ':' or '\0' character in pattern marks the end of it. +// Returns true if and only if the wildcard pattern matches the string. Each +// pattern consists of regular characters, single-character wildcards (?), and +// multi-character wildcards (*). // -// This recursive algorithm isn't very efficient, but is clear and -// works well enough for matching test names, which are short. -bool UnitTestOptions::PatternMatchesString(const char *pattern, - const char *str) { - switch (*pattern) { - case '\0': - case ':': // Either ':' or '\0' marks the end of the pattern. - return *str == '\0'; - case '?': // Matches any single character. - return *str != '\0' && PatternMatchesString(pattern + 1, str + 1); - case '*': // Matches any string (possibly empty) of characters. - return (*str != '\0' && PatternMatchesString(pattern, str + 1)) || - PatternMatchesString(pattern + 1, str); - default: // Non-special character. Matches itself. - return *pattern == *str && - PatternMatchesString(pattern + 1, str + 1); - } -} - -bool UnitTestOptions::MatchesFilter( - const std::string& name, const char* filter) { - const char *cur_pattern = filter; - for (;;) { - if (PatternMatchesString(cur_pattern, name.c_str())) { - return true; +// This function implements a linear-time string globbing algorithm based on +// https://research.swtch.com/glob. +static bool PatternMatchesString(const std::string& name_str, + const char* pattern, const char* pattern_end) { + const char* name = name_str.c_str(); + const char* const name_begin = name; + const char* const name_end = name + name_str.size(); + + const char* pattern_next = pattern; + const char* name_next = name; + + while (pattern < pattern_end || name < name_end) { + if (pattern < pattern_end) { + switch (*pattern) { + default: // Match an ordinary character. + if (name < name_end && *name == *pattern) { + ++pattern; + ++name; + continue; + } + break; + case '?': // Match any single character. + if (name < name_end) { + ++pattern; + ++name; + continue; + } + break; + case '*': + // Match zero or more characters. Start by skipping over the wildcard + // and matching zero characters from name. If that fails, restart and + // match one more character than the last attempt. + pattern_next = pattern; + name_next = name + 1; + ++pattern; + continue; + } + } + // Failed to match a character. Restart if possible. + if (name_begin < name_next && name_next <= name_end) { + pattern = pattern_next; + name = name_next; + continue; } + return false; + } + return true; +} - // Finds the next pattern in the filter. - cur_pattern = strchr(cur_pattern, ':'); +bool UnitTestOptions::MatchesFilter(const std::string& name_str, + const char* filter) { + // The filter is a list of patterns separated by colons (:). + const char* pattern = filter; + while (true) { + // Find the bounds of this pattern. + const char* const next_sep = strchr(pattern, ':'); + const char* const pattern_end = + next_sep != nullptr ? next_sep : pattern + strlen(pattern); - // Returns if no more pattern can be found. - if (cur_pattern == nullptr) { - return false; + // Check if this pattern matches name_str. + if (PatternMatchesString(name_str, pattern, pattern_end)) { + return true; } - // Skips the pattern separater (the ':' character). - cur_pattern++; + // Give up on this pattern. However, if we found a pattern separator (:), + // advance to the next pattern (skipping over the separator) and restart. + if (next_sep == nullptr) { + return false; + } + pattern = next_sep + 1; } + return true; } // Returns true if and only if the user-specified filter matches the test @@ -985,44 +1031,30 @@ std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) { ); // NOLINT } -// Returns the current time in milliseconds. +// A helper class for measuring elapsed times. +class Timer { + public: + Timer() : start_(std::chrono::steady_clock::now()) {} + + // Return time elapsed in milliseconds since the timer was created. + TimeInMillis Elapsed() { + return std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_) + .count(); + } + + private: + std::chrono::steady_clock::time_point start_; +}; + +// Returns a timestamp as milliseconds since the epoch. Note this time may jump +// around subject to adjustments by the system, to measure elapsed time use +// Timer instead. TimeInMillis GetTimeInMillis() { -#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__) - // Difference between 1970-01-01 and 1601-01-01 in milliseconds. - // http://analogous.blogspot.com/2005/04/epoch.html - const TimeInMillis kJavaEpochToWinFileTimeDelta = - static_cast(116444736UL) * 100000UL; - const DWORD kTenthMicrosInMilliSecond = 10000; - - SYSTEMTIME now_systime; - FILETIME now_filetime; - ULARGE_INTEGER now_int64; - GetSystemTime(&now_systime); - if (SystemTimeToFileTime(&now_systime, &now_filetime)) { - now_int64.LowPart = now_filetime.dwLowDateTime; - now_int64.HighPart = now_filetime.dwHighDateTime; - now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) - - kJavaEpochToWinFileTimeDelta; - return now_int64.QuadPart; - } - return 0; -#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_ - __timeb64 now; - - // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996 - // (deprecated function) there. - GTEST_DISABLE_MSC_DEPRECATED_PUSH_() - _ftime64(&now); - GTEST_DISABLE_MSC_DEPRECATED_POP_() - - return static_cast(now.time) * 1000 + now.millitm; -#elif GTEST_HAS_GETTIMEOFDAY_ - struct timeval now; - gettimeofday(&now, nullptr); - return static_cast(now.tv_sec) * 1000 + now.tv_usec / 1000; -#else -# error "Don't know how to get the current time on your system." -#endif + return std::chrono::duration_cast( + std::chrono::system_clock::now() - + std::chrono::system_clock::from_time_t(0)) + .count(); } // Utilities @@ -1537,6 +1569,31 @@ AssertionResult DoubleNearPredFormat(const char* expr1, const double diff = fabs(val1 - val2); if (diff <= abs_error) return AssertionSuccess(); + // Find the value which is closest to zero. + const double min_abs = std::min(fabs(val1), fabs(val2)); + // Find the distance to the next double from that value. + const double epsilon = + nextafter(min_abs, std::numeric_limits::infinity()) - min_abs; + // Detect the case where abs_error is so small that EXPECT_NEAR is + // effectively the same as EXPECT_EQUAL, and give an informative error + // message so that the situation can be more easily understood without + // requiring exotic floating-point knowledge. + // Don't do an epsilon check if abs_error is zero because that implies + // that an equality check was actually intended. + if (!(std::isnan)(val1) && !(std::isnan)(val2) && abs_error > 0 && + abs_error < epsilon) { + return AssertionFailure() + << "The difference between " << expr1 << " and " << expr2 << " is " + << diff << ", where\n" + << expr1 << " evaluates to " << val1 << ",\n" + << expr2 << " evaluates to " << val2 << ".\nThe abs_error parameter " + << abs_error_expr << " evaluates to " << abs_error + << " which is smaller than the minimum distance between doubles for " + "numbers of this magnitude which is " + << epsilon + << ", thus making this EXPECT_NEAR check equivalent to " + "EXPECT_EQUAL. Consider using EXPECT_DOUBLE_EQ instead."; + } return AssertionFailure() << "The difference between " << expr1 << " and " << expr2 << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n" @@ -1599,57 +1656,6 @@ AssertionResult DoubleLE(const char* expr1, const char* expr2, namespace internal { -// The helper function for {ASSERT|EXPECT}_EQ with int or enum -// arguments. -AssertionResult CmpHelperEQ(const char* lhs_expression, - const char* rhs_expression, - BiggestInt lhs, - BiggestInt rhs) { - if (lhs == rhs) { - return AssertionSuccess(); - } - - return EqFailure(lhs_expression, - rhs_expression, - FormatForComparisonFailureMessage(lhs, rhs), - FormatForComparisonFailureMessage(rhs, lhs), - false); -} - -// A macro for implementing the helper functions needed to implement -// ASSERT_?? and EXPECT_?? with integer or enum arguments. It is here -// just to avoid copy-and-paste of similar code. -#define GTEST_IMPL_CMP_HELPER_(op_name, op)\ -AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ - BiggestInt val1, BiggestInt val2) {\ - if (val1 op val2) {\ - return AssertionSuccess();\ - } else {\ - return AssertionFailure() \ - << "Expected: (" << expr1 << ") " #op " (" << expr2\ - << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\ - << " vs " << FormatForComparisonFailureMessage(val2, val1);\ - }\ -} - -// Implements the helper function for {ASSERT|EXPECT}_NE with int or -// enum arguments. -GTEST_IMPL_CMP_HELPER_(NE, !=) -// Implements the helper function for {ASSERT|EXPECT}_LE with int or -// enum arguments. -GTEST_IMPL_CMP_HELPER_(LE, <=) -// Implements the helper function for {ASSERT|EXPECT}_LT with int or -// enum arguments. -GTEST_IMPL_CMP_HELPER_(LT, < ) -// Implements the helper function for {ASSERT|EXPECT}_GE with int or -// enum arguments. -GTEST_IMPL_CMP_HELPER_(GE, >=) -// Implements the helper function for {ASSERT|EXPECT}_GT with int or -// enum arguments. -GTEST_IMPL_CMP_HELPER_(GT, > ) - -#undef GTEST_IMPL_CMP_HELPER_ - // The helper function for {ASSERT|EXPECT}_STREQ. AssertionResult CmpHelperSTREQ(const char* lhs_expression, const char* rhs_expression, @@ -2123,8 +2129,13 @@ bool String::EndsWithCaseInsensitive( // Formats an int value as "%02d". std::string String::FormatIntWidth2(int value) { + return FormatIntWidthN(value, 2); +} + +// Formats an int value to given width with leading zeros. +std::string String::FormatIntWidthN(int value, int width) { std::stringstream ss; - ss << std::setfill('0') << std::setw(2) << value; + ss << std::setfill('0') << std::setw(width) << value; return ss.str(); } @@ -2176,7 +2187,9 @@ std::string AppendUserMessage(const std::string& gtest_msg, if (user_msg_string.empty()) { return gtest_msg; } - + if (gtest_msg.empty()) { + return user_msg_string; + } return gtest_msg + "\n" + user_msg_string; } @@ -2228,7 +2241,7 @@ void TestResult::RecordProperty(const std::string& xml_element, if (!ValidateTestProperty(xml_element, test_property)) { return; } - internal::MutexLock lock(&test_properites_mutex_); + internal::MutexLock lock(&test_properties_mutex_); const std::vector::iterator property_with_matching_key = std::find_if(test_properties_.begin(), test_properties_.end(), internal::TestPropertyKeyIs(test_property.key())); @@ -2255,7 +2268,8 @@ static const char* const kReservedTestSuitesAttributes[] = { // The list of reserved attributes used in the element of XML // output. static const char* const kReservedTestSuiteAttributes[] = { - "disabled", "errors", "failures", "name", "tests", "time", "timestamp"}; + "disabled", "errors", "failures", "name", + "tests", "time", "timestamp", "skipped"}; // The list of reserved attributes used in the element of XML output. static const char* const kReservedTestCaseAttributes[] = { @@ -2268,7 +2282,7 @@ static const char* const kReservedOutputTestCaseAttributes[] = { "classname", "name", "status", "time", "type_param", "value_param", "file", "line", "result", "timestamp"}; -template +template std::vector ArrayAsVector(const char* const (&array)[kSize]) { return std::vector(array, array + kSize); } @@ -2712,6 +2726,7 @@ TestInfo::TestInfo(const std::string& a_test_suite_name, should_run_(false), is_disabled_(false), matches_filter_(false), + is_in_another_shard_(false), factory_(factory), result_() {} @@ -2725,7 +2740,7 @@ namespace internal { // // Arguments: // -// test_suite_name: name of the test suite +// test_suite_name: name of the test suite // name: name of the test // type_param: the name of the test's type parameter, or NULL if // this is not a typed or a type-parameterized test. @@ -2827,7 +2842,8 @@ void TestInfo::Run() { // Notifies the unit test event listeners that a test is about to start. repeater->OnTestStart(*this); - const TimeInMillis start = internal::GetTimeInMillis(); + result_.set_start_timestamp(internal::GetTimeInMillis()); + internal::Timer timer; impl->os_stack_trace_getter()->UponLeavingGTest(); @@ -2852,8 +2868,7 @@ void TestInfo::Run() { test, &Test::DeleteSelf_, "the test fixture's destructor"); } - result_.set_start_timestamp(start); - result_.set_elapsed_time(internal::GetTimeInMillis() - start); + result_.set_elapsed_time(timer.Elapsed()); // Notifies the unit test event listener that a test has just finished. repeater->OnTestEnd(*this); @@ -2863,6 +2878,28 @@ void TestInfo::Run() { impl->set_current_test_info(nullptr); } +// Skip and records a skipped test result for this object. +void TestInfo::Skip() { + if (!should_run_) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_info(this); + + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + + // Notifies the unit test event listeners that a test is about to start. + repeater->OnTestStart(*this); + + const TestPartResult test_part_result = + TestPartResult(TestPartResult::kSkip, this->file(), this->line(), ""); + impl->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult( + test_part_result); + + // Notifies the unit test event listener that a test has just finished. + repeater->OnTestEnd(*this); + impl->set_current_test_info(nullptr); +} + // class TestSuite // Gets the number of successful tests in this test suite. @@ -2909,7 +2946,7 @@ int TestSuite::total_test_count() const { // // Arguments: // -// name: name of the test suite +// a_name: name of the test suite // a_type_param: the name of the test suite's type parameter, or NULL if // this is not a typed or a type-parameterized test suite. // set_up_tc: pointer to the function that sets up the test suite @@ -2964,19 +3001,26 @@ void TestSuite::Run() { // Call both legacy and the new API repeater->OnTestSuiteStart(*this); // Legacy API is deprecated but still available -#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ repeater->OnTestCaseStart(*this); -#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ impl->os_stack_trace_getter()->UponLeavingGTest(); internal::HandleExceptionsInMethodIfSupported( this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()"); start_timestamp_ = internal::GetTimeInMillis(); + internal::Timer timer; for (int i = 0; i < total_test_count(); i++) { GetMutableTestInfo(i)->Run(); + if (GTEST_FLAG(fail_fast) && GetMutableTestInfo(i)->result()->Failed()) { + for (int j = i + 1; j < total_test_count(); j++) { + GetMutableTestInfo(j)->Skip(); + } + break; + } } - elapsed_time_ = internal::GetTimeInMillis() - start_timestamp_; + elapsed_time_ = timer.Elapsed(); impl->os_stack_trace_getter()->UponLeavingGTest(); internal::HandleExceptionsInMethodIfSupported( @@ -2985,9 +3029,39 @@ void TestSuite::Run() { // Call both legacy and the new API repeater->OnTestSuiteEnd(*this); // Legacy API is deprecated but still available -#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ repeater->OnTestCaseEnd(*this); -#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + impl->set_current_test_suite(nullptr); +} + +// Skips all tests under this TestSuite. +void TestSuite::Skip() { + if (!should_run_) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_suite(this); + + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + + // Call both legacy and the new API + repeater->OnTestSuiteStart(*this); +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + repeater->OnTestCaseStart(*this); +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + for (int i = 0; i < total_test_count(); i++) { + GetMutableTestInfo(i)->Skip(); + } + + // Call both legacy and the new API + repeater->OnTestSuiteEnd(*this); + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + repeater->OnTestCaseEnd(*this); +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ impl->set_current_test_suite(nullptr); } @@ -3039,7 +3113,7 @@ static std::string FormatTestSuiteCount(int test_suite_count) { static const char * TestPartResultTypeToString(TestPartResult::Type type) { switch (type) { case TestPartResult::kSkip: - return "Skipped"; + return "Skipped\n"; case TestPartResult::kSuccess: return "Success"; @@ -3056,6 +3130,9 @@ static const char * TestPartResultTypeToString(TestPartResult::Type type) { } namespace internal { +namespace { +enum class GTestColor { kDefault, kRed, kGreen, kYellow }; +} // namespace // Prints a TestPartResult to an std::string. static std::string PrintTestPartResultToString( @@ -3093,9 +3170,12 @@ static void PrintTestPartResult(const TestPartResult& test_part_result) { // Returns the character attribute for the given color. static WORD GetColorAttribute(GTestColor color) { switch (color) { - case COLOR_RED: return FOREGROUND_RED; - case COLOR_GREEN: return FOREGROUND_GREEN; - case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN; + case GTestColor::kRed: + return FOREGROUND_RED; + case GTestColor::kGreen: + return FOREGROUND_GREEN; + case GTestColor::kYellow: + return FOREGROUND_RED | FOREGROUND_GREEN; default: return 0; } } @@ -3133,13 +3213,16 @@ static WORD GetNewColor(GTestColor color, WORD old_color_attrs) { #else -// Returns the ANSI color code for the given color. COLOR_DEFAULT is +// Returns the ANSI color code for the given color. GTestColor::kDefault is // an invalid input. static const char* GetAnsiColorCode(GTestColor color) { switch (color) { - case COLOR_RED: return "1"; - case COLOR_GREEN: return "2"; - case COLOR_YELLOW: return "3"; + case GTestColor::kRed: + return "1"; + case GTestColor::kGreen: + return "2"; + case GTestColor::kYellow: + return "3"; default: return nullptr; } @@ -3188,7 +3271,9 @@ bool ShouldUseColor(bool stdout_is_tty) { // cannot simply emit special characters and have the terminal change colors. // This routine must actually emit the characters rather than return a string // that would be colored when printed, as can be done on Linux. -void ColoredPrintf(GTestColor color, const char* fmt, ...) { + +GTEST_ATTRIBUTE_PRINTF_(2, 3) +static void ColoredPrintf(GTestColor color, const char *fmt, ...) { va_list args; va_start(args, fmt); @@ -3198,7 +3283,7 @@ void ColoredPrintf(GTestColor color, const char* fmt, ...) { #else static const bool in_color_mode = ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0); - const bool use_color = in_color_mode && (color != COLOR_DEFAULT); + const bool use_color = in_color_mode && (color != GTestColor::kDefault); #endif // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS if (!use_color) { @@ -3310,25 +3395,24 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart( // Prints the filter if it's not *. This reminds the user that some // tests may be skipped. if (!String::CStringEquals(filter, kUniversalFilter)) { - ColoredPrintf(COLOR_YELLOW, - "Note: %s filter = %s\n", GTEST_NAME_, filter); + ColoredPrintf(GTestColor::kYellow, "Note: %s filter = %s\n", GTEST_NAME_, + filter); } if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) { const int32_t shard_index = Int32FromEnvOrDie(kTestShardIndex, -1); - ColoredPrintf(COLOR_YELLOW, - "Note: This is test shard %d of %s.\n", + ColoredPrintf(GTestColor::kYellow, "Note: This is test shard %d of %s.\n", static_cast(shard_index) + 1, internal::posix::GetEnv(kTestTotalShards)); } if (GTEST_FLAG(shuffle)) { - ColoredPrintf(COLOR_YELLOW, + ColoredPrintf(GTestColor::kYellow, "Note: Randomizing tests' orders with a seed of %d .\n", unit_test.random_seed()); } - ColoredPrintf(COLOR_GREEN, "[==========] "); + ColoredPrintf(GTestColor::kGreen, "[==========] "); printf("Running %s from %s.\n", FormatTestCount(unit_test.test_to_run_count()).c_str(), FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); @@ -3337,7 +3421,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart( void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart( const UnitTest& /*unit_test*/) { - ColoredPrintf(COLOR_GREEN, "[----------] "); + ColoredPrintf(GTestColor::kGreen, "[----------] "); printf("Global test environment set-up.\n"); fflush(stdout); } @@ -3346,7 +3430,7 @@ void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart( void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) { const std::string counts = FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); - ColoredPrintf(COLOR_GREEN, "[----------] "); + ColoredPrintf(GTestColor::kGreen, "[----------] "); printf("%s from %s", counts.c_str(), test_case.name()); if (test_case.type_param() == nullptr) { printf("\n"); @@ -3360,7 +3444,7 @@ void PrettyUnitTestResultPrinter::OnTestSuiteStart( const TestSuite& test_suite) { const std::string counts = FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests"); - ColoredPrintf(COLOR_GREEN, "[----------] "); + ColoredPrintf(GTestColor::kGreen, "[----------] "); printf("%s from %s", counts.c_str(), test_suite.name()); if (test_suite.type_param() == nullptr) { printf("\n"); @@ -3372,7 +3456,7 @@ void PrettyUnitTestResultPrinter::OnTestSuiteStart( #endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) { - ColoredPrintf(COLOR_GREEN, "[ RUN ] "); + ColoredPrintf(GTestColor::kGreen, "[ RUN ] "); PrintTestName(test_info.test_suite_name(), test_info.name()); printf("\n"); fflush(stdout); @@ -3395,11 +3479,11 @@ void PrettyUnitTestResultPrinter::OnTestPartResult( void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { if (test_info.result()->Passed()) { - ColoredPrintf(COLOR_GREEN, "[ OK ] "); + ColoredPrintf(GTestColor::kGreen, "[ OK ] "); } else if (test_info.result()->Skipped()) { - ColoredPrintf(COLOR_GREEN, "[ SKIPPED ] "); + ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] "); } else { - ColoredPrintf(COLOR_RED, "[ FAILED ] "); + ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); } PrintTestName(test_info.test_suite_name(), test_info.name()); if (test_info.result()->Failed()) @@ -3420,7 +3504,7 @@ void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) { const std::string counts = FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); - ColoredPrintf(COLOR_GREEN, "[----------] "); + ColoredPrintf(GTestColor::kGreen, "[----------] "); printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_case.name(), internal::StreamableToString(test_case.elapsed_time()).c_str()); fflush(stdout); @@ -3431,7 +3515,7 @@ void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) { const std::string counts = FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests"); - ColoredPrintf(COLOR_GREEN, "[----------] "); + ColoredPrintf(GTestColor::kGreen, "[----------] "); printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_suite.name(), internal::StreamableToString(test_suite.elapsed_time()).c_str()); fflush(stdout); @@ -3440,7 +3524,7 @@ void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) { void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart( const UnitTest& /*unit_test*/) { - ColoredPrintf(COLOR_GREEN, "[----------] "); + ColoredPrintf(GTestColor::kGreen, "[----------] "); printf("Global test environment tear-down\n"); fflush(stdout); } @@ -3448,7 +3532,7 @@ void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart( // Internal helper for printing the list of failed tests. void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) { const int failed_test_count = unit_test.failed_test_count(); - ColoredPrintf(COLOR_RED, "[ FAILED ] "); + ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str()); for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { @@ -3461,7 +3545,7 @@ void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) { if (!test_info.should_run() || !test_info.result()->Failed()) { continue; } - ColoredPrintf(COLOR_RED, "[ FAILED ] "); + ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); printf("%s.%s", test_suite.name(), test_info.name()); PrintFullTestCommentIfPresent(test_info); printf("\n"); @@ -3482,7 +3566,7 @@ void PrettyUnitTestResultPrinter::PrintFailedTestSuites( continue; } if (test_suite.ad_hoc_test_result().Failed()) { - ColoredPrintf(COLOR_RED, "[ FAILED ] "); + ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); printf("%s: SetUpTestSuite or TearDownTestSuite\n", test_suite.name()); ++suite_failure_count; } @@ -3510,7 +3594,7 @@ void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest& unit_test) { if (!test_info.should_run() || !test_info.result()->Skipped()) { continue; } - ColoredPrintf(COLOR_GREEN, "[ SKIPPED ] "); + ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] "); printf("%s.%s", test_suite.name(), test_info.name()); printf("\n"); } @@ -3519,7 +3603,7 @@ void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest& unit_test) { void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, int /*iteration*/) { - ColoredPrintf(COLOR_GREEN, "[==========] "); + ColoredPrintf(GTestColor::kGreen, "[==========] "); printf("%s from %s ran.", FormatTestCount(unit_test.test_to_run_count()).c_str(), FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); @@ -3528,12 +3612,12 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, internal::StreamableToString(unit_test.elapsed_time()).c_str()); } printf("\n"); - ColoredPrintf(COLOR_GREEN, "[ PASSED ] "); + ColoredPrintf(GTestColor::kGreen, "[ PASSED ] "); printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str()); const int skipped_test_count = unit_test.skipped_test_count(); if (skipped_test_count > 0) { - ColoredPrintf(COLOR_GREEN, "[ SKIPPED ] "); + ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] "); printf("%s, listed below:\n", FormatTestCount(skipped_test_count).c_str()); PrintSkippedTests(unit_test); } @@ -3548,10 +3632,8 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, if (unit_test.Passed()) { printf("\n"); // Add a spacer if no FAILURE banner is displayed. } - ColoredPrintf(COLOR_YELLOW, - " YOU HAVE %d DISABLED %s\n\n", - num_disabled, - num_disabled == 1 ? "TEST" : "TESTS"); + ColoredPrintf(GTestColor::kYellow, " YOU HAVE %d DISABLED %s\n\n", + num_disabled, num_disabled == 1 ? "TEST" : "TESTS"); } // Ensure that Google Test output is printed before, e.g., heapchecker output. fflush(stdout); @@ -3559,6 +3641,110 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, // End PrettyUnitTestResultPrinter +// This class implements the TestEventListener interface. +// +// Class BriefUnitTestResultPrinter is copyable. +class BriefUnitTestResultPrinter : public TestEventListener { + public: + BriefUnitTestResultPrinter() {} + static void PrintTestName(const char* test_suite, const char* test) { + printf("%s.%s", test_suite, test); + } + + // The following methods override what's in the TestEventListener class. + void OnTestProgramStart(const UnitTest& /*unit_test*/) override {} + void OnTestIterationStart(const UnitTest& /*unit_test*/, + int /*iteration*/) override {} + void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {} + void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {} +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseStart(const TestCase& /*test_case*/) override {} +#else + void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {} +#endif // OnTestCaseStart + + void OnTestStart(const TestInfo& /*test_info*/) override {} + + void OnTestPartResult(const TestPartResult& result) override; + void OnTestEnd(const TestInfo& test_info) override; +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseEnd(const TestCase& /*test_case*/) override {} +#else + void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {} + void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {} + void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override; + void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {} +}; + +// Called after an assertion failure. +void BriefUnitTestResultPrinter::OnTestPartResult( + const TestPartResult& result) { + switch (result.type()) { + // If the test part succeeded, we don't need to do anything. + case TestPartResult::kSuccess: + return; + default: + // Print failure message from the assertion + // (e.g. expected this and got that). + PrintTestPartResult(result); + fflush(stdout); + } +} + +void BriefUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { + if (test_info.result()->Failed()) { + ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); + PrintTestName(test_info.test_suite_name(), test_info.name()); + PrintFullTestCommentIfPresent(test_info); + + if (GTEST_FLAG(print_time)) { + printf(" (%s ms)\n", + internal::StreamableToString(test_info.result()->elapsed_time()) + .c_str()); + } else { + printf("\n"); + } + fflush(stdout); + } +} + +void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + ColoredPrintf(GTestColor::kGreen, "[==========] "); + printf("%s from %s ran.", + FormatTestCount(unit_test.test_to_run_count()).c_str(), + FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); + if (GTEST_FLAG(print_time)) { + printf(" (%s ms total)", + internal::StreamableToString(unit_test.elapsed_time()).c_str()); + } + printf("\n"); + ColoredPrintf(GTestColor::kGreen, "[ PASSED ] "); + printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str()); + + const int skipped_test_count = unit_test.skipped_test_count(); + if (skipped_test_count > 0) { + ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] "); + printf("%s.\n", FormatTestCount(skipped_test_count).c_str()); + } + + int num_disabled = unit_test.reportable_disabled_test_count(); + if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) { + if (unit_test.Passed()) { + printf("\n"); // Add a spacer if no FAILURE banner is displayed. + } + ColoredPrintf(GTestColor::kYellow, " YOU HAVE %d DISABLED %s\n\n", + num_disabled, num_disabled == 1 ? "TEST" : "TESTS"); + } + // Ensure that Google Test output is printed before, e.g., heapchecker output. + fflush(stdout); +} + +// End BriefUnitTestResultPrinter + // class TestEventRepeater // // This class forwards events to other event listeners. @@ -3742,6 +3928,16 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener { // Streams an XML CDATA section, escaping invalid CDATA sequences as needed. static void OutputXmlCDataSection(::std::ostream* stream, const char* data); + // Streams a test suite XML stanza containing the given test result. + // + // Requires: result.Failed() + static void OutputXmlTestSuiteForTestResult(::std::ostream* stream, + const TestResult& result); + + // Streams an XML representation of a TestResult object. + static void OutputXmlTestResult(::std::ostream* stream, + const TestResult& result); + // Streams an XML representation of a TestInfo object. static void OutputXmlTestInfo(::std::ostream* stream, const char* test_suite_name, @@ -3900,6 +4096,10 @@ static bool PortableLocaltime(time_t seconds, struct tm* out) { if (tm_ptr == nullptr) return false; *out = *tm_ptr; return true; +#elif defined(__STDC_LIB_EXT1__) + // Uses localtime_s when available as localtime_r is only available from + // C23 standard. + return localtime_s(&seconds, out) != nullptr; #else return localtime_r(&seconds, out) != nullptr; #endif @@ -3911,13 +4111,14 @@ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) { struct tm time_struct; if (!PortableLocaltime(static_cast(ms / 1000), &time_struct)) return ""; - // YYYY-MM-DDThh:mm:ss + // YYYY-MM-DDThh:mm:ss.sss return StreamableToString(time_struct.tm_year + 1900) + "-" + String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + String::FormatIntWidth2(time_struct.tm_mday) + "T" + String::FormatIntWidth2(time_struct.tm_hour) + ":" + String::FormatIntWidth2(time_struct.tm_min) + ":" + - String::FormatIntWidth2(time_struct.tm_sec); + String::FormatIntWidth2(time_struct.tm_sec) + "." + + String::FormatIntWidthN(static_cast(ms % 1000), 3); } // Streams an XML CDATA section, escaping invalid CDATA sequences as needed. @@ -3956,6 +4157,43 @@ void XmlUnitTestResultPrinter::OutputXmlAttribute( *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\""; } +// Streams a test suite XML stanza containing the given test result. +void XmlUnitTestResultPrinter::OutputXmlTestSuiteForTestResult( + ::std::ostream* stream, const TestResult& result) { + // Output the boilerplate for a minimal test suite with one test. + *stream << " "; + + // Output the boilerplate for a minimal test case with a single test. + *stream << " \n"; +} + // Prints an XML representation of a TestInfo object. void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, const char* test_suite_name, @@ -3999,11 +4237,17 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, FormatEpochTimeInMillisAsIso8601(result.start_timestamp())); OutputXmlAttribute(stream, kTestsuite, "classname", test_suite_name); + OutputXmlTestResult(stream, result); +} + +void XmlUnitTestResultPrinter::OutputXmlTestResult(::std::ostream* stream, + const TestResult& result) { int failures = 0; + int skips = 0; for (int i = 0; i < result.total_part_count(); ++i) { const TestPartResult& part = result.GetTestPartResult(i); if (part.failed()) { - if (++failures == 1) { + if (++failures == 1 && skips == 0) { *stream << ">\n"; } const std::string location = @@ -4011,18 +4255,31 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, part.line_number()); const std::string summary = location + "\n" + part.summary(); *stream << " "; const std::string detail = location + "\n" + part.message(); OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str()); *stream << "\n"; + } else if (part.skipped()) { + if (++skips == 1 && failures == 0) { + *stream << ">\n"; + } + const std::string location = + internal::FormatCompilerIndependentFileLocation(part.file_name(), + part.line_number()); + const std::string summary = location + "\n" + part.summary(); + *stream << " "; + const std::string detail = location + "\n" + part.message(); + OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str()); + *stream << "\n"; } } - if (failures == 0 && result.test_property_count() == 0) { + if (failures == 0 && skips == 0 && result.test_property_count() == 0) { *stream << " />\n"; } else { - if (failures == 0) { + if (failures == 0 && skips == 0) { *stream << ">\n"; } OutputXmlTestProperties(stream, result); @@ -4044,7 +4301,11 @@ void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream, OutputXmlAttribute( stream, kTestsuite, "disabled", StreamableToString(test_suite.reportable_disabled_test_count())); + OutputXmlAttribute(stream, kTestsuite, "skipped", + StreamableToString(test_suite.skipped_test_count())); + OutputXmlAttribute(stream, kTestsuite, "errors", "0"); + OutputXmlAttribute(stream, kTestsuite, "time", FormatTimeInMillisAsSeconds(test_suite.elapsed_time())); OutputXmlAttribute( @@ -4095,6 +4356,13 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream, if (unit_test.GetTestSuite(i)->reportable_test_count() > 0) PrintXmlTestSuite(stream, *unit_test.GetTestSuite(i)); } + + // If there was a test failure outside of one of the test suites (like in a + // test environment) include that in the output. + if (unit_test.ad_hoc_test_result().Failed()) { + OutputXmlTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result()); + } + *stream << "\n"; } @@ -4185,6 +4453,16 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener { const std::string& indent, bool comma = true); + // Streams a test suite JSON stanza containing the given test result. + // + // Requires: result.Failed() + static void OutputJsonTestSuiteForTestResult(::std::ostream* stream, + const TestResult& result); + + // Streams a JSON representation of a TestResult object. + static void OutputJsonTestResult(::std::ostream* stream, + const TestResult& result); + // Streams a JSON representation of a TestInfo object. static void OutputJsonTestInfo(::std::ostream* stream, const char* test_suite_name, @@ -4335,6 +4613,48 @@ void JsonUnitTestResultPrinter::OutputJsonKey( *stream << ",\n"; } +// Streams a test suite JSON stanza containing the given test result. +void JsonUnitTestResultPrinter::OutputJsonTestSuiteForTestResult( + ::std::ostream* stream, const TestResult& result) { + // Output the boilerplate for a new test suite. + *stream << Indent(4) << "{\n"; + OutputJsonKey(stream, "testsuite", "name", "NonTestSuiteFailure", Indent(6)); + OutputJsonKey(stream, "testsuite", "tests", 1, Indent(6)); + if (!GTEST_FLAG(list_tests)) { + OutputJsonKey(stream, "testsuite", "failures", 1, Indent(6)); + OutputJsonKey(stream, "testsuite", "disabled", 0, Indent(6)); + OutputJsonKey(stream, "testsuite", "skipped", 0, Indent(6)); + OutputJsonKey(stream, "testsuite", "errors", 0, Indent(6)); + OutputJsonKey(stream, "testsuite", "time", + FormatTimeInMillisAsDuration(result.elapsed_time()), + Indent(6)); + OutputJsonKey(stream, "testsuite", "timestamp", + FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()), + Indent(6)); + } + *stream << Indent(6) << "\"testsuite\": [\n"; + + // Output the boilerplate for a new test case. + *stream << Indent(8) << "{\n"; + OutputJsonKey(stream, "testcase", "name", "", Indent(10)); + OutputJsonKey(stream, "testcase", "status", "RUN", Indent(10)); + OutputJsonKey(stream, "testcase", "result", "COMPLETED", Indent(10)); + OutputJsonKey(stream, "testcase", "timestamp", + FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()), + Indent(10)); + OutputJsonKey(stream, "testcase", "time", + FormatTimeInMillisAsDuration(result.elapsed_time()), + Indent(10)); + OutputJsonKey(stream, "testcase", "classname", "", Indent(10), false); + *stream << TestPropertiesAsJson(result, Indent(10)); + + // Output the actual test result. + OutputJsonTestResult(stream, result); + + // Finish the test suite. + *stream << "\n" << Indent(6) << "]\n" << Indent(4) << "}"; +} + // Prints a JSON representation of a TestInfo object. void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream, const char* test_suite_name, @@ -4377,6 +4697,13 @@ void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream, false); *stream << TestPropertiesAsJson(result, kIndent); + OutputJsonTestResult(stream, result); +} + +void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream, + const TestResult& result) { + const std::string kIndent = Indent(10); + int failures = 0; for (int i = 0; i < result.total_part_count(); ++i) { const TestPartResult& part = result.GetTestPartResult(i); @@ -4487,6 +4814,12 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream, } } + // If there was a test failure outside of one of the test suites (like in a + // test environment) include that in the output. + if (unit_test.ad_hoc_test_result().Failed()) { + OutputJsonTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result()); + } + *stream << "\n" << kIndent << "]\n" << "}\n"; } @@ -5309,6 +5642,10 @@ void UnitTestImpl::PostFlagParsingInit() { // to shut down the default XML output before invoking RUN_ALL_TESTS. ConfigureXmlOutput(); + if (GTEST_FLAG(brief)) { + listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter); + } + #if GTEST_CAN_STREAM_RESULTS_ // Configures listeners for streaming test results to the specified server. ConfigureStreamingOutput(); @@ -5354,10 +5691,10 @@ class TestSuiteNameIs { // Arguments: // // test_suite_name: name of the test suite -// type_param: the name of the test suite's type parameter, or NULL if -// this is not a typed or a type-parameterized test suite. -// set_up_tc: pointer to the function that sets up the test suite -// tear_down_tc: pointer to the function that tears down the test suite +// type_param: the name of the test suite's type parameter, or NULL if +// this is not a typed or a type-parameterized test suite. +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite TestSuite* UnitTestImpl::GetTestSuite( const char* test_suite_name, const char* type_param, internal::SetUpTestSuiteFunc set_up_tc, @@ -5475,7 +5812,7 @@ bool UnitTestImpl::RunAllTests() { // assertions executed before RUN_ALL_TESTS(). ClearNonAdHocTestResult(); - const TimeInMillis start = GetTimeInMillis(); + Timer timer; // Shuffles test suites and tests if requested. if (has_tests_to_run && GTEST_FLAG(shuffle)) { @@ -5516,6 +5853,21 @@ bool UnitTestImpl::RunAllTests() { for (int test_index = 0; test_index < total_test_suite_count(); test_index++) { GetMutableSuiteCase(test_index)->Run(); + if (GTEST_FLAG(fail_fast) && + GetMutableSuiteCase(test_index)->Failed()) { + for (int j = test_index + 1; j < total_test_suite_count(); j++) { + GetMutableSuiteCase(j)->Skip(); + } + break; + } + } + } else if (Test::HasFatalFailure()) { + // If there was a fatal failure during the global setup then we know we + // aren't going to run any tests. Explicitly mark all of the tests as + // skipped to make this obvious in the output. + for (int test_index = 0; test_index < total_test_suite_count(); + test_index++) { + GetMutableSuiteCase(test_index)->Skip(); } } @@ -5526,7 +5878,7 @@ bool UnitTestImpl::RunAllTests() { repeater->OnEnvironmentsTearDownEnd(*parent_); } - elapsed_time_ = GetTimeInMillis() - start; + elapsed_time_ = timer.Elapsed(); // Tells the unit test event listener that the tests have just finished. repeater->OnTestIterationEnd(*parent_, i); @@ -5554,14 +5906,14 @@ bool UnitTestImpl::RunAllTests() { if (!gtest_is_initialized_before_run_all_tests) { ColoredPrintf( - COLOR_RED, + GTestColor::kRed, "\nIMPORTANT NOTICE - DO NOT IGNORE:\n" "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_ "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_ " will start to enforce the valid usage. " "Please fix it ASAP, or IT WILL START TO FAIL.\n"); // NOLINT #if GTEST_FOR_GOOGLE_ - ColoredPrintf(COLOR_RED, + ColoredPrintf(GTestColor::kRed, "For more details, see http://wiki/Main/ValidGUnitMain.\n"); #endif // GTEST_FOR_GOOGLE_ } @@ -5578,7 +5930,7 @@ void WriteToShardStatusFileIfNeeded() { if (test_shard_file != nullptr) { FILE* const file = posix::FOpen(test_shard_file, "w"); if (file == nullptr) { - ColoredPrintf(COLOR_RED, + ColoredPrintf(GTestColor::kRed, "Could not write to the test shard status file \"%s\" " "specified by the %s environment variable.\n", test_shard_file, kTestShardStatusFile); @@ -5612,7 +5964,7 @@ bool ShouldShard(const char* total_shards_env, << "Invalid environment variables: you have " << kTestShardIndex << " = " << shard_index << ", but have left " << kTestTotalShards << " unset.\n"; - ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str()); + ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str()); fflush(stdout); exit(EXIT_FAILURE); } else if (total_shards != -1 && shard_index == -1) { @@ -5620,7 +5972,7 @@ bool ShouldShard(const char* total_shards_env, << "Invalid environment variables: you have " << kTestTotalShards << " = " << total_shards << ", but have left " << kTestShardIndex << " unset.\n"; - ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str()); + ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str()); fflush(stdout); exit(EXIT_FAILURE); } else if (shard_index < 0 || shard_index >= total_shards) { @@ -5629,7 +5981,7 @@ bool ShouldShard(const char* total_shards_env, << kTestShardIndex << " < " << kTestTotalShards << ", but you have " << kTestShardIndex << "=" << shard_index << ", " << kTestTotalShards << "=" << total_shards << ".\n"; - ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str()); + ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str()); fflush(stdout); exit(EXIT_FAILURE); } @@ -6019,7 +6371,7 @@ static bool HasGoogleTestFlagPrefix(const char* str) { // @D changes to the default terminal text color. // static void PrintColorEncoded(const char* str) { - GTestColor color = COLOR_DEFAULT; // The current color. + GTestColor color = GTestColor::kDefault; // The current color. // Conceptually, we split the string into segments divided by escape // sequences. Then we print one segment at a time. At the end of @@ -6039,13 +6391,13 @@ static void PrintColorEncoded(const char* str) { if (ch == '@') { ColoredPrintf(color, "@"); } else if (ch == 'D') { - color = COLOR_DEFAULT; + color = GTestColor::kDefault; } else if (ch == 'R') { - color = COLOR_RED; + color = GTestColor::kRed; } else if (ch == 'G') { - color = COLOR_GREEN; + color = GTestColor::kGreen; } else if (ch == 'Y') { - color = COLOR_YELLOW; + color = GTestColor::kYellow; } else { --str; } @@ -6053,98 +6405,126 @@ static void PrintColorEncoded(const char* str) { } static const char kColorEncodedHelpMessage[] = -"This program contains tests written using " GTEST_NAME_ ". You can use the\n" -"following command line flags to control its behavior:\n" -"\n" -"Test Selection:\n" -" @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n" -" List the names of all tests instead of running them. The name of\n" -" TEST(Foo, Bar) is \"Foo.Bar\".\n" -" @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS" + "This program contains tests written using " GTEST_NAME_ + ". You can use the\n" + "following command line flags to control its behavior:\n" + "\n" + "Test Selection:\n" + " @G--" GTEST_FLAG_PREFIX_ + "list_tests@D\n" + " List the names of all tests instead of running them. The name of\n" + " TEST(Foo, Bar) is \"Foo.Bar\".\n" + " @G--" GTEST_FLAG_PREFIX_ + "filter=@YPOSITIVE_PATTERNS" "[@G-@YNEGATIVE_PATTERNS]@D\n" -" Run only the tests whose name matches one of the positive patterns but\n" -" none of the negative patterns. '?' matches any single character; '*'\n" -" matches any substring; ':' separates two patterns.\n" -" @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n" -" Run all disabled tests too.\n" -"\n" -"Test Execution:\n" -" @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n" -" Run the tests repeatedly; use a negative count to repeat forever.\n" -" @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n" -" Randomize tests' orders on every iteration.\n" -" @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n" -" Random number seed to use for shuffling test orders (between 1 and\n" -" 99999, or 0 to use a seed based on the current time).\n" -"\n" -"Test Output:\n" -" @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n" -" Enable/disable colored output. The default is @Gauto@D.\n" -" -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n" -" Don't print the elapsed time of each test.\n" -" @G--" GTEST_FLAG_PREFIX_ "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G" - GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n" -" Generate a JSON or XML report in the given directory or with the given\n" -" file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n" + " Run only the tests whose name matches one of the positive patterns " + "but\n" + " none of the negative patterns. '?' matches any single character; " + "'*'\n" + " matches any substring; ':' separates two patterns.\n" + " @G--" GTEST_FLAG_PREFIX_ + "also_run_disabled_tests@D\n" + " Run all disabled tests too.\n" + "\n" + "Test Execution:\n" + " @G--" GTEST_FLAG_PREFIX_ + "repeat=@Y[COUNT]@D\n" + " Run the tests repeatedly; use a negative count to repeat forever.\n" + " @G--" GTEST_FLAG_PREFIX_ + "shuffle@D\n" + " Randomize tests' orders on every iteration.\n" + " @G--" GTEST_FLAG_PREFIX_ + "random_seed=@Y[NUMBER]@D\n" + " Random number seed to use for shuffling test orders (between 1 and\n" + " 99999, or 0 to use a seed based on the current time).\n" + "\n" + "Test Output:\n" + " @G--" GTEST_FLAG_PREFIX_ + "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n" + " Enable/disable colored output. The default is @Gauto@D.\n" + " @G--" GTEST_FLAG_PREFIX_ + "brief=1@D\n" + " Only print test failures.\n" + " @G--" GTEST_FLAG_PREFIX_ + "print_time=0@D\n" + " Don't print the elapsed time of each test.\n" + " @G--" GTEST_FLAG_PREFIX_ + "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G" GTEST_PATH_SEP_ + "@Y|@G:@YFILE_PATH]@D\n" + " Generate a JSON or XML report in the given directory or with the " + "given\n" + " file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n" # if GTEST_CAN_STREAM_RESULTS_ -" @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n" -" Stream test results to the given server.\n" + " @G--" GTEST_FLAG_PREFIX_ + "stream_result_to=@YHOST@G:@YPORT@D\n" + " Stream test results to the given server.\n" # endif // GTEST_CAN_STREAM_RESULTS_ -"\n" -"Assertion Behavior:\n" + "\n" + "Assertion Behavior:\n" # if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS -" @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n" -" Set the default death test style.\n" + " @G--" GTEST_FLAG_PREFIX_ + "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n" + " Set the default death test style.\n" # endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS -" @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n" -" Turn assertion failures into debugger break-points.\n" -" @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n" -" Turn assertion failures into C++ exceptions for use by an external\n" -" test framework.\n" -" @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n" -" Do not report exceptions as test failures. Instead, allow them\n" -" to crash the program or throw a pop-up (on Windows).\n" -"\n" -"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set " + " @G--" GTEST_FLAG_PREFIX_ + "break_on_failure@D\n" + " Turn assertion failures into debugger break-points.\n" + " @G--" GTEST_FLAG_PREFIX_ + "throw_on_failure@D\n" + " Turn assertion failures into C++ exceptions for use by an external\n" + " test framework.\n" + " @G--" GTEST_FLAG_PREFIX_ + "catch_exceptions=0@D\n" + " Do not report exceptions as test failures. Instead, allow them\n" + " to crash the program or throw a pop-up (on Windows).\n" + "\n" + "Except for @G--" GTEST_FLAG_PREFIX_ + "list_tests@D, you can alternatively set " "the corresponding\n" -"environment variable of a flag (all letters in upper-case). For example, to\n" -"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_ + "environment variable of a flag (all letters in upper-case). For example, " + "to\n" + "disable colored text output, you can either specify " + "@G--" GTEST_FLAG_PREFIX_ "color=no@D or set\n" -"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n" -"\n" -"For more information, please read the " GTEST_NAME_ " documentation at\n" -"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n" -"(not one in your own code or tests), please report it to\n" -"@G<" GTEST_DEV_EMAIL_ ">@D.\n"; + "the @G" GTEST_FLAG_PREFIX_UPPER_ + "COLOR@D environment variable to @Gno@D.\n" + "\n" + "For more information, please read the " GTEST_NAME_ + " documentation at\n" + "@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ + "\n" + "(not one in your own code or tests), please report it to\n" + "@G<" GTEST_DEV_EMAIL_ ">@D.\n"; static bool ParseGoogleTestFlag(const char* const arg) { return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag, >EST_FLAG(also_run_disabled_tests)) || - ParseBoolFlag(arg, kBreakOnFailureFlag, - >EST_FLAG(break_on_failure)) || - ParseBoolFlag(arg, kCatchExceptionsFlag, - >EST_FLAG(catch_exceptions)) || - ParseStringFlag(arg, kColorFlag, >EST_FLAG(color)) || - ParseStringFlag(arg, kDeathTestStyleFlag, - >EST_FLAG(death_test_style)) || - ParseBoolFlag(arg, kDeathTestUseFork, - >EST_FLAG(death_test_use_fork)) || - ParseStringFlag(arg, kFilterFlag, >EST_FLAG(filter)) || - ParseStringFlag(arg, kInternalRunDeathTestFlag, - >EST_FLAG(internal_run_death_test)) || - ParseBoolFlag(arg, kListTestsFlag, >EST_FLAG(list_tests)) || - ParseStringFlag(arg, kOutputFlag, >EST_FLAG(output)) || - ParseBoolFlag(arg, kPrintTimeFlag, >EST_FLAG(print_time)) || - ParseBoolFlag(arg, kPrintUTF8Flag, >EST_FLAG(print_utf8)) || - ParseInt32Flag(arg, kRandomSeedFlag, >EST_FLAG(random_seed)) || - ParseInt32Flag(arg, kRepeatFlag, >EST_FLAG(repeat)) || - ParseBoolFlag(arg, kShuffleFlag, >EST_FLAG(shuffle)) || - ParseInt32Flag(arg, kStackTraceDepthFlag, - >EST_FLAG(stack_trace_depth)) || - ParseStringFlag(arg, kStreamResultToFlag, - >EST_FLAG(stream_result_to)) || - ParseBoolFlag(arg, kThrowOnFailureFlag, - >EST_FLAG(throw_on_failure)); + ParseBoolFlag(arg, kBreakOnFailureFlag, + >EST_FLAG(break_on_failure)) || + ParseBoolFlag(arg, kCatchExceptionsFlag, + >EST_FLAG(catch_exceptions)) || + ParseStringFlag(arg, kColorFlag, >EST_FLAG(color)) || + ParseStringFlag(arg, kDeathTestStyleFlag, + >EST_FLAG(death_test_style)) || + ParseBoolFlag(arg, kDeathTestUseFork, + >EST_FLAG(death_test_use_fork)) || + ParseBoolFlag(arg, kFailFast, >EST_FLAG(fail_fast)) || + ParseStringFlag(arg, kFilterFlag, >EST_FLAG(filter)) || + ParseStringFlag(arg, kInternalRunDeathTestFlag, + >EST_FLAG(internal_run_death_test)) || + ParseBoolFlag(arg, kListTestsFlag, >EST_FLAG(list_tests)) || + ParseStringFlag(arg, kOutputFlag, >EST_FLAG(output)) || + ParseBoolFlag(arg, kBriefFlag, >EST_FLAG(brief)) || + ParseBoolFlag(arg, kPrintTimeFlag, >EST_FLAG(print_time)) || + ParseBoolFlag(arg, kPrintUTF8Flag, >EST_FLAG(print_utf8)) || + ParseInt32Flag(arg, kRandomSeedFlag, >EST_FLAG(random_seed)) || + ParseInt32Flag(arg, kRepeatFlag, >EST_FLAG(repeat)) || + ParseBoolFlag(arg, kShuffleFlag, >EST_FLAG(shuffle)) || + ParseInt32Flag(arg, kStackTraceDepthFlag, + >EST_FLAG(stack_trace_depth)) || + ParseStringFlag(arg, kStreamResultToFlag, + >EST_FLAG(stream_result_to)) || + ParseBoolFlag(arg, kThrowOnFailureFlag, >EST_FLAG(throw_on_failure)); } #if GTEST_USE_OWN_FLAGFILE_FLAG_ @@ -6314,24 +6694,31 @@ void InitGoogleTest() { std::string TempDir() { #if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_) return GTEST_CUSTOM_TEMPDIR_FUNCTION_(); -#endif - -#if GTEST_OS_WINDOWS_MOBILE +#elif GTEST_OS_WINDOWS_MOBILE return "\\temp\\"; #elif GTEST_OS_WINDOWS const char* temp_dir = internal::posix::GetEnv("TEMP"); - if (temp_dir == nullptr || temp_dir[0] == '\0') + if (temp_dir == nullptr || temp_dir[0] == '\0') { return "\\temp\\"; - else if (temp_dir[strlen(temp_dir) - 1] == '\\') + } else if (temp_dir[strlen(temp_dir) - 1] == '\\') { return temp_dir; - else + } else { return std::string(temp_dir) + "\\"; + } #elif GTEST_OS_LINUX_ANDROID const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR"); - if (temp_dir == nullptr || temp_dir[0] == '\0') + if (temp_dir == nullptr || temp_dir[0] == '\0') { return "/data/local/tmp/"; - else + } else { return temp_dir; + } +#elif GTEST_OS_LINUX + const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR"); + if (temp_dir == nullptr || temp_dir[0] == '\0') { + return "/tmp/"; + } else { + return temp_dir; + } #else return "/tmp/"; #endif // GTEST_OS_WINDOWS_MOBILE From 90749e866308a0667c0d9afcf90244e5c6c95c0a Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 14 Apr 2022 13:08:58 -0700 Subject: [PATCH 0331/1000] temporal_filter_sse4,cosmetics: fix some typos Change-Id: If8318068a32da52d15c0ba595f80092611f4c847 --- vp9/encoder/x86/temporal_filter_sse4.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c index bdbd66051d..87e68fb438 100644 --- a/vp9/encoder/x86/temporal_filter_sse4.c +++ b/vp9/encoder/x86/temporal_filter_sse4.c @@ -460,7 +460,7 @@ static void vp9_apply_temporal_filter_luma( if (block_width == 16) { // Special Case: The blockwidth is 16 and we are operating on a row of 16 - // chroma pixels. In this case, we can't use the usualy left-midle-right + // chroma pixels. In this case, we can't use the usual left-middle-right // pattern. We also don't support splitting now. neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; @@ -553,7 +553,7 @@ static void vp9_apply_temporal_filter_chroma_8( // Loop variable unsigned int h; - // Initilize weight + // Initialize weight if (blk_fw) { weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[1], blk_fw[1], blk_fw[1], blk_fw[1]); @@ -827,12 +827,12 @@ void vp9_apply_temporal_filter_sse4_1( assert( (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && "subblock filter weight must be positive"); - assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2"); + assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2"); assert( (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && "subblock filter weight must be less than 2"); - // Precompute the difference sqaured + // Precompute the difference squared for (row = 0; row < block_height; row++) { for (blk_col = 0; blk_col < block_width; blk_col += 16) { store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col, From 45fb0161b0bce849f2c38aba0777b702740ccc92 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 18 Apr 2022 18:56:49 -0700 Subject: [PATCH 0332/1000] vp9_alloccommon: add missing pointer checks in vp9_free_ref_frame_buffers() and vp9_free_context_buffers(); pool and free_mi may be NULL due to earlier allocation failures Change-Id: I3bd26ea29b3aea6c58f33d5b7f5a280eb6250ec7 --- vp9/common/vp9_alloccommon.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index 5702dca718..faad657a08 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -73,6 +73,8 @@ static void free_seg_map(VP9_COMMON *cm) { void vp9_free_ref_frame_buffers(BufferPool *pool) { int i; + if (!pool) return; + for (i = 0; i < FRAME_BUFFERS; ++i) { if (!pool->frame_bufs[i].released && pool->frame_bufs[i].raw_frame_buffer.data != NULL) { @@ -100,7 +102,7 @@ void vp9_free_postproc_buffers(VP9_COMMON *cm) { } void vp9_free_context_buffers(VP9_COMMON *cm) { - cm->free_mi(cm); + if (cm->free_mi) cm->free_mi(cm); free_seg_map(cm); vpx_free(cm->above_context); cm->above_context = NULL; From 0ca5af7e24f5a8927016d6932a665acc762639a6 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 18 Apr 2022 18:57:51 -0700 Subject: [PATCH 0333/1000] vp9_alloc_internal_frame_buffers: fix num buffers assignment avoid setting num_internal_frame_buffers until the allocation is checked, avoiding an invalid access in vp9_free_internal_frame_buffers() Change-Id: I28a544a2553d62a6b5cb7c45bf10591caa4ebab6 --- vp9/common/vp9_frame_buffers.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/vp9/common/vp9_frame_buffers.c b/vp9/common/vp9_frame_buffers.c index a254e79d20..889b809e50 100644 --- a/vp9/common/vp9_frame_buffers.c +++ b/vp9/common/vp9_frame_buffers.c @@ -14,14 +14,17 @@ #include "vpx_mem/vpx_mem.h" int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) { + const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; assert(list != NULL); vp9_free_internal_frame_buffers(list); - list->num_internal_frame_buffers = - VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; - list->int_fb = (InternalFrameBuffer *)vpx_calloc( - list->num_internal_frame_buffers, sizeof(*list->int_fb)); - return (list->int_fb == NULL); + list->int_fb = + (InternalFrameBuffer *)vpx_calloc(num_buffers, sizeof(*list->int_fb)); + if (list->int_fb) { + list->num_internal_frame_buffers = num_buffers; + return 0; + } + return -1; } void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) { @@ -35,6 +38,7 @@ void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) { } vpx_free(list->int_fb); list->int_fb = NULL; + list->num_internal_frame_buffers = 0; } int vp9_get_frame_buffer(void *cb_priv, size_t min_size, From f1d42a92bbb98ab4481f85716339a96914369e6a Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 18 Apr 2022 18:59:57 -0700 Subject: [PATCH 0334/1000] vp9_encoder: check context buffer allocations previously the returns for alloc_context_buffers_ext() and vp9_alloc_context_buffers() were ignored which would result in a NULL access during encoding should they fail Change-Id: Icd76576f3d5f8d57697adc9ae926a3a5be731327 --- vp9/encoder/vp9_encoder.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 84ab80fe3f..ec6a756197 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1379,21 +1379,22 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { #endif } -static int alloc_context_buffers_ext(VP9_COMP *cpi) { +static void alloc_context_buffers_ext(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; int mi_size = cm->mi_cols * cm->mi_rows; - cpi->mbmi_ext_base = vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)); - if (!cpi->mbmi_ext_base) return 1; - - return 0; + CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base, + vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base))); } static void alloc_compressor_data(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; int sb_rows; - vp9_alloc_context_buffers(cm, cm->width, cm->height); + if (vp9_alloc_context_buffers(cm, cm->width, cm->height)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } alloc_context_buffers_ext(cpi); From 6ea4ef1d24f84d131e0a4398bf358bfd79bc88c3 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 18 Apr 2022 19:07:39 -0700 Subject: [PATCH 0335/1000] vp9_dx_iface,init_buffer_callbacks: return on alloc failure use an error code as a jmp target is not currently set in init_decoder() Change-Id: If7798039439f13c739298a8a92a55aaa24e2210c --- vp9/vp9_dx_iface.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 35ecbaff37..3c42c7dfed 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -201,7 +201,7 @@ static vpx_codec_err_t update_error_state( return error->error_code; } -static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) { +static vpx_codec_err_t init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) { VP9_COMMON *const cm = &ctx->pbi->common; BufferPool *const pool = cm->buffer_pool; @@ -217,12 +217,16 @@ static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) { pool->get_fb_cb = vp9_get_frame_buffer; pool->release_fb_cb = vp9_release_frame_buffer; - if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers)) + if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers)) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to initialize internal frame buffers"); + return VPX_CODEC_MEM_ERROR; + } pool->cb_priv = &pool->int_frame_buffers; } + + return VPX_CODEC_OK; } static void set_default_ppflags(vp8_postproc_cfg_t *cfg) { @@ -278,9 +282,7 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) set_default_ppflags(&ctx->postproc_cfg); - init_buffer_callbacks(ctx); - - return VPX_CODEC_OK; + return init_buffer_callbacks(ctx); } static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx, From 665f6a3065555317e75915af2561cf9b776e50b6 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 18 Apr 2022 19:10:11 -0700 Subject: [PATCH 0336/1000] webmdec: fix double free when no frames were decoded, for example due to a decoder initialization failure, an orphan buffer pointer from webm_guess_framerate() via webm_read_frame() would have been freed during cleanup Change-Id: I6ea3defdd13dd75427f79c516e207b682391e4fa --- webmdec.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/webmdec.cc b/webmdec.cc index 68c6f4782d..f7671bb641 100644 --- a/webmdec.cc +++ b/webmdec.cc @@ -210,6 +210,8 @@ int webm_guess_framerate(struct WebmInputContext *webm_ctx, vpx_ctx->framerate.denominator = static_cast(webm_ctx->timestamp_ns / 1000); delete[] buffer; + // webm_ctx->buffer is assigned to the buffer pointer in webm_read_frame(). + webm_ctx->buffer = nullptr; get_first_cluster(webm_ctx); webm_ctx->block = nullptr; From 8da05d39b94ab09ca1993e5c1cb6430f0d3000db Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 19 Apr 2022 17:24:26 -0700 Subject: [PATCH 0337/1000] vp8_decode: free mt buffers early on resolution change this avoids a desynchronization of mb_rows if an allocation prior to vp8mt_alloc_temp_buffers() fails and the decoder is then destroyed Bug: webm:1759 Change-Id: I75457ef9ceb24c8a8fd213c3690e7c1cf0ec425f --- vp8/vp8_dx_iface.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index ba0714abe5..6d88e5154f 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -371,8 +371,6 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, pc->Width = ctx->si.w; pc->Height = ctx->si.h; { - int prev_mb_rows = pc->mb_rows; - if (setjmp(pbi->common.error.jmp)) { pbi->common.error.setjmp = 0; /* on failure clear the cached resolution to ensure a full @@ -398,6 +396,12 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, "Invalid frame height"); } +#if CONFIG_MULTITHREAD + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { + vp8mt_de_alloc_temp_buffers(pbi, pc->mb_rows); + } +#endif + if (vp8_alloc_frame_buffers(pc, pc->Width, pc->Height)) { vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffers"); @@ -442,10 +446,8 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, #if CONFIG_MULTITHREAD if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { - vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows); + vp8mt_alloc_temp_buffers(pbi, pc->Width, 0); } -#else - (void)prev_mb_rows; #endif } From f2ef29f746c4cad7a41b3bba5daefba2726eda3a Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 19 Apr 2022 19:26:37 -0700 Subject: [PATCH 0338/1000] fdct16x16_neon.h,cosmetics: fix include-guard case Change-Id: I593735bb7f88d63f2ddab57484099479c8759a3d --- vpx_dsp/arm/fdct16x16_neon.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h index 8391238991..0dd21153fc 100644 --- a/vpx_dsp/arm/fdct16x16_neon.h +++ b/vpx_dsp/arm/fdct16x16_neon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_ -#define VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_ +#ifndef VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_ #include @@ -324,4 +324,4 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, &out[11]); } -#endif // VPX_VPX_DSP_ARM_FDCT16x16_NEON_H_ +#endif // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_ From 2651113a64d2a6892431b843ce35b57621369765 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Mon, 18 Apr 2022 16:17:17 +0800 Subject: [PATCH 0339/1000] vp9[loongarch]: Optimize vertical/horizontal_8_dual 1. vpx_lpf_vertical_8_dual_lsx 2. vpx_lpf_horizontal_8_dual_lsx Bug: webm:1755 Change-Id: I354df02cc215f36b4edf6558af0ff7fd6909deac --- test/lpf_test.cc | 14 +- vpx_dsp/loongarch/loopfilter_8_lsx.c | 270 +++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 3 files changed, 283 insertions(+), 5 deletions(-) diff --git a/test/lpf_test.cc b/test/lpf_test.cc index 833dfb9a89..0bdec77e54 100644 --- a/test/lpf_test.cc +++ b/test/lpf_test.cc @@ -147,7 +147,7 @@ class Loop8Test6Param : public ::testing::TestWithParam { }; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param); -#if HAVE_NEON || HAVE_SSE2 || \ +#if HAVE_NEON || HAVE_SSE2 || (HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH) || \ (HAVE_DSPR2 || HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH) class Loop8Test9Param : public ::testing::TestWithParam { public: @@ -169,7 +169,7 @@ class Loop8Test9Param : public ::testing::TestWithParam { }; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param); #endif // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA && - // (!CONFIG_VP9_HIGHBITDEPTH)) + // (!CONFIG_VP9_HIGHBITDEPTH) || (HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH)) TEST_P(Loop8Test6Param, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); @@ -281,7 +281,7 @@ TEST_P(Loop8Test6Param, ValueCheck) { << "First failed at test case " << first_failure; } -#if HAVE_NEON || HAVE_SSE2 || \ +#if HAVE_NEON || HAVE_SSE2 || (HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)) || \ (HAVE_DSPR2 || HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)) TEST_P(Loop8Test9Param, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); @@ -411,6 +411,7 @@ TEST_P(Loop8Test9Param, ValueCheck) { << "First failed at test case " << first_failure; } #endif // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA && + // (!CONFIG_VP9_HIGHBITDEPTH)) || (HAVE_LSX && // (!CONFIG_VP9_HIGHBITDEPTH)) using std::make_tuple; @@ -702,6 +703,13 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&vpx_lpf_vertical_8_lsx, &vpx_lpf_vertical_8_c, 8), make_tuple(&vpx_lpf_vertical_16_dual_lsx, &vpx_lpf_vertical_16_dual_c, 8))); + +INSTANTIATE_TEST_SUITE_P( + LSX, Loop8Test9Param, + ::testing::Values(make_tuple(&vpx_lpf_horizontal_8_dual_lsx, + &vpx_lpf_horizontal_8_dual_c, 8), + make_tuple(&vpx_lpf_vertical_8_dual_lsx, + &vpx_lpf_vertical_8_dual_c, 8))); #endif // HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH) } // namespace diff --git a/vpx_dsp/loongarch/loopfilter_8_lsx.c b/vpx_dsp/loongarch/loopfilter_8_lsx.c index facf6f30ec..358e221662 100644 --- a/vpx_dsp/loongarch/loopfilter_8_lsx.c +++ b/vpx_dsp/loongarch/loopfilter_8_lsx.c @@ -83,6 +83,93 @@ void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride, } } +void vpx_lpf_horizontal_8_dual_lsx( + uint8_t *dst, int32_t stride, const uint8_t *b_limit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1, + const uint8_t *limit1, const uint8_t *thresh1) { + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i flat, mask, hev, tmp, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + __m128i zero = __lsx_vldi(0); + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vreplgr2vr_b(*thresh0); + tmp = __lsx_vreplgr2vr_b(*thresh1); + thresh = __lsx_vilvl_d(tmp, thresh); + + b_limit = __lsx_vreplgr2vr_b(*b_limit0); + tmp = __lsx_vreplgr2vr_b(*b_limit1); + b_limit = __lsx_vilvl_d(tmp, b_limit); + + limit = __lsx_vreplgr2vr_b(*limit0); + tmp = __lsx_vreplgr2vr_b(*limit1); + limit = __lsx_vilvl_d(tmp, limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__lsx_bz_v(flat)) { + __lsx_vst(p1_out, dst - stride2, 0); + __lsx_vst(p0_out, dst - stride, 0); + __lsx_vst(q0_out, dst, 0); + __lsx_vst(q1_out, dst + stride, 0); + } else { + DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, + p1_l, p0_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, + q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h, + p1_h, p0_h); + DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h, + q2_h, q3_h); + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + __lsx_vst(p2_out, dst - stride3, 0); + __lsx_vst(p1_out, dst - stride2, 0); + __lsx_vst(p0_out, dst - stride, 0); + __lsx_vst(q0_out, dst, 0); + __lsx_vst(q1_out, dst + stride, 0); + __lsx_vst(q2_out, dst + stride2, 0); + } +} + void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, @@ -197,3 +284,186 @@ void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride, __lsx_vstelm_h(vec4, dst, 4, 7); } } + +void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit0, const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *b_limit1, const uint8_t *limit1, + const uint8_t *thresh1) { + uint8_t *dst_tmp = dst - 4; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p1_out, p0_out, q0_out, q1_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i row4, row5, row6, row7, row12, row13, row14, row15; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i zero = __lsx_vldi(0); + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + p0 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2); + p3 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + row4 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6); + row7 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + + q3 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1); + q0 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + row12 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14); + row15 = __lsx_vldx(dst_tmp, stride3); + + /* transpose 16x8 matrix into 8x16 */ + LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, + row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = __lsx_vreplgr2vr_b(*thresh0); + vec0 = __lsx_vreplgr2vr_b(*thresh1); + thresh = __lsx_vilvl_d(vec0, thresh); + + b_limit = __lsx_vreplgr2vr_b(*b_limit0); + vec0 = __lsx_vreplgr2vr_b(*b_limit1); + b_limit = __lsx_vilvl_d(vec0, b_limit); + + limit = __lsx_vreplgr2vr_b(*limit0); + vec0 = __lsx_vreplgr2vr_b(*limit1); + limit = __lsx_vilvl_d(vec0, limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat)) { + DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); + vec2 = __lsx_vilvl_h(vec1, vec0); + vec3 = __lsx_vilvh_h(vec1, vec0); + DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); + vec4 = __lsx_vilvl_h(vec1, vec0); + vec5 = __lsx_vilvh_h(vec1, vec0); + + dst -= 2; + __lsx_vstelm_w(vec2, dst, 0, 0); + __lsx_vstelm_w(vec2, dst + stride, 0, 1); + __lsx_vstelm_w(vec2, dst + stride2, 0, 2); + __lsx_vstelm_w(vec2, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(vec3, dst, 0, 0); + __lsx_vstelm_w(vec3, dst + stride, 0, 1); + __lsx_vstelm_w(vec3, dst + stride2, 0, 2); + __lsx_vstelm_w(vec3, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(vec4, dst, 0, 0); + __lsx_vstelm_w(vec4, dst + stride, 0, 1); + __lsx_vstelm_w(vec4, dst + stride2, 0, 2); + __lsx_vstelm_w(vec4, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(vec5, dst, 0, 0); + __lsx_vstelm_w(vec5, dst + stride, 0, 1); + __lsx_vstelm_w(vec5, dst + stride2, 0, 2); + __lsx_vstelm_w(vec5, dst + stride3, 0, 3); + } else { + DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, + p1_l, p0_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, + q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h, + p1_h, p0_h); + DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h, + q2_h, q3_h); + + /* filter8 */ + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1); + vec3 = __lsx_vilvl_h(vec1, vec0); + vec4 = __lsx_vilvh_h(vec1, vec0); + DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1); + vec6 = __lsx_vilvl_h(vec1, vec0); + vec7 = __lsx_vilvh_h(vec1, vec0); + vec2 = __lsx_vilvl_b(q2, q1); + vec5 = __lsx_vilvh_b(q2, q1); + + dst -= 3; + __lsx_vstelm_w(vec3, dst, 0, 0); + __lsx_vstelm_h(vec2, dst, 4, 0); + dst += stride; + __lsx_vstelm_w(vec3, dst, 0, 1); + __lsx_vstelm_h(vec2, dst, 4, 1); + dst += stride; + __lsx_vstelm_w(vec3, dst, 0, 2); + __lsx_vstelm_h(vec2, dst, 4, 2); + dst += stride; + __lsx_vstelm_w(vec3, dst, 0, 3); + __lsx_vstelm_h(vec2, dst, 4, 3); + dst += stride; + __lsx_vstelm_w(vec4, dst, 0, 0); + __lsx_vstelm_h(vec2, dst, 4, 4); + dst += stride; + __lsx_vstelm_w(vec4, dst, 0, 1); + __lsx_vstelm_h(vec2, dst, 4, 5); + dst += stride; + __lsx_vstelm_w(vec4, dst, 0, 2); + __lsx_vstelm_h(vec2, dst, 4, 6); + dst += stride; + __lsx_vstelm_w(vec4, dst, 0, 3); + __lsx_vstelm_h(vec2, dst, 4, 7); + dst += stride; + __lsx_vstelm_w(vec6, dst, 0, 0); + __lsx_vstelm_h(vec5, dst, 4, 0); + dst += stride; + __lsx_vstelm_w(vec6, dst, 0, 1); + __lsx_vstelm_h(vec5, dst, 4, 1); + dst += stride; + __lsx_vstelm_w(vec6, dst, 0, 2); + __lsx_vstelm_h(vec5, dst, 4, 2); + dst += stride; + __lsx_vstelm_w(vec6, dst, 0, 3); + __lsx_vstelm_h(vec5, dst, 4, 3); + dst += stride; + __lsx_vstelm_w(vec7, dst, 0, 0); + __lsx_vstelm_h(vec5, dst, 4, 4); + dst += stride; + __lsx_vstelm_w(vec7, dst, 0, 1); + __lsx_vstelm_h(vec5, dst, 4, 5); + dst += stride; + __lsx_vstelm_w(vec7, dst, 0, 2); + __lsx_vstelm_h(vec5, dst, 4, 6); + dst += stride; + __lsx_vstelm_w(vec7, dst, 0, 3); + __lsx_vstelm_h(vec5, dst, 4, 7); + } +} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 7d78dc72ac..d10f3a1408 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -448,7 +448,7 @@ () specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa lsx/; add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa/; +specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa lsx/; add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/; @@ -466,7 +466,7 @@ () specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa lsx/; add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa/; +specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa lsx/; add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/; From 608a28e30b7abc62ed415af3dbb3d981e22b8a1c Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Mon, 18 Apr 2022 16:21:04 +0800 Subject: [PATCH 0340/1000] vp9[loongarch]: Optimize convolve8_avg_vert/convolve_copy 1. vpx_convolve8_avg_vert_lsx 2. vpx_convolve_copy_lsx 3. vpx_idct32x32_135_add_lsx Bug: webm:1755 Change-Id: I6bdfe5836a91a5e361ab869b26641e86c5ebb68d --- test/convolve_test.cc | 4 +- .../loongarch/vpx_convolve8_avg_vert_lsx.c | 918 ++++++++++++++++++ vpx_dsp/loongarch/vpx_convolve_copy_lsx.c | 438 +++++++++ vpx_dsp/vpx_dsp.mk | 2 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 5 +- 5 files changed, 1363 insertions(+), 4 deletions(-) create mode 100644 vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c create mode 100644 vpx_dsp/loongarch/vpx_convolve_copy_lsx.c diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 5189be647a..d569048691 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -1451,9 +1451,9 @@ INSTANTIATE_TEST_SUITE_P(MSA, ConvolveTest, #if HAVE_LSX const ConvolveFunctions convolve8_lsx( - vpx_convolve_copy_c, vpx_convolve_avg_lsx, vpx_convolve8_horiz_lsx, + vpx_convolve_copy_lsx, vpx_convolve_avg_lsx, vpx_convolve8_horiz_lsx, vpx_convolve8_avg_horiz_lsx, vpx_convolve8_vert_lsx, - vpx_convolve8_avg_vert_c, vpx_convolve8_lsx, vpx_convolve8_avg_lsx, + vpx_convolve8_avg_vert_lsx, vpx_convolve8_lsx, vpx_convolve8_avg_lsx, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c new file mode 100644 index 0000000000..584f241838 --- /dev/null +++ b/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c @@ -0,0 +1,918 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i reg0, reg1, reg2, reg3, reg4; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp0 = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + src0 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src4 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5, + src6); + src_tmp0 += src_stride3; + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0, + tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5); + DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1); + reg2 = __lsx_vilvl_d(tmp5, tmp2); + DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1); + reg2 = __lsx_vxori_b(reg2, 128); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src0, src1); + src0 = __lsx_vilvl_d(src1, src0); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4); + DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4); + out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1, + filter2, filter3); + out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1, + filter2, filter3); + out0 = __lsx_vssrarni_b_h(out1, out0, 7); + out0 = __lsx_vxori_b(out0, 128); + out0 = __lsx_vavgr_bu(out0, src0); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + reg0 = reg2; + reg1 = reg3; + reg2 = reg4; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1, out2, out3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp0 = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src4 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5, + src6); + src_tmp0 += src_stride3; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0, + reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, src1, src0, src3, src2, src0, src1); + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1, + filter2, filter3); + out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1, + filter2, filter3); + out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1, + filter2, filter3); + out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + DUP2_ARG2(__lsx_vavgr_bu, out0, src0, out1, src1, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + reg0 = reg2; + reg1 = tmp0; + reg2 = tmp2; + reg3 = reg5; + reg4 = tmp1; + reg5 = tmp3; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_16w_mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, int32_t width) { + uint8_t *src_tmp; + uint32_t cnt = width >> 4; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filter0, filter1, filter2, filter3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i reg6, reg7, reg8, reg9, reg10, reg11; + __m128i tmp0, tmp1, tmp2, tmp3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + uint8_t *src_tmp0 = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; cnt--;) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_reg = dst; + + src_tmp = src_tmp0; + src0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src_tmp += src_stride3; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg0, reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg6, reg7, reg8, reg9); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11); + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, + src7, src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, + src4, src5, src7, src8); + tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1, + filter2, filter3); + tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1, + filter2, filter3); + tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1, + filter2, filter3); + tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + tmp2 = __lsx_vld(dst_reg, 0); + tmp3 = __lsx_vldx(dst_reg, dst_stride); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1); + __lsx_vst(tmp0, dst_reg, 0); + __lsx_vstx(tmp1, dst_reg, dst_stride); + tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1, + filter2, filter3); + tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1, + filter2, filter3); + tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1, + filter2, filter3); + tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + tmp2 = __lsx_vldx(dst_reg, dst_stride2); + tmp3 = __lsx_vldx(dst_reg, dst_stride3); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1); + __lsx_vstx(tmp0, dst_reg, dst_stride2); + __lsx_vstx(tmp1, dst_reg, dst_stride3); + dst_reg += dst_stride4; + + reg0 = reg2; + reg1 = src0; + reg2 = src2; + reg3 = reg5; + reg4 = src1; + reg5 = src3; + reg6 = reg8; + reg7 = src4; + reg8 = src7; + reg9 = reg11; + reg10 = src5; + reg11 = src8; + src6 = src10; + } + src_tmp0 += 16; + dst += 16; + } +} + +static void common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride, + filter, height, 16); +} + +static void common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride, + filter, height, 32); +} + +static void common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride, + filter, height, 64); +} + +static void common_vt_2t_and_aver_dst_4x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4; + __m128i dst0, dst1, dst2, dst3, out, filt0, src2110, src4332; + __m128i src10_r, src32_r, src21_r, src43_r; + __m128i tmp0, tmp1; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + src += src_stride; + + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110, + src4332); + DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + out = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vstelm_w(out, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 3); + dst += dst_stride; +} + +static void common_vt_2t_and_aver_dst_4x8_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i dst0, dst1, dst2, dst3, dst4; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; + __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + __m128i src2110, src4332, src6554, src8776, filt0; + __m128i tmp0, tmp1, tmp2, tmp3; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src7 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src8 = __lsx_vld(src, 0); + + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_w(dst_tmp, 0); + dst1 = __lsx_vilvl_w(dst2, dst1); + dst2 = __lsx_vilvl_w(dst4, dst3); + dst1 = __lsx_vilvl_d(dst2, dst1); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + DUP4_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, + src87_r, src76_r, src2110, src4332, src6554, src8776); + DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0, + src8776, filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2); + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp2, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp2, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp2, dst, 0, 3); +} + +static void common_vt_2t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_vt_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_vt_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_and_aver_dst_8x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4; + __m128i dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0; + __m128i tmp0, tmp1, tmp2, tmp3; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec1); + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2); + __lsx_vstelm_d(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); +} + +static void common_vt_2t_and_aver_dst_8x8mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 3); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i dst0, dst1, dst2, dst3, dst4, dst5; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp0, tmp1, tmp2, tmp3; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src5 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7); + src8 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst5 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst3, dst2, dst5, dst4, dst2, dst3); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, tmp0, tmp1, tmp2, tmp3); + + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2); + __lsx_vstelm_d(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); + dst += dst_stride; + + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst2, tmp2, dst3, tmp0, tmp2); + __lsx_vstelm_d(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); + dst += dst_stride; + + src0 = src8; + } +} + +static void common_vt_2t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_vt_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_vt_2t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i tmp0, tmp1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst1); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst2); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst3); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2); + uint8_t *src_tmp1; + uint8_t *dst_tmp1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp0, tmp1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + + src_tmp1 = src + 16; + src6 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src7, + src8); + src9 = __lsx_vldx(src_tmp1, src_stride3); + + dst_tmp1 = dst + 16; + dst4 = __lsx_vld(dst_tmp1, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp1, dst_stride, dst_tmp1, dst_stride2, dst5, + dst6); + dst7 = __lsx_vldx(dst_tmp1, dst_stride3); + src += src_stride4; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vst(tmp0, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst1); + __lsx_vstx(tmp0, dst, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst2); + __lsx_vstx(tmp0, dst, dst_stride2); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst3); + __lsx_vstx(tmp0, dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst4); + __lsx_vst(tmp0, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst5); + dst += dst_stride; + __lsx_vst(tmp0, dst, 16); + + DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst6); + dst += dst_stride; + __lsx_vst(tmp0, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst7); + dst += dst_stride; + __lsx_vst(tmp0, dst, 16); + dst += dst_stride; + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + int32_t src_stride2 = src_stride << 1; + int32_t dst_stride2 = dst_stride << 1; + uint8_t *src_tmp1; + uint8_t *dst_tmp1; + __m128i src0, src1, src2, src3, src4, src5; + __m128i src6, src7, src8, src9, src10, src11, filt0; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i tmp0, tmp1; + + filt0 = __lsx_vldrepl_h(filter, 0); + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6, + src9); + src += src_stride; + + for (; loop_cnt--;) { + src2 = __lsx_vldx(src, src_stride); + dst1 = __lsx_vldx(dst, dst_stride); + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7, + src10); + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst2, dst4, + dst6); + src_tmp1 = (uint8_t *)src + 16; + src5 = __lsx_vldx(src_tmp1, src_stride); + src_tmp1 = src_tmp1 + 16; + src8 = __lsx_vldx(src_tmp1, src_stride); + src_tmp1 = src_tmp1 + 16; + src11 = __lsx_vldx(src_tmp1, src_stride); + + dst_tmp1 = dst + 16; + dst3 = __lsx_vldx(dst_tmp1, dst_stride); + dst_tmp1 = dst + 32; + dst5 = __lsx_vldx(dst_tmp1, dst_stride); + dst_tmp1 = dst + 48; + dst7 = __lsx_vldx(dst_tmp1, dst_stride); + src += src_stride2; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vst(tmp0, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst1); + __lsx_vstx(tmp0, dst, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst2); + __lsx_vst(tmp0, dst, 16); + + dst_tmp1 = dst + 16; + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst3); + __lsx_vstx(tmp0, dst_tmp1, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst4); + __lsx_vst(tmp0, dst, 32); + + dst_tmp1 = dst_tmp1 + 16; + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst5); + __lsx_vstx(tmp0, dst_tmp1, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst6); + __lsx_vst(tmp0, dst, 48); + + dst_tmp1 = dst_tmp1 + 16; + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst7); + __lsx_vstx(tmp0, dst_tmp1, dst_stride); + dst += dst_stride2; + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void vpx_convolve8_avg_vert_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_vt_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 8: + common_vt_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 16: + common_vt_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 32: + common_vt_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 64: + common_vt_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + default: + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 8: + common_vt_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 16: + common_vt_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + + break; + case 32: + common_vt_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 64: + common_vt_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + default: + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c b/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c new file mode 100644 index 0000000000..398788a43e --- /dev/null +++ b/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static void copy_width8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + if ((height % 12) == 0) { + for (cnt = (height / 12); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride2; + src7 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + + __lsx_vstelm_d(src4, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src5, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src6, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src7, dst, 0, 0); + dst += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + } + } else if ((height % 8) == 0) { + for (cnt = height >> 3; cnt--;) { + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride2; + src7 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + + __lsx_vstelm_d(src4, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src5, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src6, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src7, dst, 0, 0); + dst += dst_stride; + } + } else if ((height % 4) == 0) { + for (cnt = (height / 4); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + } + } else if ((height % 2) == 0) { + for (cnt = (height / 2); cnt--;) { + src0 = __lsx_vld(src, 0); + src1 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + } + } +} + +static void copy_16multx8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t width) { + int32_t cnt, loop_cnt; + uint8_t *src_tmp; + uint8_t *dst_tmp; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = (uint8_t *)src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + src0 = __lsx_vld(src_tmp, 0); + DUP4_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src_tmp, + src_stride3, src_tmp, src_stride4, src1, src2, src3, src4); + src_tmp += src_stride4; + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src_tmp += src_stride2; + src7 = __lsx_vldx(src_tmp, src_stride); + src_tmp += src_stride2; + + __lsx_vst(src0, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src1, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src2, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src3, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + } + src += 16; + dst += 16; + } +} + +static void copy_width16_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + if ((height % 12) == 0) { + for (cnt = (height / 12); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride2; + src7 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + __lsx_vst(src4, dst, 0); + dst += dst_stride; + __lsx_vst(src5, dst, 0); + dst += dst_stride; + __lsx_vst(src6, dst, 0); + dst += dst_stride; + __lsx_vst(src7, dst, 0); + dst += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + } + } else if ((height % 8) == 0) { + copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 16); + } else if ((height % 4) == 0) { + for (cnt = (height >> 2); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + } + } +} + +static void copy_width32_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint8_t *src_tmp; + uint8_t *dst_tmp; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + if ((height % 12) == 0) { + for (cnt = (height / 12); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + } + } else if ((height % 8) == 0) { + copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 32); + } else if ((height % 4) == 0) { + for (cnt = (height >> 2); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + } + } +} + +static void copy_width64_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 64); +} + +void vpx_convolve_copy_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + switch (w) { + case 4: { + uint32_t cnt; + __m128i tmp; + for (cnt = h; cnt--;) { + tmp = __lsx_vldrepl_w(src, 0); + __lsx_vstelm_w(tmp, dst, 0, 0); + src += src_stride; + dst += dst_stride; + } + break; + } + case 8: { + copy_width8_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + copy_width16_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + copy_width32_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + copy_width64_lsx(src, src_stride, dst, dst_stride, h); + break; + } + default: { + uint32_t cnt; + for (cnt = h; cnt--;) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 3eba23c0af..01886b8821 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -165,11 +165,13 @@ DSP_SRCS-$(HAVE_VSX) += ppc/vpx_convolve_vsx.c # common (lsx) DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_horiz_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_vert_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_horiz_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_vert_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_avg_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_copy_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_lsx.h # loop filters diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index d10f3a1408..e5617eea31 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -368,7 +368,7 @@ () # Sub Pixel Filters # add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/; +specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx lsx/; add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi lsx/; @@ -389,7 +389,7 @@ () specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; +specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_scaled_2d ssse3 neon msa/; @@ -656,6 +656,7 @@ () specialize qw/vpx_idct32x32_135_add dspr2 msa/; $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2; $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa; + $vpx_idct32x32_135_add_lsx=vpx_idct32x32_1024_add_lsx; specialize qw/vpx_idct32x32_34_add dspr2 msa lsx/; specialize qw/vpx_idct32x32_1_add dspr2 msa lsx/; specialize qw/vpx_iwht4x4_16_add msa/; From 618739f59f5d3505ff76a1a82eb198bed4ec989d Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Sun, 3 Apr 2022 18:49:52 +0800 Subject: [PATCH 0341/1000] vp9[loongarch]: Optimize horizontal/vertical_4/dual 1. vpx_lpf_horizontal_4_lsx 2. vpx_lpf_vertical_4_lsx 3. vpx_lpf_horizontal_4_dual_lsx 3. vpx_lpf_vertical_4_dual_lsx Bug: webm:1755 Change-Id: I12e9f27cafd9514b24cfbf2354cc66c7d1238687 --- test/lpf_test.cc | 8 +- vpx_dsp/loongarch/loopfilter_4_lsx.c | 214 +++++++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- 4 files changed, 226 insertions(+), 5 deletions(-) create mode 100644 vpx_dsp/loongarch/loopfilter_4_lsx.c diff --git a/test/lpf_test.cc b/test/lpf_test.cc index 0bdec77e54..4cc99a6db4 100644 --- a/test/lpf_test.cc +++ b/test/lpf_test.cc @@ -697,17 +697,23 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( LSX, Loop8Test6Param, ::testing::Values( + make_tuple(&vpx_lpf_horizontal_4_lsx, &vpx_lpf_horizontal_4_c, 8), make_tuple(&vpx_lpf_horizontal_8_lsx, &vpx_lpf_horizontal_8_c, 8), make_tuple(&vpx_lpf_horizontal_16_dual_lsx, &vpx_lpf_horizontal_16_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_lsx, &vpx_lpf_vertical_4_c, 8), make_tuple(&vpx_lpf_vertical_8_lsx, &vpx_lpf_vertical_8_c, 8), make_tuple(&vpx_lpf_vertical_16_dual_lsx, &vpx_lpf_vertical_16_dual_c, 8))); INSTANTIATE_TEST_SUITE_P( LSX, Loop8Test9Param, - ::testing::Values(make_tuple(&vpx_lpf_horizontal_8_dual_lsx, + ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_lsx, + &vpx_lpf_horizontal_4_dual_c, 8), + make_tuple(&vpx_lpf_horizontal_8_dual_lsx, &vpx_lpf_horizontal_8_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_dual_lsx, + &vpx_lpf_vertical_4_dual_c, 8), make_tuple(&vpx_lpf_vertical_8_dual_lsx, &vpx_lpf_vertical_8_dual_c, 8))); #endif // HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH) diff --git a/vpx_dsp/loongarch/loopfilter_4_lsx.c b/vpx_dsp/loongarch/loopfilter_4_lsx.c new file mode 100644 index 0000000000..e8abf0523f --- /dev/null +++ b/vpx_dsp/loongarch/loopfilter_4_lsx.c @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/loopfilter_lsx.h" + +void vpx_lpf_horizontal_4_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i mask, hev, flat, thresh, b_limit, limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + + DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch, + p3, p2, p1, p0); + q0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2); + q3 = __lsx_vldx(src, pitch3); + + thresh = __lsx_vreplgr2vr_b(*thresh_ptr); + b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr); + limit = __lsx_vreplgr2vr_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + __lsx_vstelm_d(p1_out, src - pitch2, 0, 0); + __lsx_vstelm_d(p0_out, src - pitch, 0, 0); + __lsx_vstelm_d(q0_out, src, 0, 0); + __lsx_vstelm_d(q1_out, src + pitch, 0, 0); +} + +void vpx_lpf_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + __m128i mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + + DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch, + p3, p2, p1, p0); + q0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2); + q3 = __lsx_vldx(src, pitch3); + + thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr); + thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr); + thresh0 = __lsx_vilvl_d(thresh1, thresh0); + + b_limit0 = __lsx_vreplgr2vr_b(*b_limit0_ptr); + b_limit1 = __lsx_vreplgr2vr_b(*b_limit1_ptr); + b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); + + limit0 = __lsx_vreplgr2vr_b(*limit0_ptr); + limit1 = __lsx_vreplgr2vr_b(*limit1_ptr); + limit0 = __lsx_vilvl_d(limit1, limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + + __lsx_vstx(p1, src, -pitch2); + __lsx_vstx(p0, src, -pitch); + __lsx_vst(q0, src, 0); + __lsx_vstx(q1, src, pitch); +} + +void vpx_lpf_vertical_4_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i mask, hev, flat, limit, thresh, b_limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i vec0, vec1, vec2, vec3; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + uint8_t *src_tmp = src - 4; + + p3 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, p2, p1); + p0 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + q0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, q1, q2); + q3 = __lsx_vldx(src_tmp, pitch3); + + thresh = __lsx_vreplgr2vr_b(*thresh_ptr); + b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr); + limit = __lsx_vreplgr2vr_b(*limit_ptr); + + LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, vec0, vec1); + vec2 = __lsx_vilvl_h(vec1, vec0); + vec3 = __lsx_vilvh_h(vec1, vec0); + + src -= 2; + __lsx_vstelm_w(vec2, src, 0, 0); + src += pitch; + __lsx_vstelm_w(vec2, src, 0, 1); + src += pitch; + __lsx_vstelm_w(vec2, src, 0, 2); + src += pitch; + __lsx_vstelm_w(vec2, src, 0, 3); + src += pitch; + + __lsx_vstelm_w(vec3, src, 0, 0); + __lsx_vstelm_w(vec3, src + pitch, 0, 1); + __lsx_vstelm_w(vec3, src + pitch2, 0, 2); + __lsx_vstelm_w(vec3, src + pitch3, 0, 3); +} + +void vpx_lpf_vertical_4_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + __m128i mask, hev, flat; + __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i row0, row1, row2, row3, row4, row5, row6, row7; + __m128i row8, row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + uint8_t *src_tmp = src - 4; + + row0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row1, row2); + row3 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + row4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row5, row6); + row7 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + row8 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row9, row10); + row11 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + row12 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row13, row14); + row15 = __lsx_vldx(src_tmp, pitch3); + + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr); + thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr); + thresh0 = __lsx_vilvl_d(thresh1, thresh0); + + b_limit0 = __lsx_vreplgr2vr_b(*b_limit0_ptr); + b_limit1 = __lsx_vreplgr2vr_b(*b_limit1_ptr); + b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); + + limit0 = __lsx_vreplgr2vr_b(*limit0_ptr); + limit1 = __lsx_vreplgr2vr_b(*limit1_ptr); + limit0 = __lsx_vilvl_d(limit1, limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1); + tmp2 = __lsx_vilvl_h(tmp1, tmp0); + tmp3 = __lsx_vilvh_h(tmp1, tmp0); + DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1); + tmp4 = __lsx_vilvl_h(tmp1, tmp0); + tmp5 = __lsx_vilvh_h(tmp1, tmp0); + + src -= 2; + __lsx_vstelm_w(tmp2, src, 0, 0); + __lsx_vstelm_w(tmp2, src + pitch, 0, 1); + __lsx_vstelm_w(tmp2, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp2, src + pitch3, 0, 3); + src += pitch4; + __lsx_vstelm_w(tmp3, src, 0, 0); + __lsx_vstelm_w(tmp3, src + pitch, 0, 1); + __lsx_vstelm_w(tmp3, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp3, src + pitch3, 0, 3); + src += pitch4; + __lsx_vstelm_w(tmp4, src, 0, 0); + __lsx_vstelm_w(tmp4, src + pitch, 0, 1); + __lsx_vstelm_w(tmp4, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp4, src + pitch3, 0, 3); + src += pitch4; + __lsx_vstelm_w(tmp5, src, 0, 0); + __lsx_vstelm_w(tmp5, src + pitch, 0, 1); + __lsx_vstelm_w(tmp5, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp5, src + pitch3, 0, 3); +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 01886b8821..ec0c598031 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -203,6 +203,7 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_lsx.h DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_16_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_8_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_4_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_NEON) += arm/highbd_loopfilter_neon.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index e5617eea31..76b00e136c 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -451,10 +451,10 @@ () specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa lsx/; add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/; +specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa lsx/; add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/; +specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa lsx/; add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon dspr2 msa/; @@ -469,10 +469,10 @@ () specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa lsx/; add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/; +specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa lsx/; add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/; +specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa lsx/; } #CONFIG_VP9 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { From 76b7350cee4a4f047c813134dba33594d0b2785b Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 8 Apr 2022 15:00:33 +0800 Subject: [PATCH 0342/1000] vp9[loongarch]: Optimize sub_pixel_variance32x32/sad16x16 1. vpx_sad16x16_lsx 2. vpx_sub_pixel_variance32x32_lsx Bug: webm:1755 Change-Id: I9926ace710903993ccbb42caef320fa895e90127 --- test/sad_test.cc | 1 + test/variance_test.cc | 3 + vpx_dsp/loongarch/sad_lsx.c | 35 ++- vpx_dsp/loongarch/sub_pixel_variance_lsx.c | 348 +++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 6 files changed, 389 insertions(+), 3 deletions(-) create mode 100644 vpx_dsp/loongarch/sub_pixel_variance_lsx.c diff --git a/test/sad_test.cc b/test/sad_test.cc index aec4cbc380..e4952ba9f7 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1135,6 +1135,7 @@ INSTANTIATE_TEST_SUITE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests)); const SadMxNParam lsx_tests[] = { SadMxNParam(64, 64, &vpx_sad64x64_lsx), SadMxNParam(32, 32, &vpx_sad32x32_lsx), + SadMxNParam(16, 16, &vpx_sad16x16_lsx), }; INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests)); diff --git a/test/variance_test.cc b/test/variance_test.cc index 8060875197..6872ca2710 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1655,5 +1655,8 @@ INSTANTIATE_TEST_SUITE_P( LSX, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_lsx), VarianceParams(5, 5, &vpx_variance32x32_lsx))); +INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelVarianceTest, + ::testing::Values(SubpelVarianceParams( + 5, 5, &vpx_sub_pixel_variance32x32_lsx, 0))); #endif } // namespace diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c index 59b268ca1f..cd3f2d46bb 100644 --- a/vpx_dsp/loongarch/sad_lsx.c +++ b/vpx_dsp/loongarch/sad_lsx.c @@ -46,6 +46,33 @@ sum_m; \ }) +static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt = (height >> 2); + __m128i src0, src1, ref0, ref1, sad_tmp; + __m128i sad = __lsx_vldi(0); + int32_t src_stride2 = src_stride << 1; + int32_t ref_stride2 = ref_stride << 1; + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0); + DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1); + src += src_stride2; + ref += ref_stride2; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0); + DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1); + src += src_stride2; + ref += ref_stride2; + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + return HADD_UH_U32(sad); +} + static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, int32_t height) { @@ -328,6 +355,12 @@ static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride, sad_array[3] = HADD_UW_U32(sad); } +#define VPX_SAD_16xHEIGHT_LSX(height) \ + uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_16width_lsx(src, src_stride, ref, ref_stride, height); \ + } + #define VPX_SAD_32xHT_LSX(height) \ uint32_t vpx_sad32x##height##_lsx(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride) { \ @@ -369,7 +402,7 @@ SAD64 SAD32 -#define SAD16 VPX_SAD_16xHTx4D_LSX(16) +#define SAD16 VPX_SAD_16xHEIGHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16) SAD16 diff --git a/vpx_dsp/loongarch/sub_pixel_variance_lsx.c b/vpx_dsp/loongarch/sub_pixel_variance_lsx.c new file mode 100644 index 0000000000..0a0486479a --- /dev/null +++ b/vpx_dsp/loongarch/sub_pixel_variance_lsx.c @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_util/loongson_intrinsics.h" +#include "vpx_dsp/variance.h" + +#define HADD_SW_S32(in0, in1) \ + do { \ + __m128i res0_m; \ + \ + res0_m = __lsx_vhaddw_d_w(in0, in0); \ + res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \ + in1 = __lsx_vpickve2gr_w(res0_m, 0); \ + } while (0) + +#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift, in2) \ + do { \ + __m128i tmp0_m, tmp1_m; \ + \ + tmp0_m = __lsx_vshuf_b(in1, in0, mask); \ + tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); \ + in2 = __lsx_vsrari_h(tmp1_m, shift); \ + } while (0) + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + { \ + __m128i src_l0_m, src_l1_m; \ + __m128i res_l0_m, res_l1_m; \ + \ + src_l0_m = __lsx_vilvl_b(src, ref); \ + src_l1_m = __lsx_vilvh_b(src, ref); \ + DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \ + res_l0_m, res_l1_m); \ + var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \ + var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \ + sub = __lsx_vadd_h(sub, res_l0_m); \ + sub = __lsx_vadd_h(sub, res_l1_m); \ + } + +static const uint8_t bilinear_filters_lsx[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + (sse) - (((int64_t)(diff) * (diff)) >> (shift)) + +static uint32_t sub_pixel_sse_diff_16width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i dst0, dst1, dst2, dst3, filt0; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i vec, var = __lsx_vldi(0); + __m128i avg = var; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + src0, src1, src2, src3); + CALC_MSE_AVG_B(src0, dst0, var, avg); + CALC_MSE_AVG_B(src1, dst1, var, avg); + CALC_MSE_AVG_B(src2, dst2, var, avg); + CALC_MSE_AVG_B(src3, dst3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + + return res; +} + +static uint32_t sub_pixel_sse_diff_32width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t sse = 0; + int32_t diff0[2]; + + sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[0]); + src += 16; + dst += 16; + + sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[1]); + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_16width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4; + __m128i out0, out1, out2, out3, tmp0, tmp1, filt0, vec; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i var = __lsx_vldi(0); + __m128i avg = var; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + src0 = src4; + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + + return res; +} + +static uint32_t sub_pixel_sse_diff_32width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t sse = 0; + int32_t diff0[2]; + + sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[0]); + src += 16; + dst += 16; + + sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[1]); + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_16width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i ref0, ref1, ref2, ref3, filt_hz, filt_vt, vec0, vec1; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, vec; + __m128i var = __lsx_vldi(0); + __m128i avg = var; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + CALC_MSE_AVG_B(src2, ref2, var, avg); + CALC_MSE_AVG_B(src3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + + return res; +} + +static uint32_t sub_pixel_sse_diff_32width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t sse = 0; + int32_t diff0[2]; + + sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[0]); + src += 16; + dst += 16; + + sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[1]); + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10) + +#define VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(wd, ht) \ + uint32_t vpx_sub_pixel_variance##wd##x##ht##_lsx( \ + const uint8_t *src, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sse) { \ + int32_t diff; \ + uint32_t var; \ + const uint8_t *h_filter = bilinear_filters_lsx[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_lsx[y_offset]; \ + \ + if (y_offset) { \ + if (x_offset) { \ + *sse = sub_pixel_sse_diff_##wd##width_hv_lsx( \ + src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_sse_diff_##wd##width_v_lsx( \ + src, src_stride, ref, ref_stride, v_filter, ht, &diff); \ + } \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + if (x_offset) { \ + *sse = sub_pixel_sse_diff_##wd##width_h_lsx( \ + src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + var = vpx_variance##wd##x##ht##_lsx(src, src_stride, ref, ref_stride, \ + sse); \ + } \ + } \ + \ + return var; \ + } + +VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(32, 32) diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index ec0c598031..5c3ffe97d4 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -399,6 +399,7 @@ DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c DSP_SRCS-$(HAVE_LSX) += loongarch/variance_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/sub_pixel_variance_lsx.c DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 76b00e136c..932099243f 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -754,7 +754,7 @@ () specialize qw/vpx_sad16x32 neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi/; +specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/; @@ -1162,7 +1162,7 @@ () specialize qw/vpx_sub_pixel_variance32x64 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3 lsx/; add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance32x16 neon msa mmi sse2 ssse3/; From 192c85c4312f84eefc2bcc92b7fa7e8a685c5700 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 25 Apr 2022 11:20:48 -0700 Subject: [PATCH 0343/1000] add_noise_test.cc: remove stale TODO this was completed in: 0dc69c70f postproc : fix function parameters for noise functions. Change-Id: I84f789ca333e9690e70e696d44475dd59339593b --- test/add_noise_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc index 25de4279c2..7dc86e3eb6 100644 --- a/test/add_noise_test.cc +++ b/test/add_noise_test.cc @@ -23,7 +23,6 @@ namespace { static const int kNoiseSize = 3072; -// TODO(jimbankoski): make width and height integers not unsigned. typedef void (*AddNoiseFunc)(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); From d18407a171ef7a0108f961c12794ddb32ad5c9ab Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 25 Apr 2022 15:12:02 -0700 Subject: [PATCH 0344/1000] register_state_check.h: add compiler barrier around ASM_REGISTER_STATE_CHECK() this helps keep the call ordering consistent avoiding some code reordering which may affect the registers being checked fixes issue with armv7 and multiple versions of gcc: [ RUN ] C/AddNoiseTest.CheckNoiseAdded/0 test/register_state_check.h:116: Failure Expected equality of these values: pre_store_[i] Which is: 0 post_store[i] Which is: 4618441417868443648 Bug: webm:1760 Change-Id: Ib8bcefd2c4d263f9fc4d4b4d4ffb853fe89d1152 Fixed: webm:1760 --- test/register_state_check.h | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/test/register_state_check.h b/test/register_state_check.h index 4366466378..1746240c61 100644 --- a/test/register_state_check.h +++ b/test/register_state_check.h @@ -35,6 +35,7 @@ #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif +#include #include #include @@ -81,10 +82,13 @@ class RegisterStateCheck { CONTEXT pre_context_; }; -#define ASM_REGISTER_STATE_CHECK(statement) \ - do { \ - libvpx_test::RegisterStateCheck reg_check; \ - statement; \ +#define ASM_REGISTER_STATE_CHECK(statement) \ + do { \ + { \ + libvpx_test::RegisterStateCheck reg_check; \ + statement; \ + } \ + _ReadWriteBarrier(); \ } while (false) } // namespace libvpx_test @@ -121,11 +125,22 @@ class RegisterStateCheck { int64_t pre_store_[8]; }; +#if defined(__GNUC__) +#define ASM_REGISTER_STATE_CHECK(statement) \ + do { \ + { \ + libvpx_test::RegisterStateCheck reg_check; \ + statement; \ + } \ + __asm__ volatile("" ::: "memory"); \ + } while (false) +#else #define ASM_REGISTER_STATE_CHECK(statement) \ do { \ libvpx_test::RegisterStateCheck reg_check; \ statement; \ } while (false) +#endif } // namespace libvpx_test @@ -169,10 +184,13 @@ class RegisterStateCheckMMX { uint16_t pre_fpu_env_[14]; }; -#define API_REGISTER_STATE_CHECK(statement) \ - do { \ - libvpx_test::RegisterStateCheckMMX reg_check; \ - ASM_REGISTER_STATE_CHECK(statement); \ +#define API_REGISTER_STATE_CHECK(statement) \ + do { \ + { \ + libvpx_test::RegisterStateCheckMMX reg_check; \ + ASM_REGISTER_STATE_CHECK(statement); \ + } \ + __asm__ volatile("" ::: "memory"); \ } while (false) } // namespace libvpx_test From f6de5b51b8338ebd743a465e84d2c4b73cc29082 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Tue, 5 Apr 2022 18:17:19 +0800 Subject: [PATCH 0345/1000] vp9[loongarch]: Optimize fdct/get/variance16x16 1. vpx_fdct16x16_lsx 2. vpx_get16x16var_lsx 3. vpx_variance16x16_lsx Bug: webm:1755 Change-Id: I27090406dc28cfdca64760fea4bc16ae11b74628 --- test/dct16x16_test.cc | 7 + test/dct_test.cc | 16 ++ test/variance_test.cc | 4 +- vpx_dsp/loongarch/fwd_txfm_lsx.c | 258 ++++++++++++++++++++++++++++ vpx_dsp/loongarch/fwd_txfm_lsx.h | 166 ++++++++++++++++++ vpx_dsp/loongarch/txfm_macros_lsx.h | 8 + vpx_dsp/loongarch/variance_lsx.c | 54 +++++- vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +- 9 files changed, 514 insertions(+), 6 deletions(-) create mode 100644 vpx_dsp/loongarch/fwd_txfm_lsx.c diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index c04880ec95..06837d809d 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -868,4 +868,11 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_vsx, 0, VPX_BITS_8))); #endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P(LSX, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_lsx, + &vpx_idct16x16_256_add_c, + 0, VPX_BITS_8))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/test/dct_test.cc b/test/dct_test.cc index 20e081a24c..6178f8e2cf 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -586,6 +586,21 @@ INSTANTIATE_TEST_SUITE_P(VSX, TransDCT, VPX_BITS_8))); #endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_lsx_func_info[2] = { + { &fdct_wrapper, &idct_wrapper, + 16, 1 }, + { &fdct_wrapper, &idct_wrapper, + 32, 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + LSX, TransDCT, + ::testing::Combine(::testing::Range(0, 2), + ::testing::Values(dct_lsx_func_info), + ::testing::Values(0), ::testing::Values(VPX_BITS_8))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH + #endif // !CONFIG_EMULATE_HARDWARE /* -------------------------------------------------------------------------- */ @@ -756,4 +771,5 @@ INSTANTIATE_TEST_SUITE_P(VSX, TransWHT, ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0, VPX_BITS_8))); #endif // HAVE_VSX && !CONFIG_EMULATE_HARDWARE + } // namespace diff --git a/test/variance_test.cc b/test/variance_test.cc index 6872ca2710..11983bb8ac 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1654,7 +1654,9 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( LSX, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_lsx), - VarianceParams(5, 5, &vpx_variance32x32_lsx))); + VarianceParams(5, 5, &vpx_variance32x32_lsx), + VarianceParams(4, 4, &vpx_variance16x16_lsx))); + INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelVarianceTest, ::testing::Values(SubpelVarianceParams( 5, 5, &vpx_sub_pixel_variance32x32_lsx, 0))); diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.c b/vpx_dsp/loongarch/fwd_txfm_lsx.c new file mode 100644 index 0000000000..03f194b433 --- /dev/null +++ b/vpx_dsp/loongarch/fwd_txfm_lsx.c @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/fwd_txfm_lsx.h" + +#if !CONFIG_VP9_HIGHBITDEPTH +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride) { + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i stp21, stp22, stp23, stp24, stp25, stp26, stp30; + __m128i stp31, stp32, stp33, stp34, stp35, stp36, stp37; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; + __m128i coeff = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df }; + __m128i coeff1 = { 0x289a317906463fb1, 0x12943d3f1e2b3871 }; + __m128i coeff2 = { 0xed6cd766c78fc04f, 0x0 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t src_stride6 = src_stride4 + src_stride2; + int32_t src_stride8 = src_stride4 << 1; + int16_t *input_tmp = (int16_t *)input; + in0 = __lsx_vld(input_tmp, 0); + DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, + input_tmp, src_stride6, input_tmp, src_stride8, in1, in2, in3, in4); + input_tmp += src_stride4; + DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, + input_tmp, src_stride6, input_tmp, src_stride8, in5, in6, in7, in8); + input_tmp += src_stride4; + DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, + input_tmp, src_stride6, input_tmp, src_stride8, in9, in10, in11, + in12); + input_tmp += src_stride4; + DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in13, + in14); + input_tmp += src_stride2; + in15 = __lsx_vldx(input_tmp, src_stride2); + + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vslli_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10, + in11); + DUP4_ARG2(__lsx_vslli_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14, + in15); + DUP4_ARG2(__lsx_vadd_h, in0, in15, in1, in14, in2, in13, in3, in12, tmp0, + tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vadd_h, in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, + tmp6, tmp7); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + __lsx_vst(tmp0, tmp_ptr, 0); + __lsx_vst(tmp1, tmp_ptr, 64); + __lsx_vst(tmp2, tmp_ptr, 128); + __lsx_vst(tmp3, tmp_ptr, 192); + __lsx_vst(tmp4, tmp_ptr, 256); + __lsx_vst(tmp5, tmp_ptr, 320); + __lsx_vst(tmp6, tmp_ptr, 384); + __lsx_vst(tmp7, tmp_ptr, 448); + DUP4_ARG2(__lsx_vsub_h, in0, in15, in1, in14, in2, in13, in3, in12, in15, + in14, in13, in12); + DUP4_ARG2(__lsx_vsub_h, in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, + in9, in8); + + tmp_ptr += 16; + + /* stp 1 */ + DUP2_ARG2(__lsx_vilvh_h, in10, in13, in11, in12, vec2, vec4); + DUP2_ARG2(__lsx_vilvl_h, in10, in13, in11, in12, vec3, vec5); + + cnst4 = __lsx_vreplvei_h(coeff, 0); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4, stp25); + + cnst5 = __lsx_vreplvei_h(coeff, 1); + cnst5 = __lsx_vpackev_h(cnst5, cnst4); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5, stp22); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4, stp24); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5, stp23); + + /* stp2 */ + LSX_BUTTERFLY_4_H(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); + LSX_BUTTERFLY_4_H(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); + DUP2_ARG2(__lsx_vilvh_h, stp36, stp31, stp35, stp32, vec2, vec4); + DUP2_ARG2(__lsx_vilvl_h, stp36, stp31, stp35, stp32, vec3, vec5); + DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 3, cnst0, cnst1); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0, stp26); + + cnst0 = __lsx_vreplvei_h(coeff, 4); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1, stp21); + + LSX_BUTTERFLY_4_H(stp30, stp37, stp26, stp21, in8, in15, in14, in9); + vec1 = __lsx_vilvl_h(in15, in8); + vec0 = __lsx_vilvh_h(in15, in8); + + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 0, coeff1, 1, cnst0, cnst1); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 0); + + cnst0 = __lsx_vreplvei_h(coeff2, 0); + cnst0 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 448); + + vec1 = __lsx_vilvl_h(in14, in9); + vec0 = __lsx_vilvh_h(in14, in9); + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 2, coeff1, 3, cnst0, cnst1); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8); + __lsx_vst(in8, tmp_ptr, 256); + + cnst1 = __lsx_vreplvei_h(coeff2, 2); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 192); + + DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 5, cnst0, cnst1); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp25); + + cnst1 = __lsx_vreplvei_h(coeff, 3); + cnst1 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp22); + + /* stp4 */ + DUP2_ARG2(__lsx_vadd_h, stp34, stp25, stp33, stp22, in13, in10); + + vec1 = __lsx_vilvl_h(in13, in10); + vec0 = __lsx_vilvh_h(in13, in10); + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 4, coeff1, 5, cnst0, cnst1); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 128); + + cnst0 = __lsx_vreplvei_h(coeff2, 1); + cnst0 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 320); + + DUP2_ARG2(__lsx_vsub_h, stp34, stp25, stp33, stp22, in12, in11); + vec1 = __lsx_vilvl_h(in12, in11); + vec0 = __lsx_vilvh_h(in12, in11); + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 6, coeff1, 7, cnst0, cnst1); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8); + __lsx_vst(in8, tmp_ptr, 384); + + cnst1 = __lsx_vreplvei_h(coeff2, 3); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 64); +} + +void fdct16x8_1d_row(int16_t *input, int16_t *output) { + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + int16_t *input_tmp = input; + + DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in4, in5, + in6, in7); + DUP4_ARG2(__lsx_vld, input_tmp, 16, input_tmp, 48, input_tmp, 80, input_tmp, + 112, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, input_tmp, 144, input_tmp, 176, input_tmp, 208, + input_tmp, 240, in12, in13, in14, in15); + + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vaddi_hu, in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vaddi_hu, in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, + in11); + DUP4_ARG2(__lsx_vaddi_hu, in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, + in14, in15); + + DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vsrai_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vsrai_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10, + in11); + DUP4_ARG2(__lsx_vsrai_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14, + in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, + tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, + in15); + __lsx_vst(in8, input, 0); + __lsx_vst(in9, input, 32); + __lsx_vst(in10, input, 64); + __lsx_vst(in11, input, 96); + __lsx_vst(in12, input, 128); + __lsx_vst(in13, input, 160); + __lsx_vst(in14, input, 192); + __lsx_vst(in15, input, 224); + + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in12, + in13, in14, in15); + FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, + tmp1, in1, tmp2, in2, tmp3, in3); + __lsx_vst(tmp0, output, 0); + __lsx_vst(in0, output, 32); + __lsx_vst(tmp1, output, 64); + __lsx_vst(in1, output, 96); + __lsx_vst(tmp2, output, 128); + __lsx_vst(in2, output, 160); + __lsx_vst(tmp3, output, 192); + __lsx_vst(in3, output, 224); + + LSX_TRANSPOSE8x8_H(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, + tmp5, in5, tmp6, in6, tmp7, in7); + __lsx_vst(tmp4, output, 16); + __lsx_vst(in4, output, 48); + __lsx_vst(tmp5, output, 80); + __lsx_vst(in5, output, 112); + __lsx_vst(tmp6, output, 144); + __lsx_vst(in6, output, 176); + __lsx_vst(tmp7, output, 208); + __lsx_vst(in7, output, 240); +} + +void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]); + + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i))); + } +} +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h index a6f62dbc81..9ed8102269 100644 --- a/vpx_dsp/loongarch/fwd_txfm_lsx.h +++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h @@ -113,4 +113,170 @@ __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1); \ } +#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + __m128i x0_m, x1_m, x2_m, x3_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \ + \ + /* FDCT stage1 */ \ + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \ + s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ + LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \ + x2_m = __lsx_vneg_h(x2_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \ + x2_m = __lsx_vreplvei_h(coeff_m, 2); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \ + \ + /* stage2 */ \ + s1_m = __lsx_vilvl_h(s5_m, s6_m); \ + s0_m = __lsx_vilvh_h(s5_m, s6_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \ + \ + /* stage3 */ \ + LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x0_m, x1_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \ + \ + x1_m = __lsx_vreplvei_h(coeff_m, 5); \ + x0_m = __lsx_vneg_h(x0_m); \ + x0_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \ + \ + x2_m = __lsx_vreplvei_h(coeff_m, 6); \ + x3_m = __lsx_vneg_h(x3_m); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \ + } + +#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \ + input7, out1, out3, out5, out7, out9, out11, out13, \ + out15) \ + { \ + __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ + __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ + __m128i stp36_m, stp37_m, vec0_m, vec1_m; \ + __m128i vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ + __m128i cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df }; \ + __m128i coeff1_m = { 0x289a317906463fb1, 0x12943d3f1e2b3871 }; \ + __m128i coeff2_m = { 0xed6cd766c78fc04f, 0x0 }; \ + \ + /* stp 1 */ \ + DUP2_ARG2(__lsx_vilvh_h, input2, input5, input3, input4, vec2_m, vec4_m); \ + DUP2_ARG2(__lsx_vilvl_h, input2, input5, input3, input4, vec3_m, vec5_m); \ + \ + cnst4_m = __lsx_vreplvei_h(coeff_m, 0); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m, stp25_m); \ + \ + cnst5_m = __lsx_vreplvei_h(coeff_m, 1); \ + cnst5_m = __lsx_vpackev_h(cnst5_m, cnst4_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m, stp22_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m, stp24_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m, stp23_m); \ + \ + /* stp2 */ \ + LSX_BUTTERFLY_4_H(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, \ + stp32_m, stp33_m); \ + LSX_BUTTERFLY_4_H(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, \ + stp35_m, stp34_m); \ + \ + DUP2_ARG2(__lsx_vilvh_h, stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, \ + vec4_m); \ + DUP2_ARG2(__lsx_vilvl_h, stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, \ + vec5_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, cnst0_m, cnst1_m); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m, stp26_m); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff_m, 4); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m, stp21_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 5, coeff_m, 2, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp25_m); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff_m, 3); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp22_m); \ + \ + /* stp4 */ \ + LSX_BUTTERFLY_4_H(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, \ + vec4_m, vec5_m); \ + LSX_BUTTERFLY_4_H(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, \ + stp24_m, stp31_m); \ + \ + vec1_m = __lsx_vilvl_h(vec2_m, vec6_m); \ + vec0_m = __lsx_vilvh_h(vec2_m, vec6_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 0, coeff1_m, 1, cnst0_m, cnst1_m); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out1); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff2_m, 0); \ + cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out15); \ + \ + vec1_m = __lsx_vilvl_h(vec4_m, vec5_m); \ + vec0_m = __lsx_vilvh_h(vec4_m, vec5_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 2, coeff1_m, 3, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out9); \ + \ + cnst1_m = __lsx_vreplvei_h(coeff2_m, 2); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out7); \ + \ + vec1_m = __lsx_vilvl_h(stp23_m, stp21_m); \ + vec0_m = __lsx_vilvh_h(stp23_m, stp21_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 4, coeff1_m, 5, cnst0_m, cnst1_m); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out5); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff2_m, 1); \ + cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out11); \ + \ + vec1_m = __lsx_vilvl_h(stp24_m, stp31_m); \ + vec0_m = __lsx_vilvh_h(stp24_m, stp31_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 6, coeff1_m, 7, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out13); \ + \ + cnst1_m = __lsx_vreplvei_h(coeff2_m, 3); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3); \ + } + +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride); +void fdct16x8_1d_row(int16_t *input, int16_t *output); #endif // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ diff --git a/vpx_dsp/loongarch/txfm_macros_lsx.h b/vpx_dsp/loongarch/txfm_macros_lsx.h index bc6f7dacc9..977f1c2dd0 100644 --- a/vpx_dsp/loongarch/txfm_macros_lsx.h +++ b/vpx_dsp/loongarch/txfm_macros_lsx.h @@ -44,4 +44,12 @@ out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ } +#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2, in3) \ + do { \ + __m128i tp0_m, tp1_m; \ + \ + DUP2_ARG2(__lsx_vdp2_w_h, in0, in2, in1, in2, tp1_m, tp0_m); \ + in3 = __lsx_vssrarni_h_w(tp1_m, tp0_m, DCT_CONST_BITS); \ + } while (0) + #endif // VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_ diff --git a/vpx_dsp/loongarch/variance_lsx.c b/vpx_dsp/loongarch/variance_lsx.c index 8164e98189..8f2ec0563f 100644 --- a/vpx_dsp/loongarch/variance_lsx.c +++ b/vpx_dsp/loongarch/variance_lsx.c @@ -37,9 +37,50 @@ sub = __lsx_vadd_h(sub, res_l1_m); \ } +#define VARIANCE_WxH(sse, diff, shift) \ + (sse) - (((uint32_t)(diff) * (diff)) >> (shift)) + #define VARIANCE_LARGE_WxH(sse, diff, shift) \ (sse) - (((int64_t)(diff) * (diff)) >> (shift)) +static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt = (height >> 2); + __m128i src, ref, vec; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + for (; ht_cnt--;) { + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride, int32_t height, int32_t *diff) { @@ -133,8 +174,10 @@ static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride, return HADD_SW_S32(var); } -#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); -#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8) + +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10) +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12) #define VPX_VARIANCE_WDXHT_LSX(wd, ht) \ uint32_t vpx_variance##wd##x##ht##_lsx( \ @@ -148,6 +191,7 @@ static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride, return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ } +VPX_VARIANCE_WDXHT_LSX(16, 16) VPX_VARIANCE_WDXHT_LSX(32, 32) uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride, @@ -159,3 +203,9 @@ uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride, return VARIANCE_64Wx64H(*sse, diff); } + +void vpx_get16x16var_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { + *sse = sse_diff_16width_lsx(src, src_stride, ref, ref_stride, 16, sum); +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 5c3ffe97d4..efb253c68d 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -236,6 +236,7 @@ DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.h +DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.c ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 932099243f..4ad698cabe 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -585,7 +585,7 @@ () specialize qw/vpx_fdct8x8_1 sse2 neon msa/; add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct16x16 neon sse2 msa/; + specialize qw/vpx_fdct16x16 neon sse2 msa lsx/; add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct16x16_1 sse2 neon msa/; @@ -1099,7 +1099,7 @@ () specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx lsx/; add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/; @@ -1123,7 +1123,7 @@ () # Specialty Variance # add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx/; + specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx lsx/; add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; specialize qw/vpx_get8x8var sse2 neon msa vsx/; From 19b45a26c62170c1fb0dfd18a083ddb84ef7e4a4 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Apr 2022 18:42:24 -0700 Subject: [PATCH 0346/1000] vp9,encode_tiles_buffer_alloc: fix allocation check previously vp9_bitstream_worker_data was checked after it was memset(); this change uses CHECK_MEM_ERROR for consistency to ensure the pointer is checked first Change-Id: I532d0eb0e746dc6b8d694b616eba693c5c0053ac --- vp9/encoder/vp9_bitstream.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 99cc2ee831..75bd097f24 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -963,21 +963,20 @@ void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) { } } -static int encode_tiles_buffer_alloc(VP9_COMP *const cpi) { +static void encode_tiles_buffer_alloc(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; int i; const size_t worker_data_size = cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data); - cpi->vp9_bitstream_worker_data = vpx_memalign(16, worker_data_size); + CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data, + vpx_memalign(16, worker_data_size)); memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size); - if (!cpi->vp9_bitstream_worker_data) return 1; for (i = 1; i < cpi->num_workers; ++i) { cpi->vp9_bitstream_worker_data[i].dest_size = cpi->oxcf.width * cpi->oxcf.height; - cpi->vp9_bitstream_worker_data[i].dest = - vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size); - if (!cpi->vp9_bitstream_worker_data[i].dest) return 1; + CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data[i].dest, + vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size)); } - return 0; } static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) { @@ -992,7 +991,7 @@ static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) { cpi->vp9_bitstream_worker_data[1].dest_size > (cpi->oxcf.width * cpi->oxcf.height)) { vp9_bitstream_encode_tiles_buffer_dealloc(cpi); - if (encode_tiles_buffer_alloc(cpi)) return 0; + encode_tiles_buffer_alloc(cpi); } while (tile_col < tile_cols) { From 1b70db4be90e66fdd0473f34ad7bec69f269edeb Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Apr 2022 19:05:56 -0700 Subject: [PATCH 0347/1000] vp9: check postproc_state.limits allocs Change-Id: I9d5df96580074375e4847d2e2f60a6a6d56eeea5 --- vp9/common/vp9_postproc.c | 1 + vp9/encoder/vp9_encoder.c | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c index d2c8535b01..96519f0051 100644 --- a/vp9/common/vp9_postproc.c +++ b/vp9/common/vp9_postproc.c @@ -360,6 +360,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, if (!cm->postproc_state.limits) { cm->postproc_state.limits = vpx_calloc(unscaled_width, sizeof(*cm->postproc_state.limits)); + if (!cm->postproc_state.limits) return 1; } } diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index ec6a756197..89b7c8e246 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -3684,9 +3684,9 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index, case 6: l = 150; break; } if (!cpi->common.postproc_state.limits) { - cpi->common.postproc_state.limits = - vpx_calloc(cpi->un_scaled_source->y_width, - sizeof(*cpi->common.postproc_state.limits)); + CHECK_MEM_ERROR(cm, cpi->common.postproc_state.limits, + vpx_calloc(cpi->un_scaled_source->y_width, + sizeof(*cpi->common.postproc_state.limits))); } vp9_denoise(&cpi->common, cpi->Source, cpi->Source, l, cpi->common.postproc_state.limits); From e93e2ca0e33fb04f54724c4df6526727e7399841 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Apr 2022 19:07:08 -0700 Subject: [PATCH 0348/1000] vp9_enc_grp_get_next_job: check job queue alloc + reverse conditional order; var == constant is more readable Change-Id: I9f2b4394024c262fd5fe9576a8bf33afe197c050 --- vp9/encoder/vp9_multi_thread.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c index c66c035492..6078f8975c 100644 --- a/vp9/encoder/vp9_multi_thread.c +++ b/vp9/encoder/vp9_multi_thread.c @@ -36,7 +36,7 @@ void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt, pthread_mutex_lock(mutex_handle); #endif next = job_queue_hdl->next; - if (NULL != next) { + if (next != NULL) { JobQueue *job_queue = (JobQueue *)next; job_info = &job_queue->job_info; // Update the next job in the queue @@ -84,8 +84,8 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { multi_thread_ctxt->allocated_tile_rows = tile_rows; multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col; - multi_thread_ctxt->job_queue = - (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue)); + CHECK_MEM_ERROR(cm, multi_thread_ctxt->job_queue, + (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue))); #if CONFIG_MULTITHREAD // Create mutex for each tile From 72fa1d505ed4eaf2660e35347aba768502f268c4 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Apr 2022 19:13:48 -0700 Subject: [PATCH 0349/1000] vp9_alloc_motion_field_info: check motion_field_array alloc Change-Id: I4ae11242e645feb3b85eaea186f14b3676ae40a8 --- vp9/encoder/vp9_non_greedy_mv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/vp9/encoder/vp9_non_greedy_mv.c b/vp9/encoder/vp9_non_greedy_mv.c index 4679d6c49c..1c0d281495 100644 --- a/vp9/encoder/vp9_non_greedy_mv.c +++ b/vp9/encoder/vp9_non_greedy_mv.c @@ -178,6 +178,7 @@ Status vp9_alloc_motion_field_info(MotionFieldInfo *motion_field_info, motion_field_info->frame_num = frame_num; motion_field_info->motion_field_array = vpx_calloc(frame_num, sizeof(*motion_field_info->motion_field_array)); + if (!motion_field_info->motion_field_array) return STATUS_FAILED; for (frame_idx = 0; frame_idx < frame_num; ++frame_idx) { for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; From 58fff2f9ef25c13da150cdfb366a351705db5776 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Apr 2022 19:14:44 -0700 Subject: [PATCH 0350/1000] vp9_speed_features.c: check allocations Change-Id: If3b319c1ce7036c2259440f4eeb2e645bf559f4c --- vp9/encoder/vp9_speed_features.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 7d7b2c3fb4..0431d8a452 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -652,8 +652,10 @@ static void set_rt_speed_feature_framesize_independent( if (cpi->content_state_sb_fd == NULL && (!cpi->use_svc || svc->spatial_layer_id == svc->number_spatial_layers - 1)) { - cpi->content_state_sb_fd = (uint8_t *)vpx_calloc( - (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t)); + CHECK_MEM_ERROR(cm, cpi->content_state_sb_fd, + (uint8_t *)vpx_calloc( + (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(uint8_t))); } } if (cpi->oxcf.rc_mode == VPX_CBR && content != VP9E_CONTENT_SCREEN) { @@ -804,14 +806,17 @@ static void set_rt_speed_feature_framesize_independent( sf->partition_search_type = FIXED_PARTITION; sf->always_this_block_size = BLOCK_64X64; } - if (cpi->count_arf_frame_usage == NULL) - cpi->count_arf_frame_usage = + if (cpi->count_arf_frame_usage == NULL) { + CHECK_MEM_ERROR( + cm, cpi->count_arf_frame_usage, (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), - sizeof(*cpi->count_arf_frame_usage)); + sizeof(*cpi->count_arf_frame_usage))); + } if (cpi->count_lastgolden_frame_usage == NULL) - cpi->count_lastgolden_frame_usage = + CHECK_MEM_ERROR( + cm, cpi->count_lastgolden_frame_usage, (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), - sizeof(*cpi->count_lastgolden_frame_usage)); + sizeof(*cpi->count_lastgolden_frame_usage))); } if (svc->previous_frame_is_intra_only) { sf->partition_search_type = FIXED_PARTITION; From a5ad89018eecef202d4ae5853ecdde843c0a9880 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Apr 2022 19:15:40 -0700 Subject: [PATCH 0351/1000] VP9RateControlRTC::Create: check segmentation_map alloc Change-Id: I17b23915c32accf834def5ab26a8e4e188f9993a --- vp9/ratectrl_rtc.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index 76ff367c06..f4d7f7e9e7 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -25,7 +25,10 @@ std::unique_ptr VP9RateControlRTC::Create( VP9RateControlRTC()); if (!rc_api) return nullptr; rc_api->cpi_ = static_cast(vpx_memalign(32, sizeof(*cpi_))); - if (!rc_api->cpi_) return nullptr; + if (!rc_api->cpi_) { + rc_api.reset(); + return nullptr; + } vp9_zero(*rc_api->cpi_); rc_api->InitRateControl(cfg); @@ -34,6 +37,10 @@ std::unique_ptr VP9RateControlRTC::Create( cpi->segmentation_map = static_cast( vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, sizeof(*cpi->segmentation_map))); + if (!cpi->segmentation_map) { + rc_api.reset(); + return nullptr; + } cpi->cyclic_refresh = vp9_cyclic_refresh_alloc(cpi->common.mi_rows, cpi->common.mi_cols); cpi->cyclic_refresh->content_mode = 0; From b2d57a88086410c8beb3696374764c4e836fe332 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Apr 2022 19:16:11 -0700 Subject: [PATCH 0352/1000] simple_encode,init_encoder: check buffer_pool alloc Change-Id: I54f83733260abf828166400c5fd0c4c7e3ccec2f --- vp9/simple_encode.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 1a0ada119f..654699e1b2 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -110,6 +110,7 @@ static VP9_COMP *init_encoder(const VP9EncoderConfig *oxcf, vpx_img_fmt_t img_fmt) { VP9_COMP *cpi; BufferPool *buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(*buffer_pool)); + if (!buffer_pool) return NULL; vp9_initialize_enc(); cpi = vp9_create_compressor(oxcf, buffer_pool); vp9_update_compressor_with_img_fmt(cpi, img_fmt); From e82c5a85c9fcb727a591ffa63fc08bcaf52f9da3 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Apr 2022 22:18:21 -0700 Subject: [PATCH 0353/1000] vp9_row_mt_alloc_rd_thresh: check alloc Change-Id: I6fb7771d9fa6ec54d81f24a02a289e8b852e7332 --- vp9/encoder/vp9_multi_thread.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c index 6078f8975c..45659f2a9a 100644 --- a/vp9/encoder/vp9_multi_thread.c +++ b/vp9/encoder/vp9_multi_thread.c @@ -58,9 +58,10 @@ void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi, (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1; int i; - this_tile->row_base_thresh_freq_fact = + CHECK_MEM_ERROR( + cm, this_tile->row_base_thresh_freq_fact, (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES, - sizeof(*(this_tile->row_base_thresh_freq_fact))); + sizeof(*(this_tile->row_base_thresh_freq_fact)))); for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++) this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT; } From c152584107a18a0e240a23279f1f1dfcd80a3acf Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Apr 2022 22:19:05 -0700 Subject: [PATCH 0354/1000] vp9_get_smooth_motion_field: check alloc Change-Id: I6b19d0169d127f622abf97b3b8590eee957bdc51 --- vp9/encoder/vp9_non_greedy_mv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vp9/encoder/vp9_non_greedy_mv.c b/vp9/encoder/vp9_non_greedy_mv.c index 1c0d281495..d52801c845 100644 --- a/vp9/encoder/vp9_non_greedy_mv.c +++ b/vp9/encoder/vp9_non_greedy_mv.c @@ -423,6 +423,7 @@ void vp9_get_smooth_motion_field(const MV *search_mf, int row, col; int bw = 4 << b_width_log2_lookup[bsize]; int bh = 4 << b_height_log2_lookup[bsize]; + if (!(input && output)) goto fail; // copy search results to input buffer for (idx = 0; idx < rows * cols; ++idx) { input[idx].row = (float)search_mf[idx].row / bh; @@ -451,6 +452,7 @@ void vp9_get_smooth_motion_field(const MV *search_mf, smooth_mf[idx].row = (int)(input[idx].row * bh); smooth_mf[idx].col = (int)(input[idx].col * bw); } +fail: free(input); free(output); } From c3d2df2f2f810b1c1bd9bd6bf0a54d20b4e6dacc Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Apr 2022 22:19:34 -0700 Subject: [PATCH 0355/1000] fastssim,fs_ctx_init: check alloc Change-Id: Ie087e8be1e943b94327ed520db447a0e3a927738 --- vpx_dsp/fastssim.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vpx_dsp/fastssim.c b/vpx_dsp/fastssim.c index 6ab6f557e2..4d32a02a55 100644 --- a/vpx_dsp/fastssim.c +++ b/vpx_dsp/fastssim.c @@ -47,7 +47,7 @@ struct fs_ctx { unsigned *col_buf; }; -static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { +static int fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { unsigned char *data; size_t data_size; int lw; @@ -71,6 +71,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { lh = (lh + 1) >> 1; } data = (unsigned char *)malloc(data_size); + if (!data) return -1; _ctx->level = (fs_level *)data; _ctx->nlevels = _nlevels; data += _nlevels * sizeof(*_ctx->level); @@ -95,6 +96,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { lh = (lh + 1) >> 1; } _ctx->col_buf = (unsigned *)data; + return 0; } static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); } @@ -456,7 +458,7 @@ static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst, double ret; int l; ret = 1; - fs_ctx_init(&ctx, _w, _h, FS_NLEVELS); + if (fs_ctx_init(&ctx, _w, _h, FS_NLEVELS)) return 99.0; fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd, _shift); for (l = 0; l < FS_NLEVELS - 1; l++) { From 8baaa7b5a3fcea958261bda667a871c163e93bf9 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Apr 2022 22:20:00 -0700 Subject: [PATCH 0356/1000] y4m_input_open: check allocs Change-Id: I99ee0ef3ab28a22923cb413ccf5935fdc38862be --- y4minput.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/y4minput.c b/y4minput.c index 9a4bdbd7b5..7d3c03a7fc 100644 --- a/y4minput.c +++ b/y4minput.c @@ -1087,9 +1087,15 @@ int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, y4m_ctx->dst_buf = (unsigned char *)malloc(y4m_ctx->dst_buf_sz); else y4m_ctx->dst_buf = (unsigned char *)malloc(2 * y4m_ctx->dst_buf_sz); + if (!y4m_ctx->dst_buf) return -1; - if (y4m_ctx->aux_buf_sz > 0) + if (y4m_ctx->aux_buf_sz > 0) { y4m_ctx->aux_buf = (unsigned char *)malloc(y4m_ctx->aux_buf_sz); + if (!y4m_ctx->aux_buf) { + free(y4m_ctx->dst_buf); + return -1; + } + } return 0; } From b1ed8e08a21b33c0f5039559113004bee7943dc4 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Thu, 7 Apr 2022 17:51:51 +0800 Subject: [PATCH 0357/1000] vp9[loongarch]: Optimize sad64x64/32x32_avg,comp_avg_pred 1. vpx_sad64x64_avg_lsx 2. vpx_sad32x32_avg_lsx 3. comp_avg_pred_lsx Bug: webm:1755 Change-Id: I58dabdcdd4265bd6ebd5670db8a132d2e838683f --- test/comp_avg_pred_test.cc | 5 + test/sad_test.cc | 6 ++ vpx_dsp/loongarch/avg_pred_lsx.c | 83 ++++++++++++++ vpx_dsp/loongarch/sad_lsx.c | 180 ++++++++++++++++++++++++++++++- vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +- 6 files changed, 274 insertions(+), 7 deletions(-) create mode 100644 vpx_dsp/loongarch/avg_pred_lsx.c diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc index b9201a20f9..3977a2d0b5 100644 --- a/test/comp_avg_pred_test.cc +++ b/test/comp_avg_pred_test.cc @@ -183,4 +183,9 @@ INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTest, INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTest, ::testing::Values(&vpx_comp_avg_pred_vsx)); #endif // HAVE_VSX + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTest, + ::testing::Values(&vpx_comp_avg_pred_lsx)); +#endif // HAVE_LSX } // namespace diff --git a/test/sad_test.cc b/test/sad_test.cc index e4952ba9f7..12a6206b95 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1139,6 +1139,12 @@ const SadMxNParam lsx_tests[] = { }; INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests)); +const SadMxNAvgParam avg_lsx_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_lsx), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_lsx), +}; +INSTANTIATE_TEST_SUITE_P(LSX, SADavgTest, ::testing::ValuesIn(avg_lsx_tests)); + const SadMxNx4Param x4d_lsx_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_lsx), SadMxNx4Param(32, 32, &vpx_sad32x32x4d_lsx), diff --git a/vpx_dsp/loongarch/avg_pred_lsx.c b/vpx_dsp/loongarch/avg_pred_lsx.c new file mode 100644 index 0000000000..482626080a --- /dev/null +++ b/vpx_dsp/loongarch/avg_pred_lsx.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_util/loongson_intrinsics.h" + +void vpx_comp_avg_pred_lsx(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + // width > 8 || width == 8 || width == 4 + if (width > 8) { + int i, j; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + __m128i p, r, avg; + + p = __lsx_vld(pred + j, 0); + r = __lsx_vld(ref + j, 0); + avg = __lsx_vavgr_bu(p, r); + __lsx_vst(avg, comp_pred + j, 0); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + int i = height * width; + do { + __m128i p, r, r_0, r_1; + + p = __lsx_vld(pred, 0); + r_0 = __lsx_vld(ref, 0); + ref += ref_stride; + r_1 = __lsx_vld(ref, 0); + ref += ref_stride; + r = __lsx_vilvl_d(r_1, r_0); + r = __lsx_vavgr_bu(p, r); + + __lsx_vst(r, comp_pred, 0); + + pred += 16; + comp_pred += 16; + i -= 16; + } while (i); + } else { // width = 4 + int i = height * width; + assert(width == 4); + do { + __m128i p, r, r_0, r_1, r_2, r_3; + p = __lsx_vld(pred, 0); + + if (width == ref_stride) { + r = __lsx_vld(ref, 0); + ref += 16; + } else { + r_0 = __lsx_vld(ref, 0); + ref += ref_stride; + r_1 = __lsx_vld(ref, 0); + ref += ref_stride; + r_2 = __lsx_vld(ref, 0); + ref += ref_stride; + r_3 = __lsx_vld(ref, 0); + ref += ref_stride; + DUP2_ARG2(__lsx_vilvl_w, r_1, r_0, r_3, r_2, r_0, r_2); + r = __lsx_vilvl_d(r_2, r_0); + } + r = __lsx_vavgr_bu(p, r); + + __lsx_vst(r, comp_pred, 0); + comp_pred += 16; + pred += 16; + i -= 16; + } while (i); + } +} diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c index cd3f2d46bb..30464b3661 100644 --- a/vpx_dsp/loongarch/sad_lsx.c +++ b/vpx_dsp/loongarch/sad_lsx.c @@ -46,6 +46,17 @@ sum_m; \ }) +#define HADD_SW_S32(in) \ + ({ \ + __m128i res0_m; \ + int32_t sum_m; \ + \ + res0_m = __lsx_vhaddw_d_w(in, in); \ + res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \ + sum_m = __lsx_vpickve2gr_w(res0_m, 0); \ + sum_m; \ + }) + static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, int32_t height) { @@ -355,7 +366,150 @@ static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride, sad_array[3] = HADD_UW_U32(sad); } -#define VPX_SAD_16xHEIGHT_LSX(height) \ +static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i comp0, comp1, sad_tmp; + __m128i sad = __lsx_vldi(0); + uint8_t *src_tmp, *ref_tmp; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t ref_stride2 = ref_stride << 1; + int32_t ref_stride3 = ref_stride2 + ref_stride; + int32_t ref_stride4 = ref_stride2 << 1; + + for (; ht_cnt--;) { + src_tmp = (uint8_t *)src + 16; + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src1 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + ref_tmp = (uint8_t *)ref + 16; + ref0 = __lsx_vld(ref, 0); + DUP2_ARG2(__lsx_vldx, ref, ref_stride, ref, ref_stride2, ref2, ref4); + ref6 = __lsx_vldx(ref, ref_stride3); + ref1 = __lsx_vld(ref_tmp, 0); + DUP2_ARG2(__lsx_vldx, ref_tmp, ref_stride, ref_tmp, ref_stride2, ref3, + ref5); + ref7 = __lsx_vldx(ref_tmp, ref_stride3); + ref += ref_stride4; + + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 32, sec_pred, 64, sec_pred, 96, + pred0, pred2, pred4, pred6); + DUP4_ARG2(__lsx_vld, sec_pred, 16, sec_pred, 48, sec_pred, 80, sec_pred, + 112, pred1, pred3, pred5, pred7); + sec_pred += 128; + + DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1); + sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1); + sad_tmp = SAD_UB2_UH(src2, src3, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1); + sad_tmp = SAD_UB2_UH(src4, src5, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1); + sad_tmp = SAD_UB2_UH(src6, src7, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt = (height >> 2); + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; + __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3; + __m128i sad, sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + } + sad = __lsx_vhaddw_wu_hu(sad0, sad0); + sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1); + sad = __lsx_vadd_w(sad, sad_tmp); + + return HADD_SW_S32(sad); +} + +#define VPX_SAD_16xHT_LSX(height) \ uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride) { \ return sad_16width_lsx(src, src_stride, ref, ref_stride, height); \ @@ -394,15 +548,33 @@ static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride, sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ } -#define SAD64 VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) +#define VPX_AVGSAD_32xHT_LSX(height) \ + uint32_t vpx_sad32x##height##_avg_lsx( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_32width_lsx(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define VPX_AVGSAD_64xHT_LSX(height) \ + uint32_t vpx_sad64x##height##_avg_lsx( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_64width_lsx(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define SAD64 \ + VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_AVGSAD_64xHT_LSX(64) SAD64 -#define SAD32 VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) +#define SAD32 \ + VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_AVGSAD_32xHT_LSX(32) SAD32 -#define SAD16 VPX_SAD_16xHEIGHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16) +#define SAD16 VPX_SAD_16xHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16) SAD16 diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index efb253c68d..ddccfc1f4a 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -401,6 +401,7 @@ DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c DSP_SRCS-$(HAVE_LSX) += loongarch/variance_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/sub_pixel_variance_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/avg_pred_lsx.c DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 4ad698cabe..68d4f86f2f 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -836,7 +836,7 @@ () } # CONFIG_VP9_ENCODER add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx mmi/; @@ -845,7 +845,7 @@ () specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx mmi/; @@ -1147,7 +1147,7 @@ () specialize qw/vpx_get4x4sse_cs neon msa vsx/; add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; - specialize qw/vpx_comp_avg_pred neon sse2 vsx/; + specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/; # # Subpixel Variance From 1b00ad52630a0379d2df16a4fc7351f4e3d0896e Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Tue, 12 Apr 2022 09:10:27 +0800 Subject: [PATCH 0358/1000] vp9[loongarch]: Optimize sad8x8/32x64/64x32x4d 1. vpx_sad8x8x4d_lsx 2. vpx_sad32x64x4d_lsx 3. vpx_sad64x32x4d_lsx Bug: webm:1755 Change-Id: I08a2b8717ec8623ffdd4451a04e68fa3a7228668 --- test/sad_test.cc | 3 ++ vpx_dsp/loongarch/sad_lsx.c | 97 ++++++++++++++++++++++++++++++++++-- vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +-- 3 files changed, 99 insertions(+), 7 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 12a6206b95..7ce25343f6 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1147,8 +1147,11 @@ INSTANTIATE_TEST_SUITE_P(LSX, SADavgTest, ::testing::ValuesIn(avg_lsx_tests)); const SadMxNx4Param x4d_lsx_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_lsx), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_lsx), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_lsx), SadMxNx4Param(32, 32, &vpx_sad32x32x4d_lsx), SadMxNx4Param(16, 16, &vpx_sad16x16x4d_lsx), + SadMxNx4Param(8, 8, &vpx_sad8x8x4d_lsx), }; INSTANTIATE_TEST_SUITE_P(LSX, SADx4Test, ::testing::ValuesIn(x4d_lsx_tests)); #endif // HAVE_LSX diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c index 30464b3661..4764acbf88 100644 --- a/vpx_dsp/loongarch/sad_lsx.c +++ b/vpx_dsp/loongarch/sad_lsx.c @@ -165,6 +165,81 @@ static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride, return sad; } +static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt = (height >> 2); + uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + __m128i src0, src1, src2, src3, sad_tmp; + __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + __m128i sad2 = sad0; + __m128i sad3 = sad0; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t ref_stride2 = ref_stride << 1; + int32_t ref_stride3 = ref_stride2 + ref_stride; + int32_t ref_stride4 = ref_stride2 << 1; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + src0 = __lsx_vld(src_ptr, 0); + DUP2_ARG2(__lsx_vldx, src_ptr, src_stride, src_ptr, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_ptr, src_stride3); + src_ptr += src_stride4; + ref0 = __lsx_vld(ref0_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref0_ptr, ref_stride, ref0_ptr, ref_stride2, ref1, + ref2); + ref3 = __lsx_vldx(ref0_ptr, ref_stride3); + ref0_ptr += ref_stride4; + ref4 = __lsx_vld(ref1_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref1_ptr, ref_stride, ref1_ptr, ref_stride2, ref5, + ref6); + ref7 = __lsx_vldx(ref1_ptr, ref_stride3); + ref1_ptr += ref_stride4; + ref8 = __lsx_vld(ref2_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref2_ptr, ref_stride, ref2_ptr, ref_stride2, ref9, + ref10); + ref11 = __lsx_vldx(ref2_ptr, ref_stride3); + ref2_ptr += ref_stride4; + ref12 = __lsx_vld(ref3_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref3_ptr, ref_stride, ref3_ptr, ref_stride2, ref13, + ref14); + ref15 = __lsx_vldx(ref3_ptr, ref_stride3); + ref3_ptr += ref_stride4; + + DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1); + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + + DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1); + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1); + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + + DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1); + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + } + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *const aref_ptr[], int32_t ref_stride, int32_t height, @@ -527,6 +602,13 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, return sad_64width_lsx(src, src_stride, ref, ref_stride, height); \ } +#define VPX_SAD_8xHTx4D_LSX(height) \ + void vpx_sad8x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ + sad_8width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + #define VPX_SAD_16xHTx4D_LSX(height) \ void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ const uint8_t *const refs[], \ @@ -564,13 +646,15 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, second_pred); \ } -#define SAD64 \ - VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_AVGSAD_64xHT_LSX(64) +#define SAD64 \ + VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_SAD_64xHTx4D_LSX(32) \ + VPX_AVGSAD_64xHT_LSX(64) SAD64 -#define SAD32 \ - VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_AVGSAD_32xHT_LSX(32) +#define SAD32 \ + VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_SAD_32xHTx4D_LSX(64) \ + VPX_AVGSAD_32xHT_LSX(32) SAD32 @@ -578,6 +662,11 @@ SAD32 SAD16 +#define SAD8 VPX_SAD_8xHTx4D_LSX(8) + +SAD8 + #undef SAD64 #undef SAD32 #undef SAD16 +#undef SAD8 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 68d4f86f2f..b441b337b4 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -881,10 +881,10 @@ () specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi lsx/; add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/; +specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi lsx/; add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/; +specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi lsx/; add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi lsx/; @@ -905,7 +905,7 @@ () specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/; add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/; +specialize qw/vpx_sad8x8x4d neon msa sse2 mmi lsx/; add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/; From 872732b2c90eda09f6db1a21b5eee6dc36e813f3 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 28 Apr 2022 17:45:47 -0700 Subject: [PATCH 0359/1000] examples: add missing argv_dup alloc checks Change-Id: Ia3080cbf50071d599c7168a20466392a963f101a --- args.c | 1 + examples/vp9_spatial_svc_encoder.c | 4 ++++ vpxdec.c | 9 ++++++++- vpxenc.c | 4 ++++ 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/args.c b/args.c index 17b615584e..4afb9c021a 100644 --- a/args.c +++ b/args.c @@ -83,6 +83,7 @@ const char *arg_next(struct arg *arg) { char **argv_dup(int argc, const char **argv) { char **new_argv = malloc((argc + 1) * sizeof(*argv)); + if (!new_argv) return NULL; memcpy(new_argv, argv, argc * sizeof(*argv)); new_argv[argc] = NULL; diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c index 455f6c9036..c45edb9ae0 100644 --- a/examples/vp9_spatial_svc_encoder.c +++ b/examples/vp9_spatial_svc_encoder.c @@ -222,6 +222,10 @@ static void parse_command_line(int argc, const char **argv_, // process command line options argv = argv_dup(argc - 1, argv_ + 1); + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + exit(EXIT_FAILURE); + } for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { arg.argv_step = 1; diff --git a/vpxdec.c b/vpxdec.c index 363eb1a24b..84cef7dfd4 100644 --- a/vpxdec.c +++ b/vpxdec.c @@ -581,7 +581,10 @@ static int main_loop(int argc, const char **argv_) { /* Parse command line */ exec_name = argv_[0]; argv = argv_dup(argc - 1, argv_ + 1); - + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + return EXIT_FAILURE; + } for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { memset(&arg, 0, sizeof(arg)); arg.argv_step = 1; @@ -1123,6 +1126,10 @@ int main(int argc, const char **argv_) { int error = 0; argv = argv_dup(argc - 1, argv_ + 1); + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + return EXIT_FAILURE; + } for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { memset(&arg, 0, sizeof(arg)); arg.argv_step = 1; diff --git a/vpxenc.c b/vpxenc.c index b64b6cf441..7eff97b132 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -1703,6 +1703,10 @@ int main(int argc, const char **argv_) { * codec. */ argv = argv_dup(argc - 1, argv_ + 1); + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + return EXIT_FAILURE; + } parse_global_config(&global, argv); if (argc < 3) usage_exit(); From 8ac72859e16934df6e598283997d141e9493fc05 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 3 May 2022 10:44:49 -0400 Subject: [PATCH 0360/1000] vp9 svc sample: set fps from y4m file Change-Id: I082c0409910da4cda5bf852b20ffa11ba5c2ebd6 --- examples/vp9_spatial_svc_encoder.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c index c45edb9ae0..e85dbf8e71 100644 --- a/examples/vp9_spatial_svc_encoder.c +++ b/examples/vp9_spatial_svc_encoder.c @@ -361,6 +361,8 @@ static void parse_command_line(int argc, const char **argv_, if (app_input->input_ctx.file_type == FILE_TYPE_Y4M) { enc_cfg->g_w = app_input->input_ctx.width; enc_cfg->g_h = app_input->input_ctx.height; + enc_cfg->g_timebase.den = app_input->input_ctx.framerate.numerator; + enc_cfg->g_timebase.num = app_input->input_ctx.framerate.denominator; } if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 || From f3b4c9a8f65fb8f35d0e77d2fa62bcd075bbd738 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 6 May 2022 11:47:06 -0700 Subject: [PATCH 0361/1000] vp8[cd]x.h: document vpx_codec_vp[89]_[cd]x* + mark the _algo variables as deprecated. this quiets some doxygen warnings Bug: webm:1752 Change-Id: I53b9b796c3d8fef5c713ee4278641198f95b5864 --- vpx/vp8cx.h | 16 ++++++++++++++++ vpx/vp8dx.h | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 5665a5f036..f5dc6d1188 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -33,7 +33,15 @@ extern "C" { * This interface provides the capability to encode raw VP8 streams. * @{ */ + +/*!\brief A single instance of the VP8 encoder. + *\deprecated This access mechanism is provided for backwards compatibility; + * prefer vpx_codec_vp8_cx(). + */ extern vpx_codec_iface_t vpx_codec_vp8_cx_algo; + +/*!\brief The interface to the VP8 encoder. + */ extern vpx_codec_iface_t *vpx_codec_vp8_cx(void); /*!@} - end algorithm interface member group*/ @@ -42,7 +50,15 @@ extern vpx_codec_iface_t *vpx_codec_vp8_cx(void); * This interface provides the capability to encode raw VP9 streams. * @{ */ + +/*!\brief A single instance of the VP9 encoder. + *\deprecated This access mechanism is provided for backwards compatibility; + * prefer vpx_codec_vp9_cx(). + */ extern vpx_codec_iface_t vpx_codec_vp9_cx_algo; + +/*!\brief The interface to the VP9 encoder. + */ extern vpx_codec_iface_t *vpx_codec_vp9_cx(void); /*!@} - end algorithm interface member group*/ diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h index 506a8936be..8c13649f4a 100644 --- a/vpx/vp8dx.h +++ b/vpx/vp8dx.h @@ -32,7 +32,15 @@ extern "C" { * This interface provides the capability to decode VP8 streams. * @{ */ + +/*!\brief A single instance of the VP8 decoder. + *\deprecated This access mechanism is provided for backwards compatibility; + * prefer vpx_codec_vp8_dx(). + */ extern vpx_codec_iface_t vpx_codec_vp8_dx_algo; + +/*!\brief The interface to the VP8 decoder. + */ extern vpx_codec_iface_t *vpx_codec_vp8_dx(void); /*!@} - end algorithm interface member group*/ @@ -41,7 +49,15 @@ extern vpx_codec_iface_t *vpx_codec_vp8_dx(void); * This interface provides the capability to decode VP9 streams. * @{ */ + +/*!\brief A single instance of the VP9 decoder. + *\deprecated This access mechanism is provided for backwards compatibility; + * prefer vpx_codec_vp9_dx(). + */ extern vpx_codec_iface_t vpx_codec_vp9_dx_algo; + +/*!\brief The interface to the VP9 decoder. + */ extern vpx_codec_iface_t *vpx_codec_vp9_dx(void); /*!@} - end algorithm interface member group*/ From cb1abee1455ac7e552da271ac64c71d117caaa77 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 6 May 2022 11:55:56 -0700 Subject: [PATCH 0362/1000] add some missing realloc checks Change-Id: I0fd1e094085c18b1d9a32333e876c2affeb6de23 --- examples/twopass_encoder.c | 1 + test/vp9_ethread_test.cc | 1 + tools/tiny_ssim.c | 4 ++++ 3 files changed, 6 insertions(+) diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c index 07ba37dfd0..07a10d9cf3 100644 --- a/examples/twopass_encoder.c +++ b/examples/twopass_encoder.c @@ -84,6 +84,7 @@ static int get_frame_stats(vpx_codec_ctx_t *ctx, const vpx_image_t *img, const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf; const size_t pkt_size = pkt->data.twopass_stats.sz; stats->buf = realloc(stats->buf, stats->sz + pkt_size); + if (!stats->buf) die("Failed to reallocate stats buffer."); memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size); stats->sz += pkt_size; } diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc index 21caf7918c..238366cb60 100644 --- a/test/vp9_ethread_test.cc +++ b/test/vp9_ethread_test.cc @@ -98,6 +98,7 @@ class VPxFirstPassEncoderThreadTest firstpass_stats_.buf = realloc(firstpass_stats_.buf, firstpass_stats_.sz + pkt_size); + ASSERT_NE(firstpass_stats_.buf, nullptr); memcpy((uint8_t *)firstpass_stats_.buf + firstpass_stats_.sz, pkt_buf, pkt_size); firstpass_stats_.sz += pkt_size; diff --git a/tools/tiny_ssim.c b/tools/tiny_ssim.c index 1577970488..8fba814621 100644 --- a/tools/tiny_ssim.c +++ b/tools/tiny_ssim.c @@ -453,6 +453,10 @@ int main(int argc, char *argv[]) { psnry = realloc(psnry, allocated_frames * sizeof(*psnry)); psnru = realloc(psnru, allocated_frames * sizeof(*psnru)); psnrv = realloc(psnrv, allocated_frames * sizeof(*psnrv)); + if (!(ssimy && ssimu && ssimv && psnry && psnru && psnrv)) { + fprintf(stderr, "Error allocating SSIM/PSNR data.\n"); + exit(EXIT_FAILURE); + } } psnr_and_ssim(ssimy[n_frames], psnry[n_frames], y[0], y[1], w, h); psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], (w + 1) / 2, From 258affdeab68ed59e181368baa46e2f1d077b0ab Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 3 May 2022 12:24:44 +0300 Subject: [PATCH 0363/1000] [NEON] Optimize vp9_diamond_search_sad() for NEON About 50% improvement in comparison to the C function. I have followed the AVX version with some simplifications. Change-Id: I72ddbdb2fbc5ed8a7f0210703fe05523a37db1c9 --- vp9/common/vp9_rtcd_defs.pl | 2 +- .../arm/neon/vp9_diamond_search_sad_neon.c | 322 ++++++++++++++++++ vp9/vp9cx.mk | 1 + 3 files changed, 324 insertions(+), 1 deletion(-) create mode 100644 vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 4da0b6675b..e6b65c96f0 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -175,7 +175,7 @@ () # Motion search # add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; -specialize qw/vp9_diamond_search_sad avx/; +specialize qw/vp9_diamond_search_sad avx neon/; # # Apply temporal filter diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c new file mode 100644 index 0000000000..e56733d43e --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vpx_ports/mem.h" + +#ifdef __GNUC__ +#define LIKELY(v) __builtin_expect(v, 1) +#define UNLIKELY(v) __builtin_expect(v, 0) +#else +#define LIKELY(v) (v) +#define UNLIKELY(v) (v) +#endif + +static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { + int_mv result; + result.as_mv.row = row; + result.as_mv.col = col; + return result; +} + +static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) { + // This is simplified from the C implementation to utilise that + // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and + // x->nmvjointsadcost[1] == x->nmvjointsadcost[3] + return mv.as_int == 0 ? 0 : 1; +} + +static INLINE int mv_cost(const int_mv mv, const int *joint_cost, + int *const comp_cost[2]) { + assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX); + assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX); + return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] + + comp_cost[1][mv.as_mv.col]; +} + +static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, + int sad_per_bit) { + const int_mv diff = + pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col); + return ROUND_POWER_OF_TWO( + (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, + VP9_PROB_COST_SHIFT); +} + +/***************************************************************************** + * This function utilizes 3 properties of the cost function lookup tables, * + * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * + * vp9_encoder.c. * + * For the joint cost: * + * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * + * For the component costs: * + * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * + * (Equal costs for both components) * + * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * + * (Cost function is even) * + * If these do not hold, then this function cannot be used without * + * modification, in which case you can revert to using the C implementation, * + * which does not rely on these properties. * + *****************************************************************************/ +int vp9_diamond_search_sad_neon(const MACROBLOCK *x, + const search_site_config *cfg, MV *ref_mv, + MV *best_mv, int search_param, int sad_per_bit, + int *num00, const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv) { + static const uint32_t data[4] = { 0, 1, 2, 3 }; + const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data); + + const int32x4_t zero_s32 = vdupq_n_s32(0); + const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); + const int16x8_t v_max_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(maxmv.as_int)); + const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min); + const int16x8_t v_min_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(minmv.as_int)); + + const int32x4_t v_spb_d = vdupq_n_s32(sad_per_bit); + + const int32x4_t v_joint_cost_0_d = vdupq_n_s32(x->nmvjointsadcost[0]); + const int32x4_t v_joint_cost_1_d = vdupq_n_s32(x->nmvjointsadcost[1]); + + // search_param determines the length of the initial step and hence the number + // of iterations. + // 0 = initial step (MAX_FIRST_STEP) pel + // 1 = (MAX_FIRST_STEP/2) pel, + // 2 = (MAX_FIRST_STEP/4) pel... + const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; + const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; + const int tot_steps = cfg->total_steps - search_param; + + const int_mv fcenter_mv = + pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); + const int16x8_t vfcmv = vdupq_n_s16(fcenter_mv.as_int); + + const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); + const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); + + int_mv bmv = pack_int_mv(ref_row, ref_col); + int_mv new_bmv = bmv; + int16x8_t v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); + + const int what_stride = x->plane[0].src.stride; + const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; + const uint8_t *const what = x->plane[0].src.buf; + const uint8_t *const in_what = + x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; + + // Work out the start point for the search + const uint8_t *best_address = in_what; + const uint8_t *new_best_address = best_address; +#if defined(__aarch64__) + int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address); +#else + int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address); +#endif + unsigned int best_sad = INT_MAX; + int i, j, step; + + // Check the prerequisite cost function properties that are easy to check + // in an assert. See the function-level documentation for details on all + // prerequisites. + assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); + assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); + + // Check the starting position + best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); + best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); + + *num00 = 0; + + for (i = 0, step = 0; step < tot_steps; step++) { + for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { + int16x8_t v_diff_mv_w; + int8x16_t v_inside_d; + uint32x4_t v_outside_d; + int32x4_t v_cost_d, v_sad_d; +#if defined(__aarch64__) + int64x2_t v_blocka[2]; +#else + int32x4_t v_blocka[1]; + uint32x2_t horiz_max_0, horiz_max_1; +#endif + + uint32_t horiz_max; + // Compute the candidate motion vectors + const int16x8_t v_ss_mv_w = vld1q_s16((const int16_t *)&ss_mv[i]); + const int16x8_t v_these_mv_w = vaddq_s16(v_bmv_w, v_ss_mv_w); + // Clamp them to the search bounds + int16x8_t v_these_mv_clamp_w = v_these_mv_w; + v_these_mv_clamp_w = vminq_s16(v_these_mv_clamp_w, v_max_mv_w); + v_these_mv_clamp_w = vmaxq_s16(v_these_mv_clamp_w, v_min_mv_w); + // The ones that did not change are inside the search area + v_inside_d = vreinterpretq_s8_u32( + vceqq_s32(vreinterpretq_s32_s16(v_these_mv_clamp_w), + vreinterpretq_s32_s16(v_these_mv_w))); + + // If none of them are inside, then move on +#if defined(__aarch64__) + horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d)); +#else + horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)), + vget_high_u32(vreinterpretq_u32_s8(v_inside_d))); + horiz_max_1 = vpmax_u32(horiz_max_0, horiz_max_0); + vst1_lane_u32(&horiz_max, horiz_max_1, 0); +#endif + if (LIKELY(horiz_max == 0)) { + continue; + } + + // The inverse mask indicates which of the MVs are outside + v_outside_d = + vreinterpretq_u32_s8(veorq_s8(v_inside_d, vdupq_n_s8((int8_t)0xff))); + // Shift right to keep the sign bit clear, we will use this later + // to set the cost to the maximum value. + v_outside_d = vshrq_n_u32(v_outside_d, 1); + + // Compute the difference MV + v_diff_mv_w = vsubq_s16(v_these_mv_clamp_w, vfcmv); + // We utilise the fact that the cost function is even, and use the + // absolute difference. This allows us to use unsigned indexes later + // and reduces cache pressure somewhat as only a half of the table + // is ever referenced. + v_diff_mv_w = vabsq_s16(v_diff_mv_w); + + // Compute the SIMD pointer offsets. + { +#if defined(__aarch64__) // sizeof(intptr_t) == 8 + // Load the offsets + int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]); + int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]); + // Set the ones falling outside to zero + v_bo10_q = vandq_s64( + v_bo10_q, + vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(v_inside_d)))); + v_bo32_q = vandq_s64( + v_bo32_q, + vmovl_s32(vget_high_s32(vreinterpretq_s32_s8(v_inside_d)))); + // Compute the candidate addresses + v_blocka[0] = vaddq_s64(v_ba_q, v_bo10_q); + v_blocka[1] = vaddq_s64(v_ba_q, v_bo32_q); +#else // sizeof(intptr_t) == 4 + int32x4_t v_bo_d = vld1q_s32((const int32_t *)&ss_os[i]); + v_bo_d = vandq_s32(v_bo_d, vreinterpretq_s32_s8(v_inside_d)); + v_blocka[0] = vaddq_s32(v_ba_d, v_bo_d); +#endif + } + + fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], + in_what_stride, (uint32_t *)&v_sad_d); + + // Look up the component cost of the residual motion vector + { + uint32_t cost[4]; + int16_t __attribute__((aligned(16))) rowcol[8]; + vst1q_s16(rowcol, v_diff_mv_w); + + // Note: This is a use case for gather instruction + cost[0] = x->nmvsadcost[0][rowcol[0]] + x->nmvsadcost[0][rowcol[1]]; + cost[1] = x->nmvsadcost[0][rowcol[2]] + x->nmvsadcost[0][rowcol[3]]; + cost[2] = x->nmvsadcost[0][rowcol[4]] + x->nmvsadcost[0][rowcol[5]]; + cost[3] = x->nmvsadcost[0][rowcol[6]] + x->nmvsadcost[0][rowcol[7]]; + + v_cost_d = vld1q_s32((int32_t *)cost); + } + + // Now add in the joint cost + { + const uint32x4_t v_sel_d = + vceqq_s32(vreinterpretq_s32_s16(v_diff_mv_w), zero_s32); + const int32x4_t v_joint_cost_d = vreinterpretq_s32_u8( + vbslq_u8(vreinterpretq_u8_u32(v_sel_d), + vreinterpretq_u8_s32(v_joint_cost_0_d), + vreinterpretq_u8_s32(v_joint_cost_1_d))); + v_cost_d = vaddq_s32(v_cost_d, v_joint_cost_d); + } + + // Multiply by sad_per_bit + v_cost_d = vmulq_s32(v_cost_d, v_spb_d); + // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT) + v_cost_d = + vaddq_s32(v_cost_d, vdupq_n_s32(1 << (VP9_PROB_COST_SHIFT - 1))); + v_cost_d = vshrq_n_s32(v_cost_d, VP9_PROB_COST_SHIFT); + // Add the cost to the sad + v_sad_d = vaddq_s32(v_sad_d, v_cost_d); + + // Make the motion vectors outside the search area have max cost + // by or'ing in the comparison mask, this way the minimum search won't + // pick them. + v_sad_d = vorrq_s32(v_sad_d, vreinterpretq_s32_u32(v_outside_d)); + + // Find the minimum value and index horizontally in v_sad_d + { + uint32_t local_best_sad; +#if defined(__aarch64__) + local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d)); +#else + uint32x2_t horiz_min_0 = + vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v_sad_d)), + vget_high_u32(vreinterpretq_u32_s32(v_sad_d))); + uint32x2_t horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0); + vst1_lane_u32(&local_best_sad, horiz_min_1, 0); +#endif + + // Update the global minimum if the local minimum is smaller + if (LIKELY(local_best_sad < best_sad)) { +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + uint32_t local_best_idx; + const uint32x4_t v_sel_d = + vceqq_s32(v_sad_d, vdupq_n_s32(local_best_sad)); + uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d); + v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff)); + +#if defined(__aarch64__) + local_best_idx = vminvq_u32(v_mask_d); +#else + horiz_min_0 = + vmin_u32(vget_low_u32(v_mask_d), vget_high_u32(v_mask_d)); + horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0); + vst1_lane_u32(&local_best_idx, horiz_min_1, 0); +#endif + + new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic pop +#endif + new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; + + best_sad = local_best_sad; + } + } + } + + bmv = new_bmv; + best_address = new_best_address; + + v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); +#if defined(__aarch64__) + v_ba_q = vdupq_n_s64((intptr_t)best_address); +#else + v_ba_d = vdupq_n_s32((intptr_t)best_address); +#endif + + if (UNLIKELY(best_address == in_what)) { + (*num00)++; + } + } + + *best_mv = bmv.as_mv; + return best_sad; +} diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 92a7fddb9d..c9afd9a347 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -113,6 +113,7 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c From a6bff83a603affa2799bbacedc24f9ca8632a5c6 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Thu, 12 May 2022 11:18:19 -0700 Subject: [PATCH 0364/1000] vp9-rtc: Fix to interp_filter for segment skip For segment skip feature: allow for setting the mi->interp_filter to BILINEAR, if cm->interp_filter is set BILIENAR. This can happen at speed 9 when the segment skip feature is used (e.g., active_maps) Without this fix the assert can be triggered with the active_map_test.cc for speed 9 included. Updated the test. Fixes the assert triggered in the issue: Bug: webm:1762 Change-Id: I462e0bdd966e4f3cb5b7bc746685916ac8808358 --- test/active_map_test.cc | 3 ++- vp9/encoder/vp9_encodeframe.c | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/test/active_map_test.cc b/test/active_map_test.cc index 9c55f9a8b1..7f41009e0f 100644 --- a/test/active_map_test.cc +++ b/test/active_map_test.cc @@ -37,6 +37,7 @@ class ActiveMapTest ::libvpx_test::Encoder *encoder) { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); + encoder->Control(VP9E_SET_AQ_MODE, 3); } else if (video->frame() == 3) { vpx_active_map_t map = vpx_active_map_t(); /* clang-format off */ @@ -87,5 +88,5 @@ TEST_P(ActiveMapTest, Test) { VP9_INSTANTIATE_TEST_SUITE(ActiveMapTest, ::testing::Values(::libvpx_test::kRealTime), - ::testing::Range(0, 9)); + ::testing::Range(0, 10)); } // namespace diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index fc4089865d..9da8f61e32 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1905,13 +1905,17 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, } static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, + INTERP_FILTER interp_filter, RD_COST *rd_cost, BLOCK_SIZE bsize) { MACROBLOCKD *const xd = &x->e_mbd; MODE_INFO *const mi = xd->mi[0]; INTERP_FILTER filter_ref; filter_ref = get_pred_context_switchable_interp(xd); - if (filter_ref == SWITCHABLE_FILTERS) filter_ref = EIGHTTAP; + if (interp_filter == BILINEAR) + filter_ref = BILINEAR; + else if (filter_ref == SWITCHABLE_FILTERS) + filter_ref = EIGHTTAP; mi->sb_type = bsize; mi->mode = ZEROMV; @@ -4682,7 +4686,7 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, hybrid_search_svc_baseiskey(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row, mi_col); else if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) - set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize); + set_mode_info_seg_skip(x, cm->tx_mode, cm->interp_filter, rd_cost, bsize); else if (bsize >= BLOCK_8X8) { if (cpi->rc.hybrid_intra_scene_change) hybrid_search_scene_change(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row, From 617698706ccff16b43091edb9cf94d2d3eda7c5f Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 12 May 2022 21:05:24 -0400 Subject: [PATCH 0365/1000] Add aq mode 0 and 3 to active map test Bug: webm:1762 Change-Id: Ia827f6686e8d0cdc09f3d07d07dacaa4fcd801ab --- test/active_map_test.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/active_map_test.cc b/test/active_map_test.cc index 7f41009e0f..543ec0d358 100644 --- a/test/active_map_test.cc +++ b/test/active_map_test.cc @@ -19,7 +19,8 @@ namespace { class ActiveMapTest : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWith2Params { + public ::libvpx_test::CodecTestWith3Params { protected: static const int kWidth = 208; static const int kHeight = 144; @@ -37,7 +38,7 @@ class ActiveMapTest ::libvpx_test::Encoder *encoder) { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); - encoder->Control(VP9E_SET_AQ_MODE, 3); + encoder->Control(VP9E_SET_AQ_MODE, GET_PARAM(3)); } else if (video->frame() == 3) { vpx_active_map_t map = vpx_active_map_t(); /* clang-format off */ @@ -88,5 +89,5 @@ TEST_P(ActiveMapTest, Test) { VP9_INSTANTIATE_TEST_SUITE(ActiveMapTest, ::testing::Values(::libvpx_test::kRealTime), - ::testing::Range(0, 10)); + ::testing::Range(5, 10), ::testing::Values(0, 3)); } // namespace From 0d51bb2fc5e1e5581d8d378aad3ac61b3205b3b7 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Tue, 12 Apr 2022 16:02:55 +0800 Subject: [PATCH 0366/1000] vp9[loongarch]: Optimize vpx_hadamard_16x16/8x8 1. vpx_hadamard_16x16_lsx 2. vpx_hadamard_8x8_lsx Bug: webm:1755 Change-Id: I3b1e0a2c026c3806b7bbbd191d0edf0e78912af7 --- test/hadamard_test.cc | 7 ++ vpx_dsp/loongarch/avg_lsx.c | 90 +++++++++++++++++++++ vpx_dsp/loongarch/bitdepth_conversion_lsx.h | 48 +++++++++++ vpx_dsp/vpx_dsp.mk | 4 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- 5 files changed, 153 insertions(+), 4 deletions(-) create mode 100644 vpx_dsp/loongarch/avg_lsx.c create mode 100644 vpx_dsp/loongarch/bitdepth_conversion_lsx.h diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index dab945a561..10b1e79c10 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -285,6 +285,13 @@ INSTANTIATE_TEST_SUITE_P( HadamardFuncWithSize(&vpx_hadamard_16x16_vsx, 16))); #endif // HAVE_VSX +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_lsx, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_lsx, 16))); +#endif // HAVE_LSX + #if CONFIG_VP9_HIGHBITDEPTH class HadamardHighbdTest : public HadamardTestBase { protected: diff --git a/vpx_dsp/loongarch/avg_lsx.c b/vpx_dsp/loongarch/avg_lsx.c new file mode 100644 index 0000000000..750c9de29f --- /dev/null +++ b/vpx_dsp/loongarch/avg_lsx.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/bitdepth_conversion_lsx.h" + +void vpx_hadamard_8x8_lsx(const int16_t *src, ptrdiff_t src_stride, + tran_low_t *dst) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + ptrdiff_t src_stride2 = src_stride << 1; + ptrdiff_t src_stride3 = src_stride2 + src_stride; + ptrdiff_t src_stride4 = src_stride2 << 1; + ptrdiff_t src_stride6 = src_stride3 << 1; + + int16_t *src_tmp = (int16_t *)src; + src0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src1, src2); + src3 = __lsx_vldx(src_tmp, src_stride6); + src_tmp += src_stride4; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src5, src6); + src7 = __lsx_vldx(src_tmp, src_stride6); + + LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, + tmp4, tmp6, tmp7, tmp5, tmp3, tmp1); + LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, + src4, src5, src7, src6, src3, src2); + LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, + tmp3, tmp4, tmp5, tmp1, tmp6, tmp2); + LSX_TRANSPOSE8x8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, + tmp4, tmp6, tmp7, tmp5, tmp3, tmp1); + LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, + src4, src5, src7, src6, src3, src2); + LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, + tmp3, tmp4, tmp5, tmp1, tmp6, tmp2); + store_tran_low(tmp0, dst, 0); + store_tran_low(tmp1, dst, 8); + store_tran_low(tmp2, dst, 16); + store_tran_low(tmp3, dst, 24); + store_tran_low(tmp4, dst, 32); + store_tran_low(tmp5, dst, 40); + store_tran_low(tmp6, dst, 48); + store_tran_low(tmp7, dst, 56); +} + +void vpx_hadamard_16x16_lsx(const int16_t *src, ptrdiff_t src_stride, + tran_low_t *dst) { + int i; + __m128i a0, a1, a2, a3, b0, b1, b2, b3; + + /* Rearrange 16x16 to 8x32 and remove stride. + * Top left first. */ + vpx_hadamard_8x8_lsx(src + 0 + 0 * src_stride, src_stride, dst + 0); + /* Top right. */ + vpx_hadamard_8x8_lsx(src + 8 + 0 * src_stride, src_stride, dst + 64); + /* Bottom left. */ + vpx_hadamard_8x8_lsx(src + 0 + 8 * src_stride, src_stride, dst + 128); + /* Bottom right. */ + vpx_hadamard_8x8_lsx(src + 8 + 8 * src_stride, src_stride, dst + 192); + + for (i = 0; i < 64; i += 8) { + a0 = load_tran_low(dst); + a1 = load_tran_low(dst + 64); + a2 = load_tran_low(dst + 128); + a3 = load_tran_low(dst + 192); + + LSX_BUTTERFLY_4_H(a0, a2, a3, a1, b0, b2, b3, b1); + DUP4_ARG2(__lsx_vsrai_h, b0, 1, b1, 1, b2, 1, b3, 1, b0, b1, b2, b3); + LSX_BUTTERFLY_4_H(b0, b1, b3, b2, a0, a1, a3, a2); + + store_tran_low(a0, dst, 0); + store_tran_low(a1, dst, 64); + store_tran_low(a2, dst, 128); + store_tran_low(a3, dst, 192); + + dst += 8; + } +} diff --git a/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h new file mode 100644 index 0000000000..4834f18fc0 --- /dev/null +++ b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_util/loongson_intrinsics.h" + +#if CONFIG_VP9_HIGHBITDEPTH +#define load_tran_low(s) \ + ({ \ + __m128i res0_m; \ + __m128i v0_m = __lsx_vld(s, 0); \ + __m128i v1_m = __lsx_vld(s + 4, 0); \ + res0_m = __lsx_vsrlni_h_w(v0_m, v1_m, 0); \ + res0_m; \ + }) + +#define store_tran_low(v, s, c) \ + { \ + __m128i v0_m, v1_m; \ + v1_m = __lsx_vexth_w_h(v); \ + v0_m = __lsx_vsllwil_w_h(v, 0); \ + __lsx_vst(v0_m, s + c, 0); \ + __lsx_vst(v1_m, s + c + 4, 0); \ + } +#else +#define load_tran_low(s) \ + ({ \ + __m128i res0_m; \ + res0_m = __lsx_vld(s, 0); \ + res0_m; \ + }) + +#define store_tran_low(v, s, c) __lsx_vst(v, s + c, 0) +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index ddccfc1f4a..7de8b02055 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -339,6 +339,7 @@ DSP_SRCS-$(HAVE_AVX2) += x86/avg_intrin_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c +DSP_SRCS-$(HAVE_LSX) += loongarch/avg_lsx.c ifeq ($(VPX_ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm endif @@ -439,6 +440,9 @@ DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h DSP_SRCS-$(HAVE_SSE2) += x86/mem_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h +# LSX utilities +DSP_SRCS-$(HAVE_LSX) += loongarch/bitdepth_conversion_lsx.h + DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) DSP_SRCS-yes += vpx_dsp_rtcd.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index b441b337b4..1c88dcdfa8 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -789,10 +789,10 @@ () if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64"; + specialize qw/vpx_hadamard_8x8 sse2 neon vsx lsx/, "$ssse3_x86_64"; add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/; + specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx lsx/; add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/vpx_hadamard_32x32 sse2 avx2/; @@ -813,10 +813,10 @@ () specialize qw/vpx_highbd_satd avx2/; } else { add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; - specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64"; + specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx lsx/, "$ssse3_x86_64"; add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; - specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/; + specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx lsx/; add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; specialize qw/vpx_hadamard_32x32 sse2 avx2/; From 65d9ac5b5a3dd1c72c15a1fc5bcc004a43ad4c90 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Tue, 12 Apr 2022 21:01:53 +0800 Subject: [PATCH 0367/1000] vp9[loongarch]: Optimize fdct4x4/8x8_lsx 1. vpx_fdct4x4_lsx 2. vpx_fdct8x8_lsx Bug: webm:1755 Change-Id: If283fc08f9bedcbecd2c4052adb210f8fe00d4f0 --- test/dct_test.cc | 6 +- test/fdct8x8_test.cc | 7 +++ vpx_dsp/loongarch/fwd_txfm_lsx.c | 92 +++++++++++++++++++++++++++++ vpx_dsp/loongarch/fwd_txfm_lsx.h | 99 ++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 5 files changed, 204 insertions(+), 4 deletions(-) diff --git a/test/dct_test.cc b/test/dct_test.cc index 6178f8e2cf..2182f87e5e 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -587,7 +587,9 @@ INSTANTIATE_TEST_SUITE_P(VSX, TransDCT, #endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && #if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH -static const FuncInfo dct_lsx_func_info[2] = { +static const FuncInfo dct_lsx_func_info[4] = { + { &fdct_wrapper, &idct_wrapper, 4, 1 }, + { &fdct_wrapper, &idct_wrapper, 8, 1 }, { &fdct_wrapper, &idct_wrapper, 16, 1 }, { &fdct_wrapper, &idct_wrapper, @@ -596,7 +598,7 @@ static const FuncInfo dct_lsx_func_info[2] = { INSTANTIATE_TEST_SUITE_P( LSX, TransDCT, - ::testing::Combine(::testing::Range(0, 2), + ::testing::Combine(::testing::Range(0, 4), ::testing::Values(dct_lsx_func_info), ::testing::Values(0), ::testing::Values(VPX_BITS_8))); #endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 0822666e70..83d1ff1429 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -768,4 +768,11 @@ INSTANTIATE_TEST_SUITE_P(VSX, FwdTrans8x8DCT, &vpx_idct8x8_64_add_vsx, 0, VPX_BITS_8))); #endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P(LSX, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_lsx, + &vpx_idct8x8_64_add_c, 0, + VPX_BITS_8))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.c b/vpx_dsp/loongarch/fwd_txfm_lsx.c index 03f194b433..6f2d4d6fee 100644 --- a/vpx_dsp/loongarch/fwd_txfm_lsx.c +++ b/vpx_dsp/loongarch/fwd_txfm_lsx.c @@ -11,6 +11,20 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/loongarch/fwd_txfm_lsx.h" +#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \ + \ + DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \ + DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \ + _t0 = __lsx_vilvl_h(_s1, _s0); \ + _t1 = __lsx_vilvh_h(_s1, _s0); \ + _t2 = __lsx_vilvl_h(_s3, _s2); \ + _t3 = __lsx_vilvh_h(_s3, _s2); \ + DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \ + DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \ + } + #if !CONFIG_VP9_HIGHBITDEPTH void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride) { @@ -240,6 +254,84 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) { __lsx_vst(in7, output, 240); } +void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + __m128i in0, in1, in2, in3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t src_stride6 = src_stride4 + src_stride2; + + in0 = __lsx_vld(input, 0); + DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2); + in3 = __lsx_vldx(input, src_stride6); + + /* fdct4 pre-process */ + { + __m128i vec, mask; + __m128i zero = __lsx_vldi(0); + + mask = __lsx_vinsgr2vr_b(zero, 1, 0); + DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2, + in3); + vec = __lsx_vseqi_h(in0, 0); + vec = __lsx_vxori_b(vec, 255); + vec = __lsx_vand_v(mask, vec); + in0 = __lsx_vadd_h(in0, vec); + } + + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2); + __lsx_vst(in0, output, 0); + __lsx_vst(in2, output, 16); +} + +void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t src_stride6 = src_stride4 + src_stride2; + int16_t *input_tmp = (int16_t *)input; + + in0 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1, + in2); + in3 = __lsx_vldx(input_tmp, src_stride6); + input_tmp += src_stride4; + in4 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5, + in6); + in7 = __lsx_vldx(input_tmp, src_stride6); + + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); + + __lsx_vst(in0, output, 0); + __lsx_vst(in1, output, 16); + __lsx_vst(in2, output, 32); + __lsx_vst(in3, output, 48); + __lsx_vst(in4, output, 64); + __lsx_vst(in5, output, 80); + __lsx_vst(in6, output, 96); + __lsx_vst(in7, output, 112); +} + void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output, int32_t src_stride) { int32_t i; diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h index 9ed8102269..d04427a6ea 100644 --- a/vpx_dsp/loongarch/fwd_txfm_lsx.h +++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h @@ -14,6 +14,105 @@ #include "vpx_dsp/loongarch/txfm_macros_lsx.h" #include "vpx_dsp/txfm_common.h" +#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ + __m128i vec4_m, vec5_m, vec6_m, vec7_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df }; \ + \ + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ + DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m); \ + cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m); \ + vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \ + \ + vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m); \ + cnst2_m = __lsx_vreplvei_h(coeff_m, 2); \ + cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m); \ + vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \ + \ + DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m, \ + vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \ + vec7_m, DCT_CONST_BITS, out0, out2, out1, out3); \ + } + +#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ + __m128i s7_m, x0_m, x1_m, x2_m, x3_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \ + \ + /* FDCT stage1 */ \ + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \ + s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ + LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \ + x2_m = __lsx_vneg_h(x2_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \ + x2_m = __lsx_vreplvei_h(coeff_m, 2); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \ + \ + /* stage2 */ \ + s1_m = __lsx_vilvl_h(s5_m, s6_m); \ + s0_m = __lsx_vilvh_h(s5_m, s6_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \ + \ + /* stage3 */ \ + LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x0_m, x1_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \ + \ + x1_m = __lsx_vreplvei_h(coeff_m, 5); \ + x0_m = __lsx_vneg_h(x0_m); \ + x0_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \ + x2_m = __lsx_vreplvei_h(coeff_m, 6); \ + x3_m = __lsx_vneg_h(x3_m); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \ + } + +#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ + { \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + \ + DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m, \ + vec1_m, vec2_m, vec3_m); \ + DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m, \ + vec5_m, vec6_m, vec7_m); \ + DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, \ + in3, in0, in1, in2, in3); \ + DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, \ + in7, in4, in5, in6, in7); \ + } + #define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ { \ __m128i tp0_m, tp1_m; \ diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 1c88dcdfa8..f17fc3b496 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -573,13 +573,13 @@ () add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; } else { add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct4x4 neon sse2 msa/; + specialize qw/vpx_fdct4x4 neon sse2 msa lsx/; add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct4x4_1 sse2 neon/; add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64"; + specialize qw/vpx_fdct8x8 sse2 neon msa lsx/, "$ssse3_x86_64"; add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct8x8_1 sse2 neon msa/; From a44e61db29e510afe35448a0401c436bcfba3ec5 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Wed, 13 Apr 2022 16:46:22 +0800 Subject: [PATCH 0368/1000] vp9[loongarch]: Optimize avg_variance64x64/variance8x8 1. vpx_variance8x8_lsx 2. vpx_sub_pixel_avg_variance64x64_lsx Bug: webm:1755 Change-Id: I7d68c7f2f5c8d27dc31cfd32298aeefb68f5d560 --- test/variance_test.cc | 7 +- vpx_dsp/loongarch/sub_pixel_variance_lsx.c | 408 ++++++++++++++++++++- vpx_dsp/loongarch/variance_lsx.c | 38 ++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 4 files changed, 451 insertions(+), 6 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index 11983bb8ac..a11ce25a63 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1655,10 +1655,15 @@ INSTANTIATE_TEST_SUITE_P( LSX, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_lsx), VarianceParams(5, 5, &vpx_variance32x32_lsx), - VarianceParams(4, 4, &vpx_variance16x16_lsx))); + VarianceParams(4, 4, &vpx_variance16x16_lsx), + VarianceParams(3, 3, &vpx_variance8x8_lsx))); INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelVarianceTest, ::testing::Values(SubpelVarianceParams( 5, 5, &vpx_sub_pixel_variance32x32_lsx, 0))); + +INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelAvgVarianceTest, + ::testing::Values(SubpelAvgVarianceParams( + 6, 6, &vpx_sub_pixel_avg_variance64x64_lsx, 0))); #endif } // namespace diff --git a/vpx_dsp/loongarch/sub_pixel_variance_lsx.c b/vpx_dsp/loongarch/sub_pixel_variance_lsx.c index 0a0486479a..c7d233af82 100644 --- a/vpx_dsp/loongarch/sub_pixel_variance_lsx.c +++ b/vpx_dsp/loongarch/sub_pixel_variance_lsx.c @@ -54,11 +54,76 @@ static const uint8_t bilinear_filters_lsx[8][2] = { #define VARIANCE_LARGE_WxH(sse, diff, shift) \ (sse) - (((int64_t)(diff) * (diff)) >> (shift)) +static uint32_t avg_sse_diff_64x64_lsx(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt = 32; + uint32_t res; + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; + __m128i pred0, pred1, pred2, pred3, vec, vec_tmp; + __m128i avg0, avg1, avg2, avg3; + __m128i var = __lsx_vldi(0); + + avg0 = var; + avg1 = var; + avg2 = var; + avg3 = var; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3, + pred3, src0, src1, src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3, + pred3, src0, src1, src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + vec = __lsx_vhaddw_w_h(avg0, avg0); + vec_tmp = __lsx_vhaddw_w_h(avg1, avg1); + vec = __lsx_vadd_w(vec, vec_tmp); + vec_tmp = __lsx_vhaddw_w_h(avg2, avg2); + vec = __lsx_vadd_w(vec, vec_tmp); + vec_tmp = __lsx_vhaddw_w_h(avg3, avg3); + vec = __lsx_vadd_w(vec, vec_tmp); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + + return res; +} + static uint32_t sub_pixel_sse_diff_16width_h_lsx( const uint8_t *src, int32_t src_stride, const uint8_t *dst, int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { uint32_t loop_cnt = (height >> 2); - int32_t res; + uint32_t res; __m128i src0, src1, src2, src3, src4, src5, src6, src7; __m128i dst0, dst1, dst2, dst3, filt0; __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; @@ -134,7 +199,7 @@ static uint32_t sub_pixel_sse_diff_16width_v_lsx( const uint8_t *src, int32_t src_stride, const uint8_t *dst, int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { uint32_t loop_cnt = (height >> 2); - int32_t res; + uint32_t res; __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4; __m128i out0, out1, out2, out3, tmp0, tmp1, filt0, vec; __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; @@ -215,7 +280,7 @@ static uint32_t sub_pixel_sse_diff_16width_hv_lsx( int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, int32_t height, int32_t *diff) { uint32_t loop_cnt = (height >> 2); - int32_t res; + uint32_t res; __m128i src0, src1, src2, src3, src4, src5, src6, src7; __m128i ref0, ref1, ref2, ref3, filt_hz, filt_vt, vec0, vec1; __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, vec; @@ -308,7 +373,309 @@ static uint32_t sub_pixel_sse_diff_32width_hv_lsx( return sse; } +static uint32_t subpel_avg_ssediff_16w_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + uint32_t loop_cnt = (height >> 2); + uint32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + __m128i pred0, pred1, pred2, pred3, filt0, vec; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i mask = { 0x403030202010100, 0x807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + dst0 = __lsx_vld(dst, 0); + dst += dst_stride; + dst1 = __lsx_vld(dst, 0); + dst += dst_stride; + dst2 = __lsx_vld(dst, 0); + dst += dst_stride; + dst3 = __lsx_vld(dst, 0); + dst += dst_stride; + + pred0 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred1 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred2 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred3 = __lsx_vld(sec_pred, 0); + sec_pred += width; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vavgr_bu, tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, + pred3, tmp0, tmp1, tmp2, tmp3); + + CALC_MSE_AVG_B(tmp0, dst0, var, avg); + CALC_MSE_AVG_B(tmp1, dst1, var, avg); + CALC_MSE_AVG_B(tmp2, dst2, var, avg); + CALC_MSE_AVG_B(tmp3, dst3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + + return res; +} + +static uint32_t subpel_avg_ssediff_16w_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + uint32_t loop_cnt = (height >> 2); + uint32_t res; + __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3; + __m128i src0, src1, src2, src3, src4, out0, out1, out2, out3; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i tmp0, tmp1, vec, filt0; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + src += src_stride; + src2 = __lsx_vld(src, 0); + src += src_stride; + src3 = __lsx_vld(src, 0); + src += src_stride; + src4 = __lsx_vld(src, 0); + src += src_stride; + + pred0 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred1 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred2 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred3 = __lsx_vld(sec_pred, 0); + sec_pred += width; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + src0 = src4; + ref0 = __lsx_vld(dst, 0); + dst += dst_stride; + ref1 = __lsx_vld(dst, 0); + dst += dst_stride; + ref2 = __lsx_vld(dst, 0); + dst += dst_stride; + ref3 = __lsx_vld(dst, 0); + dst += dst_stride; + + DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3, + pred3, out0, out1, out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t subpel_avg_ssediff_16w_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) { + uint32_t loop_cnt = (height >> 2); + uint32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; + __m128i out0, out1, out2, out3, filt_hz, filt_vt, vec, vec0, vec1; + __m128i mask = { 0x403030202010100, 0x807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + pred0 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred1 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred2 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred3 = __lsx_vld(sec_pred, 0); + sec_pred += width; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + ref0 = __lsx_vld(dst, 0); + dst += dst_stride; + ref1 = __lsx_vld(dst, 0); + dst += dst_stride; + ref2 = __lsx_vld(dst, 0); + dst += dst_stride; + ref3 = __lsx_vld(dst, 0); + dst += dst_stride; + + DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3, + pred3, out0, out1, out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_h_lsx(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_v_lsx(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += subpel_avg_ssediff_16w_hv_lsx(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10) +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12) #define VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(wd, ht) \ uint32_t vpx_sub_pixel_variance##wd##x##ht##_lsx( \ @@ -346,3 +713,38 @@ static uint32_t sub_pixel_sse_diff_32width_hv_lsx( } VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(32, 32) + +#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(ht) \ + uint32_t vpx_sub_pixel_avg_variance64x##ht##_lsx( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_lsx[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_lsx[y_offset]; \ + \ + if (y_offset) { \ + if (x_offset) { \ + *sse = sub_pixel_avg_sse_diff_64width_hv_lsx( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_64width_v_lsx( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ + } else { \ + if (x_offset) { \ + *sse = sub_pixel_avg_sse_diff_64width_h_lsx( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_64x##ht##_lsx(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, &diff); \ + } \ + } \ + \ + return VARIANCE_64Wx##ht##H(*sse, diff); \ + } + +VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(64) diff --git a/vpx_dsp/loongarch/variance_lsx.c b/vpx_dsp/loongarch/variance_lsx.c index 8f2ec0563f..5223e0f169 100644 --- a/vpx_dsp/loongarch/variance_lsx.c +++ b/vpx_dsp/loongarch/variance_lsx.c @@ -43,6 +43,42 @@ #define VARIANCE_LARGE_WxH(sse, diff, shift) \ (sse) - (((int64_t)(diff) * (diff)) >> (shift)) +static uint32_t sse_diff_8width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt = (height >> 2); + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, vec; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t ref_stride2 = ref_stride << 1; + int32_t ref_stride3 = ref_stride2 + ref_stride; + int32_t ref_stride4 = ref_stride2 << 1; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1, + src2, src3); + src_ptr += src_stride4; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr + ref_stride, 0, + ref_ptr + ref_stride2, 0, ref_ptr + ref_stride3, 0, ref0, ref1, + ref2, ref3); + ref_ptr += ref_stride4; + + DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2, + src0, src1, ref0, ref1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __lsx_vhaddw_w_h(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride, int32_t height, int32_t *diff) { @@ -174,6 +210,7 @@ static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride, return HADD_SW_S32(var); } +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8) #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10) @@ -191,6 +228,7 @@ static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride, return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ } +VPX_VARIANCE_WDXHT_LSX(8, 8) VPX_VARIANCE_WDXHT_LSX(16, 16) VPX_VARIANCE_WDXHT_LSX(32, 32) diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index f17fc3b496..706af97e50 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1108,7 +1108,7 @@ () specialize qw/vpx_variance8x16 sse2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx/; + specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx lsx/; add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance8x4 sse2 neon msa mmi vsx/; @@ -1192,7 +1192,7 @@ () specialize qw/vpx_sub_pixel_variance4x4 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3/; + specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3 lsx/; add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa mmi sse2 ssse3/; From ca89bed50dbc5fe2abef50c5f36924bb1da6d1f6 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Sun, 15 May 2022 22:15:29 -0700 Subject: [PATCH 0369/1000] vp9-rtc: Fix to usage of active_maps when aq_mode=0 If aq_mode=0 the segmentation feature may still be used for active_maps, so the condition active_maps.enabled needs to be added in two places regarding segmentation logic in encodeframe.c. Otherwise the active_maps would have no effect. This also resolves why the assert in bug webm:1762 was not triggered when aq_mode=0. Change-Id: Ibd68e9b5c3f81728241a168d3fb3567d6845633d --- vp9/encoder/vp9_encodeframe.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 9da8f61e32..5f08fa6f60 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -217,8 +217,8 @@ static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row, break; } - // Set segment index from ROI map if it's enabled. - if (cpi->roi.enabled) + // Set segment index if ROI map or active_map is enabled. + if (cpi->roi.enabled || cpi->active_map.enabled) mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); vp9_init_plane_quantizers(cpi, x); @@ -2499,7 +2499,8 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, *(xd->mi[0]) = ctx->mic; *(x->mbmi_ext) = ctx->mbmi_ext; - if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled)) { + if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled || + cpi->active_map.enabled)) { // Setting segmentation map for cyclic_refresh. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->cyclic_refresh->content_mode) { From 8486953e5e0de3cec0332b787aa05a7405e3c207 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Fri, 18 Mar 2022 09:33:53 +0800 Subject: [PATCH 0370/1000] vp8[loongarch]: Optimize vp8 encoding partial function 1. vp8_short_fdct4x4 2. vp8_regular_quantize_b 3. vp8_block_error 4. vp8_mbblock_error 5. vpx_subtract_block Bug: webm:1755 Change-Id: I3dbfc7e3937af74090fc53fb4c9664e6cdda29ef --- test/quantize_test.cc | 7 + test/vp8_fdct4x4_test.cc | 5 + test/vp9_subtract_test.cc | 5 + vp8/common/rtcd_defs.pl | 8 +- vp8/encoder/loongarch/dct_lsx.c | 99 +++++++ vp8/encoder/loongarch/encodeopt_lsx.c | 82 ++++++ vp8/encoder/loongarch/quantize_lsx.c | 145 ++++++++++ vp8/vp8cx.mk | 5 + vpx_dsp/loongarch/subtract_lsx.c | 371 ++++++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 2 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 11 files changed, 726 insertions(+), 5 deletions(-) create mode 100644 vp8/encoder/loongarch/dct_lsx.c create mode 100644 vp8/encoder/loongarch/encodeopt_lsx.c create mode 100644 vp8/encoder/loongarch/quantize_lsx.c create mode 100644 vpx_dsp/loongarch/subtract_lsx.c diff --git a/test/quantize_test.cc b/test/quantize_test.cc index 792b21432e..57309e8102 100644 --- a/test/quantize_test.cc +++ b/test/quantize_test.cc @@ -224,4 +224,11 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c), make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c))); #endif // HAVE_MMI + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, QuantizeTest, + ::testing::Values(make_tuple(&vp8_regular_quantize_b_lsx, + &vp8_regular_quantize_b_c))); +#endif // HAVE_LSX } // namespace diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc index 3e4305be73..1b73a72a01 100644 --- a/test/vp8_fdct4x4_test.cc +++ b/test/vp8_fdct4x4_test.cc @@ -203,4 +203,9 @@ INSTANTIATE_TEST_SUITE_P(MSA, FdctTest, INSTANTIATE_TEST_SUITE_P(MMI, FdctTest, ::testing::Values(vp8_short_fdct4x4_mmi)); #endif // HAVE_MMI + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P(LSX, FdctTest, + ::testing::Values(vp8_short_fdct4x4_lsx)); +#endif // HAVE_LSX } // namespace diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc index ef8cc207d6..211cc6c7ad 100644 --- a/test/vp9_subtract_test.cc +++ b/test/vp9_subtract_test.cc @@ -152,4 +152,9 @@ INSTANTIATE_TEST_SUITE_P(VSX, VP9SubtractBlockTest, ::testing::Values(vpx_subtract_block_vsx)); #endif +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P(LSX, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_lsx)); +#endif + } // namespace vp9 diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index e4b40fa9ed..4f45d2ab9a 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -184,7 +184,7 @@ () # Forward DCT # add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi/; +specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi lsx/; add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch"; specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/; @@ -196,7 +196,7 @@ () # Quantizer # add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *"; -specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi/; +specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi lsx/; add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *"; specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa mmi/; @@ -205,10 +205,10 @@ () # Block subtraction # add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff"; -specialize qw/vp8_block_error sse2 msa/; +specialize qw/vp8_block_error sse2 msa lsx/; add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc"; -specialize qw/vp8_mbblock_error sse2 msa/; +specialize qw/vp8_mbblock_error sse2 msa lsx/; add_proto qw/int vp8_mbuverror/, "struct macroblock *mb"; specialize qw/vp8_mbuverror sse2 msa/; diff --git a/vp8/encoder/loongarch/dct_lsx.c b/vp8/encoder/loongarch/dct_lsx.c new file mode 100644 index 0000000000..e090d2360f --- /dev/null +++ b/vp8/encoder/loongarch/dct_lsx.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vp8_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \ + \ + DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \ + DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \ + _t0 = __lsx_vilvl_h(_s1, _s0); \ + _t1 = __lsx_vilvh_h(_s1, _s0); \ + _t2 = __lsx_vilvl_h(_s3, _s2); \ + _t3 = __lsx_vilvh_h(_s3, _s2); \ + DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \ + DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \ + } + +#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2) \ + { \ + __m128i tmp0_m, tmp1_m, tmp2_m; \ + \ + tmp0_m = __lsx_vreplvei_h(coeff, val0); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff, val1, coeff, val2, tmp1_m, tmp2_m); \ + DUP2_ARG2(__lsx_vpackev_h, tmp1_m, tmp0_m, tmp0_m, tmp2_m, const1, \ + const2); \ + } + +#define RET_1_IF_NZERO_H(_in) \ + ({ \ + __m128i tmp_m; \ + __m128i one_m = __lsx_vldi(0x401); \ + __m128i max_m = __lsx_vldi(0xFF); \ + \ + tmp_m = __lsx_vseqi_h(_in, 0); \ + tmp_m = __lsx_vxor_v(tmp_m, max_m); \ + tmp_m = __lsx_vand_v(tmp_m, one_m); \ + \ + tmp_m; \ + }) + +void vp8_short_fdct4x4_lsx(int16_t *input, int16_t *output, int32_t pitch) { + __m128i in0, in1, in2, in3; + __m128i tmp0, tmp1, tmp2, tmp3, const0, const1; + __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c }; + __m128i out0, out1, out2, out3; + __m128i zero = __lsx_vldi(0); + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + + in0 = __lsx_vld(input, 0); + DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2); + in3 = __lsx_vldx(input, pitch3); + + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3); + DUP4_ARG2(__lsx_vslli_h, tmp0, 3, tmp1, 3, in1, 3, in3, 3, tmp0, tmp1, in1, + in3); + in0 = __lsx_vadd_h(tmp0, tmp1); + in2 = __lsx_vsub_h(tmp0, tmp1); + SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); + tmp0 = __lsx_vilvl_h(in3, in1); + in1 = __lsx_vreplvei_h(coeff, 3); + out0 = __lsx_vpackev_h(zero, in1); + coeff = __lsx_vilvl_h(zero, coeff); + out1 = __lsx_vreplvei_w(coeff, 0); + DUP2_ARG3(__lsx_vdp2add_w_h, out0, tmp0, const0, out1, tmp0, const1, out0, + out1); + DUP2_ARG3(__lsx_vsrani_h_w, out0, out0, 12, out1, out1, 12, in1, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3); + tmp2 = __lsx_vadd_h(tmp0, tmp1); + tmp3 = __lsx_vsub_h(tmp0, tmp1); + DUP2_ARG2(__lsx_vaddi_hu, tmp2, 7, tmp3, 7, in0, in2); + DUP2_ARG2(__lsx_vsrai_h, in0, 4, in2, 4, in0, in2); + DUP2_ARG2(__lsx_vilvl_h, zero, in0, zero, in2, out0, out2); + tmp1 = RET_1_IF_NZERO_H(in3); + DUP2_ARG2(__lsx_vilvl_h, zero, tmp1, in3, in1, tmp1, tmp0); + DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, out3, out1); + out3 = __lsx_vadd_w(out3, out1); + out1 = __lsx_vreplvei_w(coeff, 1); + DUP2_ARG3(__lsx_vdp2add_w_h, out1, tmp0, const0, out3, tmp0, const1, out1, + out3); + DUP2_ARG2(__lsx_vsrai_w, out1, 16, out3, 16, out1, out3); + out1 = __lsx_vadd_w(out1, tmp1); + DUP2_ARG2(__lsx_vpickev_h, out1, out0, out3, out2, in0, in2); + __lsx_vst(in0, output, 0); + __lsx_vst(in2, output, 16); +} diff --git a/vp8/encoder/loongarch/encodeopt_lsx.c b/vp8/encoder/loongarch/encodeopt_lsx.c new file mode 100644 index 0000000000..4ad4caba60 --- /dev/null +++ b/vp8/encoder/loongarch/encodeopt_lsx.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" +#include "vp8/encoder/block.h" + +int32_t vp8_block_error_lsx(int16_t *coeff_ptr, int16_t *dq_coeff_ptr) { + int32_t err = 0; + __m128i dq_coeff0, dq_coeff1, coeff0, coeff1; + __m128i reg0, reg1, reg2, reg3, error; + + DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, dq_coeff_ptr, 0, + dq_coeff_ptr, 16, coeff0, coeff1, dq_coeff0, dq_coeff1); + DUP2_ARG2(__lsx_vsubwev_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg0, + reg2); + DUP2_ARG2(__lsx_vsubwod_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg1, + reg3); + error = __lsx_vmul_w(reg0, reg0); + DUP2_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, error); + error = __lsx_vmadd_w(error, reg3, reg3); + error = __lsx_vhaddw_d_w(error, error); + err = __lsx_vpickve2gr_w(error, 0); + err += __lsx_vpickve2gr_w(error, 2); + return err; +} + +int32_t vp8_mbblock_error_lsx(MACROBLOCK *mb, int32_t dc) { + BLOCK *be; + BLOCKD *bd; + int16_t *coeff, *dq_coeff; + int32_t err = 0; + uint32_t loop_cnt; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, error; + __m128i mask0 = __lsx_vldi(0xFF); + __m128i zero = __lsx_vldi(0); + + if (dc == 1) { + mask0 = __lsx_vinsgr2vr_w(mask0, 0, 0); + } + + for (loop_cnt = 0; loop_cnt < 8; loop_cnt++) { + int32_t loop_tmp = loop_cnt << 1; + be = &mb->block[loop_tmp]; + bd = &mb->e_mbd.block[loop_tmp]; + coeff = be->coeff; + dq_coeff = bd->dqcoeff; + DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src0, + src1, tmp0, tmp1); + be = &mb->block[loop_tmp + 1]; + bd = &mb->e_mbd.block[loop_tmp + 1]; + coeff = be->coeff; + dq_coeff = bd->dqcoeff; + DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src2, + src3, tmp2, tmp3); + DUP4_ARG2(__lsx_vsubwev_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vsubwod_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3, + reg1, reg3, reg5, reg7); + DUP2_ARG3(__lsx_vbitsel_v, zero, reg0, mask0, zero, reg4, mask0, reg0, + reg4); + error = __lsx_vmul_w(reg0, reg0); + DUP4_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, reg3, + reg3, error, reg4, reg4, error, error, error, error); + DUP2_ARG3(__lsx_vmadd_w, error, reg5, reg5, error, reg6, reg6, error, + error); + error = __lsx_vmadd_w(error, reg7, reg7); + error = __lsx_vhaddw_d_w(error, error); + error = __lsx_vhaddw_q_d(error, error); + err += __lsx_vpickve2gr_w(error, 0); + } + return err; +} diff --git a/vp8/encoder/loongarch/quantize_lsx.c b/vp8/encoder/loongarch/quantize_lsx.c new file mode 100644 index 0000000000..75889192a7 --- /dev/null +++ b/vp8/encoder/loongarch/quantize_lsx.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vp8_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" +#include "vp8/encoder/block.h" + +#define BOOST_QUANT1(_in0, _in1, _in2, _ui) \ + { \ + if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \ + if (__lsx_vpickve2gr_h(_in1, _ui)) { \ + eob = _ui; \ + boost_temp = zbin_boost; \ + } else { \ + boost_temp++; \ + } \ + } else { \ + _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui); \ + boost_temp++; \ + } \ + } + +#define BOOST_QUANT2(_in0, _in1, _in2, _ui) \ + { \ + if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \ + if (__lsx_vpickve2gr_h(_in1, _ui)) { \ + eob = _ui + 8; \ + boost_temp = zbin_boost; \ + } else { \ + boost_temp++; \ + } \ + } else { \ + _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui); \ + boost_temp++; \ + } \ + } + +static int8_t exact_regular_quantize_b_lsx( + int16_t *zbin_boost, int16_t *coeff_ptr, int16_t *zbin, int16_t *round, + int16_t *quant, int16_t *quant_shift, int16_t *de_quant, int16_t zbin_oq_in, + int16_t *q_coeff, int16_t *dq_coeff) { + int32_t eob; + int16_t *boost_temp = zbin_boost; + __m128i inv_zig_zag = { 0x0C07040206050100, 0x0F0E0A090D0B0803 }; + __m128i sign_z0, sign_z1, q_coeff0, q_coeff1; + __m128i z_bin0, z_bin1, zbin_o_q, x0, x1, sign_x0, sign_x1, de_quant0, + de_quant1; + __m128i z0, z1, round0, round1, quant0, quant2; + __m128i inv_zig_zag0, inv_zig_zag1; + __m128i zigzag_mask0 = { 0x0008000400010000, 0x0006000300020005 }; + __m128i zigzag_mask1 = { 0x000A000D000C0009, 0X000F000E000B0007 }; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i zero = __lsx_vldi(0); + + zbin_o_q = __lsx_vreplgr2vr_h(zbin_oq_in); + inv_zig_zag0 = __lsx_vilvl_b(zero, inv_zig_zag); + inv_zig_zag1 = __lsx_vilvh_b(zero, inv_zig_zag); + eob = -1; + DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, round, 0, round, 16, tmp0, + tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0, + zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, z0, z1, round0, + round1); + DUP4_ARG2(__lsx_vld, quant, 0, quant, 16, zbin, 0, zbin, 16, tmp0, tmp1, tmp2, + tmp3); + DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0, + zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, quant0, quant2, + z_bin0, z_bin1); + DUP2_ARG2(__lsx_vsrai_h, z0, 15, z1, 15, sign_z0, sign_z1); + DUP2_ARG2(__lsx_vadda_h, z0, zero, z1, zero, x0, x1); + DUP2_ARG2(__lsx_vsub_h, x0, z_bin0, x1, z_bin1, z_bin0, z_bin1); + DUP2_ARG2(__lsx_vsub_h, z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1); + DUP2_ARG2(__lsx_vmulwev_w_h, quant0, round0, quant2, round1, tmp0, tmp2); + DUP2_ARG2(__lsx_vmulwod_w_h, quant0, round0, quant2, round1, tmp1, tmp3); + DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2); + DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3); + DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, q_coeff0, q_coeff1); + + DUP2_ARG2(__lsx_vld, quant_shift, 0, quant_shift, 16, tmp1, tmp3); + DUP2_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp3, tmp1, zigzag_mask1, tmp3, tmp1, + quant0, quant2); + DUP2_ARG2(__lsx_vadd_h, x0, round0, x1, round1, x0, x1); + DUP2_ARG2(__lsx_vmulwev_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp0, tmp2); + DUP2_ARG2(__lsx_vmulwod_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp1, tmp3); + DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2); + DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3); + DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, x0, x1); + DUP2_ARG2(__lsx_vxor_v, x0, sign_z0, x1, sign_z1, sign_x0, sign_x1); + DUP2_ARG2(__lsx_vsub_h, sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1); + + BOOST_QUANT1(z_bin0, x0, sign_x0, 0); + BOOST_QUANT1(z_bin0, x0, sign_x0, 1); + BOOST_QUANT1(z_bin0, x0, sign_x0, 2); + BOOST_QUANT1(z_bin0, x0, sign_x0, 3); + BOOST_QUANT1(z_bin0, x0, sign_x0, 4); + BOOST_QUANT1(z_bin0, x0, sign_x0, 5); + BOOST_QUANT1(z_bin0, x0, sign_x0, 6); + BOOST_QUANT1(z_bin0, x0, sign_x0, 7); + + BOOST_QUANT2(z_bin1, x1, sign_x1, 0); + BOOST_QUANT2(z_bin1, x1, sign_x1, 1); + BOOST_QUANT2(z_bin1, x1, sign_x1, 2); + BOOST_QUANT2(z_bin1, x1, sign_x1, 3); + BOOST_QUANT2(z_bin1, x1, sign_x1, 4); + BOOST_QUANT2(z_bin1, x1, sign_x1, 5); + BOOST_QUANT2(z_bin1, x1, sign_x1, 6); + BOOST_QUANT2(z_bin1, x1, sign_x1, 7); + + DUP2_ARG2(__lsx_vld, de_quant, 0, de_quant, 16, de_quant0, de_quant1); + DUP2_ARG3(__lsx_vshuf_h, inv_zig_zag0, sign_x1, sign_x0, inv_zig_zag1, + sign_x1, sign_x0, q_coeff0, q_coeff1); + DUP2_ARG2(__lsx_vmul_h, de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0, + de_quant1); + __lsx_vst(q_coeff0, q_coeff, 0); + __lsx_vst(q_coeff1, q_coeff, 16); + __lsx_vst(de_quant0, dq_coeff, 0); + __lsx_vst(de_quant1, dq_coeff, 16); + + return (int8_t)(eob + 1); +} + +void vp8_regular_quantize_b_lsx(BLOCK *b, BLOCKD *d) { + int16_t *zbin_boost_ptr = b->zrun_zbin_boost; + int16_t *coeff_ptr = b->coeff; + int16_t *zbin_ptr = b->zbin; + int16_t *round_ptr = b->round; + int16_t *quant_ptr = b->quant; + int16_t *quant_shift_ptr = b->quant_shift; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + int16_t *dequant_ptr = d->dequant; + int16_t zbin_oq_value = b->zbin_extra; + + *d->eob = exact_regular_quantize_b_lsx( + zbin_boost_ptr, coeff_ptr, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, dequant_ptr, zbin_oq_value, qcoeff_ptr, dqcoeff_ptr); +} diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 3a8f8ea45a..5744cbabcc 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -124,4 +124,9 @@ ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c endif +# common (loongarch LSX intrinsics) +VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/quantize_lsx.c +VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/dct_lsx.c +VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/encodeopt_lsx.c + VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes)) diff --git a/vpx_dsp/loongarch/subtract_lsx.c b/vpx_dsp/loongarch/subtract_lsx.c new file mode 100644 index 0000000000..943a5c5a9b --- /dev/null +++ b/vpx_dsp/loongarch/subtract_lsx.c @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static void sub_blk_4x4_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + __m128i src0, src1, src2, src3; + __m128i pred0, pred1, pred2, pred3; + __m128i diff0, diff1; + __m128i reg0, reg1; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t diff_stride2 = diff_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t diff_stride3 = diff_stride2 + diff_stride; + + DUP4_ARG2(__lsx_vldrepl_w, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1, + src2, src3); + DUP4_ARG2(__lsx_vldrepl_w, pred_ptr, 0, pred_ptr + pred_stride, 0, + pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0, + pred1, pred2, pred3); + DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, pred1, pred0, pred3, pred2, + src0, src2, pred0, pred2); + DUP2_ARG2(__lsx_vilvl_d, src2, src0, pred2, pred0, src0, pred0); + reg0 = __lsx_vilvl_b(src0, pred0); + reg1 = __lsx_vilvh_b(src0, pred0); + DUP2_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, diff0, diff1); + __lsx_vstelm_d(diff0, diff_ptr, 0, 0); + __lsx_vstelm_d(diff0, diff_ptr + diff_stride, 0, 1); + __lsx_vstelm_d(diff1, diff_ptr + diff_stride2, 0, 0); + __lsx_vstelm_d(diff1, diff_ptr + diff_stride3, 0, 1); +} + +static void sub_blk_8x8_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t dst_stride = diff_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t dst_stride2 = dst_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t pred_stride4 = pred_stride2 << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + + DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1, + src2, src3); + DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0, + pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0, + pred1, pred2, pred3); + src_ptr += src_stride4; + pred_ptr += pred_stride4; + + DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src4, src5, + src6, src7); + DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0, + pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred4, + pred5, pred6, pred7); + + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7, + src4, src5, src6, src7); + __lsx_vst(src0, diff_ptr, 0); + __lsx_vstx(src1, diff_ptr, dst_stride); + __lsx_vstx(src2, diff_ptr, dst_stride2); + __lsx_vstx(src3, diff_ptr, dst_stride3); + diff_ptr += dst_stride2; + __lsx_vst(src4, diff_ptr, 0); + __lsx_vstx(src5, diff_ptr, dst_stride); + __lsx_vstx(src6, diff_ptr, dst_stride2); + __lsx_vstx(src7, diff_ptr, dst_stride3); +} + +static void sub_blk_16x16_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t dst_stride = diff_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t dst_stride2 = dst_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t pred_stride4 = pred_stride2 << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + int16_t *diff_tmp = diff + 8; + + DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred, + pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4); + src += src_stride4; + pred += pred_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + pred, pred_stride, src5, src6, src7, pred5); + DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7); + src += src_stride4; + pred += pred_stride4; + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7, + src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, + pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7, + pred4, pred5, pred6, pred7); + __lsx_vst(src0, diff, 0); + __lsx_vstx(src2, diff, dst_stride); + __lsx_vstx(src4, diff, dst_stride2); + __lsx_vstx(src6, diff, dst_stride3); + __lsx_vst(src1, diff_tmp, 0); + __lsx_vstx(src3, diff_tmp, dst_stride); + __lsx_vstx(src5, diff_tmp, dst_stride2); + __lsx_vstx(src7, diff_tmp, dst_stride3); + diff += dst_stride2; + diff_tmp += dst_stride2; + __lsx_vst(pred0, diff, 0); + __lsx_vstx(pred2, diff, dst_stride); + __lsx_vstx(pred4, diff, dst_stride2); + __lsx_vstx(pred6, diff, dst_stride3); + __lsx_vst(pred1, diff_tmp, 0); + __lsx_vstx(pred3, diff_tmp, dst_stride); + __lsx_vstx(pred5, diff_tmp, dst_stride2); + __lsx_vstx(pred7, diff_tmp, dst_stride3); + diff += dst_stride2; + diff_tmp += dst_stride2; + DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred, + pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4); + src += src_stride4; + pred += pred_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + pred, pred_stride, src5, src6, src7, pred5); + DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7); + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7, + src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, + pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7, + pred4, pred5, pred6, pred7); + __lsx_vst(src0, diff, 0); + __lsx_vstx(src2, diff, dst_stride); + __lsx_vstx(src4, diff, dst_stride2); + __lsx_vstx(src6, diff, dst_stride3); + __lsx_vst(src1, diff_tmp, 0); + __lsx_vstx(src3, diff_tmp, dst_stride); + __lsx_vstx(src5, diff_tmp, dst_stride2); + __lsx_vstx(src7, diff_tmp, dst_stride3); + diff += dst_stride2; + diff_tmp += dst_stride2; + __lsx_vst(pred0, diff, 0); + __lsx_vstx(pred2, diff, dst_stride); + __lsx_vstx(pred4, diff, dst_stride2); + __lsx_vstx(pred6, diff, dst_stride3); + __lsx_vst(pred1, diff_tmp, 0); + __lsx_vstx(pred3, diff_tmp, dst_stride); + __lsx_vstx(pred5, diff_tmp, dst_stride2); + __lsx_vstx(pred7, diff_tmp, dst_stride3); +} + +static void sub_blk_32x32_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + uint32_t loop_cnt; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t pred_stride4 = pred_stride2 << 1; + + for (loop_cnt = 8; loop_cnt--;) { + const uint8_t *src_tmp = src + 16; + const uint8_t *pred_tmp = pred + 16; + DUP4_ARG2(__lsx_vld, src, 0, src_tmp, 0, pred, 0, pred_tmp, 0, src0, src1, + pred0, pred1); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src2, src3, src4, src5); + DUP4_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, pred, + pred_stride, pred_tmp, pred_stride, src6, src7, pred2, pred3); + DUP4_ARG2(__lsx_vldx, pred, pred_stride2, pred_tmp, pred_stride2, pred, + pred_stride3, pred_tmp, pred_stride3, pred4, pred5, pred6, pred7); + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, + reg3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, + reg7, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, + tmp3, pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, + tmp7, pred4, pred5, pred6, pred7); + src += src_stride4; + pred += pred_stride4; + __lsx_vst(src0, diff, 0); + __lsx_vst(src1, diff, 16); + __lsx_vst(src2, diff, 32); + __lsx_vst(src3, diff, 48); + diff += diff_stride; + __lsx_vst(src4, diff, 0); + __lsx_vst(src5, diff, 16); + __lsx_vst(src6, diff, 32); + __lsx_vst(src7, diff, 48); + diff += diff_stride; + __lsx_vst(pred0, diff, 0); + __lsx_vst(pred1, diff, 16); + __lsx_vst(pred2, diff, 32); + __lsx_vst(pred3, diff, 48); + diff += diff_stride; + __lsx_vst(pred4, diff, 0); + __lsx_vst(pred5, diff, 16); + __lsx_vst(pred6, diff, 32); + __lsx_vst(pred7, diff, 48); + diff += diff_stride; + } +} + +static void sub_blk_64x64_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + uint32_t loop_cnt; + + for (loop_cnt = 32; loop_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred0, pred1, + pred2, pred3); + src += src_stride; + pred += pred_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6, + src7); + DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred4, pred5, + pred6, pred7); + src += src_stride; + pred += pred_stride; + + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, + reg3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, + reg7, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, + tmp3, pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, + tmp7, pred4, pred5, pred6, pred7); + __lsx_vst(src0, diff, 0); + __lsx_vst(src1, diff, 16); + __lsx_vst(src2, diff, 32); + __lsx_vst(src3, diff, 48); + __lsx_vst(src4, diff, 64); + __lsx_vst(src5, diff, 80); + __lsx_vst(src6, diff, 96); + __lsx_vst(src7, diff, 112); + diff += diff_stride; + __lsx_vst(pred0, diff, 0); + __lsx_vst(pred1, diff, 16); + __lsx_vst(pred2, diff, 32); + __lsx_vst(pred3, diff, 48); + __lsx_vst(pred4, diff, 64); + __lsx_vst(pred5, diff, 80); + __lsx_vst(pred6, diff, 96); + __lsx_vst(pred7, diff, 112); + diff += diff_stride; + } +} + +void vpx_subtract_block_lsx(int32_t rows, int32_t cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + if (rows == cols) { + switch (rows) { + case 4: + sub_blk_4x4_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 8: + sub_blk_8x8_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 16: + sub_blk_16x16_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 32: + sub_blk_32x32_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 64: + sub_blk_64x64_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + default: + vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } + } else { + vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + } +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 7de8b02055..9d8c94545d 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -382,6 +382,8 @@ DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c DSP_SRCS-$(HAVE_VSX) += ppc/subtract_vsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c + ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 706af97e50..1ef99e641d 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -730,7 +730,7 @@ () # Block subtraction # add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; -specialize qw/vpx_subtract_block neon msa mmi sse2 vsx/; +specialize qw/vpx_subtract_block neon msa mmi sse2 vsx lsx/; # # Single block SAD From bfbb79e252b9102ca5ae3ad5ab605254ce6681d2 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Wed, 20 Apr 2022 11:13:13 +0800 Subject: [PATCH 0371/1000] vp8[loongarch]: Optimize sub_pixel_variance8x8/16x16 1. vpx_sub_pixel_variance8x8_lsx 1. vpx_sub_pixel_variance16x16_lsx 2. vpx_mse16x16_lsx Bug: webm:1755 Change-Id: Iaedd8393c950c13042a0597d0d47b534a2723317 --- test/variance_test.cc | 12 +- vpx_dsp/loongarch/sub_pixel_variance_lsx.c | 212 ++++++++++++++++----- vpx_dsp/loongarch/variance_lsx.c | 102 +++++----- vpx_dsp/loongarch/variance_lsx.h | 62 ++++++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +- 6 files changed, 301 insertions(+), 94 deletions(-) create mode 100644 vpx_dsp/loongarch/variance_lsx.h diff --git a/test/variance_test.cc b/test/variance_test.cc index a11ce25a63..80855052dc 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1651,6 +1651,9 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_MMI #if HAVE_LSX +INSTANTIATE_TEST_SUITE_P(LSX, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_lsx))); + INSTANTIATE_TEST_SUITE_P( LSX, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_lsx), @@ -1658,9 +1661,12 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(4, 4, &vpx_variance16x16_lsx), VarianceParams(3, 3, &vpx_variance8x8_lsx))); -INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelVarianceTest, - ::testing::Values(SubpelVarianceParams( - 5, 5, &vpx_sub_pixel_variance32x32_lsx, 0))); +INSTANTIATE_TEST_SUITE_P( + LSX, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_lsx, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_lsx, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_lsx, 0))); INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelAvgVarianceTest, ::testing::Values(SubpelAvgVarianceParams( diff --git a/vpx_dsp/loongarch/sub_pixel_variance_lsx.c b/vpx_dsp/loongarch/sub_pixel_variance_lsx.c index c7d233af82..700793531c 100644 --- a/vpx_dsp/loongarch/sub_pixel_variance_lsx.c +++ b/vpx_dsp/loongarch/sub_pixel_variance_lsx.c @@ -10,47 +10,17 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" -#include "vpx_util/loongson_intrinsics.h" +#include "vpx_dsp/loongarch/variance_lsx.h" #include "vpx_dsp/variance.h" -#define HADD_SW_S32(in0, in1) \ - do { \ - __m128i res0_m; \ - \ - res0_m = __lsx_vhaddw_d_w(in0, in0); \ - res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \ - in1 = __lsx_vpickve2gr_w(res0_m, 0); \ - } while (0) - -#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift, in2) \ - do { \ - __m128i tmp0_m, tmp1_m; \ - \ - tmp0_m = __lsx_vshuf_b(in1, in0, mask); \ - tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); \ - in2 = __lsx_vsrari_h(tmp1_m, shift); \ - } while (0) - -#define CALC_MSE_AVG_B(src, ref, var, sub) \ - { \ - __m128i src_l0_m, src_l1_m; \ - __m128i res_l0_m, res_l1_m; \ - \ - src_l0_m = __lsx_vilvl_b(src, ref); \ - src_l1_m = __lsx_vilvh_b(src, ref); \ - DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \ - res_l0_m, res_l1_m); \ - var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \ - var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \ - sub = __lsx_vadd_h(sub, res_l0_m); \ - sub = __lsx_vadd_h(sub, res_l1_m); \ - } - static const uint8_t bilinear_filters_lsx[8][2] = { { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, }; +#define VARIANCE_WxH(sse, diff, shift) \ + (sse) - (((uint32_t)(diff) * (diff)) >> (shift)) + #define VARIANCE_LARGE_WxH(sse, diff, shift) \ (sse) - (((int64_t)(diff) * (diff)) >> (shift)) @@ -59,8 +29,7 @@ static uint32_t avg_sse_diff_64x64_lsx(const uint8_t *src_ptr, const uint8_t *ref_ptr, int32_t ref_stride, const uint8_t *sec_pred, int32_t *diff) { - int32_t ht_cnt = 32; - uint32_t res; + int32_t res, ht_cnt = 32; __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; __m128i pred0, pred1, pred2, pred3, vec, vec_tmp; __m128i avg0, avg1, avg2, avg3; @@ -119,11 +88,58 @@ static uint32_t avg_sse_diff_64x64_lsx(const uint8_t *src_ptr, return res; } +static uint32_t sub_pixel_sse_diff_8width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; + __m128i vec0, vec1, vec2, vec3, filt0, out, vec; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec0, vec0, FILTER_BITS, vec1, vec1, + FILTER_BITS, vec2, vec2, FILTER_BITS, vec3, vec3, FILTER_BITS, + src0, src1, src2, src3); + out = __lsx_vpackev_d(src1, src0); + CALC_MSE_AVG_B(out, ref0, var, avg); + out = __lsx_vpackev_d(src3, src2); + CALC_MSE_AVG_B(out, ref1, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + static uint32_t sub_pixel_sse_diff_16width_h_lsx( const uint8_t *src, int32_t src_stride, const uint8_t *dst, int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { uint32_t loop_cnt = (height >> 2); - uint32_t res; + int32_t res; __m128i src0, src1, src2, src3, src4, src5, src6, src7; __m128i dst0, dst1, dst2, dst3, filt0; __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; @@ -172,7 +188,6 @@ static uint32_t sub_pixel_sse_diff_16width_h_lsx( vec = __lsx_vhaddw_w_h(avg, avg); HADD_SW_S32(vec, *diff); HADD_SW_S32(var, res); - return res; } @@ -195,11 +210,59 @@ static uint32_t sub_pixel_sse_diff_32width_h_lsx( return sse; } +static uint32_t sub_pixel_sse_diff_8width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4; + __m128i vec, vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3, filt0; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + src0 = src4; + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + static uint32_t sub_pixel_sse_diff_16width_v_lsx( const uint8_t *src, int32_t src_stride, const uint8_t *dst, int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { uint32_t loop_cnt = (height >> 2); - uint32_t res; + int32_t res; __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4; __m128i out0, out1, out2, out3, tmp0, tmp1, filt0, vec; __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; @@ -252,7 +315,6 @@ static uint32_t sub_pixel_sse_diff_16width_v_lsx( vec = __lsx_vhaddw_w_h(avg, avg); HADD_SW_S32(vec, *diff); HADD_SW_S32(var, res); - return res; } @@ -275,12 +337,70 @@ static uint32_t sub_pixel_sse_diff_32width_v_lsx( return sse; } +static uint32_t sub_pixel_sse_diff_8width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4, out0, out1; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3, vec, vec0, filt_hz, filt_vt; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src1, ref0); + src += src_stride; + dst += dst_stride; + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src2, ref1); + src += src_stride; + dst += dst_stride; + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src3, ref2); + src += src_stride; + dst += dst_stride; + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src4, ref3); + src += src_stride; + dst += dst_stride; + + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out1); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); + HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); + + HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out1); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); + HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out0); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + static uint32_t sub_pixel_sse_diff_16width_hv_lsx( const uint8_t *src, int32_t src_stride, const uint8_t *dst, int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, int32_t height, int32_t *diff) { uint32_t loop_cnt = (height >> 2); - uint32_t res; + int32_t res; __m128i src0, src1, src2, src3, src4, src5, src6, src7; __m128i ref0, ref1, ref2, ref3, filt_hz, filt_vt, vec0, vec1; __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, vec; @@ -378,7 +498,7 @@ static uint32_t subpel_avg_ssediff_16w_h_lsx( int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, int32_t height, int32_t *diff, int32_t width) { uint32_t loop_cnt = (height >> 2); - uint32_t res; + int32_t res; __m128i src0, src1, src2, src3, src4, src5, src6, src7; __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; __m128i pred0, pred1, pred2, pred3, filt0, vec; @@ -450,7 +570,7 @@ static uint32_t subpel_avg_ssediff_16w_v_lsx( int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, int32_t height, int32_t *diff, int32_t width) { uint32_t loop_cnt = (height >> 2); - uint32_t res; + int32_t res; __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3; __m128i src0, src1, src2, src3, src4, out0, out1, out2, out3; __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; @@ -527,7 +647,7 @@ static uint32_t subpel_avg_ssediff_16w_hv_lsx( int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) { uint32_t loop_cnt = (height >> 2); - uint32_t res; + int32_t res; __m128i src0, src1, src2, src3, src4, src5, src6, src7; __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3; __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; @@ -674,6 +794,8 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_lsx( return sse; } +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6) +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8) #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10) #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12) @@ -712,6 +834,8 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_lsx( return var; \ } +VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(8, 8) +VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(16, 16) VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(32, 32) #define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(ht) \ diff --git a/vpx_dsp/loongarch/variance_lsx.c b/vpx_dsp/loongarch/variance_lsx.c index 5223e0f169..8fad342c71 100644 --- a/vpx_dsp/loongarch/variance_lsx.c +++ b/vpx_dsp/loongarch/variance_lsx.c @@ -9,33 +9,7 @@ */ #include "./vpx_dsp_rtcd.h" -#include "vpx_util/loongson_intrinsics.h" - -#define HADD_SW_S32(in) \ - ({ \ - __m128i res0_m; \ - int32_t sum_m; \ - \ - res0_m = __lsx_vhaddw_d_w(in, in); \ - res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \ - sum_m = __lsx_vpickve2gr_w(res0_m, 0); \ - sum_m; \ - }) - -#define CALC_MSE_AVG_B(src, ref, var, sub) \ - { \ - __m128i src_l0_m, src_l1_m; \ - __m128i res_l0_m, res_l1_m; \ - \ - src_l0_m = __lsx_vilvl_b(src, ref); \ - src_l1_m = __lsx_vilvh_b(src, ref); \ - DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \ - res_l0_m, res_l1_m); \ - var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \ - var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \ - sub = __lsx_vadd_h(sub, res_l0_m); \ - sub = __lsx_vadd_h(sub, res_l1_m); \ - } +#include "vpx_dsp/loongarch/variance_lsx.h" #define VARIANCE_WxH(sse, diff, shift) \ (sse) - (((uint32_t)(diff) * (diff)) >> (shift)) @@ -46,7 +20,7 @@ static uint32_t sse_diff_8width_lsx(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride, int32_t height, int32_t *diff) { - int32_t ht_cnt = (height >> 2); + int32_t res, ht_cnt = (height >> 2); __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, vec; __m128i avg = __lsx_vldi(0); __m128i var = avg; @@ -74,15 +48,15 @@ static uint32_t sse_diff_8width_lsx(const uint8_t *src_ptr, int32_t src_stride, } vec = __lsx_vhaddw_w_h(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; } static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride, int32_t height, int32_t *diff) { - int32_t ht_cnt = (height >> 2); + int32_t res, ht_cnt = (height >> 2); __m128i src, ref, vec; __m128i avg = __lsx_vldi(0); __m128i var = avg; @@ -112,15 +86,15 @@ static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride, CALC_MSE_AVG_B(src, ref, var, avg); } vec = __lsx_vhaddw_w_h(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; } static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride, int32_t height, int32_t *diff) { - int32_t ht_cnt = (height >> 2); + int32_t res, ht_cnt = (height >> 2); __m128i avg = __lsx_vldi(0); __m128i src0, src1, ref0, ref1; __m128i vec; @@ -157,15 +131,15 @@ static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride, } vec = __lsx_vhaddw_w_h(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; } static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride, int32_t *diff) { - int32_t ht_cnt = 32; + int32_t res, ht_cnt = 32; __m128i avg0 = __lsx_vldi(0); __m128i src0, src1, src2, src3; __m128i ref0, ref1, ref2, ref3; @@ -205,12 +179,12 @@ static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride, vec0 = __lsx_vadd_w(vec0, vec1); vec1 = __lsx_vhaddw_w_h(avg3, avg3); vec0 = __lsx_vadd_w(vec0, vec1); - *diff = HADD_SW_S32(vec0); - - return HADD_SW_S32(var); + HADD_SW_S32(vec0, *diff); + HADD_SW_S32(var, res); + return res; } -#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6) #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8) #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10) @@ -228,6 +202,38 @@ static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride, return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ } +static uint32_t sse_16width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t res, ht_cnt = (height >> 2); + __m128i src, ref; + __m128i var = __lsx_vldi(0); + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + } + HADD_SW_S32(var, res); + return res; +} + VPX_VARIANCE_WDXHT_LSX(8, 8) VPX_VARIANCE_WDXHT_LSX(16, 16) VPX_VARIANCE_WDXHT_LSX(32, 32) @@ -242,6 +248,14 @@ uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride, return VARIANCE_64Wx64H(*sse, diff); } +uint32_t vpx_mse16x16_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_16width_lsx(src, src_stride, ref, ref_stride, 16); + + return *sse; +} + void vpx_get16x16var_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, uint32_t *sse, int32_t *sum) { diff --git a/vpx_dsp/loongarch/variance_lsx.h b/vpx_dsp/loongarch/variance_lsx.h new file mode 100644 index 0000000000..cf9e9890ff --- /dev/null +++ b/vpx_dsp/loongarch/variance_lsx.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_ + +#include "vpx_util/loongson_intrinsics.h" + +#define HADD_SW_S32(in0, in1) \ + do { \ + __m128i res0_m; \ + \ + res0_m = __lsx_vhaddw_d_w(in0, in0); \ + res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \ + in1 = __lsx_vpickve2gr_w(res0_m, 0); \ + } while (0) + +#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift, in2) \ + do { \ + __m128i tmp0_m, tmp1_m; \ + \ + tmp0_m = __lsx_vshuf_b(in1, in0, mask); \ + tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); \ + in2 = __lsx_vsrari_h(tmp1_m, shift); \ + } while (0) + +#define CALC_MSE_B(src, ref, var) \ + do { \ + __m128i src_l0_m, src_l1_m; \ + __m128i res_l0_m, res_l1_m; \ + \ + src_l0_m = __lsx_vilvl_b(src, ref); \ + src_l1_m = __lsx_vilvh_b(src, ref); \ + DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \ + res_l0_m, res_l1_m); \ + var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \ + var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \ + } while (0) + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + do { \ + __m128i src_l0_m, src_l1_m; \ + __m128i res_l0_m, res_l1_m; \ + \ + src_l0_m = __lsx_vilvl_b(src, ref); \ + src_l1_m = __lsx_vilvh_b(src, ref); \ + DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \ + res_l0_m, res_l1_m); \ + var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \ + var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \ + sub = __lsx_vadd_h(sub, res_l0_m); \ + sub = __lsx_vadd_h(sub, res_l1_m); \ + } while (0) + +#endif // VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 9d8c94545d..4f5a7a1908 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -402,6 +402,7 @@ DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c +DSP_SRCS-$(HAVE_LSX) += loongarch/variance_lsx.h DSP_SRCS-$(HAVE_LSX) += loongarch/variance_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/sub_pixel_variance_lsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/avg_pred_lsx.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 1ef99e641d..23925a4793 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1129,7 +1129,7 @@ () specialize qw/vpx_get8x8var sse2 neon msa vsx/; add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx lsx/; add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_mse16x8 sse2 avx2 msa mmi vsx/; @@ -1171,7 +1171,7 @@ () specialize qw/vpx_sub_pixel_variance16x32 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3 lsx/; add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance16x8 neon msa mmi sse2 ssse3/; @@ -1180,7 +1180,7 @@ () specialize qw/vpx_sub_pixel_variance8x16 neon msa mmi sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3 lsx/; add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance8x4 neon msa mmi sse2 ssse3/; From 508c0aff89b511d04cbd1e782cc24313fd6ae06b Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Wed, 20 Apr 2022 11:16:55 +0800 Subject: [PATCH 0372/1000] vp8[loongarch]: Optimize fdct8x4/diamond_search_sad 1. vp8_short_fdct8x4_lsx 2. vp8_diamond_search_sad_lsx 3. vpx_sad8x8_lsx Bug: webm:1755 Change-Id: Ic9df84ead2d4fc07ec58e9730d6a12ac2b2d31c1 --- test/sad_test.cc | 1 + vp8/common/rtcd_defs.pl | 5 +-- vp8/encoder/loongarch/dct_lsx.c | 62 +++++++++++++++++++++++++++++++++ vp8/encoder/mcomp.c | 4 +-- vpx_dsp/loongarch/sad_lsx.c | 36 ++++++++++++++++++- vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 6 files changed, 104 insertions(+), 6 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 7ce25343f6..2506f1adbc 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1136,6 +1136,7 @@ const SadMxNParam lsx_tests[] = { SadMxNParam(64, 64, &vpx_sad64x64_lsx), SadMxNParam(32, 32, &vpx_sad32x32_lsx), SadMxNParam(16, 16, &vpx_sad16x16_lsx), + SadMxNParam(8, 8, &vpx_sad8x8_lsx), }; INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests)); diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 4f45d2ab9a..7bc866faaa 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -187,7 +187,7 @@ () specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi lsx/; add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/; +specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi lsx/; add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch"; specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/; @@ -222,9 +222,10 @@ () $vp8_refining_search_sad_msa=vp8_refining_search_sadx4; add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"; -specialize qw/vp8_diamond_search_sad sse2 msa/; +specialize qw/vp8_diamond_search_sad sse2 msa lsx/; $vp8_diamond_search_sad_sse2=vp8_diamond_search_sadx4; $vp8_diamond_search_sad_msa=vp8_diamond_search_sadx4; +$vp8_diamond_search_sad_lsx=vp8_diamond_search_sadx4; # # Alt-ref Noise Reduction (ARNR) diff --git a/vp8/encoder/loongarch/dct_lsx.c b/vp8/encoder/loongarch/dct_lsx.c index e090d2360f..a08d4d3f63 100644 --- a/vp8/encoder/loongarch/dct_lsx.c +++ b/vp8/encoder/loongarch/dct_lsx.c @@ -97,3 +97,65 @@ void vp8_short_fdct4x4_lsx(int16_t *input, int16_t *output, int32_t pitch) { __lsx_vst(in0, output, 0); __lsx_vst(in2, output, 16); } + +void vp8_short_fdct8x4_lsx(int16_t *input, int16_t *output, int32_t pitch) { + __m128i in0, in1, in2, in3, temp0, temp1, tmp0, tmp1; + __m128i const0, const1, const2, vec0_w, vec1_w, vec2_w, vec3_w; + __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c }; + __m128i zero = __lsx_vldi(0); + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + + in0 = __lsx_vld(input, 0); + DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2); + in3 = __lsx_vldx(input, pitch3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3); + DUP4_ARG2(__lsx_vslli_h, temp0, 3, temp1, 3, in1, 3, in3, 3, temp0, temp1, + in1, in3); + in0 = __lsx_vadd_h(temp0, temp1); + in2 = __lsx_vsub_h(temp0, temp1); + SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2); + temp0 = __lsx_vreplvei_h(coeff, 3); + vec1_w = __lsx_vpackev_h(zero, temp0); + coeff = __lsx_vilvh_h(zero, coeff); + vec3_w = __lsx_vreplvei_w(coeff, 0); + tmp1 = __lsx_vilvl_h(in3, in1); + tmp0 = __lsx_vilvh_h(in3, in1); + vec0_w = vec1_w; + vec2_w = vec3_w; + DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1, + vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w, + vec3_w); + DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 12, vec3_w, vec2_w, 12, in1, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3); + in0 = __lsx_vadd_h(temp0, temp1); + in0 = __lsx_vaddi_hu(in0, 7); + in2 = __lsx_vsub_h(temp0, temp1); + in2 = __lsx_vaddi_hu(in2, 7); + in0 = __lsx_vsrai_h(in0, 4); + in2 = __lsx_vsrai_h(in2, 4); + DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, vec3_w, vec1_w); + vec3_w = __lsx_vadd_w(vec3_w, vec1_w); + vec1_w = __lsx_vreplvei_w(coeff, 1); + const0 = RET_1_IF_NZERO_H(in3); + tmp1 = __lsx_vilvl_h(in3, in1); + tmp0 = __lsx_vilvh_h(in3, in1); + vec0_w = vec1_w; + vec2_w = vec3_w; + DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1, + vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w, + vec3_w); + DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 16, vec3_w, vec2_w, 16, in1, in3); + in1 = __lsx_vadd_h(in1, const0); + DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, temp0, temp1); + __lsx_vst(temp0, output, 0); + __lsx_vst(temp1, output, 16); + + DUP2_ARG2(__lsx_vpickod_d, in1, in0, in3, in2, in0, in2); + __lsx_vst(in0, output, 32); + __lsx_vst(in2, output, 48); +} diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index 769c2f5589..ae092c66e1 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -1129,7 +1129,7 @@ int vp8_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } -#if HAVE_SSE2 || HAVE_MSA +#if HAVE_SSE2 || HAVE_MSA || HAVE_LSX int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int_mv *best_mv, int search_param, int sad_per_bit, int *num00, vp8_variance_fn_ptr_t *fn_ptr, @@ -1278,7 +1278,7 @@ int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } -#endif // HAVE_SSE2 || HAVE_MSA +#endif // HAVE_SSE2 || HAVE_MSA || HAVE_LSX int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int sad_per_bit, int distance, diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c index 4764acbf88..46ee557df5 100644 --- a/vpx_dsp/loongarch/sad_lsx.c +++ b/vpx_dsp/loongarch/sad_lsx.c @@ -57,6 +57,34 @@ sum_m; \ }) +static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp; + __m128i sad = __lsx_vldi(0); + + for (ht_cnt = (height >> 2); ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0); + src += src_stride; + ref += ref_stride; + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src1, ref1); + src += src_stride; + ref += ref_stride; + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src2, ref2); + src += src_stride; + ref += ref_stride; + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src3, ref3); + src += src_stride; + ref += ref_stride; + DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2, + src0, src1, ref0, ref1); + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + return HADD_UH_U32(sad); +} + static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, int32_t height) { @@ -584,6 +612,12 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, return HADD_SW_S32(sad); } +#define VPX_SAD_8xHT_LSX(height) \ + uint32_t vpx_sad8x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_8width_lsx(src, src_stride, ref, ref_stride, height); \ + } + #define VPX_SAD_16xHT_LSX(height) \ uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride) { \ @@ -662,7 +696,7 @@ SAD32 SAD16 -#define SAD8 VPX_SAD_8xHTx4D_LSX(8) +#define SAD8 VPX_SAD_8xHT_LSX(8) VPX_SAD_8xHTx4D_LSX(8) SAD8 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 23925a4793..e82b487f13 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -763,7 +763,7 @@ () specialize qw/vpx_sad8x16 neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi/; +specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad8x4 neon msa sse2 vsx mmi/; From 1c39c625264fa64db1c573cbac1f3a4f24c660d3 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Sun, 24 Apr 2022 10:34:21 +0800 Subject: [PATCH 0373/1000] vp8[loongarch]: Optimize vp8_sixtap_predict4x4 1. vp8_sixtap_predict4x4 Bug: webm:1755 Change-Id: If7d844496ef2cfe2252f2ef12bb7cded63ad03dd --- vp8/common/loongarch/sixtap_filter_lsx.c | 745 ++++++++++++++++++++++- vp8/common/rtcd_defs.pl | 2 +- 2 files changed, 743 insertions(+), 4 deletions(-) diff --git a/vp8/common/loongarch/sixtap_filter_lsx.c b/vp8/common/loongarch/sixtap_filter_lsx.c index 75fe533d98..a23ed16d2b 100644 --- a/vp8/common/loongarch/sixtap_filter_lsx.c +++ b/vp8/common/loongarch/sixtap_filter_lsx.c @@ -50,9 +50,9 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { __m128i vec0_m, vec1_m, vec2_m; \ __m128i hz_out_m; \ \ - DUP2_ARG3(__lsx_vshuf_b, src0, src1, mask0, src0, src1, mask1, vec0_m, \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, \ vec1_m); \ - vec2_m = __lsx_vshuf_b(src0, src1, mask2); \ + vec2_m = __lsx_vshuf_b(src1, src0, mask2); \ hz_out_m = DPADD_H3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); \ \ hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); \ @@ -61,6 +61,24 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { hz_out_m; \ }) +#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, filt0, filt1, filt2, out0, out1) \ + { \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \ + \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \ + vec1_m); \ + DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1); \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \ + vec3_m); \ + DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \ + out0, out1); \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src3, src2, mask2, vec4_m, \ + vec5_m); \ + DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \ + out0, out1); \ + } + #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ mask2, filt0, filt1, filt2, out0, out1, \ out2, out3) \ @@ -104,7 +122,7 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { __m128i vec0_m, vec1_m; \ __m128i hz_out_m; \ \ - DUP2_ARG3(__lsx_vshuf_b, src0, src1, mask0, src0, src1, mask1, vec0_m, \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, \ vec1_m); \ hz_out_m = FILT_4TAP_DPADD_H(vec0_m, vec1_m, filt_h0, filt_h1); \ hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); \ @@ -113,6 +131,20 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { hz_out_m; \ }) +#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + filt0, filt1, out0, out1) \ + { \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ + \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \ + vec1_m); \ + DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1); \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \ + vec3_m); \ + DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \ + out0, out1); \ + } + #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ filt0, filt1, out0, out1, out2, out3) \ ({ \ @@ -133,6 +165,107 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { out3); \ }) +static inline void common_hz_6t_4x4_lsx(uint8_t *RESTRICT src, + int32_t src_stride, + uint8_t *RESTRICT dst, + int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3, filt0, filt1, filt2; + __m128i mask0, mask1, mask2, out0, out1; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 2; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + filt2 = __lsx_vldrepl_h(filter, 4); + + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out0, out1); + out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); + out0 = __lsx_vxori_b(out0, 128); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); +} + +static void common_hz_6t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3, filt0, filt1, filt2; + __m128i mask0, mask1, mask2, out0, out1, out2, out3; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride_x2 << 1; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 2; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + filt2 = __lsx_vldrepl_h(filter, 4); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src += src_stride_x4; + HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out0, out1); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out2, out3); + + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 3); +} + +static void common_hz_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_6t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_6t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + static void common_hz_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, int32_t dst_stride, const int8_t *filter, int32_t height) { @@ -254,6 +387,64 @@ static void common_hz_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, } } +static void common_vt_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + __m128i src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; + __m128i out0, out1; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + filt2 = __lsx_vldrepl_h(filter, 4); + + DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1); + src2 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4); + src += src_stride_x3; + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110, + src4332); + DUP2_ARG2(__lsx_vxori_b, src2110, 128, src4332, 128, src2110, src4332); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src5 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7); + src8 = __lsx_vldx(src, src_stride_x3); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + DUP2_ARG2(__lsx_vilvl_d, src65_r, src54_r, src87_r, src76_r, src6554, + src8776); + DUP2_ARG2(__lsx_vxori_b, src6554, 128, src8776, 128, src6554, src8776); + out0 = DPADD_H3(src2110, src4332, src6554, filt0, filt1, filt2); + out1 = DPADD_H3(src4332, src6554, src8776, filt0, filt1, filt2); + + out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); + out0 = __lsx_vxori_b(out0, 128); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + src2110 = src6554; + src4332 = src8776; + src4 = src8; + } +} + static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, int32_t dst_stride, const int8_t *filter, int32_t height) { @@ -395,6 +586,92 @@ static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, } } +static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, tmp0, tmp1; + __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 2; + + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, + filt_hz1); + filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4); + DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, + filt_vt1); + filt_vt2 = __lsx_vldrepl_h(filter_vert, 4); + + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1); + src2 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4); + src += src_stride_x3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + + hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); + hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src5 = __lsx_vld(src, 0); + src6 = __lsx_vldx(src, src_stride); + src += src_stride_x2; + + DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6); + hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); + + src7 = __lsx_vld(src, 0); + src8 = __lsx_vldx(src, src_stride); + src += src_stride_x2; + + DUP2_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src7, src8); + hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff); + + out2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + out3 = __lsx_vpackev_b(hz_out7, hz_out6); + tmp1 = DPADD_H3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); + + tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); + tmp0 = __lsx_vxori_b(tmp0, 128); + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 3); + dst += dst_stride; + + hz_out3 = hz_out7; + out0 = out2; + out1 = out3; + } +} + static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, int32_t dst_stride, const int8_t *filter_horiz, @@ -506,6 +783,102 @@ static void common_hv_6ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, filter_horiz, filter_vert, height); } +static void common_hz_4t_4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1; + __m128i out0, out1; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 1; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, + out0, out1); + + out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); + out0 = __lsx_vxori_b(out0, 128); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); +} + +static void common_hz_4t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1; + __m128i out0, out1, out2, out3; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 1; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + src += src_stride_x4; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, + out0, out1); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, + out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 3); +} + +static void common_hz_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_4t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_4t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + static void common_hz_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, int32_t dst_stride, const int8_t *filter, int32_t height) { @@ -597,6 +970,55 @@ static void common_hz_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, } } +static void common_vt_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5; + __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; + __m128i src2110, src4332, filt0, filt1, out0, out1; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2); + src1 = __lsx_vld(src, 0); + src += src_stride_x2; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); + + src2110 = __lsx_vilvl_d(src21_r, src10_r); + src2110 = __lsx_vxori_b(src2110, 128); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src3 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5); + src += src_stride_x3; + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); + src4332 = __lsx_vilvl_d(src43_r, src32_r); + src4332 = __lsx_vxori_b(src4332, 128); + out0 = FILT_4TAP_DPADD_H(src2110, src4332, filt0, filt1); + + src2 = __lsx_vld(src, 0); + src += src_stride; + DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src54_r, src65_r); + src2110 = __lsx_vilvl_d(src65_r, src54_r); + src2110 = __lsx_vxori_b(src2110, 128); + out1 = FILT_4TAP_DPADD_H(src4332, src2110, filt0, filt1); + out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); + out0 = __lsx_vxori_b(out0, 128); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + } +} + static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, int32_t dst_stride, const int8_t *filter, int32_t height) { @@ -719,6 +1141,74 @@ static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, } } +static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; + __m128i mask0, mask1, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 1; + + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, + filt_hz1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2); + src += src_stride_x2; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + + DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, + filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src3 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5); + src6 = __lsx_vldx(src, src_stride_x3); + src += src_stride_x4; + + DUP2_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src3, src4); + hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff); + vec1 = __lsx_vpackev_b(hz_out3, hz_out2); + tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1); + + DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6); + hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); + vec2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp1 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1); + + tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); + tmp0 = __lsx_vxori_b(tmp0, 128); + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 3); + dst += dst_stride; + + hz_out1 = hz_out5; + vec0 = vec2; + } +} + static inline void common_hv_4ht_4vt_8w_lsx( uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, @@ -804,6 +1294,82 @@ static void common_hv_4ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, filter_horiz, filter_vert, height); } +static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5, src6; + __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2; + __m128i filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 2; + + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, + filt_hz1); + filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2); + src += src_stride_x2; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + + hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + + DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, + filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src3 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5); + src6 = __lsx_vldx(src, src_stride_x3); + src += src_stride_x4; + DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, + src4, src5, src6); + + hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff); + vec1 = __lsx_vpackev_b(hz_out3, hz_out2); + tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1); + + hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); + vec2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp1 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1); + + DUP2_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp1, dst, 0, 1); + dst += dst_stride; + + hz_out1 = hz_out5; + vec0 = vec2; + } +} + static inline void common_hv_6ht_4vt_8w_lsx( uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, @@ -895,6 +1461,82 @@ static void common_hv_6ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, filter_horiz, filter_vert, height); } +static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i filt_hz0, filt_hz1, filt_vt0, filt_vt1, filt_vt2, mask0, mask1; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, tmp0, tmp1, out0, out1, out2, out3; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + + src -= 1; + + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, + filt_hz1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + DUP4_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src, src_stride, + src, src_stride_x2, src0, src1, src3, src4); + src2 = __lsx_vld(src, 0); + src += src_stride_x3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); + + DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, + filt_vt1); + filt_vt2 = __lsx_vldrepl_h(filter_vert, 4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src5 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7); + src8 = __lsx_vldx(src, src_stride_x3); + DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, + src6, src7, src8); + src += src_stride_x4; + + hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); + out2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1); + hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff); + out3 = __lsx_vpackev_b(hz_out7, hz_out6); + tmp1 = DPADD_H3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); + + tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); + tmp0 = __lsx_vxori_b(tmp0, 128); + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 3); + dst += dst_stride; + + hz_out3 = hz_out7; + out0 = out2; + out1 = out3; + } +} + static inline void common_hv_4ht_6vt_8w_lsx( uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, @@ -1000,6 +1642,103 @@ typedef void (*PVp8SixtapPredictFunc2)(uint8_t *RESTRICT src, int32_t dst_stride, const int8_t *filter, int32_t height); +void vp8_sixtap_predict4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1]; + const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1]; + + static PVp8SixtapPredictFunc1 Predict4x4Funcs1[4] = { + common_hv_6ht_6vt_4w_lsx, + common_hv_6ht_4vt_4w_lsx, + common_hv_4ht_6vt_4w_lsx, + common_hv_4ht_4vt_4w_lsx, + }; + + static PVp8SixtapPredictFunc2 Predict4x4Funcs2[4] = { common_vt_6t_4w_lsx, + common_vt_4t_4w_lsx, + common_hz_6t_4w_lsx, + common_hz_4t_4w_lsx }; + if (yoffset < 8 && xoffset < 8) { + if (yoffset) { + if (xoffset) { + switch (xoffset & 1) { + case 0: + switch (yoffset & 1) { + case 0: + Predict4x4Funcs1[0](src, src_stride, dst, dst_stride, h_filter, + v_filter, 4); + break; + case 1: + Predict4x4Funcs1[1](src, src_stride, dst, dst_stride, h_filter, + v_filter + 1, 4); + break; + } + break; + + case 1: + switch (yoffset & 1) { + case 0: + Predict4x4Funcs1[2](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter, 4); + break; + + case 1: + Predict4x4Funcs1[3](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter + 1, 4); + break; + } + break; + } + } else { + switch (yoffset & 1) { + case 0: + Predict4x4Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 4); + break; + + case 1: + Predict4x4Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1, + 4); + break; + } + } + } else { + switch (xoffset) { + case 0: { + __m128i tp0; + tp0 = __lsx_vinsgr2vr_w(tp0, src, 0); + src += src_stride; + tp0 = __lsx_vinsgr2vr_w(tp0, src, 0); + src += src_stride; + tp0 = __lsx_vinsgr2vr_w(tp0, src, 0); + src += src_stride; + tp0 = __lsx_vinsgr2vr_w(tp0, src, 0); + + __lsx_vstelm_w(tp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tp0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tp0, dst, 0, 3); + break; + } + case 2: + case 4: + case 6: + Predict4x4Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 4); + break; + } + switch (xoffset & 1) { + case 1: + Predict4x4Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1, + 4); + break; + } + } + } +} + void vp8_sixtap_predict8x8_lsx(uint8_t *RESTRICT src, int32_t src_stride, int32_t xoffset, int32_t yoffset, uint8_t *RESTRICT dst, int32_t dst_stride) { diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 7bc866faaa..739a612847 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -155,7 +155,7 @@ () specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/; add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; -specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi/; +specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi lsx/; add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/; From 17959f9c94be43de57972052ebfdc40870170b0e Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Fri, 6 May 2022 12:17:39 +0800 Subject: [PATCH 0374/1000] vp9[loongarch]: Optimize vpx_quantize_b/b_32x32 1. vpx_quantize_b_lsx 2. vpx_quantize_b_32x32_lsx Bug: webm:1755 Change-Id: I476c8677a2c2aed7248e088e62c3777c9bed2adb --- test/vp9_quantize_test.cc | 10 ++ vpx_dsp/loongarch/quantize_lsx.c | 249 +++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 4 files changed, 262 insertions(+), 2 deletions(-) create mode 100644 vpx_dsp/loongarch/quantize_lsx.c diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index d54f1bc9cd..ca1062a76f 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -568,6 +568,16 @@ INSTANTIATE_TEST_SUITE_P( VPX_BITS_8, 32, true))); #endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_lsx, + &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_lsx, + &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH + // Only useful to compare "Speed" test results. INSTANTIATE_TEST_SUITE_P( DISABLED_C, VP9QuantizeTest, diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c new file mode 100644 index 0000000000..e3fbb9e9e0 --- /dev/null +++ b/vpx_dsp/loongarch/quantize_lsx.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +#define CALCULATE_QCOEFF(coeff, coeff_abs, round, quant, shift, cmp_mask) \ + ({ \ + __m128i rounded, qcoeff; \ + \ + rounded = __lsx_vsadd_h(coeff_abs, round); \ + qcoeff = __lsx_vmuh_h(rounded, quant); \ + qcoeff = __lsx_vadd_h(rounded, qcoeff); \ + qcoeff = __lsx_vmuh_h(qcoeff, shift); \ + qcoeff = __lsx_vsigncov_h(coeff, qcoeff); \ + qcoeff = __lsx_vand_v(qcoeff, cmp_mask); \ + \ + qcoeff; \ + }) + +#define CALCULATE_DQCOEFF_AND_STORE(qcoeff, dequant, dqcoeff) \ + { \ + __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant); \ + __lsx_vst(dqcoeff16, dqcoeff, 0); \ + } + +#define CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff, dequant, dqcoeff) \ + { \ + __m128i low, high, dqcoeff32_0, dqcoeff32_1, res; \ + __m128i zero = __lsx_vldi(0); \ + __m128i coeff = __lsx_vabsd_h(qcoeff, zero); \ + \ + __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero); \ + __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero); \ + \ + low = __lsx_vmul_h(coeff, dequant); \ + high = __lsx_vmuh_h(coeff, dequant); \ + dqcoeff32_0 = __lsx_vilvl_h(high, low); \ + dqcoeff32_1 = __lsx_vilvh_h(high, low); \ + \ + dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1); \ + dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1); \ + dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0); \ + dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1); \ + res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0); \ + __lsx_vst(res, dqcoeff, 0); \ + } + +#define SCAN_FOR_EOB(coeff0, coeff1, zbin_mask0, zbin_mask1, scan, index, \ + zero) \ + ({ \ + __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero); \ + __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero); \ + __m128i scan0 = __lsx_vld(scan + index, 0); \ + __m128i scan1 = __lsx_vld(scan + index + 8, 0); \ + __m128i eob0, eob1, eob_max; \ + \ + scan0 = __lsx_vsub_h(scan0, zbin_mask0); \ + scan1 = __lsx_vsub_h(scan1, zbin_mask1); \ + eob0 = __lsx_vandn_v(zero_coeff0, scan0); \ + eob1 = __lsx_vandn_v(zero_coeff1, scan1); \ + eob_max = __lsx_vmax_h(eob0, eob1); \ + eob_max; \ + }) + +#define ACCUMULATE_EOB(eob) \ + ({ \ + __m128i eob_shuffled; \ + int16_t res_m; \ + \ + eob_shuffled = __lsx_vshuf4i_w(eob, 0xe); \ + eob = __lsx_vmax_h(eob, eob_shuffled); \ + eob_shuffled = __lsx_vshuf4i_h(eob, 0xe); \ + eob = __lsx_vmax_h(eob, eob_shuffled); \ + eob_shuffled = __lsx_vshuf4i_h(eob, 0x1); \ + eob = __lsx_vmax_h(eob, eob_shuffled); \ + res_m = __lsx_vpickve2gr_h(eob, 1); \ + res_m; \ + }) + +#if !CONFIG_VP9_HIGHBITDEPTH +void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + __m128i zero = __lsx_vldi(0); + int index = 16; + + __m128i zbin, round, quant, dequant, quant_shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + + (void)scan; + + zbin = __lsx_vld(zbin_ptr, 0); + round = __lsx_vld(round_ptr, 0); + quant = __lsx_vld(quant_ptr, 0); + dequant = __lsx_vld(dequant_ptr, 0); + quant_shift = __lsx_vld(quant_shift_ptr, 0); + // Handle one DC and first 15 AC. + DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1); + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + zbin = __lsx_vilvh_d(zbin, zbin); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + round = __lsx_vilvh_d(round, round); + quant = __lsx_vilvh_d(quant, quant); + quant_shift = __lsx_vilvh_d(quant_shift, quant_shift); + qcoeff1 = + CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + + __lsx_vst(qcoeff0, qcoeff_ptr, 0); + __lsx_vst(qcoeff1, qcoeff_ptr, 16); + + CALCULATE_DQCOEFF_AND_STORE(qcoeff0, dequant, dqcoeff_ptr); + dequant = __lsx_vilvh_d(dequant, dequant); + CALCULATE_DQCOEFF_AND_STORE(qcoeff1, dequant, dqcoeff_ptr + 8); + + eob = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + // AC only loop. + while (index < n_coeffs) { + coeff0 = __lsx_vld(coeff_ptr + index, 0); + coeff1 = __lsx_vld(coeff_ptr + index + 8, 0); + + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + qcoeff1 = + CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + + __lsx_vst(qcoeff0, qcoeff_ptr + index, 0); + __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0); + + CALCULATE_DQCOEFF_AND_STORE(qcoeff0, dequant, dqcoeff_ptr + index); + CALCULATE_DQCOEFF_AND_STORE(qcoeff1, dequant, dqcoeff_ptr + index + 8); + + eob0 = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); + eob = __lsx_vmax_h(eob, eob0); + + index += 16; + } + + *eob_ptr = ACCUMULATE_EOB(eob); +} + +void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m128i zero = __lsx_vldi(0); + int index; + + __m128i zbin, round, quant, dequant, quant_shift; + __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1; + __m128i eob = zero, eob0; + + (void)scan; + (void)n_coeffs; + + zbin = __lsx_vld(zbin_ptr, 0); + zbin = __lsx_vsrari_h(zbin, 1); + round = __lsx_vld(round_ptr, 0); + round = __lsx_vsrari_h(round, 1); + + quant = __lsx_vld(quant_ptr, 0); + dequant = __lsx_vld(dequant_ptr, 0); + quant_shift = __lsx_vld(quant_shift_ptr, 0); + quant_shift = __lsx_vslli_h(quant_shift, 1); + // Handle one DC and first 15 AC. + DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1); + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + // remove DC from zbin + zbin = __lsx_vilvh_d(zbin, zbin); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + // remove DC in quant_shift, quant, quant_shift + round = __lsx_vilvh_d(round, round); + quant = __lsx_vilvh_d(quant, quant); + quant_shift = __lsx_vilvh_d(quant_shift, quant_shift); + qcoeff1 = + CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + __lsx_vst(qcoeff0, qcoeff_ptr, 0); + __lsx_vst(qcoeff1, qcoeff_ptr, 16); + + CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff0, dequant, dqcoeff_ptr); + dequant = __lsx_vilvh_d(dequant, dequant); + CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff1, dequant, dqcoeff_ptr + 8); + eob = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + // AC only loop. + for (index = 16; index < 32 * 32; index += 16) { + coeff0 = __lsx_vld(coeff_ptr + index, 0); + coeff1 = __lsx_vld(coeff_ptr + index + 8, 0); + + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + qcoeff1 = + CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + __lsx_vst(qcoeff0, qcoeff_ptr + index, 0); + __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0); + + CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff0, dequant, dqcoeff_ptr + index); + CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff1, dequant, + dqcoeff_ptr + 8 + index); + eob0 = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); + eob = __lsx_vmax_h(eob, eob0); + } + + *eob_ptr = ACCUMULATE_EOB(eob); +} +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 4f5a7a1908..13999af04d 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -328,6 +328,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/quantize_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c endif diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index e82b487f13..d3c668f9ae 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -711,10 +711,10 @@ () # if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx/; + specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx lsx/; add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/; + specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; From 63378a94f996304e2784ecd6584e70cf487991e9 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Mon, 9 May 2022 14:39:05 +0800 Subject: [PATCH 0375/1000] loongarch: Reduce the number of instructions Replace some redundant instructions to improve the efficiency of the program. 1. txfm_macros_lsx.h 2. vpx_convolve8_avg_lsx.c 3. vpx_convolve8_horiz_lsx.c 4. vpx_convolve8_lsx.c 5. vpx_convolve8_vert_lsx.c 6. vpx_convolve_copy_lsx.c 7. vpx_convolve_lsx.h Bug: webm:1755 Change-Id: I9b7fdf6900338a26f9b1775609ad387648684f3d --- vpx_dsp/loongarch/sad_lsx.c | 2 +- vpx_dsp/loongarch/txfm_macros_lsx.h | 53 ++--- vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c | 49 ++-- vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c | 161 ++++++------- vpx_dsp/loongarch/vpx_convolve8_lsx.c | 110 ++++----- vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c | 243 ++++++++------------ vpx_dsp/loongarch/vpx_convolve_copy_lsx.c | 1 - vpx_dsp/loongarch/vpx_convolve_lsx.h | 25 +- 8 files changed, 271 insertions(+), 373 deletions(-) diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c index 46ee557df5..5eaebfb518 100644 --- a/vpx_dsp/loongarch/sad_lsx.c +++ b/vpx_dsp/loongarch/sad_lsx.c @@ -198,7 +198,7 @@ static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, int32_t ref_stride, int32_t height, uint32_t *sad_array) { int32_t ht_cnt = (height >> 2); - uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; __m128i src0, src1, src2, src3, sad_tmp; __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; diff --git a/vpx_dsp/loongarch/txfm_macros_lsx.h b/vpx_dsp/loongarch/txfm_macros_lsx.h index 977f1c2dd0..bd514831bf 100644 --- a/vpx_dsp/loongarch/txfm_macros_lsx.h +++ b/vpx_dsp/loongarch/txfm_macros_lsx.h @@ -13,36 +13,29 @@ #include "vpx_util/loongson_intrinsics.h" -#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ - { \ - __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \ - __m128i k0_m, k1_m, k2_m, k3_m; \ - __m128i zero = __lsx_vldi(0); \ - \ - k0_m = __lsx_vreplgr2vr_h(cnst0); \ - k1_m = __lsx_vreplgr2vr_h(cnst1); \ - k2_m = __lsx_vpackev_h(k1_m, k0_m); \ - k0_m = __lsx_vpackev_h(zero, k0_m); \ - k1_m = __lsx_vpackev_h(k1_m, zero); \ - \ - s5_m = __lsx_vilvl_h(reg1, reg0); \ - s4_m = __lsx_vilvh_h(reg1, reg0); \ - s3_m = __lsx_vilvl_h(reg0, reg1); \ - s2_m = __lsx_vilvh_h(reg0, reg1); \ - \ - s1_m = __lsx_vdp2_w_h(s5_m, k0_m); \ - s0_m = __lsx_vdp2_w_h(s4_m, k0_m); \ - k3_m = __lsx_vdp2_w_h(s5_m, k1_m); \ - s1_m = __lsx_vsub_w(s1_m, k3_m); \ - k3_m = __lsx_vdp2_w_h(s4_m, k1_m); \ - s0_m = __lsx_vsub_w(s0_m, k3_m); \ - \ - out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ - \ - s1_m = __lsx_vdp2_w_h(s3_m, k2_m); \ - s0_m = __lsx_vdp2_w_h(s2_m, k2_m); \ - out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ - } +#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ + do { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \ + __m128i k0_m, k1_m, k2_m, k3_m; \ + \ + k0_m = __lsx_vreplgr2vr_h(cnst0); \ + k1_m = __lsx_vreplgr2vr_h(cnst1); \ + k2_m = __lsx_vpackev_h(k1_m, k0_m); \ + \ + DUP2_ARG2(__lsx_vilvl_h, reg1, reg0, reg0, reg1, s5_m, s3_m); \ + DUP2_ARG2(__lsx_vilvh_h, reg1, reg0, reg0, reg1, s4_m, s2_m); \ + \ + DUP2_ARG2(__lsx_vmulwev_w_h, s5_m, k0_m, s4_m, k0_m, s1_m, s0_m); \ + k3_m = __lsx_vmulwod_w_h(s5_m, k1_m); \ + s1_m = __lsx_vsub_w(s1_m, k3_m); \ + k3_m = __lsx_vmulwod_w_h(s4_m, k1_m); \ + s0_m = __lsx_vsub_w(s0_m, k3_m); \ + \ + out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ + \ + DUP2_ARG2(__lsx_vdp2_w_h, s3_m, k2_m, s2_m, k2_m, s1_m, s0_m); \ + out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ + } while (0) #define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2, in3) \ do { \ diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c index 2b983552b6..54fcd6c571 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c @@ -278,7 +278,7 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx( __m128i src0, src1, src2, src3, src4, mask; __m128i filt_hz, filt_vt, vec0, vec1; __m128i dst0, dst1, dst2, dst3; - __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, out; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; int32_t src_stride2 = src_stride << 1; @@ -311,13 +311,12 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx( dst1 = __lsx_vilvl_w(dst3, dst2); dst0 = __lsx_vilvl_d(dst1, dst0); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - out = __lsx_vpickev_b(tmp1, tmp0); - out = __lsx_vavgr_bu(out, dst0); - __lsx_vstelm_w(out, dst, 0, 0); - __lsx_vstelm_w(out, dst + dst_stride, 0, 1); - __lsx_vstelm_w(out, dst + dst_stride2, 0, 2); - __lsx_vstelm_w(out, dst + dst_stride3, 0, 3); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3); } static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx( @@ -386,9 +385,8 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx( hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3, filt_vt, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, - FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); - DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, res0, res1); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, res0, res1); DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1); __lsx_vstelm_w(res0, dst, 0, 0); @@ -467,10 +465,9 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx( hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); vec3 = __lsx_vpackev_b(hz_out0, hz_out1); tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); - - DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, - FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); - PCKEV_AVG_ST4_D(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride); } static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( @@ -513,8 +510,6 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); @@ -522,8 +517,8 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); - - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); dst0 = __lsx_vldrepl_d(dst_tmp, 0); dst_tmp += dst_stride; @@ -534,7 +529,7 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( dst3 = __lsx_vldrepl_d(dst_tmp, 0); dst_tmp += dst_stride; DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); - PCKEV_AVG_ST4_D(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); + AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride); dst += dst_stride; } } @@ -597,8 +592,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp3 = __lsx_vpickev_b(tmp1, tmp0); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); tmp3 = __lsx_vavgr_bu(tmp3, dst0); __lsx_vst(tmp3, dst, 0); @@ -606,8 +600,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp3 = __lsx_vpickev_b(tmp1, tmp0); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); tmp3 = __lsx_vavgr_bu(tmp3, dst1); __lsx_vstx(tmp3, dst, dst_stride); @@ -615,8 +608,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp3 = __lsx_vpickev_b(tmp1, tmp0); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); tmp3 = __lsx_vavgr_bu(tmp3, dst2); __lsx_vstx(tmp3, dst, dst_stride2); @@ -624,8 +616,7 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp3 = __lsx_vpickev_b(tmp1, tmp0); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); tmp3 = __lsx_vavgr_bu(tmp3, dst3); __lsx_vstx(tmp3, dst, dst_stride3); dst += dst_stride4; @@ -642,8 +633,6 @@ static void common_hv_2ht_2vt_and_aver_dst_32w_lsx( common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); - src += 16; - dst += 16; } static void common_hv_2ht_2vt_and_aver_dst_64w_lsx( diff --git a/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c index 5d67d65274..2c6459a978 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c @@ -338,8 +338,7 @@ static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { __m128i src0, src1, src2, src3, mask; - __m128i filt0, vec0, vec1, res0, res1; - __m128i vec2, vec3; + __m128i filt0, vec0, vec1, vec2, vec3, res0, res1; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride + src_stride2; @@ -355,8 +354,8 @@ static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, src3 = __lsx_vldx(src, src_stride3); DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, vec3); - DUP2_ARG2(__lsx_vsrari_h, vec2, FILTER_BITS, vec3, FILTER_BITS, vec2, vec3); - DUP2_ARG2(__lsx_vpickev_b, vec2, vec2, vec3, vec3, res0, res1); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec2, vec2, FILTER_BITS, vec3, vec3, + FILTER_BITS, res0, res1); __lsx_vstelm_w(res0, dst, 0, 0); __lsx_vstelm_w(res0, dst + dst_stride, 0, 1); @@ -367,10 +366,9 @@ static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { - __m128i vec0, vec1, vec2, vec3, filt0; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; - __m128i res0, res1, res2, res3; - __m128i vec4, vec5, vec6, vec7; + __m128i res0, res1, res2, res3, filt0; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride + src_stride2; int32_t src_stride4 = src_stride2 << 1; @@ -396,10 +394,10 @@ static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, src7, src6, mask, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, vec4, vec5, vec6, vec7); - DUP4_ARG2(__lsx_vsrari_h, vec4, FILTER_BITS, vec5, FILTER_BITS, vec6, - FILTER_BITS, vec7, FILTER_BITS, vec4, vec5, vec6, vec7); - DUP4_ARG2(__lsx_vpickev_b, vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, - res0, res1, res2, res3); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5, + FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0, + res1, res2, res3); + __lsx_vstelm_w(res0, dst, 0, 0); dst += dst_stride; __lsx_vstelm_w(res0, dst, 0, 1); @@ -451,14 +449,13 @@ static void common_hz_2t_8x4_lsx(const uint8_t *src, int32_t src_stride, src3, src3, mask, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, vec0, vec1, vec2, vec3); - DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2, - FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3); - DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, src0, src1); - - __lsx_vstelm_d(src0, dst, 0, 0); - __lsx_vstelm_d(src0, dst + dst_stride, 0, 1); - __lsx_vstelm_d(src1, dst + dst_stride2, 0, 0); - __lsx_vstelm_d(src1, dst + dst_stride3, 0, 1); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec1); + + __lsx_vstelm_d(vec0, dst, 0, 0); + __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(vec1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(vec1, dst + dst_stride3, 0, 1); } static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, @@ -490,15 +487,9 @@ static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, src3, src3, mask, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, vec0, vec1, vec2, vec3); - DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2, - FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3); - - src0 = __lsx_vld(src, 0); - DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); - src3 = __lsx_vldx(src, src_stride3); - src += src_stride4; + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); - DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1); __lsx_vstelm_d(out0, dst, 0, 0); dst += dst_stride; __lsx_vstelm_d(out0, dst, 0, 1); @@ -508,13 +499,17 @@ static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, __lsx_vstelm_d(out1, dst, 0, 1); dst += dst_stride; + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, src3, src3, mask, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, vec0, vec1, vec2, vec3); - DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2, - FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3); - DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); __lsx_vstelm_d(out0, dst, 0, 0); dst += dst_stride; @@ -537,27 +532,25 @@ static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, mask, src3, src3, mask, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, vec0, vec1, vec2, vec3); - DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2, - FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3); - - src0 = __lsx_vld(src, 0); - DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); - src3 = __lsx_vldx(src, src_stride3); - src += src_stride4; + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); - DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1); __lsx_vstelm_d(out0, dst, 0, 0); __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, src3, src3, mask, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, vec0, vec1, vec2, vec3); - DUP4_ARG2(__lsx_vsrari_h, vec0, FILTER_BITS, vec1, FILTER_BITS, vec2, - FILTER_BITS, vec3, FILTER_BITS, vec0, vec1, vec2, vec3); - DUP2_ARG2(__lsx_vpickev_b, vec1, vec0, vec3, vec2, out0, out1); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); __lsx_vstelm_d(out0, dst_tmp1, 0, 0); __lsx_vstelm_d(out0, dst_tmp1 + dst_stride, 0, 1); @@ -582,7 +575,7 @@ static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride, uint32_t loop_cnt = (height >> 2) - 1; __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; @@ -609,22 +602,17 @@ static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride, out0, out1, out2, out3); DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, out4, out5, out6, out7); - DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2, - FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3); - DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6, - FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, out0, + out1, out2, out3); - tmp = __lsx_vpickev_b(out1, out0); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out0, dst, 0); dst += dst_stride; - tmp = __lsx_vpickev_b(out3, out2); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out1, dst, 0); dst += dst_stride; - tmp = __lsx_vpickev_b(out5, out4); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out2, dst, 0); dst += dst_stride; - tmp = __lsx_vpickev_b(out7, out6); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out3, dst, 0); dst += dst_stride; for (; loop_cnt--;) { @@ -648,22 +636,17 @@ static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride, filt0, out0, out1, out2, out3); DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, out4, out5, out6, out7); - DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2, - FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3); - DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6, - FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out1, out2, out3); - tmp = __lsx_vpickev_b(out1, out0); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out0, dst, 0); dst += dst_stride; - tmp = __lsx_vpickev_b(out3, out2); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out1, dst, 0); dst += dst_stride; - tmp = __lsx_vpickev_b(out5, out4); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out2, dst, 0); dst += dst_stride; - tmp = __lsx_vpickev_b(out7, out6); - __lsx_vst(tmp, dst, 0); + __lsx_vst(out3, dst, 0); dst += dst_stride; } } @@ -674,7 +657,7 @@ static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride, uint32_t loop_cnt = (height >> 1); __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; mask = __lsx_vld(mc_filt_mask_arr, 0); @@ -699,21 +682,16 @@ static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride, filt0, out0, out1, out2, out3); DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, out4, out5, out6, out7); - DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2, - FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3); - DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6, - FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7); - - tmp = __lsx_vpickev_b(out1, out0); - __lsx_vst(tmp, dst, 0); - tmp = __lsx_vpickev_b(out3, out2); - __lsx_vst(tmp, dst, 16); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out1, out2, out3); + + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); dst += dst_stride; - tmp = __lsx_vpickev_b(out5, out4); - __lsx_vst(tmp, dst, 0); - tmp = __lsx_vpickev_b(out7, out6); - __lsx_vst(tmp, dst, 16); + __lsx_vst(out2, dst, 0); + __lsx_vst(out3, dst, 16); dst += dst_stride; } } @@ -724,7 +702,7 @@ static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride, uint32_t loop_cnt = height; __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - __m128i out0, out1, out2, out3, out4, out5, out6, out7, tmp; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; mask = __lsx_vld(mc_filt_mask_arr, 0); @@ -749,19 +727,14 @@ static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride, filt0, out0, out1, out2, out3); DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, out4, out5, out6, out7); - DUP4_ARG2(__lsx_vsrari_h, out0, FILTER_BITS, out1, FILTER_BITS, out2, - FILTER_BITS, out3, FILTER_BITS, out0, out1, out2, out3); - DUP4_ARG2(__lsx_vsrari_h, out4, FILTER_BITS, out5, FILTER_BITS, out6, - FILTER_BITS, out7, FILTER_BITS, out4, out5, out6, out7); - - tmp = __lsx_vpickev_b(out1, out0); - __lsx_vst(tmp, dst, 0); - tmp = __lsx_vpickev_b(out3, out2); - __lsx_vst(tmp, dst, 16); - tmp = __lsx_vpickev_b(out5, out4); - __lsx_vst(tmp, dst, 32); - tmp = __lsx_vpickev_b(out7, out6); - __lsx_vst(tmp, dst, 48); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out1, out2, out3); + + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + __lsx_vst(out2, dst, 32); + __lsx_vst(out3, dst, 48); dst += dst_stride; } } diff --git a/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_lsx.c index 894c137203..73583abb98 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_lsx.c @@ -248,7 +248,7 @@ static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride, int8_t *filter_horiz, int8_t *filter_vert) { __m128i src0, src1, src2, src3, src4, mask; - __m128i filt_vt, filt_hz, vec0, vec1, res0, res1; + __m128i filt_vt, filt_hz, vec0, vec1; __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; @@ -276,13 +276,13 @@ static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - DUP2_ARG2(__lsx_vpickev_b, tmp0, tmp0, tmp1, tmp1, res0, res1); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp0, tmp0, FILTER_BITS, tmp1, tmp1, + FILTER_BITS, tmp0, tmp1); - __lsx_vstelm_w(res0, dst, 0, 0); - __lsx_vstelm_w(res0, dst + dst_stride, 0, 1); - __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0); - __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1); + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp1, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(tmp1, dst + dst_stride3, 0, 1); } static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride, @@ -290,7 +290,6 @@ static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride, int8_t *filter_horiz, int8_t *filter_vert) { __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; - __m128i res0, res1, res2, res3; __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; __m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7; @@ -331,20 +330,19 @@ static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride, hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3, filt_vt, vec4, vec5, vec6, vec7); - DUP4_ARG2(__lsx_vsrari_h, vec4, FILTER_BITS, vec5, FILTER_BITS, vec6, - FILTER_BITS, vec7, FILTER_BITS, vec4, vec5, vec6, vec7); - DUP4_ARG2(__lsx_vpickev_b, vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, - res0, res1, res2, res3); - - __lsx_vstelm_w(res0, dst, 0, 0); - __lsx_vstelm_w(res0, dst + dst_stride, 0, 1); - __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0); - __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5, + FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, vec4, + vec5, vec6, vec7); + + __lsx_vstelm_w(vec4, dst, 0, 0); + __lsx_vstelm_w(vec4, dst + dst_stride, 0, 1); + __lsx_vstelm_w(vec5, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(vec5, dst + dst_stride3, 0, 1); dst += dst_stride4; - __lsx_vstelm_w(res2, dst, 0, 0); - __lsx_vstelm_w(res2, dst + dst_stride, 0, 1); - __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0); - __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1); + __lsx_vstelm_w(vec6, dst, 0, 0); + __lsx_vstelm_w(vec6, dst + dst_stride, 0, 1); + __lsx_vstelm_w(vec7, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(vec7, dst + dst_stride3, 0, 1); } static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride, @@ -364,7 +362,7 @@ static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert) { - __m128i src0, src1, src2, src3, src4, mask, out0, out1; + __m128i src0, src1, src2, src3, src4, mask; __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; @@ -401,14 +399,13 @@ static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride, vec3 = __lsx_vpackev_b(hz_out0, hz_out1); tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); - DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, - FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); - DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); - __lsx_vstelm_d(out0, dst, 0, 0); - __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); - __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); - __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride3, 0, 1); } static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, @@ -417,9 +414,9 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt = (height >> 3); - __m128i src0, src1, src2, src3, src4, mask, out0, out1; + __m128i src0, src1, src2, src3, src4, mask; __m128i filt_hz, filt_vt, vec0; - __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; @@ -449,8 +446,6 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); - DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); @@ -463,43 +458,44 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt); - DUP2_ARG2(__lsx_vsrari_h, tmp3, FILTER_BITS, tmp4, FILTER_BITS, tmp3, tmp4); - DUP2_ARG2(__lsx_vpickev_b, tmp2, tmp1, tmp4, tmp3, out0, out1); - __lsx_vstelm_d(out0, dst, 0, 0); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3, + FILTER_BITS, tmp1, tmp2); + + __lsx_vstelm_d(tmp1, dst, 0, 0); dst += dst_stride; - __lsx_vstelm_d(out0, dst, 0, 1); + __lsx_vstelm_d(tmp1, dst, 0, 1); dst += dst_stride; - __lsx_vstelm_d(out1, dst, 0, 0); + __lsx_vstelm_d(tmp2, dst, 0, 0); dst += dst_stride; - __lsx_vstelm_d(out1, dst, 0, 1); + __lsx_vstelm_d(tmp2, dst, 0, 1); dst += dst_stride; hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); - tmp5 = __lsx_vdp2_h_bu(vec0, filt_vt); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); - tmp6 = __lsx_vdp2_h_bu(vec0, filt_vt); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); - tmp7 = __lsx_vdp2_h_bu(vec0, filt_vt); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); - tmp8 = __lsx_vdp2_h_bu(vec0, filt_vt); + tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt); - DUP4_ARG2(__lsx_vsrari_h, tmp5, FILTER_BITS, tmp6, FILTER_BITS, tmp7, - FILTER_BITS, tmp8, FILTER_BITS, tmp5, tmp6, tmp7, tmp8); - DUP2_ARG2(__lsx_vpickev_b, tmp6, tmp5, tmp8, tmp7, out0, out1); - __lsx_vstelm_d(out0, dst, 0, 0); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3, + FILTER_BITS, tmp1, tmp2); + + __lsx_vstelm_d(tmp1, dst, 0, 0); dst += dst_stride; - __lsx_vstelm_d(out0, dst, 0, 1); + __lsx_vstelm_d(tmp1, dst, 0, 1); dst += dst_stride; - __lsx_vstelm_d(out1, dst, 0, 0); + __lsx_vstelm_d(tmp2, dst, 0, 0); dst += dst_stride; - __lsx_vstelm_d(out1, dst, 0, 1); + __lsx_vstelm_d(tmp2, dst, 0, 1); dst += dst_stride; } } @@ -554,8 +550,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); - DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2); - tmp = __lsx_vpickev_b(tmp2, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); __lsx_vst(tmp, dst, 0); dst += dst_stride; @@ -563,8 +558,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); - DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2); - tmp = __lsx_vpickev_b(tmp2, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); __lsx_vst(tmp, dst, 0); dst += dst_stride; @@ -572,8 +566,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); - DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2); - tmp = __lsx_vpickev_b(tmp2, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); __lsx_vst(tmp, dst, 0); dst += dst_stride; @@ -581,8 +574,7 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); - DUP2_ARG2(__lsx_vsrari_h, tmp1, FILTER_BITS, tmp2, FILTER_BITS, tmp1, tmp2); - tmp = __lsx_vpickev_b(tmp2, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); __lsx_vst(tmp, dst, 0); dst += dst_stride; } @@ -599,8 +591,6 @@ static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride, common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); - src += 16; - dst += 16; } static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride, diff --git a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c index c0bb10f3b7..7e3a95b2fd 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c @@ -361,13 +361,12 @@ static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { __m128i src0, src1, src2, src3, src4; - __m128i src10_l, src32_l, src21_l, src43_l, src2110, src4332; + __m128i vec0, vec1, vec2, vec3, vec4, vec5; __m128i filt0, tmp0, tmp1; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; int32_t src_stride4 = src_stride2 << 1; - int32_t dst_stride2 = dst_stride << 1; int32_t dst_stride3 = dst_stride2 + dst_stride; @@ -378,37 +377,33 @@ static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, src, src_stride4, src1, src2, src3, src4); src += (src_stride4 + src_stride); - DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, - src10_l, src21_l, src32_l, src43_l); - DUP2_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l, src2110, - src4332); - DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - src2110 = __lsx_vpickev_b(tmp1, tmp0); - - __lsx_vstelm_w(src2110, dst, 0, 0); - __lsx_vstelm_w(src2110, dst + dst_stride, 0, 1); - __lsx_vstelm_w(src2110, dst + dst_stride2, 0, 2); - __lsx_vstelm_w(src2110, dst + dst_stride3, 0, 3); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0, + vec1, vec2, vec3); + DUP2_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec4, vec5); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3); } static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; - __m128i src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; - __m128i src65_l, src87_l, src2110, src4332, src6554, src8776; + __m128i vec0, vec1, vec2, vec3, vec4, vec5; + __m128i vec6, vec7, vec8, vec9, vec10, vec11; __m128i tmp0, tmp1, tmp2, tmp3; __m128i filt0; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; int32_t src_stride4 = src_stride2 << 1; - int32_t dst_stride2 = dst_stride << 1; int32_t dst_stride3 = dst_stride2 + dst_stride; int32_t dst_stride4 = dst_stride2 << 1; - uint8_t *dst_tmp1 = dst + dst_stride4; filt0 = __lsx_vldrepl_h(filter, 0); @@ -420,27 +415,27 @@ static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, src, src_stride4, src5, src6, src7, src8); src += (src_stride4 + src_stride); - DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, - src10_l, src21_l, src32_l, src43_l); - DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, - src54_l, src65_l, src76_l, src87_l); - DUP4_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, - src87_l, src76_l, src2110, src4332, src6554, src8776); - DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0, - src8776, filt0, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, - FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); - DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, src2110, src4332); - - __lsx_vstelm_w(src2110, dst, 0, 0); - __lsx_vstelm_w(src2110, dst + dst_stride, 0, 1); - __lsx_vstelm_w(src2110, dst + dst_stride2, 0, 2); - __lsx_vstelm_w(src2110, dst + dst_stride3, 0, 3); - - __lsx_vstelm_w(src4332, dst_tmp1, 0, 0); - __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride, 0, 1); - __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride2, 0, 2); - __lsx_vstelm_w(src4332, dst_tmp1 + dst_stride3, 0, 3); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0, + vec1, vec2, vec3); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, vec4, + vec5, vec6, vec7); + DUP4_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec8, + vec9, vec10, vec11); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec8, filt0, vec9, filt0, vec10, filt0, vec11, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3); + + __lsx_vstelm_w(tmp1, dst_tmp1, 0, 0); + __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride, 0, 1); + __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride3, 0, 3); } static void common_vt_2t_4w_lsx(const uint8_t *src, int32_t src_stride, @@ -457,17 +452,14 @@ static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { __m128i src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; - __m128i out0, out1; - __m128i tmp0, tmp1, tmp2, tmp3; + __m128i out0, out1, tmp0, tmp1, tmp2, tmp3; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; int32_t src_stride4 = src_stride2 << 1; - int32_t dst_stride2 = dst_stride << 1; int32_t dst_stride3 = dst_stride2 + dst_stride; - /* rearranging filter_y */ filt0 = __lsx_vldrepl_h(filter, 0); src0 = __lsx_vld(src, 0); @@ -478,9 +470,8 @@ static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride, vec1, vec2, vec3); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, - FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); - DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); __lsx_vstelm_d(out0, dst, 0, 0); __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); @@ -494,13 +485,11 @@ static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, uint32_t loop_cnt = (height >> 3); __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - __m128i out0, out1; - __m128i tmp0, tmp1, tmp2, tmp3; + __m128i out0, out1, tmp0, tmp1, tmp2, tmp3; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; int32_t src_stride4 = src_stride2 << 1; - int32_t dst_stride2 = dst_stride << 1; int32_t dst_stride3 = dst_stride2 + dst_stride; int32_t dst_stride4 = dst_stride2 << 1; @@ -525,9 +514,9 @@ static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, vec4, vec5, vec6, vec7); DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, - FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); - DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); @@ -536,9 +525,9 @@ static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, tmp0, tmp1, tmp2, tmp3); - DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2, - FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3); - DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, out0, out1); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); @@ -559,29 +548,17 @@ static void common_vt_2t_8w_lsx(const uint8_t *src, int32_t src_stride, } } -static void common_vt_2t_16w_lsx(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int y0_q4, - int y_step_q4, int w, int height) { +static void common_vt_2t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { uint32_t loop_cnt = (height >> 2); - __m128i src0, src1, src2, src3, src4; + __m128i src0, src1, src2, src3, src4, tmp, tmp0, tmp1; __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; int32_t src_stride4 = src_stride2 << 1; - const int16_t *const filter_y = filter[y0_q4]; - int8_t cnt, filt_ver[8]; - - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - for (cnt = 8; cnt--;) { - filt_ver[cnt] = filter_y[cnt]; - } - - filt0 = __lsx_vldrepl_h(&filt_ver[3], 0); + filt0 = __lsx_vldrepl_h(filter, 0); src0 = __lsx_vld(src, 0); src += src_stride; @@ -595,29 +572,25 @@ static void common_vt_2t_16w_lsx(const uint8_t *src, ptrdiff_t src_stride, DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp4 = __lsx_vpickev_b(tmp1, tmp0); - __lsx_vst(tmp4, dst, 0); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); dst += dst_stride; DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); - DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); - tmp4 = __lsx_vpickev_b(tmp3, tmp2); - __lsx_vst(tmp4, dst, 0); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); dst += dst_stride; DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp4 = __lsx_vpickev_b(tmp1, tmp0); - __lsx_vst(tmp4, dst, 0); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); dst += dst_stride; - DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); - tmp4 = __lsx_vpickev_b(tmp3, tmp2); - __lsx_vst(tmp4, dst, 0); + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); dst += dst_stride; src0 = src4; @@ -630,20 +603,18 @@ static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride, uint32_t loop_cnt = (height >> 2); __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4; + __m128i tmp, tmp0, tmp1; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; int32_t src_stride4 = src_stride2 << 1; - int32_t dst_stride2 = dst_stride << 1; int32_t dst_stride3 = dst_stride2 + dst_stride; - uint8_t *src_tmp; + filt0 = __lsx_vldrepl_h(filter, 0); - src0 = __lsx_vld(src, 0); - src5 = __lsx_vld(src, 16); + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5); src += src_stride; src_tmp = src + 16; @@ -658,53 +629,45 @@ static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride, src_tmp += src_stride4; DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp4 = __lsx_vpickev_b(tmp1, tmp0); - __lsx_vst(tmp4, dst, 0); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); - DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); - tmp4 = __lsx_vpickev_b(tmp3, tmp2); - __lsx_vstx(tmp4, dst, dst_stride); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vstx(tmp, dst, dst_stride); DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp4 = __lsx_vpickev_b(tmp1, tmp0); - __lsx_vstx(tmp4, dst, dst_stride2); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vstx(tmp, dst, dst_stride2); - DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); - tmp4 = __lsx_vpickev_b(tmp3, tmp2); - __lsx_vstx(tmp4, dst, dst_stride3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vstx(tmp, dst, dst_stride3); DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2); DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp4 = __lsx_vpickev_b(tmp1, tmp0); - __lsx_vst(tmp4, dst, 16); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 16); - DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); dst += dst_stride; - tmp4 = __lsx_vpickev_b(tmp3, tmp2); - __lsx_vst(tmp4, dst, 16); + __lsx_vst(tmp, dst, 16); DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6); DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7); DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); dst += dst_stride; - tmp4 = __lsx_vpickev_b(tmp1, tmp0); - __lsx_vst(tmp4, dst, 16); + __lsx_vst(tmp, dst, 16); - DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); dst += dst_stride; - tmp4 = __lsx_vpickev_b(tmp3, tmp2); - __lsx_vst(tmp4, dst, 16); + __lsx_vst(tmp, dst, 16); dst += dst_stride; @@ -719,7 +682,7 @@ static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride, uint32_t loop_cnt = (height >> 1); __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; __m128i src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - __m128i tmp, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i tmp, tmp0, tmp1; int32_t src_stride2 = src_stride << 1; int32_t dst_stride2 = dst_stride << 1; @@ -743,49 +706,41 @@ static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp = __lsx_vpickev_b(tmp1, tmp0); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst, 0); - DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); - tmp = __lsx_vpickev_b(tmp3, tmp2); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst_tmp1, 0); DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6); DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7); - DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp4, tmp5); - DUP2_ARG2(__lsx_vsrari_h, tmp4, FILTER_BITS, tmp5, FILTER_BITS, tmp4, tmp5); - tmp = __lsx_vpickev_b(tmp5, tmp4); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst, 16); - DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp6, tmp7); - DUP2_ARG2(__lsx_vsrari_h, tmp6, FILTER_BITS, tmp7, FILTER_BITS, tmp6, tmp7); - tmp = __lsx_vpickev_b(tmp7, tmp6); + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst_tmp1, 16); DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2); DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); - DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1); - tmp = __lsx_vpickev_b(tmp1, tmp0); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst, 32); - DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp2, tmp3); - DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3); - tmp = __lsx_vpickev_b(tmp3, tmp2); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst_tmp1, 32); DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6); DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7); - DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp4, tmp5); - DUP2_ARG2(__lsx_vsrari_h, tmp4, FILTER_BITS, tmp5, FILTER_BITS, tmp4, tmp5); - tmp = __lsx_vpickev_b(tmp5, tmp4); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst, 48); - DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp6, tmp7); - DUP2_ARG2(__lsx_vsrari_h, tmp6, FILTER_BITS, tmp7, FILTER_BITS, tmp6, tmp7); - tmp = __lsx_vpickev_b(tmp7, tmp6); + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); __lsx_vst(tmp, dst_tmp1, 48); dst += dst_stride2; dst_tmp1 += dst_stride2; @@ -823,8 +778,8 @@ void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride, &filt_ver[3], h); break; case 16: - common_vt_2t_16w_lsx(src, src_stride, dst, dst_stride, filter, y0_q4, - y_step_q4, w, h); + common_vt_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); break; case 32: common_vt_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, diff --git a/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c b/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c index 398788a43e..53dc7097ed 100644 --- a/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c @@ -15,7 +15,6 @@ static void copy_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height) { int32_t cnt; - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; __m128i src0, src1, src2, src3, src4, src5, src6, src7; int32_t src_stride2 = src_stride << 1; int32_t src_stride3 = src_stride2 + src_stride; diff --git a/vpx_dsp/loongarch/vpx_convolve_lsx.h b/vpx_dsp/loongarch/vpx_convolve_lsx.h index d319bc4f7d..2428407f2b 100644 --- a/vpx_dsp/loongarch/vpx_convolve_lsx.h +++ b/vpx_dsp/loongarch/vpx_convolve_lsx.h @@ -125,19 +125,18 @@ tmp1_m; \ }) -#define PCKEV_AVG_ST4_D(in0, in1, in2, in3, dst0, dst1, pdst, stride) \ - { \ - __m128i tmp0_m, tmp1_m; \ - \ - DUP2_ARG2(__lsx_vpickev_b, in1, in0, in3, in2, tmp0_m, tmp1_m); \ - DUP2_ARG2(__lsx_vavgr_bu, tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ - __lsx_vstelm_d(tmp0_m, pdst, 0, 0); \ - pdst += stride; \ - __lsx_vstelm_d(tmp0_m, pdst, 0, 1); \ - pdst += stride; \ - __lsx_vstelm_d(tmp1_m, pdst, 0, 0); \ - pdst += stride; \ - __lsx_vstelm_d(tmp1_m, pdst, 0, 1); \ +#define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride) \ + { \ + __m128i tmp0_m, tmp1_m; \ + \ + DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \ + __lsx_vstelm_d(tmp0_m, pdst, 0, 0); \ + pdst += stride; \ + __lsx_vstelm_d(tmp0_m, pdst, 0, 1); \ + pdst += stride; \ + __lsx_vstelm_d(tmp1_m, pdst, 0, 0); \ + pdst += stride; \ + __lsx_vstelm_d(tmp1_m, pdst, 0, 1); \ } #endif // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ From f92c451e6c03685e28217f2080cc52a994938664 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Tue, 17 May 2022 19:06:04 +0800 Subject: [PATCH 0376/1000] loongarch: Modify the representation of macros Some macros have been changed to "#define do {...} While (0)", change the rest to "static INLINE ..." Bug: webm:1755 Change-Id: I445ac0c543f12df38f086b479394b111058367d0 --- vp8/common/loongarch/idct_lsx.c | 39 +-- vp8/common/loongarch/loopfilter_filters_lsx.c | 16 +- vp8/common/loongarch/sixtap_filter_lsx.c | 322 +++++++++--------- vpx_dsp/loongarch/bitdepth_conversion_lsx.h | 43 +-- vpx_dsp/loongarch/fwd_dct32x32_lsx.c | 4 +- vpx_dsp/loongarch/fwd_txfm_lsx.c | 4 +- vpx_dsp/loongarch/fwd_txfm_lsx.h | 40 +-- vpx_dsp/loongarch/idct32x32_lsx.c | 4 +- vpx_dsp/loongarch/loopfilter_16_lsx.c | 8 +- vpx_dsp/loongarch/loopfilter_lsx.h | 20 +- vpx_dsp/loongarch/quantize_lsx.c | 192 ++++++----- vpx_dsp/loongarch/sad_lsx.c | 231 +++++++------ vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c | 102 +++--- .../loongarch/vpx_convolve8_avg_vert_lsx.c | 28 +- vpx_dsp/loongarch/vpx_convolve8_lsx.c | 110 +++--- vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c | 44 +-- vpx_dsp/loongarch/vpx_convolve_lsx.h | 100 +++--- 17 files changed, 654 insertions(+), 653 deletions(-) diff --git a/vp8/common/loongarch/idct_lsx.c b/vp8/common/loongarch/idct_lsx.c index 679019ff63..eee871eec4 100644 --- a/vp8/common/loongarch/idct_lsx.c +++ b/vp8/common/loongarch/idct_lsx.c @@ -16,47 +16,44 @@ static const int32_t cospi8sqrt2minus1 = 20091; static const int32_t sinpi8sqrt2 = 35468; #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ + do { \ __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ \ DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, tmp0_m, tmp1_m); \ DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, tmp2_m, tmp3_m); \ DUP2_ARG2(__lsx_vilvl_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ DUP2_ARG2(__lsx_vilvh_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ - } + } while (0) #define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ + do { \ __m128i s4_m, s5_m, s6_m, s7_m; \ \ TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m); \ DUP2_ARG2(__lsx_vilvl_d, s6_m, s4_m, s7_m, s5_m, out0, out2); \ out1 = __lsx_vilvh_d(s6_m, s4_m); \ out3 = __lsx_vilvh_d(s7_m, s5_m); \ - } + } while (0) -#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in) \ - ({ \ - __m128i out_m; \ +#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in0, in1) \ + do { \ __m128i zero_m = __lsx_vldi(0); \ __m128i tmp1_m, tmp2_m; \ __m128i sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \ \ - tmp1_m = __lsx_vilvl_h(in, zero_m); \ - tmp2_m = __lsx_vilvh_h(in, zero_m); \ + tmp1_m = __lsx_vilvl_h(in0, zero_m); \ + tmp2_m = __lsx_vilvh_h(in0, zero_m); \ tmp1_m = __lsx_vsrai_w(tmp1_m, 16); \ tmp2_m = __lsx_vsrai_w(tmp2_m, 16); \ tmp1_m = __lsx_vmul_w(tmp1_m, sinpi8_sqrt2_m); \ tmp1_m = __lsx_vsrai_w(tmp1_m, 16); \ tmp2_m = __lsx_vmul_w(tmp2_m, sinpi8_sqrt2_m); \ tmp2_m = __lsx_vsrai_w(tmp2_m, 16); \ - out_m = __lsx_vpickev_h(tmp2_m, tmp1_m); \ - \ - out_m; \ - }) + in1 = __lsx_vpickev_h(tmp2_m, tmp1_m); \ + } while (0) #define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ + do { \ __m128i a1_m, b1_m, c1_m, d1_m; \ __m128i c_tmp1_m, c_tmp2_m; \ __m128i d_tmp1_m, d_tmp2_m; \ @@ -65,7 +62,7 @@ static const int32_t sinpi8sqrt2 = 35468; const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_h(cospi8sqrt2minus1); \ a1_m = __lsx_vadd_h(in0, in2); \ b1_m = __lsx_vsub_h(in0, in2); \ - c_tmp1_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1); \ + EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1, c_tmp1_m); \ \ c_tmp2_m = __lsx_vmuh_h(in3, const_cospi8sqrt2minus1_m); \ c_tmp2_m = __lsx_vslli_h(c_tmp2_m, 1); \ @@ -77,13 +74,13 @@ static const int32_t sinpi8sqrt2 = 35468; d_tmp1_m = __lsx_vslli_h(d_tmp1_m, 1); \ d_tmp1_m = __lsx_vsrai_h(d_tmp1_m, 1); \ d_tmp1_m = __lsx_vadd_h(in1, d_tmp1_m); \ - d_tmp2_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3); \ + EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3, d_tmp2_m); \ d1_m = __lsx_vadd_h(d_tmp1_m, d_tmp2_m); \ LSX_BUTTERFLY_4_H(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ - } + } while (0) #define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ + do { \ __m128i a1_m, b1_m, c1_m, d1_m; \ __m128i c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \ __m128i const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m; \ @@ -105,13 +102,13 @@ static const int32_t sinpi8sqrt2 = 35468; d_tmp2_m = __lsx_vsrai_w(d_tmp2_m, 16); \ d1_m = __lsx_vadd_w(d_tmp1_m, d_tmp2_m); \ LSX_BUTTERFLY_4_W(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ - } + } while (0) #define UNPCK_SH_SW(in, out0, out1) \ - { \ + do { \ out0 = __lsx_vsllwil_w_h(in, 0); \ out1 = __lsx_vexth_w_h(in); \ - } + } while (0) static void idct4x4_addconst_lsx(int16_t in_dc, uint8_t *pred, int32_t pred_stride, uint8_t *dest, diff --git a/vp8/common/loongarch/loopfilter_filters_lsx.c b/vp8/common/loongarch/loopfilter_filters_lsx.c index a3ac76d258..f743ec0c50 100644 --- a/vp8/common/loongarch/loopfilter_filters_lsx.c +++ b/vp8/common/loongarch/loopfilter_filters_lsx.c @@ -14,7 +14,7 @@ #include "vpx_util/loongson_intrinsics.h" #define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) \ - { \ + do { \ __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ const __m128i cnst4b = __lsx_vldi(4); \ const __m128i cnst3b = __lsx_vldi(3); \ @@ -46,10 +46,10 @@ q1 = __lsx_vxori_b(q1_m, 0x80); \ p1_m = __lsx_vsadd_b(p1_m, filt); \ p1 = __lsx_vxori_b(p1_m, 0x80); \ - } + } while (0) #define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ - { \ + do { \ __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ __m128i u, filt, t1, t2, filt_sign, q0_sub_p0; \ __m128i filt_r, filt_l; \ @@ -113,12 +113,12 @@ p0_m = __lsx_vsadd_b(p0_m, u); \ q0 = __lsx_vxori_b(q0_m, 0x80); \ p0 = __lsx_vxori_b(p0_m, 0x80); \ - } + } while (0) #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ flat_out) \ - { \ + do { \ __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ \ @@ -143,13 +143,13 @@ mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out); \ mask_out = __lsx_vslt_bu(limit_in, mask_out); \ mask_out = __lsx_vxori_b(mask_out, 0xff); \ - } + } while (0) #define VP8_ST6x1_B(in0, in0_idx, in1, in1_idx, pdst, stride) \ - { \ + do { \ __lsx_vstelm_w(in0, pdst, 0, in0_idx); \ __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx); \ - } + } while (0) static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch, const uint8_t *b_limit0_ptr, diff --git a/vp8/common/loongarch/sixtap_filter_lsx.c b/vp8/common/loongarch/sixtap_filter_lsx.c index a23ed16d2b..cd7ba54746 100644 --- a/vp8/common/loongarch/sixtap_filter_lsx.c +++ b/vp8/common/loongarch/sixtap_filter_lsx.c @@ -33,37 +33,61 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 }; -#define DPADD_H3(in0, in1, in2, coeff0, coeff1, coeff2) \ - ({ \ - __m128i out0_m; \ - \ - out0_m = __lsx_vdp2_h_b(in0, coeff0); \ - out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1); \ - out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2); \ - \ - out0_m; \ - }) - -#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \ - filt_h2) \ - ({ \ - __m128i vec0_m, vec1_m, vec2_m; \ - __m128i hz_out_m; \ - \ - DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, \ - vec1_m); \ - vec2_m = __lsx_vshuf_b(src1, src0, mask2); \ - hz_out_m = DPADD_H3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); \ - \ - hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); \ - hz_out_m = __lsx_vsat_h(hz_out_m, 7); \ - \ - hz_out_m; \ - }) +static INLINE __m128i dpadd_h3(__m128i in0, __m128i in1, __m128i in2, + __m128i coeff0, __m128i coeff1, __m128i coeff2) { + __m128i out0_m; + + out0_m = __lsx_vdp2_h_b(in0, coeff0); + out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1); + out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2); + + return out0_m; +} + +static INLINE __m128i horiz_6tap_filt(__m128i src0, __m128i src1, __m128i mask0, + __m128i mask1, __m128i mask2, + __m128i filt_h0, __m128i filt_h1, + __m128i filt_h2) { + __m128i vec0_m, vec1_m, vec2_m; + __m128i hz_out_m; + + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, + vec1_m); + vec2_m = __lsx_vshuf_b(src1, src0, mask2); + hz_out_m = dpadd_h3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); + hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); + hz_out_m = __lsx_vsat_h(hz_out_m, 7); + + return hz_out_m; +} + +static INLINE __m128i filt_4tap_dpadd_h(__m128i vec0, __m128i vec1, + __m128i filt0, __m128i filt1) { + __m128i tmp_m; + + tmp_m = __lsx_vdp2_h_b(vec0, filt0); + tmp_m = __lsx_vdp2add_h_b(tmp_m, vec1, filt1); + + return tmp_m; +} + +static INLINE __m128i horiz_4tap_filt(__m128i src0, __m128i src1, __m128i mask0, + __m128i mask1, __m128i filt_h0, + __m128i filt_h1) { + __m128i vec0_m, vec1_m, hz_out_m; + + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, + vec1_m); + hz_out_m = filt_4tap_dpadd_h(vec0_m, vec1_m, filt_h0, filt_h1); + hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); + hz_out_m = __lsx_vsat_h(hz_out_m, 7); + + return hz_out_m; +} #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ mask2, filt0, filt1, filt2, out0, out1) \ - { \ + do { \ __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \ \ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \ @@ -77,12 +101,12 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { vec5_m); \ DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \ out0, out1); \ - } + } while (0) #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ mask2, filt0, filt1, filt2, out0, out1, \ out2, out3) \ - ({ \ + do { \ __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ \ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \ @@ -105,35 +129,11 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \ out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2, \ out3); \ - }) - -#define FILT_4TAP_DPADD_H(vec0, vec1, filt0, filt1) \ - ({ \ - __m128i tmp0; \ - \ - tmp0 = __lsx_vdp2_h_b(vec0, filt0); \ - tmp0 = __lsx_vdp2add_h_b(tmp0, vec1, filt1); \ - \ - tmp0; \ - }) - -#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ - ({ \ - __m128i vec0_m, vec1_m; \ - __m128i hz_out_m; \ - \ - DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, \ - vec1_m); \ - hz_out_m = FILT_4TAP_DPADD_H(vec0_m, vec1_m, filt_h0, filt_h1); \ - hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); \ - hz_out_m = __lsx_vsat_h(hz_out_m, 7); \ - \ - hz_out_m; \ - }) + } while (0) #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ filt0, filt1, out0, out1) \ - { \ + do { \ __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ \ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \ @@ -143,11 +143,11 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { vec3_m); \ DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \ out0, out1); \ - } + } while (0) #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ filt0, filt1, out0, out1, out2, out3) \ - ({ \ + do { \ __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ \ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \ @@ -163,7 +163,7 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \ out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, \ out3); \ - }) + } while (0) static inline void common_hz_6t_4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride, @@ -424,8 +424,8 @@ static void common_vt_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP2_ARG2(__lsx_vilvl_d, src65_r, src54_r, src87_r, src76_r, src6554, src8776); DUP2_ARG2(__lsx_vxori_b, src6554, 128, src8776, 128, src6554, src8776); - out0 = DPADD_H3(src2110, src4332, src6554, filt0, filt1, filt2); - out1 = DPADD_H3(src4332, src6554, src8776, filt0, filt1, filt2); + out0 = dpadd_h3(src2110, src4332, src6554, filt0, filt1, filt2); + out1 = dpadd_h3(src4332, src6554, src8776, filt0, filt1, filt2); out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); out0 = __lsx_vxori_b(out0, 128); @@ -487,10 +487,10 @@ static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10, src9, src76_r, src87_r, src98_r, src109_r); - out0_r = DPADD_H3(src10_r, src32_r, src76_r, filt0, filt1, filt2); - out1_r = DPADD_H3(src21_r, src43_r, src87_r, filt0, filt1, filt2); - out2_r = DPADD_H3(src32_r, src76_r, src98_r, filt0, filt1, filt2); - out3_r = DPADD_H3(src43_r, src87_r, src109_r, filt0, filt1, filt2); + out0_r = dpadd_h3(src10_r, src32_r, src76_r, filt0, filt1, filt2); + out1_r = dpadd_h3(src21_r, src43_r, src87_r, filt0, filt1, filt2); + out2_r = dpadd_h3(src32_r, src76_r, src98_r, filt0, filt1, filt2); + out3_r = dpadd_h3(src43_r, src87_r, src109_r, filt0, filt1, filt2); DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r, out2_r, VP8_FILTER_SHIFT, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); @@ -555,14 +555,14 @@ static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src54_r, src65_r, src76_r, src87_r); DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l, src76_l, src87_l); - out0_r = DPADD_H3(src10_r, src32_r, src54_r, filt0, filt1, filt2); - out1_r = DPADD_H3(src21_r, src43_r, src65_r, filt0, filt1, filt2); - out2_r = DPADD_H3(src32_r, src54_r, src76_r, filt0, filt1, filt2); - out3_r = DPADD_H3(src43_r, src65_r, src87_r, filt0, filt1, filt2); - out0_l = DPADD_H3(src10_l, src32_l, src54_l, filt0, filt1, filt2); - out1_l = DPADD_H3(src21_l, src43_l, src65_l, filt0, filt1, filt2); - out2_l = DPADD_H3(src32_l, src54_l, src76_l, filt0, filt1, filt2); - out3_l = DPADD_H3(src43_l, src65_l, src87_l, filt0, filt1, filt2); + out0_r = dpadd_h3(src10_r, src32_r, src54_r, filt0, filt1, filt2); + out1_r = dpadd_h3(src21_r, src43_r, src65_r, filt0, filt1, filt2); + out2_r = dpadd_h3(src32_r, src54_r, src76_r, filt0, filt1, filt2); + out3_r = dpadd_h3(src43_r, src65_r, src87_r, filt0, filt1, filt2); + out0_l = dpadd_h3(src10_l, src32_l, src54_l, filt0, filt1, filt2); + out1_l = dpadd_h3(src21_l, src43_l, src65_l, filt0, filt1, filt2); + out2_l = dpadd_h3(src32_l, src54_l, src76_l, filt0, filt1, filt2); + out3_l = dpadd_h3(src43_l, src65_l, src87_l, filt0, filt1, filt2); DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l, out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT, out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3); @@ -621,12 +621,12 @@ static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src1, src2, src3); src4 = __lsx_vxori_b(src4, 128); - hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out2 = horiz_6tap_filt(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); - hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); @@ -636,7 +636,7 @@ static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src += src_stride_x2; DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6); - hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, + hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); @@ -645,15 +645,15 @@ static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src += src_stride_x2; DUP2_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src7, src8); - hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0, + hz_out7 = horiz_6tap_filt(src7, src8, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff); out2 = __lsx_vpackev_b(hz_out5, hz_out4); - tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); out3 = __lsx_vpackev_b(hz_out7, hz_out6); - tmp1 = DPADD_H3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); + tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); tmp0 = __lsx_vxori_b(tmp0, 128); @@ -710,15 +710,15 @@ static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src1, src2, src3); src4 = __lsx_vxori_b(src4, 128); - hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out4 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); filt = __lsx_vld(filter_vert, 0); DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1); @@ -734,25 +734,25 @@ static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, src6, src7, src8); - hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, + hz_out5 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); out2 = __lsx_vpackev_b(hz_out5, hz_out4); - tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); - hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, + hz_out6 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); out5 = __lsx_vpackev_b(hz_out6, hz_out5); - tmp1 = DPADD_H3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); + tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); - hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0, + hz_out7 = horiz_6tap_filt(src7, src7, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); out7 = __lsx_vpackev_b(hz_out7, hz_out6); - tmp2 = DPADD_H3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2); + tmp2 = dpadd_h3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2); - hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0, + hz_out8 = horiz_6tap_filt(src8, src8, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); out6 = __lsx_vpackev_b(hz_out8, hz_out7); - tmp3 = DPADD_H3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2); + tmp3 = dpadd_h3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2); DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, VP8_FILTER_SHIFT, tmp3, tmp2, VP8_FILTER_SHIFT, vec0, vec1); @@ -997,14 +997,14 @@ static void common_vt_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); src4332 = __lsx_vilvl_d(src43_r, src32_r); src4332 = __lsx_vxori_b(src4332, 128); - out0 = FILT_4TAP_DPADD_H(src2110, src4332, filt0, filt1); + out0 = filt_4tap_dpadd_h(src2110, src4332, filt0, filt1); src2 = __lsx_vld(src, 0); src += src_stride; DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src54_r, src65_r); src2110 = __lsx_vilvl_d(src65_r, src54_r); src2110 = __lsx_vxori_b(src2110, 128); - out1 = FILT_4TAP_DPADD_H(src4332, src2110, filt0, filt1); + out1 = filt_4tap_dpadd_h(src4332, src2110, filt0, filt1); out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); out0 = __lsx_vxori_b(out0, 128); @@ -1055,10 +1055,10 @@ static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src8, src9, src10); DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9, src72_r, src87_r, src98_r, src109_r); - out0_r = FILT_4TAP_DPADD_H(src10_r, src72_r, filt0, filt1); - out1_r = FILT_4TAP_DPADD_H(src21_r, src87_r, filt0, filt1); - out2_r = FILT_4TAP_DPADD_H(src72_r, src98_r, filt0, filt1); - out3_r = FILT_4TAP_DPADD_H(src87_r, src109_r, filt0, filt1); + out0_r = filt_4tap_dpadd_h(src10_r, src72_r, filt0, filt1); + out1_r = filt_4tap_dpadd_h(src21_r, src87_r, filt0, filt1); + out2_r = filt_4tap_dpadd_h(src72_r, src98_r, filt0, filt1); + out3_r = filt_4tap_dpadd_h(src87_r, src109_r, filt0, filt1); DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r, out2_r, VP8_FILTER_SHIFT, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); @@ -1114,14 +1114,14 @@ static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src32_r, src43_r, src54_r, src65_r); DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6, src5, src32_l, src43_l, src54_l, src65_l); - out0_r = FILT_4TAP_DPADD_H(src10_r, src32_r, filt0, filt1); - out1_r = FILT_4TAP_DPADD_H(src21_r, src43_r, filt0, filt1); - out2_r = FILT_4TAP_DPADD_H(src32_r, src54_r, filt0, filt1); - out3_r = FILT_4TAP_DPADD_H(src43_r, src65_r, filt0, filt1); - out0_l = FILT_4TAP_DPADD_H(src10_l, src32_l, filt0, filt1); - out1_l = FILT_4TAP_DPADD_H(src21_l, src43_l, filt0, filt1); - out2_l = FILT_4TAP_DPADD_H(src32_l, src54_l, filt0, filt1); - out3_l = FILT_4TAP_DPADD_H(src43_l, src65_l, filt0, filt1); + out0_r = filt_4tap_dpadd_h(src10_r, src32_r, filt0, filt1); + out1_r = filt_4tap_dpadd_h(src21_r, src43_r, filt0, filt1); + out2_r = filt_4tap_dpadd_h(src32_r, src54_r, filt0, filt1); + out3_r = filt_4tap_dpadd_h(src43_r, src65_r, filt0, filt1); + out0_l = filt_4tap_dpadd_h(src10_l, src32_l, filt0, filt1); + out1_l = filt_4tap_dpadd_h(src21_l, src43_l, filt0, filt1); + out2_l = filt_4tap_dpadd_h(src32_l, src54_l, filt0, filt1); + out3_l = filt_4tap_dpadd_h(src43_l, src65_l, filt0, filt1); DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l, out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT, out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3); @@ -1168,8 +1168,8 @@ static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); src2 = __lsx_vxori_b(src2, 128); - hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); - hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1); + hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = horiz_4tap_filt(src1, src2, mask0, mask1, filt_hz0, filt_hz1); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, @@ -1182,16 +1182,16 @@ static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src += src_stride_x4; DUP2_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src3, src4); - hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1); hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff); vec1 = __lsx_vpackev_b(hz_out3, hz_out2); - tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6); - hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1); hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); vec2 = __lsx_vpackev_b(hz_out5, hz_out4); - tmp1 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1); + tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1); tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); tmp0 = __lsx_vxori_b(tmp0, 128); @@ -1239,9 +1239,9 @@ static inline void common_hv_4ht_4vt_8w_lsx( DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); src2 = __lsx_vxori_b(src2, 128); - hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); - hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); - hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); + hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2); filt = __lsx_vld(filter_vert, 0); @@ -1254,21 +1254,21 @@ static inline void common_hv_4ht_4vt_8w_lsx( DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, src4, src5, src6); - hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1); vec1 = __lsx_vpackev_b(hz_out3, hz_out2); - tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); - hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out0 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1); vec3 = __lsx_vpackev_b(hz_out0, hz_out3); - tmp1 = FILT_4TAP_DPADD_H(vec2, vec3, filt_vt0, filt_vt1); + tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1); - hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1); vec4 = __lsx_vpackev_b(hz_out1, hz_out0); - tmp2 = FILT_4TAP_DPADD_H(vec1, vec4, filt_vt0, filt_vt1); + tmp2 = filt_4tap_dpadd_h(vec1, vec4, filt_vt0, filt_vt1); - hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec0, vec1); - tmp3 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1); + tmp3 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); @@ -1324,9 +1324,9 @@ static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); src2 = __lsx_vxori_b(src2, 128); - hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out1 = horiz_6tap_filt(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); @@ -1341,17 +1341,17 @@ static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, src4, src5, src6); - hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, + hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff); vec1 = __lsx_vpackev_b(hz_out3, hz_out2); - tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); - hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, + hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); vec2 = __lsx_vpackev_b(hz_out5, hz_out4); - tmp1 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1); + tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1); DUP2_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); @@ -1402,11 +1402,11 @@ static inline void common_hv_6ht_4vt_8w_lsx( DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); src2 = __lsx_vxori_b(src2, 128); - hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); - hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2); @@ -1420,25 +1420,25 @@ static inline void common_hv_6ht_4vt_8w_lsx( DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, src4, src5, src6); - hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, + hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); vec1 = __lsx_vpackev_b(hz_out3, hz_out2); - tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); - hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, + hz_out0 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); vec3 = __lsx_vpackev_b(hz_out0, hz_out3); - tmp1 = FILT_4TAP_DPADD_H(vec2, vec3, filt_vt0, filt_vt1); + tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1); - hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, + hz_out1 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); - tmp2 = FILT_4TAP_DPADD_H(vec1, vec0, filt_vt0, filt_vt1); + tmp2 = filt_4tap_dpadd_h(vec1, vec0, filt_vt0, filt_vt1); - hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, + hz_out2 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0, filt_hz1, filt_hz2); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2); - tmp3 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1); + tmp3 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1); DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); @@ -1492,9 +1492,9 @@ static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, src1, src2, src3); src4 = __lsx_vxori_b(src4, 128); - hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); - hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1); - hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = horiz_4tap_filt(src2, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1); hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); @@ -1510,15 +1510,15 @@ static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, src6, src7, src8); src += src_stride_x4; - hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1); hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); out2 = __lsx_vpackev_b(hz_out5, hz_out4); - tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); - hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1); + hz_out7 = horiz_4tap_filt(src7, src8, mask0, mask1, filt_hz0, filt_hz1); hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff); out3 = __lsx_vpackev_b(hz_out7, hz_out6); - tmp1 = DPADD_H3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); + tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); tmp0 = __lsx_vxori_b(tmp0, 128); @@ -1571,11 +1571,11 @@ static inline void common_hv_4ht_6vt_8w_lsx( DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, src1, src2, src3); src4 = __lsx_vxori_b(src4, 128); - hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); - hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); - hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); - hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); - hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4); @@ -1590,21 +1590,21 @@ static inline void common_hv_4ht_6vt_8w_lsx( DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, src6, src7, src8); - hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); + hz_out5 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1); out2 = __lsx_vpackev_b(hz_out5, hz_out4); - tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); - hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out6 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1); out5 = __lsx_vpackev_b(hz_out6, hz_out5); - tmp1 = DPADD_H3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); + tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); - hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1); + hz_out7 = horiz_4tap_filt(src7, src7, mask0, mask1, filt_hz0, filt_hz1); out6 = __lsx_vpackev_b(hz_out7, hz_out6); - tmp2 = DPADD_H3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2); + tmp2 = dpadd_h3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2); - hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1); + hz_out8 = horiz_4tap_filt(src8, src8, mask0, mask1, filt_hz0, filt_hz1); out7 = __lsx_vpackev_b(hz_out8, hz_out7); - tmp3 = DPADD_H3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2); + tmp3 = dpadd_h3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2); DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, vec0, vec1); DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1); __lsx_vstelm_d(vec0, dst, 0, 0); diff --git a/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h index 4834f18fc0..b0db1e99c5 100644 --- a/vpx_dsp/loongarch/bitdepth_conversion_lsx.h +++ b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h @@ -16,33 +16,26 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_util/loongson_intrinsics.h" +static INLINE __m128i load_tran_low(const tran_low_t *s) { #if CONFIG_VP9_HIGHBITDEPTH -#define load_tran_low(s) \ - ({ \ - __m128i res0_m; \ - __m128i v0_m = __lsx_vld(s, 0); \ - __m128i v1_m = __lsx_vld(s + 4, 0); \ - res0_m = __lsx_vsrlni_h_w(v0_m, v1_m, 0); \ - res0_m; \ - }) - -#define store_tran_low(v, s, c) \ - { \ - __m128i v0_m, v1_m; \ - v1_m = __lsx_vexth_w_h(v); \ - v0_m = __lsx_vsllwil_w_h(v, 0); \ - __lsx_vst(v0_m, s + c, 0); \ - __lsx_vst(v1_m, s + c + 4, 0); \ - } + __m128i v0_m = __lsx_vld(s, 0); + __m128i v1_m = __lsx_vld(s + 4, 0); + return __lsx_vsrlni_h_w(v0_m, v1_m, 0); #else -#define load_tran_low(s) \ - ({ \ - __m128i res0_m; \ - res0_m = __lsx_vld(s, 0); \ - res0_m; \ - }) + return __lsx_vld(s, 0); +#endif +} -#define store_tran_low(v, s, c) __lsx_vst(v, s + c, 0) -#endif // CONFIG_VP9_HIGHBITDEPTH +static INLINE void store_tran_low(__m128i v, tran_low_t *s, int32_t c) { +#if CONFIG_VP9_HIGHBITDEPTH + __m128i v0_m, v1_m; + v1_m = __lsx_vexth_w_h(v); + v0_m = __lsx_vsllwil_w_h(v, 0); + __lsx_vst(v0_m, s + c, 0); + __lsx_vst(v1_m, s + c + 4, 0); +#else + __lsx_vst(v, s + c, 0); +#endif +} #endif // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_ diff --git a/vpx_dsp/loongarch/fwd_dct32x32_lsx.c b/vpx_dsp/loongarch/fwd_dct32x32_lsx.c index e5c301b2c1..9bb3877212 100644 --- a/vpx_dsp/loongarch/fwd_dct32x32_lsx.c +++ b/vpx_dsp/loongarch/fwd_dct32x32_lsx.c @@ -13,10 +13,10 @@ #include "vpx_dsp/fwd_txfm.h" #define UNPCK_SH_SW(in, out0, out1) \ - { \ + do { \ out0 = __lsx_vsllwil_w_h(in, 0); \ out1 = __lsx_vexth_w_h(in); \ - } + } while (0) static void fdct8x32_1d_column_load_butterfly(const int16_t *input, int32_t src_stride, diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.c b/vpx_dsp/loongarch/fwd_txfm_lsx.c index 6f2d4d6fee..508532b9d8 100644 --- a/vpx_dsp/loongarch/fwd_txfm_lsx.c +++ b/vpx_dsp/loongarch/fwd_txfm_lsx.c @@ -12,7 +12,7 @@ #include "vpx_dsp/loongarch/fwd_txfm_lsx.h" #define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ - { \ + do { \ __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \ \ DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \ @@ -23,7 +23,7 @@ _t3 = __lsx_vilvh_h(_s3, _s2); \ DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \ DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \ - } + } while (0) #if !CONFIG_VP9_HIGHBITDEPTH void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h index d04427a6ea..4a9fce9a3d 100644 --- a/vpx_dsp/loongarch/fwd_txfm_lsx.h +++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h @@ -15,7 +15,7 @@ #include "vpx_dsp/txfm_common.h" #define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ + do { \ __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ __m128i vec4_m, vec5_m, vec6_m, vec7_m; \ @@ -38,11 +38,11 @@ DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m, \ vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \ vec7_m, DCT_CONST_BITS, out0, out2, out1, out3); \ - } + } while (0) #define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ out3, out4, out5, out6, out7) \ - { \ + do { \ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ __m128i s7_m, x0_m, x1_m, x2_m, x3_m; \ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \ @@ -97,10 +97,10 @@ x3_m = __lsx_vneg_h(x3_m); \ x2_m = __lsx_vpackev_h(x2_m, x3_m); \ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \ - } + } while (0) #define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ - { \ + do { \ __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ \ DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m, \ @@ -111,10 +111,10 @@ in3, in0, in1, in2, in3); \ DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, \ in7, in4, in5, in6, in7); \ - } + } while (0) #define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ - { \ + do { \ __m128i tp0_m, tp1_m; \ __m128i one = __lsx_vreplgr2vr_h(1); \ \ @@ -130,10 +130,10 @@ vec1 = __lsx_vadd_h(vec1, tp1_m); \ vec0 = __lsx_vsrai_h(vec0, 2); \ vec1 = __lsx_vsrai_h(vec1, 2); \ - } + } while (0) #define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \ - { \ + do { \ __m128i tp0_m, tp1_m; \ __m128i one_m = __lsx_vldi(0x401); \ \ @@ -147,10 +147,10 @@ vec1 = __lsx_vadd_h(vec1, tp1_m); \ vec0 = __lsx_vsrai_h(vec0, 2); \ vec1 = __lsx_vsrai_h(vec1, 2); \ - } + } while (0) #define FDCT32_POSTPROC_NEG_W(vec) \ - { \ + do { \ __m128i temp_m; \ __m128i one_m = __lsx_vreplgr2vr_w(1); \ \ @@ -159,11 +159,11 @@ temp_m = __lsx_vand_v(one_m, temp_m); \ vec = __lsx_vadd_w(vec, temp_m); \ vec = __lsx_vsrai_w(vec, 2); \ - } + } while (0) #define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \ const0, const1, out0, out1, out2, out3) \ - { \ + do { \ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ __m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1; \ __m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0); \ @@ -188,11 +188,11 @@ DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m); \ DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \ DCT_CONST_BITS, out2, out3); \ - } + } while (0) #define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2, \ in3) \ - { \ + do { \ __m128i dst0_m, dst1_m, dst2_m, dst3_m; \ __m128i tmp0_m, tmp1_m; \ __m128i res0_m, res1_m, res2_m, res3_m; \ @@ -210,11 +210,11 @@ __lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1); \ __lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0); \ __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1); \ - } + } while (0) #define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ out2, out3, out4, out5, out6, out7) \ - { \ + do { \ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ __m128i x0_m, x1_m, x2_m, x3_m; \ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \ @@ -270,12 +270,12 @@ x3_m = __lsx_vneg_h(x3_m); \ x2_m = __lsx_vpackev_h(x2_m, x3_m); \ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \ - } + } while (0) #define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \ input7, out1, out3, out5, out7, out9, out11, out13, \ out15) \ - { \ + do { \ __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ __m128i stp36_m, stp37_m, vec0_m, vec1_m; \ @@ -373,7 +373,7 @@ cnst1_m = __lsx_vreplvei_h(coeff2_m, 3); \ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3); \ - } + } while (0) void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride); diff --git a/vpx_dsp/loongarch/idct32x32_lsx.c b/vpx_dsp/loongarch/idct32x32_lsx.c index d6890c28e1..ec07f57d90 100644 --- a/vpx_dsp/loongarch/idct32x32_lsx.c +++ b/vpx_dsp/loongarch/idct32x32_lsx.c @@ -12,10 +12,10 @@ #include "vpx_dsp/loongarch/fwd_txfm_lsx.h" #define UNPCK_UB_SH(_in, _out0, _out1) \ - { \ + do { \ _out0 = __lsx_vsllwil_hu_bu(_in, 0); \ _out1 = __lsx_vexth_hu_bu(_in); \ - } + } while (0) static void idct32x8_row_transpose_store(const int16_t *input, int16_t *tmp_buf) { diff --git a/vpx_dsp/loongarch/loopfilter_16_lsx.c b/vpx_dsp/loongarch/loopfilter_16_lsx.c index cbaefcd6e0..539817777d 100644 --- a/vpx_dsp/loongarch/loopfilter_16_lsx.c +++ b/vpx_dsp/loongarch/loopfilter_16_lsx.c @@ -15,7 +15,7 @@ #define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, \ _in2, _in3, _in4, _in5, _in6, _in7) \ - { \ + do { \ _in0 = __lsx_vld(_src, 0); \ _in1 = __lsx_vldx(_src, _stride); \ _in2 = __lsx_vldx(_src, _stride2); \ @@ -25,11 +25,11 @@ _in5 = __lsx_vldx(_src, _stride); \ _in6 = __lsx_vldx(_src, _stride2); \ _in7 = __lsx_vldx(_src, _stride3); \ - } + } while (0) #define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, _dst, \ _stride, _stride2, _stride3, _stride4) \ - { \ + do { \ __lsx_vst(_dst0, _dst, 0); \ __lsx_vstx(_dst1, _dst, _stride); \ __lsx_vstx(_dst2, _dst, _stride2); \ @@ -39,7 +39,7 @@ __lsx_vstx(_dst5, _dst, _stride); \ __lsx_vstx(_dst6, _dst, _stride2); \ __lsx_vstx(_dst7, _dst, _stride3); \ - } + } while (0) static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride, uint8_t *filter48, diff --git a/vpx_dsp/loongarch/loopfilter_lsx.h b/vpx_dsp/loongarch/loopfilter_lsx.h index 53e15fe6d5..1c43836503 100644 --- a/vpx_dsp/loongarch/loopfilter_lsx.h +++ b/vpx_dsp/loongarch/loopfilter_lsx.h @@ -16,7 +16,7 @@ #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ flat_out) \ - { \ + do { \ __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ \ @@ -47,10 +47,10 @@ \ mask_out = __lsx_vslt_bu(limit_in, mask_out); \ mask_out = __lsx_vxori_b(mask_out, 0xff); \ - } + } while (0) #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ - { \ + do { \ __m128i p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0; \ __m128i flat4_tmp = __lsx_vldi(1); \ \ @@ -64,11 +64,11 @@ flat_out = __lsx_vslt_bu(flat4_tmp, flat_out); \ flat_out = __lsx_vxori_b(flat_out, 0xff); \ flat_out = flat_out & (mask); \ - } + } while (0) #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ q6_in, q7_in, flat_in, flat2_out) \ - { \ + do { \ __m128i flat5_tmp = __lsx_vldi(1); \ __m128i p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0; \ __m128i p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0; \ @@ -87,11 +87,11 @@ flat2_out = __lsx_vslt_bu(flat5_tmp, flat2_out); \ flat2_out = __lsx_vxori_b(flat2_out, 0xff); \ flat2_out = flat2_out & flat_in; \ - } + } while (0) #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \ p0_out, q0_out, q1_out) \ - { \ + do { \ __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ const __m128i cnst4b = __lsx_vldi(4); \ const __m128i cnst3b = __lsx_vldi(3); \ @@ -118,12 +118,12 @@ q1_m = __lsx_vssub_b(q1_m, filt); \ p1_m = __lsx_vsadd_b(p1_m, filt); \ DUP2_ARG2(__lsx_vxori_b, q1_m, 0x80, p1_m, 0x80, q1_out, p1_out); \ - } + } while (0) #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ q1_filt8_out, q2_filt8_out) \ - { \ + do { \ __m128i tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ \ tmp_filt8_2 = __lsx_vadd_h(p2_in, p1_in); \ @@ -162,6 +162,6 @@ tmp_filt8_0 = __lsx_vadd_h(q1_in, q3_in); \ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1); \ q1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ - } + } while (0) #endif // VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_ diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c index e3fbb9e9e0..2fc33b06b7 100644 --- a/vpx_dsp/loongarch/quantize_lsx.c +++ b/vpx_dsp/loongarch/quantize_lsx.c @@ -12,79 +12,83 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_util/loongson_intrinsics.h" -#define CALCULATE_QCOEFF(coeff, coeff_abs, round, quant, shift, cmp_mask) \ - ({ \ - __m128i rounded, qcoeff; \ - \ - rounded = __lsx_vsadd_h(coeff_abs, round); \ - qcoeff = __lsx_vmuh_h(rounded, quant); \ - qcoeff = __lsx_vadd_h(rounded, qcoeff); \ - qcoeff = __lsx_vmuh_h(qcoeff, shift); \ - qcoeff = __lsx_vsigncov_h(coeff, qcoeff); \ - qcoeff = __lsx_vand_v(qcoeff, cmp_mask); \ - \ - qcoeff; \ - }) - -#define CALCULATE_DQCOEFF_AND_STORE(qcoeff, dequant, dqcoeff) \ - { \ - __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant); \ - __lsx_vst(dqcoeff16, dqcoeff, 0); \ - } +static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs, + __m128i round, __m128i quant, + __m128i shift, __m128i cmp_mask) { + __m128i rounded, qcoeff; + + rounded = __lsx_vsadd_h(coeff_abs, round); + qcoeff = __lsx_vmuh_h(rounded, quant); + qcoeff = __lsx_vadd_h(rounded, qcoeff); + qcoeff = __lsx_vmuh_h(qcoeff, shift); + qcoeff = __lsx_vsigncov_h(coeff, qcoeff); + qcoeff = __lsx_vand_v(qcoeff, cmp_mask); + + return qcoeff; +} -#define CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff, dequant, dqcoeff) \ - { \ - __m128i low, high, dqcoeff32_0, dqcoeff32_1, res; \ - __m128i zero = __lsx_vldi(0); \ - __m128i coeff = __lsx_vabsd_h(qcoeff, zero); \ - \ - __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero); \ - __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero); \ - \ - low = __lsx_vmul_h(coeff, dequant); \ - high = __lsx_vmuh_h(coeff, dequant); \ - dqcoeff32_0 = __lsx_vilvl_h(high, low); \ - dqcoeff32_1 = __lsx_vilvh_h(high, low); \ - \ - dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1); \ - dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1); \ - dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0); \ - dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1); \ - res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0); \ - __lsx_vst(res, dqcoeff, 0); \ - } +static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, + int16_t *dqcoeff) { + __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant); + __lsx_vst(dqcoeff16, dqcoeff, 0); +} + +static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff, + __m128i dequant, + int16_t *dqcoeff) { + // Un-sign to bias rounding like C. + __m128i low, high, dqcoeff32_0, dqcoeff32_1, res; + __m128i zero = __lsx_vldi(0); + __m128i coeff = __lsx_vabsd_h(qcoeff, zero); + + const __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero); + const __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero); + + low = __lsx_vmul_h(coeff, dequant); + high = __lsx_vmuh_h(coeff, dequant); + dqcoeff32_0 = __lsx_vilvl_h(high, low); + dqcoeff32_1 = __lsx_vilvh_h(high, low); + + // "Divide" by 2. + dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1); + dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1); + dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0); + dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1); + res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0); + __lsx_vst(res, dqcoeff, 0); +} + +static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1, + __m128i zbin_mask0, __m128i zbin_mask1, + const int16_t *scan, int index, + __m128i zero) { + const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero); + const __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero); + __m128i scan0 = __lsx_vld(scan + index, 0); + __m128i scan1 = __lsx_vld(scan + index + 8, 0); + __m128i eob0, eob1; + + scan0 = __lsx_vsub_h(scan0, zbin_mask0); + scan1 = __lsx_vsub_h(scan1, zbin_mask1); + eob0 = __lsx_vandn_v(zero_coeff0, scan0); + eob1 = __lsx_vandn_v(zero_coeff1, scan1); + return __lsx_vmax_h(eob0, eob1); +} -#define SCAN_FOR_EOB(coeff0, coeff1, zbin_mask0, zbin_mask1, scan, index, \ - zero) \ - ({ \ - __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero); \ - __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero); \ - __m128i scan0 = __lsx_vld(scan + index, 0); \ - __m128i scan1 = __lsx_vld(scan + index + 8, 0); \ - __m128i eob0, eob1, eob_max; \ - \ - scan0 = __lsx_vsub_h(scan0, zbin_mask0); \ - scan1 = __lsx_vsub_h(scan1, zbin_mask1); \ - eob0 = __lsx_vandn_v(zero_coeff0, scan0); \ - eob1 = __lsx_vandn_v(zero_coeff1, scan1); \ - eob_max = __lsx_vmax_h(eob0, eob1); \ - eob_max; \ - }) - -#define ACCUMULATE_EOB(eob) \ - ({ \ - __m128i eob_shuffled; \ - int16_t res_m; \ - \ - eob_shuffled = __lsx_vshuf4i_w(eob, 0xe); \ - eob = __lsx_vmax_h(eob, eob_shuffled); \ - eob_shuffled = __lsx_vshuf4i_h(eob, 0xe); \ - eob = __lsx_vmax_h(eob, eob_shuffled); \ - eob_shuffled = __lsx_vshuf4i_h(eob, 0x1); \ - eob = __lsx_vmax_h(eob, eob_shuffled); \ - res_m = __lsx_vpickve2gr_h(eob, 1); \ - res_m; \ - }) +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + int16_t res_m; + + eob_shuffled = __lsx_vshuf4i_w(eob, 0xe); + eob = __lsx_vmax_h(eob, eob_shuffled); + eob_shuffled = __lsx_vshuf4i_h(eob, 0xe); + eob = __lsx_vmax_h(eob, eob_shuffled); + eob_shuffled = __lsx_vshuf4i_h(eob, 0x1); + eob = __lsx_vmax_h(eob, eob_shuffled); + res_m = __lsx_vpickve2gr_h(eob, 1); + + return res_m; +} #if !CONFIG_VP9_HIGHBITDEPTH void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, @@ -120,21 +124,21 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); qcoeff0 = - CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); round = __lsx_vilvh_d(round, round); quant = __lsx_vilvh_d(quant, quant); quant_shift = __lsx_vilvh_d(quant_shift, quant_shift); qcoeff1 = - CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); __lsx_vst(qcoeff0, qcoeff_ptr, 0); __lsx_vst(qcoeff1, qcoeff_ptr, 16); - CALCULATE_DQCOEFF_AND_STORE(qcoeff0, dequant, dqcoeff_ptr); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); dequant = __lsx_vilvh_d(dequant, dequant); - CALCULATE_DQCOEFF_AND_STORE(qcoeff1, dequant, dqcoeff_ptr + 8); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { coeff0 = __lsx_vld(coeff_ptr + index, 0); @@ -147,24 +151,24 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); qcoeff0 = - CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); qcoeff1 = - CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); __lsx_vst(qcoeff0, qcoeff_ptr + index, 0); __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0); - CALCULATE_DQCOEFF_AND_STORE(qcoeff0, dequant, dqcoeff_ptr + index); - CALCULATE_DQCOEFF_AND_STORE(qcoeff1, dequant, dqcoeff_ptr + index + 8); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, zero); eob = __lsx_vmax_h(eob, eob0); index += 16; } - *eob_ptr = ACCUMULATE_EOB(eob); + *eob_ptr = accumulate_eob(eob); } void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, @@ -204,20 +208,20 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); qcoeff0 = - CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); // remove DC in quant_shift, quant, quant_shift round = __lsx_vilvh_d(round, round); quant = __lsx_vilvh_d(quant, quant); quant_shift = __lsx_vilvh_d(quant_shift, quant_shift); qcoeff1 = - CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); __lsx_vst(qcoeff0, qcoeff_ptr, 0); __lsx_vst(qcoeff1, qcoeff_ptr, 16); - CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff0, dequant, dqcoeff_ptr); + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr); dequant = __lsx_vilvh_d(dequant, dequant); - CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8); + eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); // AC only loop. for (index = 16; index < 32 * 32; index += 16) { coeff0 = __lsx_vld(coeff_ptr + index, 0); @@ -230,20 +234,20 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); qcoeff0 = - CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); qcoeff1 = - CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); __lsx_vst(qcoeff0, qcoeff_ptr + index, 0); __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0); - CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff0, dequant, dqcoeff_ptr + index); - CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff1, dequant, + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8 + index); - eob0 = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, zero); eob = __lsx_vmax_h(eob, eob0); } - *eob_ptr = ACCUMULATE_EOB(eob); + *eob_ptr = accumulate_eob(eob); } -#endif // !CONFIG_VP9_HIGHBITDEPTH +#endif diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c index 5eaebfb518..b6fbedb0d0 100644 --- a/vpx_dsp/loongarch/sad_lsx.c +++ b/vpx_dsp/loongarch/sad_lsx.c @@ -8,59 +8,63 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_util/loongson_intrinsics.h" -#define SAD_UB2_UH(in0, in1, ref0, ref1) \ - ({ \ - __m128i diff0_m, diff1_m, sad_m0; \ - __m128i sad_m = __lsx_vldi(0); \ - \ - diff0_m = __lsx_vabsd_bu(in0, ref0); \ - diff1_m = __lsx_vabsd_bu(in1, ref1); \ - \ - sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m); \ - sad_m = __lsx_vadd_h(sad_m, sad_m0); \ - sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m); \ - sad_m = __lsx_vadd_h(sad_m, sad_m0); \ - \ - sad_m; \ - }) - -#define HADD_UW_U32(in) \ - ({ \ - __m128i res0_m; \ - uint32_t sum_m; \ - res0_m = __lsx_vhaddw_du_wu(in, in); \ - res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m); \ - sum_m = __lsx_vpickve2gr_w(res0_m, 0); \ - sum_m; \ - }) - -#define HADD_UH_U32(in) \ - ({ \ - __m128i res_m; \ - uint32_t sum_m; \ - res_m = __lsx_vhaddw_wu_hu(in, in); \ - sum_m = HADD_UW_U32(res_m); \ - sum_m; \ - }) - -#define HADD_SW_S32(in) \ - ({ \ - __m128i res0_m; \ - int32_t sum_m; \ - \ - res0_m = __lsx_vhaddw_d_w(in, in); \ - res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \ - sum_m = __lsx_vpickve2gr_w(res0_m, 0); \ - sum_m; \ - }) +static INLINE __m128i sad_ub2_uh(__m128i in0, __m128i in1, __m128i ref0, + __m128i ref1) { + __m128i diff0_m, diff1_m, sad_m0; + __m128i sad_m = __lsx_vldi(0); + + diff0_m = __lsx_vabsd_bu(in0, ref0); + diff1_m = __lsx_vabsd_bu(in1, ref1); + + sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m); + sad_m = __lsx_vadd_h(sad_m, sad_m0); + sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m); + sad_m = __lsx_vadd_h(sad_m, sad_m0); + + return sad_m; +} + +static INLINE uint32_t hadd_uw_u32(__m128i in) { + __m128i res0_m; + uint32_t sum_m; + + res0_m = __lsx_vhaddw_du_wu(in, in); + res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m); + sum_m = __lsx_vpickve2gr_w(res0_m, 0); + + return sum_m; +} + +static INLINE uint32_t hadd_uh_u32(__m128i in) { + __m128i res_m; + uint32_t sum_m; + + res_m = __lsx_vhaddw_wu_hu(in, in); + sum_m = hadd_uw_u32(res_m); + + return sum_m; +} + +static INLINE int32_t hadd_sw_s32(__m128i in) { + __m128i res0_m; + int32_t sum_m; + + res0_m = __lsx_vhaddw_d_w(in, in); + res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); + sum_m = __lsx_vpickve2gr_w(res0_m, 0); + + return sum_m; +} static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, int32_t height) { int32_t ht_cnt; + uint32_t res; __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp; __m128i sad = __lsx_vldi(0); @@ -79,16 +83,18 @@ static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride, ref += ref_stride; DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, ref0, ref1); - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad = __lsx_vadd_h(sad, sad_tmp); } - return HADD_UH_U32(sad); + res = hadd_uh_u32(sad); + return res; } static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, int32_t height) { int32_t ht_cnt = (height >> 2); + uint32_t res; __m128i src0, src1, ref0, ref1, sad_tmp; __m128i sad = __lsx_vldi(0); int32_t src_stride2 = src_stride << 1; @@ -99,23 +105,26 @@ static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1); src += src_stride2; ref += ref_stride2; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad = __lsx_vadd_h(sad, sad_tmp); DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0); DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1); src += src_stride2; ref += ref_stride2; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad = __lsx_vadd_h(sad, sad_tmp); } - return HADD_UH_U32(sad); + + res = hadd_uh_u32(sad); + return res; } static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, int32_t height) { int32_t ht_cnt = (height >> 2); + uint32_t res; __m128i src0, src1, ref0, ref1; __m128i sad_tmp; __m128i sad = __lsx_vldi(0); @@ -125,31 +134,32 @@ static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride, src += src_stride; DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); ref += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad = __lsx_vadd_h(sad, sad_tmp); DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); src += src_stride; DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); ref += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad = __lsx_vadd_h(sad, sad_tmp); DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); src += src_stride; DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); ref += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad = __lsx_vadd_h(sad, sad_tmp); DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); src += src_stride; DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); ref += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad = __lsx_vadd_h(sad, sad_tmp); } - return HADD_UH_U32(sad); + res = hadd_uh_u32(sad); + return res; } static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride, @@ -170,9 +180,9 @@ static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride, DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, ref3); ref += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad0 = __lsx_vadd_h(sad0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); sad1 = __lsx_vadd_h(sad1, sad_tmp); DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, @@ -181,14 +191,14 @@ static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride, DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, ref3); ref += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad0 = __lsx_vadd_h(sad0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); sad1 = __lsx_vadd_h(sad1, sad_tmp); } - sad = HADD_UH_U32(sad0); - sad += HADD_UH_U32(sad1); + sad = hadd_uh_u32(sad0); + sad += hadd_uh_u32(sad1); return sad; } @@ -247,25 +257,25 @@ static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1); DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad0 = __lsx_vadd_h(sad0, sad_tmp); DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1); - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad1 = __lsx_vadd_h(sad1, sad_tmp); DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1); - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad2 = __lsx_vadd_h(sad2, sad_tmp); DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1); - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad3 = __lsx_vadd_h(sad3, sad_tmp); } - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); + sad_array[0] = hadd_uh_u32(sad0); + sad_array[1] = hadd_uh_u32(sad1); + sad_array[2] = hadd_uh_u32(sad2); + sad_array[3] = hadd_uh_u32(sad3); } static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, @@ -334,10 +344,10 @@ static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); sad3 = __lsx_vadd_h(sad3, sad_tmp); } - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); + sad_array[0] = hadd_uh_u32(sad0); + sad_array[1] = hadd_uh_u32(sad1); + sad_array[2] = hadd_uh_u32(sad2); + sad_array[3] = hadd_uh_u32(sad3); } static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride, @@ -363,28 +373,28 @@ static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1); ref0_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad0 = __lsx_vadd_h(sad0, sad_tmp); DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1); ref1_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad1 = __lsx_vadd_h(sad1, sad_tmp); DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1); ref2_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad2 = __lsx_vadd_h(sad2, sad_tmp); DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1); ref3_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad3 = __lsx_vadd_h(sad3, sad_tmp); } - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); + sad_array[0] = hadd_uh_u32(sad0); + sad_array[1] = hadd_uh_u32(sad1); + sad_array[2] = hadd_uh_u32(sad2); + sad_array[3] = hadd_uh_u32(sad3); } static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride, @@ -419,60 +429,60 @@ static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride, DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48, ref0, ref1, ref2, ref3); ref0_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp); DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48, ref0, ref1, ref2, ref3); ref1_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp); DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48, ref0, ref1, ref2, ref3); ref2_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp); DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48, ref0, ref1, ref2, ref3); ref3_ptr += ref_stride; - sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp); } sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0); sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1); sad = __lsx_vadd_w(sad, sad_tmp); - sad_array[0] = HADD_UW_U32(sad); + sad_array[0] = hadd_uw_u32(sad); sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0); sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1); sad = __lsx_vadd_w(sad, sad_tmp); - sad_array[1] = HADD_UW_U32(sad); + sad_array[1] = hadd_uw_u32(sad); sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0); sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1); sad = __lsx_vadd_w(sad, sad_tmp); - sad_array[2] = HADD_UW_U32(sad); + sad_array[2] = hadd_uw_u32(sad); sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0); sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1); sad = __lsx_vadd_w(sad, sad_tmp); - sad_array[3] = HADD_UW_U32(sad); + sad_array[3] = hadd_uw_u32(sad); } static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, int32_t height, const uint8_t *sec_pred) { - int32_t ht_cnt = (height >> 2); + int32_t res, ht_cnt = (height >> 2); __m128i src0, src1, src2, src3, src4, src5, src6, src7; __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; @@ -514,26 +524,26 @@ static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride, sec_pred += 128; DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1); - sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); sad = __lsx_vadd_h(sad, sad_tmp); DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1); - sad_tmp = SAD_UB2_UH(src2, src3, comp0, comp1); + sad_tmp = sad_ub2_uh(src2, src3, comp0, comp1); sad = __lsx_vadd_h(sad, sad_tmp); DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1); - sad_tmp = SAD_UB2_UH(src4, src5, comp0, comp1); + sad_tmp = sad_ub2_uh(src4, src5, comp0, comp1); sad = __lsx_vadd_h(sad, sad_tmp); DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1); - sad_tmp = SAD_UB2_UH(src6, src7, comp0, comp1); + sad_tmp = sad_ub2_uh(src6, src7, comp0, comp1); sad = __lsx_vadd_h(sad, sad_tmp); } - - return HADD_UH_U32(sad); + res = hadd_uh_u32(sad); + return res; } static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, int32_t height, const uint8_t *sec_pred) { - int32_t ht_cnt = (height >> 2); + int32_t res, ht_cnt = (height >> 2); __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3; __m128i sad, sad_tmp; @@ -552,9 +562,9 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, sec_pred += 64; DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, comp1, comp2, comp3); - sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); sad0 = __lsx_vadd_h(sad0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); sad1 = __lsx_vadd_h(sad1, sad_tmp); DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, @@ -568,9 +578,9 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, sec_pred += 64; DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, comp1, comp2, comp3); - sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); sad0 = __lsx_vadd_h(sad0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); sad1 = __lsx_vadd_h(sad1, sad_tmp); DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, @@ -584,9 +594,9 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, sec_pred += 64; DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, comp1, comp2, comp3); - sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); sad0 = __lsx_vadd_h(sad0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); sad1 = __lsx_vadd_h(sad1, sad_tmp); DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, @@ -600,16 +610,17 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, sec_pred += 64; DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, comp1, comp2, comp3); - sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); sad0 = __lsx_vadd_h(sad0, sad_tmp); - sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); sad1 = __lsx_vadd_h(sad1, sad_tmp); } sad = __lsx_vhaddw_wu_hu(sad0, sad0); sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1); sad = __lsx_vadd_w(sad, sad_tmp); - return HADD_SW_S32(sad); + res = hadd_sw_s32(sad); + return res; } #define VPX_SAD_8xHT_LSX(height) \ diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c index 54fcd6c571..d1abf622ad 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c @@ -57,13 +57,13 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_lsx( DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); src6 = __lsx_vxori_b(src6, 128); - tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3); DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, @@ -87,17 +87,17 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_lsx( src2 = __lsx_vilvl_d(src3, src2); DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, src8, src9, src10); - tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff); tmp4 = __lsx_vpackev_b(tmp3, tmp4); - out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src0 = __lsx_vshuf_b(src1, tmp3, shuff); src0 = __lsx_vpackev_b(src1, src0); - out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, + out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, filt_vt2, filt_vt3); out0 = __lsx_vssrarni_b_h(out1, out0, FILTER_BITS); out0 = __lsx_vxori_b(out0, 128); @@ -152,19 +152,19 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_lsx( DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); src6 = __lsx_vxori_b(src6, 128); - src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, @@ -181,25 +181,25 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_lsx( DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, src8, src9, src10); - src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); tmp3 = __lsx_vpackev_b(src7, src6); - out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src0 = __lsx_vpackev_b(src8, src7); - out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, + out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src1 = __lsx_vpackev_b(src9, src8); - src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, + src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, + src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src2 = __lsx_vpackev_b(src10, src9); - src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, + src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, filt_vt2, filt_vt3); DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, FILTER_BITS, src4, src3, FILTER_BITS, out0, out1); @@ -296,9 +296,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx( DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, src, src_stride4, src1, src2, src3, src4); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); @@ -348,11 +348,11 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx( src, src_stride4, src5, src6, src7, src8); src += src_stride4; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); - hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); - hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz); + hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz); + hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz); DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff, hz_out1, hz_out3); hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff); @@ -449,20 +449,20 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx( dst_tmp += dst_stride; dst3 = __lsx_vldrepl_d(dst_tmp, 0); DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); vec1 = __lsx_vpackev_b(hz_out0, hz_out1); tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); vec2 = __lsx_vpackev_b(hz_out1, hz_out0); tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); vec3 = __lsx_vpackev_b(hz_out0, hz_out1); tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, @@ -494,7 +494,7 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( src0 = __lsx_vld(src, 0); src += src_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); for (; loop_cnt--;) { src1 = __lsx_vld(src, 0); @@ -502,19 +502,19 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( src4 = __lsx_vldx(src, src_stride3); src += src_stride4; - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, @@ -571,8 +571,8 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); src += src_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); for (; loop_cnt--;) { src0 = __lsx_vld(src, 0); @@ -588,32 +588,32 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); dst3 = __lsx_vldx(dst, dst_stride3); - hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); tmp3 = __lsx_vavgr_bu(tmp3, dst0); __lsx_vst(tmp3, dst, 0); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); tmp3 = __lsx_vavgr_bu(tmp3, dst1); __lsx_vstx(tmp3, dst, dst_stride); - hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); tmp3 = __lsx_vavgr_bu(tmp3, dst2); __lsx_vstx(tmp3, dst, dst_stride2); - hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c index 584f241838..5c6413df44 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c @@ -68,9 +68,9 @@ static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src, tmp0, tmp1, tmp2, tmp3); DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4); DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4); - out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1, + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1, filter2, filter3); - out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1, + out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1, filter2, filter3); out0 = __lsx_vssrarni_b_h(out1, out0, 7); out0 = __lsx_vxori_b(out0, 128); @@ -146,13 +146,13 @@ static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src, src8, src9, src10); DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, tmp0, tmp1, tmp2, tmp3); - out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1, + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1, filter2, filter3); - out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1, + out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1, filter2, filter3); - out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1, + out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1, filter2, filter3); - out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1, + out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); @@ -231,13 +231,13 @@ static void common_vt_8t_and_aver_dst_16w_mult_lsx( src0, src1, src2, src3); DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, src4, src5, src7, src8); - tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1, + tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1, filter2, filter3); - tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1, + tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1, filter2, filter3); - tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1, + tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1, filter2, filter3); - tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1, + tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); @@ -246,13 +246,13 @@ static void common_vt_8t_and_aver_dst_16w_mult_lsx( DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1); __lsx_vst(tmp0, dst_reg, 0); __lsx_vstx(tmp1, dst_reg, dst_stride); - tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1, + tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1, filter2, filter3); - tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1, + tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1, filter2, filter3); - tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1, + tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1, filter2, filter3); - tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1, + tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); diff --git a/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_lsx.c index 73583abb98..9f5cd6cfe9 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_lsx.c @@ -54,13 +54,13 @@ static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); src6 = __lsx_vxori_b(src6, 128); - tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3); DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, @@ -73,17 +73,17 @@ static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride, src += src_stride; DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, src8, src9, src10); - tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff); tmp4 = __lsx_vpackev_b(tmp3, tmp4); - out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src0 = __lsx_vshuf_b(src1, tmp3, shuff); src0 = __lsx_vpackev_b(src1, src0); - out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, + out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, filt_vt2, filt_vt3); out0 = __lsx_vssrarni_b_h(out1, out0, 7); out0 = __lsx_vxori_b(out0, 128); @@ -135,19 +135,19 @@ static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); src6 = __lsx_vxori_b(src6, 128); - src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, @@ -161,25 +161,25 @@ static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride, src += src_stride; DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, src8, src9, src10); - src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); tmp3 = __lsx_vpackev_b(src7, src6); - out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src0 = __lsx_vpackev_b(src8, src7); - out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, + out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src1 = __lsx_vpackev_b(src9, src8); - src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, + src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, + src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); src2 = __lsx_vpackev_b(src10, src9); - src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, + src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, filt_vt2, filt_vt3); DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1); DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); @@ -267,9 +267,9 @@ static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride, src0 = __lsx_vld(src, 0); DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, src, src_stride4, src1, src2, src3, src4); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2); @@ -316,11 +316,11 @@ static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride, src, src_stride4, src5, src6, src7, src8); src += src_stride4; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); - hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); - hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz); + hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz); + hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz); DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff, hz_out1, hz_out3); @@ -382,20 +382,20 @@ static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride, DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, src, src_stride4, src1, src2, src3, src4); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); vec1 = __lsx_vpackev_b(hz_out0, hz_out1); tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); vec2 = __lsx_vpackev_b(hz_out1, hz_out0); tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); vec3 = __lsx_vpackev_b(hz_out0, hz_out1); tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); @@ -430,7 +430,7 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, src0 = __lsx_vld(src, 0); src += src_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); for (; loop_cnt--;) { src1 = __lsx_vld(src, 0); @@ -438,19 +438,19 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, src4 = __lsx_vldx(src, src_stride3); src += src_stride4; - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); src1 = __lsx_vld(src, 0); DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); src4 = __lsx_vldx(src, src_stride3); @@ -470,19 +470,19 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, __lsx_vstelm_d(tmp2, dst, 0, 1); dst += dst_stride; - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out1, hz_out0); tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); vec0 = __lsx_vpackev_b(hz_out0, hz_out1); tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt); @@ -534,8 +534,8 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); src += src_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); for (; loop_cnt--;) { uint8_t *src_tmp0 = src + 8; @@ -546,32 +546,32 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7); src += src_stride4; - hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); __lsx_vst(tmp, dst, 0); dst += dst_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); __lsx_vst(tmp, dst, 0); dst += dst_stride; - hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); __lsx_vst(tmp, dst, 0); dst += dst_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz); DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); diff --git a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c index 7e3a95b2fd..6022e43c83 100644 --- a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c +++ b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c @@ -52,9 +52,9 @@ static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride, tmp0, tmp1, tmp2, tmp3); DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4); DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4); - out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1, + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1, filter2, filter3); - out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1, + out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1, filter2, filter3); out0 = __lsx_vssrarni_b_h(out1, out0, 7); out0 = __lsx_vxori_b(out0, 128); @@ -116,13 +116,13 @@ static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride, src8, src9, src10); DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, tmp0, tmp1, tmp2, tmp3); - out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1, + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1, filter2, filter3); - out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1, + out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1, filter2, filter3); - out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1, + out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1, filter2, filter3); - out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1, + out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); @@ -192,13 +192,13 @@ static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride, src0, src1, src2, src3); DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, src4, src5, src7, src8); - tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1, + tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1, filter2, filter3); - tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1, + tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1, filter2, filter3); - tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1, + tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1, filter2, filter3); - tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1, + tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); @@ -206,13 +206,13 @@ static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride, dst += dst_stride; __lsx_vst(tmp1, dst, 0); dst += dst_stride; - tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1, + tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1, filter2, filter3); - tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1, + tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1, filter2, filter3); - tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1, + tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1, filter2, filter3); - tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1, + tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); @@ -298,25 +298,25 @@ static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride, src0, src1, src2, src3); DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, src4, src5, src7, src8); - tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1, + tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1, filter2, filter3); - tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1, + tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1, filter2, filter3); - tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1, + tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1, filter2, filter3); - tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1, + tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); __lsx_vst(tmp0, dst_tmp, 0); __lsx_vstx(tmp1, dst_tmp, dst_stride); - tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1, + tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1, filter2, filter3); - tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1, + tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1, filter2, filter3); - tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1, + tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1, filter2, filter3); - tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1, + tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1, filter2, filter3); DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); diff --git a/vpx_dsp/loongarch/vpx_convolve_lsx.h b/vpx_dsp/loongarch/vpx_convolve_lsx.h index 2428407f2b..d886b00198 100644 --- a/vpx_dsp/loongarch/vpx_convolve_lsx.h +++ b/vpx_dsp/loongarch/vpx_convolve_lsx.h @@ -11,11 +11,50 @@ #ifndef VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ #define VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ -#include "vpx_util/loongson_intrinsics.h" +#include "./vpx_config.h" #include "vpx_dsp/vpx_filter.h" +#include "vpx_util/loongson_intrinsics.h" + +static INLINE __m128i filt_8tap_dpadd_s_h(__m128i _reg0, __m128i _reg1, + __m128i _reg2, __m128i _reg3, + __m128i _filter0, __m128i _filter1, + __m128i _filter2, __m128i _filter3) { + __m128i _vec0, _vec1; + + _vec0 = __lsx_vdp2_h_b(_reg0, _filter0); + _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1); + _vec1 = __lsx_vdp2_h_b(_reg2, _filter2); + _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3); + return __lsx_vsadd_h(_vec0, _vec1); +} + +static INLINE __m128i horiz_8tap_filt(__m128i _src0, __m128i _src1, + __m128i _mask0, __m128i _mask1, + __m128i _mask2, __m128i _mask3, + __m128i _filt_h0, __m128i _filt_h1, + __m128i _filt_h2, __m128i _filt_h3) { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + __m128i _out; + + DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1, + _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3); + _out = filt_8tap_dpadd_s_h(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, + _filt_h2, _filt_h3); + _out = __lsx_vsrari_h(_out, FILTER_BITS); + return __lsx_vsat_h(_out, 7); +} + +static INLINE __m128i horiz_2tap_filt_uh(__m128i in0, __m128i in1, __m128i mask, + __m128i coeff) { + __m128i tmp0_m, tmp1_m; + + tmp0_m = __lsx_vshuf_b(in1, in0, mask); + tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); + return __lsx_vsrari_h(tmp1_m, FILTER_BITS); +} #define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \ - { \ + do { \ _src0 = __lsx_vld(_src, 0); \ _src += _stride; \ _src1 = __lsx_vld(_src, 0); \ @@ -23,43 +62,12 @@ _src2 = __lsx_vld(_src, 0); \ _src += _stride; \ _src3 = __lsx_vld(_src, 0); \ - } - -#define FILT_8TAP_DPADD_S_H(_reg0, _reg1, _reg2, _reg3, _filter0, _filter1, \ - _filter2, _filter3) \ - ({ \ - __m128i _vec0, _vec1; \ - \ - _vec0 = __lsx_vdp2_h_b(_reg0, _filter0); \ - _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1); \ - _vec1 = __lsx_vdp2_h_b(_reg2, _filter2); \ - _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3); \ - _vec0 = __lsx_vsadd_h(_vec0, _vec1); \ - \ - _vec0; \ - }) - -#define HORIZ_8TAP_FILT(_src0, _src1, _mask0, _mask1, _mask2, _mask3, \ - _filt_h0, _filt_h1, _filt_h2, _filt_h3) \ - ({ \ - __m128i _tmp0, _tmp1, _tmp2, _tmp3; \ - __m128i _out; \ - \ - DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, \ - _src1, _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, \ - _tmp3); \ - _out = FILT_8TAP_DPADD_S_H(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, \ - _filt_h2, _filt_h3); \ - _out = __lsx_vsrari_h(_out, FILTER_BITS); \ - _out = __lsx_vsat_h(_out, 7); \ - \ - _out; \ - }) + } while (0) #define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, \ _mask2, _mask3, _filter0, _filter1, \ _filter2, _filter3, _out0, _out1) \ - { \ + do { \ __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ __m128i _reg0, _reg1, _reg2, _reg3; \ \ @@ -78,12 +86,12 @@ DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7, \ _filter3, _reg2, _reg3); \ DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1); \ - } + } while (0) #define HORIZ_8TAP_8WID_4VECS_FILT( \ _src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0, \ _filter1, _filter2, _filter3, _out0, _out1, _out2, _out3) \ - { \ + do { \ __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7; \ \ @@ -111,22 +119,10 @@ _reg5, _reg6, _reg7); \ DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3, \ _reg7, _out0, _out1, _out2, _out3); \ - } - -#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ - ({ \ - __m128i tmp0_m; \ - __m128i tmp1_m; \ - \ - tmp0_m = __lsx_vshuf_b(in1, in0, mask); \ - tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); \ - tmp1_m = __lsx_vsrari_h(tmp1_m, shift); \ - \ - tmp1_m; \ - }) + } while (0) #define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride) \ - { \ + do { \ __m128i tmp0_m, tmp1_m; \ \ DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \ @@ -137,6 +133,6 @@ __lsx_vstelm_d(tmp1_m, pdst, 0, 0); \ pdst += stride; \ __lsx_vstelm_d(tmp1_m, pdst, 0, 1); \ - } + } while (0) #endif // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ From b163db1a6a3e9c5e544a2c8e43b7a0d299d2b8c6 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sun, 15 May 2022 15:39:15 -0700 Subject: [PATCH 0377/1000] tools/*.py: update to python3 only lint-hunks.py is tested as part of the presubmit; the rest may need further changes as they're used. Bug: b/229626362 Change-Id: I2fd6e96deab8d892d34527e484ea65e3df86d162 --- tools/diff.py | 2 +- tools/intersect-diffs.py | 4 ++-- tools/lint-hunks.py | 36 +++++++++++++++++++----------------- tools/wrap-commit-msg.py | 2 +- 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/tools/diff.py b/tools/diff.py index a96c7db851..860a6b051b 100644 --- a/tools/diff.py +++ b/tools/diff.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 ## Copyright (c) 2012 The WebM project authors. All Rights Reserved. ## ## Use of this source code is governed by a BSD-style license diff --git a/tools/intersect-diffs.py b/tools/intersect-diffs.py index 4dbafa90b7..590e687b47 100755 --- a/tools/intersect-diffs.py +++ b/tools/intersect-diffs.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 ## Copyright (c) 2012 The WebM project authors. All Rights Reserved. ## ## Use of this source code is governed by a BSD-style license @@ -69,7 +69,7 @@ def main(): break if out_hunks: - print FormatDiffHunks(out_hunks) + print(FormatDiffHunks(out_hunks)) sys.exit(1) if __name__ == "__main__": diff --git a/tools/lint-hunks.py b/tools/lint-hunks.py index 30d3249193..0a94afebb9 100755 --- a/tools/lint-hunks.py +++ b/tools/lint-hunks.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 ## Copyright (c) 2012 The WebM project authors. All Rights Reserved. ## ## Use of this source code is governed by a BSD-style license @@ -10,7 +10,7 @@ """Performs style checking on each diff hunk.""" import getopt import os -import StringIO +import io import subprocess import sys @@ -63,21 +63,21 @@ def main(argv=None): try: try: opts, args = getopt.getopt(argv[1:], SHORT_OPTIONS, LONG_OPTIONS) - except getopt.error, msg: + except getopt.error as msg: raise Usage(msg) # process options for o, _ in opts: if o in ("-h", "--help"): - print __doc__ + print(__doc__) sys.exit(0) if args and len(args) > 1: - print __doc__ + print(__doc__) sys.exit(0) # Find the fully qualified path to the root of the tree - tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE) + tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE, text=True) tl = tl.communicate()[0].strip() # See if we're working on the index or not. @@ -93,9 +93,9 @@ def main(argv=None): # Get a list of all affected lines file_affected_line_map = {} - p = Subprocess(diff_cmd, stdout=subprocess.PIPE) + p = Subprocess(diff_cmd, stdout=subprocess.PIPE, text=True) stdout = p.communicate()[0] - for hunk in diff.ParseDiffHunks(StringIO.StringIO(stdout)): + for hunk in diff.ParseDiffHunks(io.StringIO(stdout)): filename = hunk.right.filename[2:] if filename not in file_affected_line_map: file_affected_line_map[filename] = set() @@ -103,7 +103,7 @@ def main(argv=None): # Run each affected file through cpplint lint_failed = False - for filename, affected_lines in file_affected_line_map.iteritems(): + for filename, affected_lines in file_affected_line_map.items(): if filename.split(".")[-1] not in ("c", "h", "cc"): continue if filename.startswith("third_party"): @@ -112,14 +112,16 @@ def main(argv=None): if args: # File contents come from git show_cmd = SHOW_CMD + [args[0] + ":" + filename] - show = Subprocess(show_cmd, stdout=subprocess.PIPE) + show = Subprocess(show_cmd, stdout=subprocess.PIPE, text=True) lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1), - stdin=show.stdout, stderr=subprocess.PIPE) + stdin=show.stdout, stderr=subprocess.PIPE, + text=True) lint_out = lint.communicate()[1] else: # File contents come from the working tree lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1), - stdin=subprocess.PIPE, stderr=subprocess.PIPE) + stdin=subprocess.PIPE, stderr=subprocess.PIPE, + text=True) stdin = open(os.path.join(tl, filename)).read() lint_out = lint.communicate(stdin)[1] @@ -129,17 +131,17 @@ def main(argv=None): continue warning_line_num = int(fields[1]) if warning_line_num in affected_lines: - print "%s:%d:%s"%(filename, warning_line_num, - ":".join(fields[2:])) + print("%s:%d:%s"%(filename, warning_line_num, + ":".join(fields[2:]))) lint_failed = True # Set exit code if any relevant lint errors seen if lint_failed: return 1 - except Usage, err: - print >>sys.stderr, err - print >>sys.stderr, "for help use --help" + except Usage as err: + print(err, file=sys.stderr) + print("for help use --help", file=sys.stderr) return 2 if __name__ == "__main__": diff --git a/tools/wrap-commit-msg.py b/tools/wrap-commit-msg.py index d5b4b046b1..ba3fa58732 100755 --- a/tools/wrap-commit-msg.py +++ b/tools/wrap-commit-msg.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 ## Copyright (c) 2012 The WebM project authors. All Rights Reserved. ## ## Use of this source code is governed by a BSD-style license From c0cee345a36592d162a8c55349a517b84ef5810c Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 18 May 2022 19:00:56 -0700 Subject: [PATCH 0378/1000] y4m_test: check temp file ptr GetTempOutFile() and TempOutFile::file() may return null if the open fails Change-Id: Ib3ee9b592140d30d12aecefa7dfc5f569fa28a34 --- test/y4m_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/y4m_test.cc b/test/y4m_test.cc index 8272263f66..89c6552c5d 100644 --- a/test/y4m_test.cc +++ b/test/y4m_test.cc @@ -196,6 +196,7 @@ static const char kY4MRegularHeader[] = TEST(Y4MHeaderTest, RegularHeader) { libvpx_test::TempOutFile f; + ASSERT_NE(f.file(), nullptr); fwrite(kY4MRegularHeader, 1, sizeof(kY4MRegularHeader), f.file()); fflush(f.file()); EXPECT_EQ(0, fseek(f.file(), 0, 0)); @@ -222,6 +223,7 @@ static const char kY4MLongHeader[] = TEST(Y4MHeaderTest, LongHeader) { libvpx_test::TempOutFile f; + ASSERT_NE(f.file(), nullptr); fwrite(kY4MLongHeader, 1, sizeof(kY4MLongHeader), f.file()); fflush(f.file()); EXPECT_EQ(fseek(f.file(), 0, 0), 0); From 44874ab879455941c977910daba1b80788f243da Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Wed, 25 May 2022 09:42:35 +0800 Subject: [PATCH 0379/1000] loongarch: Remove redundant code Simplify architecture support code and remove redundant code to improve efficiency. Bug: webm:1755 Change-Id: I03bc251aca115b0379fe19907abd165e0876355b --- build/make/rtcd.pl | 15 +- vp8/common/loongarch/loopfilter_filters_lsx.c | 12 +- vpx_dsp/loongarch/loopfilter_16_lsx.c | 180 +++++----- vpx_dsp/loongarch/loopfilter_4_lsx.c | 36 +- vpx_dsp/loongarch/loopfilter_8_lsx.c | 315 +++++++++--------- 5 files changed, 262 insertions(+), 296 deletions(-) diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl index 8ed776add8..9c97268426 100755 --- a/build/make/rtcd.pl +++ b/build/make/rtcd.pl @@ -494,20 +494,7 @@ () @ALL_ARCHS = filter(qw/vsx/); ppc; } elsif ($opts{arch} =~ /loongarch/ ) { - @ALL_ARCHS = filter("$opts{arch}"); - open CONFIG_FILE, $opts{config} or - die "Error opening config file '$opts{config}': $!\n"; - while () { - if (/HAVE_LSX=yes/) { - @ALL_ARCHS = filter("$opts{arch}", qw/lsx/); - last; - } - if (/HAVE_LASX=yes/) { - @ALL_ARCHS = filter("$opts{arch}", qw/lasx/); - last; - } - } - close CONFIG_FILE; + @ALL_ARCHS = filter(qw/lsx lasx/); loongarch; } else { unoptimized; diff --git a/vp8/common/loongarch/loopfilter_filters_lsx.c b/vp8/common/loongarch/loopfilter_filters_lsx.c index f743ec0c50..79c3ea6dbb 100644 --- a/vp8/common/loongarch/loopfilter_filters_lsx.c +++ b/vp8/common/loongarch/loopfilter_filters_lsx.c @@ -172,16 +172,16 @@ static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch, DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch_x2, q1, q2); q3 = __lsx_vldx(src, pitch_x3); - thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr); - thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr); + thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); + thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); thresh0 = __lsx_vilvl_d(thresh1, thresh0); - b_limit0 = __lsx_vreplgr2vr_b(*b_limit0_ptr); - b_limit1 = __lsx_vreplgr2vr_b(*b_limit1_ptr); + b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); + b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); - limit0 = __lsx_vreplgr2vr_b(*limit0_ptr); - limit1 = __lsx_vreplgr2vr_b(*limit1_ptr); + limit0 = __lsx_vldrepl_b(limit0_ptr, 0); + limit1 = __lsx_vldrepl_b(limit1_ptr, 0); limit0 = __lsx_vilvl_d(limit1, limit0); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, diff --git a/vpx_dsp/loongarch/loopfilter_16_lsx.c b/vpx_dsp/loongarch/loopfilter_16_lsx.c index 539817777d..0503df9966 100644 --- a/vpx_dsp/loongarch/loopfilter_16_lsx.c +++ b/vpx_dsp/loongarch/loopfilter_16_lsx.c @@ -55,7 +55,6 @@ static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride, __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; - __m128i zero = __lsx_vldi(0); int32_t stride2 = stride << 1; int32_t stride3 = stride2 + stride; @@ -69,9 +68,9 @@ static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride, DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); q3 = __lsx_vldx(dst, stride3); - thresh = __lsx_vreplgr2vr_b(*thresh_ptr); - b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr); - limit = __lsx_vreplgr2vr_b(*limit_ptr); + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); /* mask and hev */ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); @@ -87,18 +86,16 @@ static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride, return 1; } - DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, - p1_l, p0_l); - DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, - q2_l, q3_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); - DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h, - p1_h, p0_h); - DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h, - q2_h, q3_h); + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); @@ -135,7 +132,6 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { uint8_t *dst_tmp1 = dst + stride4; __m128i flat, flat2, filter8; - __m128i zero = __lsx_vldi(0); __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; __m128i out_h, out_l; v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; @@ -180,16 +176,15 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { } else { dst = dst_tmp0 - stride3; - p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7); - p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6); - p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5); - p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4); - p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3); - p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2); - p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1); - p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0); - - q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0); + p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0); + p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0); + p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0); + p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0); + p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0); + p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0); + p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0); + p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0); + q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0); tmp0_l = p7_l_in << 3; tmp0_l -= p7_l_in; @@ -205,16 +200,15 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7); - p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6); - p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5); - p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4); - - p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3); - p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2); - p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1); - p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0); - q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0); + p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7); + p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6); + p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5); + p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4); + p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3); + p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2); + p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1); + p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0); + q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0); tmp0_h = p7_h_in << 3; tmp0_h -= p7_h_in; @@ -236,14 +230,14 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { dst += stride; /* p5 */ - q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1); + q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0); tmp0_l = p5_l_in - p6_l_in; tmp0_l += q1_l_in; tmp0_l -= p7_l_in; tmp1_l += tmp0_l; out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1); + q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1); tmp0_h = p5_h_in - p6_h_in; tmp0_h += q1_h_in; tmp0_h -= p7_h_in; @@ -256,14 +250,14 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { dst += stride; /* p4 */ - q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2); + q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0); tmp0_l = p4_l_in - p5_l_in; tmp0_l += q2_l_in; tmp0_l -= p7_l_in; tmp1_l += tmp0_l; out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2); + q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2); tmp0_h = p4_h_in - p5_h_in; tmp0_h += q2_h_in; tmp0_h -= p7_h_in; @@ -276,14 +270,14 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { dst += stride; /* p3 */ - q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3); + q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0); tmp0_l = p3_l_in - p4_l_in; tmp0_l += q3_l_in; tmp0_l -= p7_l_in; tmp1_l += tmp0_l; out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3); + q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3); tmp0_h = p3_h_in - p4_h_in; tmp0_h += q3_h_in; tmp0_h -= p7_h_in; @@ -296,7 +290,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { dst += stride; /* p2 */ - q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4); + q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0); filter8 = __lsx_vld(filter48, 0); tmp0_l = p2_l_in - p3_l_in; tmp0_l += q4_l_in; @@ -304,7 +298,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { tmp1_l += tmp0_l; out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4); + q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4); tmp0_h = p2_h_in - p3_h_in; tmp0_h += q4_h_in; tmp0_h -= p7_h_in; @@ -317,7 +311,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { dst += stride; /* p1 */ - q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5); + q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0); filter8 = __lsx_vld(filter48, 16); tmp0_l = p1_l_in - p2_l_in; tmp0_l += q5_l_in; @@ -325,7 +319,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { tmp1_l += tmp0_l; out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5); + q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5); tmp0_h = p1_h_in - p2_h_in; tmp0_h += q5_h_in; tmp0_h -= p7_h_in; @@ -338,7 +332,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { dst += stride; /* p0 */ - q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6); + q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0); filter8 = __lsx_vld(filter48, 32); tmp0_l = p0_l_in - p1_l_in; tmp0_l += q6_l_in; @@ -346,7 +340,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { tmp1_l += tmp0_l; out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6); + q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6); tmp0_h = p0_h_in - p1_h_in; tmp0_h += q6_h_in; tmp0_h -= p7_h_in; @@ -359,7 +353,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { dst += stride; /* q0 */ - q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7); + q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0); filter8 = __lsx_vld(filter48, 48); tmp0_l = q7_l_in - p0_l_in; tmp0_l += q0_l_in; @@ -367,7 +361,7 @@ static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { tmp1_l += tmp0_l; out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7); + q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7); tmp0_h = q7_h_in - p0_h_in; tmp0_h += q0_h_in; tmp0_h -= p7_h_in; @@ -534,9 +528,9 @@ static void mb_lpf_horizontal_edge(uint8_t *dst, int32_t stride, DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); q3 = __lsx_vldx(dst, stride3); - thresh = __lsx_vreplgr2vr_b(*thresh_ptr); - b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr); - limit = __lsx_vreplgr2vr_b(*limit_ptr); + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); /* filter_mask* */ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, @@ -850,15 +844,14 @@ static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48, __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; __m128i vec0, vec1, vec2, vec3, vec4, vec5; - __m128i zero = __lsx_vldi(0); /* load vector elements */ DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16, p3, p2, p1, p0); DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3); - thresh = __lsx_vreplgr2vr_b(*thresh_ptr); - b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr); - limit = __lsx_vreplgr2vr_b(*limit_ptr); + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); /* mask and hev */ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, @@ -901,16 +894,14 @@ static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48, return 1; } - DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, - p1_l, p0_l); - DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, - q2_l, q3_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); - DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h, - p1_h, p0_h); - DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h, - q2_h, q3_h); + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); @@ -942,7 +933,6 @@ static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48, static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride, uint8_t *filter48) { - __m128i zero = __lsx_vldi(0); __m128i flat, flat2, filter8; __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; __m128i out_l, out_h; @@ -1038,15 +1028,15 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride, dst -= 7 * 16; - p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7); - p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6); - p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5); - p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4); - p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3); - p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2); - p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1); - p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0); - q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0); + p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0); + p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0); + p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0); + p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0); + p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0); + p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0); + p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0); + p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0); + q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0); tmp0_l = p7_l_in << 3; tmp0_l -= p7_l_in; @@ -1060,15 +1050,15 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride, tmp1_l += p0_l_in; tmp1_l += tmp0_l; out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7); - p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6); - p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5); - p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4); - p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3); - p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2); - p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1); - p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0); - q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0); + p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7); + p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6); + p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5); + p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4); + p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3); + p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2); + p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1); + p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0); + q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0); tmp0_h = p7_h_in << 3; tmp0_h -= p7_h_in; @@ -1088,13 +1078,13 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride, __lsx_vst(p6, dst, 0); /* p5 */ - q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1); + q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0); tmp0_l = p5_l_in - p6_l_in; tmp0_l += q1_l_in; tmp0_l -= p7_l_in; tmp1_l += tmp0_l; out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1); + q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1); tmp0_h = p5_h_in - p6_h_in; tmp0_h += q1_h_in; tmp0_h -= p7_h_in; @@ -1105,13 +1095,13 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride, __lsx_vst(p5, dst, 16); /* p4 */ - q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2); + q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0); tmp0_l = p4_l_in - p5_l_in; tmp0_l += q2_l_in; tmp0_l -= p7_l_in; tmp1_l += tmp0_l; out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2); + q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2); tmp0_h = p4_h_in - p5_h_in; tmp0_h += q2_h_in; tmp0_h -= p7_h_in; @@ -1122,13 +1112,13 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride, __lsx_vst(p4, dst, 16 * 2); /* p3 */ - q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3); + q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0); tmp0_l = p3_l_in - p4_l_in; tmp0_l += q3_l_in; tmp0_l -= p7_l_in; tmp1_l += tmp0_l; out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3); + q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3); tmp0_h = p3_h_in - p4_h_in; tmp0_h += q3_h_in; tmp0_h -= p7_h_in; @@ -1139,14 +1129,14 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride, __lsx_vst(p3, dst, 16 * 3); /* p2 */ - q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4); + q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0); filter8 = __lsx_vld(filter48, 0); tmp0_l = p2_l_in - p3_l_in; tmp0_l += q4_l_in; tmp0_l -= p7_l_in; tmp1_l += tmp0_l; out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4); + q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4); tmp0_h = p2_h_in - p3_h_in; tmp0_h += q4_h_in; tmp0_h -= p7_h_in; @@ -1157,14 +1147,14 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride, __lsx_vst(filter8, dst, 16 * 4); /* p1 */ - q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5); + q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0); filter8 = __lsx_vld(filter48, 16); tmp0_l = p1_l_in - p2_l_in; tmp0_l += q5_l_in; tmp0_l -= p7_l_in; tmp1_l += tmp0_l; out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5); + q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5); tmp0_h = p1_h_in - p2_h_in; tmp0_h += q5_h_in; tmp0_h -= p7_h_in; @@ -1175,14 +1165,14 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride, __lsx_vst(filter8, dst, 16 * 5); /* p0 */ - q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6); + q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0); filter8 = __lsx_vld(filter48, 32); tmp0_l = p0_l_in - p1_l_in; tmp0_l += q6_l_in; tmp0_l -= p7_l_in; tmp1_l += tmp0_l; out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6); + q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6); tmp0_h = p0_h_in - p1_h_in; tmp0_h += q6_h_in; tmp0_h -= p7_h_in; @@ -1193,14 +1183,14 @@ static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride, __lsx_vst(filter8, dst, 16 * 6); /* q0 */ - q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7); + q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0); filter8 = __lsx_vld(filter48, 48); tmp0_l = q7_l_in - p0_l_in; tmp0_l += q0_l_in; tmp0_l -= p7_l_in; tmp1_l += tmp0_l; out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); - q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7); + q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7); tmp0_h = q7_h_in - p0_h_in; tmp0_h += q0_h_in; tmp0_h -= p7_h_in; diff --git a/vpx_dsp/loongarch/loopfilter_4_lsx.c b/vpx_dsp/loongarch/loopfilter_4_lsx.c index e8abf0523f..9300b5c5ae 100644 --- a/vpx_dsp/loongarch/loopfilter_4_lsx.c +++ b/vpx_dsp/loongarch/loopfilter_4_lsx.c @@ -27,9 +27,9 @@ void vpx_lpf_horizontal_4_lsx(uint8_t *src, int32_t pitch, DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2); q3 = __lsx_vldx(src, pitch3); - thresh = __lsx_vreplgr2vr_b(*thresh_ptr); - b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr); - limit = __lsx_vreplgr2vr_b(*limit_ptr); + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); @@ -60,16 +60,16 @@ void vpx_lpf_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch, DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2); q3 = __lsx_vldx(src, pitch3); - thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr); - thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr); + thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); + thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); thresh0 = __lsx_vilvl_d(thresh1, thresh0); - b_limit0 = __lsx_vreplgr2vr_b(*b_limit0_ptr); - b_limit1 = __lsx_vreplgr2vr_b(*b_limit1_ptr); + b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); + b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); - limit0 = __lsx_vreplgr2vr_b(*limit0_ptr); - limit1 = __lsx_vreplgr2vr_b(*limit1_ptr); + limit0 = __lsx_vldrepl_b(limit0_ptr, 0); + limit1 = __lsx_vldrepl_b(limit1_ptr, 0); limit0 = __lsx_vilvl_d(limit1, limit0); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, @@ -102,9 +102,9 @@ void vpx_lpf_vertical_4_lsx(uint8_t *src, int32_t pitch, DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, q1, q2); q3 = __lsx_vldx(src_tmp, pitch3); - thresh = __lsx_vreplgr2vr_b(*thresh_ptr); - b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr); - limit = __lsx_vreplgr2vr_b(*limit_ptr); + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, q3); @@ -169,16 +169,16 @@ void vpx_lpf_vertical_4_dual_lsx(uint8_t *src, int32_t pitch, row9, row10, row11, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, q3); - thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr); - thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr); + thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); + thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); thresh0 = __lsx_vilvl_d(thresh1, thresh0); - b_limit0 = __lsx_vreplgr2vr_b(*b_limit0_ptr); - b_limit1 = __lsx_vreplgr2vr_b(*b_limit1_ptr); + b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); + b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); - limit0 = __lsx_vreplgr2vr_b(*limit0_ptr); - limit1 = __lsx_vreplgr2vr_b(*limit1_ptr); + limit0 = __lsx_vldrepl_b(limit0_ptr, 0); + limit1 = __lsx_vldrepl_b(limit1_ptr, 0); limit0 = __lsx_vilvl_d(limit1, limit0); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, diff --git a/vpx_dsp/loongarch/loopfilter_8_lsx.c b/vpx_dsp/loongarch/loopfilter_8_lsx.c index 358e221662..00219ba71d 100644 --- a/vpx_dsp/loongarch/loopfilter_8_lsx.c +++ b/vpx_dsp/loongarch/loopfilter_8_lsx.c @@ -17,11 +17,10 @@ void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride, const uint8_t *thresh_ptr) { __m128i mask, hev, flat, thresh, b_limit, limit; __m128i p3, p2, p1, p0, q3, q2, q1, q0; - __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out; __m128i p2_filter8, p1_filter8, p0_filter8; __m128i q0_filter8, q1_filter8, q2_filter8; __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l; - __m128i zero = __lsx_vldi(0); int32_t stride2 = stride << 1; int32_t stride3 = stride2 + stride; @@ -34,16 +33,16 @@ void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride, DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); q3 = __lsx_vldx(dst, stride3); - thresh = __lsx_vreplgr2vr_b(*thresh_ptr); - b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr); - limit = __lsx_vreplgr2vr_b(*limit_ptr); + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - flat = __lsx_vilvl_d(zero, flat); + flat = __lsx_vilvl_d(flat, flat); if (__lsx_bz_v(flat)) { __lsx_vstelm_d(p1_out, dst - stride2, 0, 0); @@ -51,35 +50,35 @@ void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride, __lsx_vstelm_d(q0_out, dst, 0, 0); __lsx_vstelm_d(q1_out, dst + stride, 0, 0); } else { - DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, - p1_l, p0_l); - DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, - q2_l, q3_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); - /* convert 16 bit output data into 8 bit */ - DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, zero, - p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, - q0_filter8); - DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, q1_filter8, - q2_filter8); - DUP4_ARG3(__lsx_vbitsel_v, p2, p2_filter8, flat, p1_out, p1_filter8, flat, - p0_out, p0_filter8, flat, q0_out, q0_filter8, flat, p2_out, - p1_out, p0_out, q0_out); - DUP2_ARG3(__lsx_vbitsel_v, q1_out, q1_filter8, flat, q2, q2_filter8, flat, - q1_out, q2_out); + DUP2_ARG2(__lsx_vpickev_b, p1_filter8, p2_filter8, q0_filter8, p0_filter8, + p1_filter8, q0_filter8); + q2_filter8 = __lsx_vpickev_b(q2_filter8, q1_filter8); + + p2 = __lsx_vilvl_d(p1_out, p2); + p0_out = __lsx_vilvl_d(q0_out, p0_out); + q1_out = __lsx_vilvl_d(q2, q1_out); + + DUP2_ARG3(__lsx_vbitsel_v, p2, p1_filter8, flat, p0_out, q0_filter8, flat, + p2_out, p1_out); + p0_out = __lsx_vbitsel_v(q1_out, q2_filter8, flat); dst -= stride3; __lsx_vstelm_d(p2_out, dst, 0, 0); - __lsx_vstelm_d(p1_out, dst + stride, 0, 0); - __lsx_vstelm_d(p0_out, dst + stride2, 0, 0); - __lsx_vstelm_d(q0_out, dst + stride3, 0, 0); + __lsx_vstelm_d(p2_out, dst + stride, 0, 1); + __lsx_vstelm_d(p1_out, dst + stride2, 0, 0); + __lsx_vstelm_d(p1_out, dst + stride3, 0, 1); dst += stride4; - __lsx_vstelm_d(q1_out, dst, 0, 0); + __lsx_vstelm_d(p0_out, dst, 0, 0); dst += stride; - __lsx_vstelm_d(q2_out, dst, 0, 0); + __lsx_vstelm_d(p0_out, dst, 0, 1); } } @@ -89,14 +88,13 @@ void vpx_lpf_horizontal_8_dual_lsx( const uint8_t *limit1, const uint8_t *thresh1) { __m128i p3, p2, p1, p0, q3, q2, q1, q0; __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; - __m128i flat, mask, hev, tmp, thresh, b_limit, limit; + __m128i flat, mask, hev, thresh, b_limit, limit; __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; - __m128i zero = __lsx_vldi(0); int32_t stride2 = stride << 1; int32_t stride3 = stride2 + stride; @@ -108,17 +106,17 @@ void vpx_lpf_horizontal_8_dual_lsx( DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); q3 = __lsx_vldx(dst, stride3); - thresh = __lsx_vreplgr2vr_b(*thresh0); - tmp = __lsx_vreplgr2vr_b(*thresh1); - thresh = __lsx_vilvl_d(tmp, thresh); + thresh = __lsx_vldrepl_b(thresh0, 0); + p2_out = __lsx_vldrepl_b(thresh1, 0); + thresh = __lsx_vilvl_d(p2_out, thresh); - b_limit = __lsx_vreplgr2vr_b(*b_limit0); - tmp = __lsx_vreplgr2vr_b(*b_limit1); - b_limit = __lsx_vilvl_d(tmp, b_limit); + b_limit = __lsx_vldrepl_b(b_limit0, 0); + p2_out = __lsx_vldrepl_b(b_limit1, 0); + b_limit = __lsx_vilvl_d(p2_out, b_limit); - limit = __lsx_vreplgr2vr_b(*limit0); - tmp = __lsx_vreplgr2vr_b(*limit1); - limit = __lsx_vilvl_d(tmp, limit); + limit = __lsx_vldrepl_b(limit0, 0); + p2_out = __lsx_vldrepl_b(limit1, 0); + limit = __lsx_vilvl_d(p2_out, limit); /* mask and hev */ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, @@ -132,17 +130,15 @@ void vpx_lpf_horizontal_8_dual_lsx( __lsx_vst(q0_out, dst, 0); __lsx_vst(q1_out, dst + stride, 0); } else { - DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, - p1_l, p0_l); - DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, - q2_l, q3_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); - DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h, - p1_h, p0_h); - DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h, - q2_h, q3_h); + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); @@ -180,7 +176,6 @@ void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride, __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; - __m128i vec0, vec1, vec2, vec3, vec4; __m128i zero = __lsx_vldi(0); int32_t stride2 = stride << 1; @@ -200,9 +195,9 @@ void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride, LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, q3); - thresh = __lsx_vreplgr2vr_b(*thresh_ptr); - b_limit = __lsx_vreplgr2vr_b(*b_limit_ptr); - limit = __lsx_vreplgr2vr_b(*limit_ptr); + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); /* mask and hev */ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, @@ -217,20 +212,20 @@ void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride, /* if flat is zero for all pixels, then no need to calculate other filter */ if (__lsx_bz_v(flat)) { /* Store 4 pixels p1-_q1 */ - DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); - vec2 = __lsx_vilvl_h(vec1, vec0); - vec3 = __lsx_vilvh_h(vec1, vec0); + DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1); + p2 = __lsx_vilvl_h(p1, p0); + p3 = __lsx_vilvh_h(p1, p0); dst -= 2; - __lsx_vstelm_w(vec2, dst, 0, 0); - __lsx_vstelm_w(vec2, dst + stride, 0, 1); - __lsx_vstelm_w(vec2, dst + stride2, 0, 2); - __lsx_vstelm_w(vec2, dst + stride3, 0, 3); + __lsx_vstelm_w(p2, dst, 0, 0); + __lsx_vstelm_w(p2, dst + stride, 0, 1); + __lsx_vstelm_w(p2, dst + stride2, 0, 2); + __lsx_vstelm_w(p2, dst + stride3, 0, 3); dst += stride4; - __lsx_vstelm_w(vec3, dst, 0, 0); - __lsx_vstelm_w(vec3, dst + stride, 0, 1); - __lsx_vstelm_w(vec3, dst + stride2, 0, 2); - __lsx_vstelm_w(vec3, dst + stride3, 0, 3); + __lsx_vstelm_w(p3, dst, 0, 0); + __lsx_vstelm_w(p3, dst + stride, 0, 1); + __lsx_vstelm_w(p3, dst + stride2, 0, 2); + __lsx_vstelm_w(p3, dst + stride3, 0, 3); } else { DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); @@ -253,35 +248,34 @@ void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride, q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat); /* Store 6 pixels p2-_q2 */ - DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1); - vec2 = __lsx_vilvl_h(vec1, vec0); - vec3 = __lsx_vilvh_h(vec1, vec0); - vec4 = __lsx_vilvl_b(q2, q1); - + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3); + p1 = __lsx_vilvl_h(q3, p3); + p2 = __lsx_vilvh_h(q3, p3); + p3 = __lsx_vilvl_b(q2, q1); dst -= 3; - __lsx_vstelm_w(vec2, dst, 0, 0); - __lsx_vstelm_h(vec4, dst, 4, 0); + __lsx_vstelm_w(p1, dst, 0, 0); + __lsx_vstelm_h(p3, dst, 4, 0); dst += stride; - __lsx_vstelm_w(vec2, dst, 0, 1); - __lsx_vstelm_h(vec4, dst, 4, 1); + __lsx_vstelm_w(p1, dst, 0, 1); + __lsx_vstelm_h(p3, dst, 4, 1); dst += stride; - __lsx_vstelm_w(vec2, dst, 0, 2); - __lsx_vstelm_h(vec4, dst, 4, 2); + __lsx_vstelm_w(p1, dst, 0, 2); + __lsx_vstelm_h(p3, dst, 4, 2); dst += stride; - __lsx_vstelm_w(vec2, dst, 0, 3); - __lsx_vstelm_h(vec4, dst, 4, 3); + __lsx_vstelm_w(p1, dst, 0, 3); + __lsx_vstelm_h(p3, dst, 4, 3); dst += stride; - __lsx_vstelm_w(vec3, dst, 0, 0); - __lsx_vstelm_h(vec4, dst, 4, 4); + __lsx_vstelm_w(p2, dst, 0, 0); + __lsx_vstelm_h(p3, dst, 4, 4); dst += stride; - __lsx_vstelm_w(vec3, dst, 0, 1); - __lsx_vstelm_h(vec4, dst, 4, 5); + __lsx_vstelm_w(p2, dst, 0, 1); + __lsx_vstelm_h(p3, dst, 4, 5); dst += stride; - __lsx_vstelm_w(vec3, dst, 0, 2); - __lsx_vstelm_h(vec4, dst, 4, 6); + __lsx_vstelm_w(p2, dst, 0, 2); + __lsx_vstelm_h(p3, dst, 4, 6); dst += stride; - __lsx_vstelm_w(vec3, dst, 0, 3); - __lsx_vstelm_h(vec4, dst, 4, 7); + __lsx_vstelm_w(p2, dst, 0, 3); + __lsx_vstelm_h(p3, dst, 4, 7); } } @@ -301,8 +295,6 @@ void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride, __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; - __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - __m128i zero = __lsx_vldi(0); int32_t stride2 = stride << 1; int32_t stride3 = stride2 + stride; int32_t stride4 = stride2 << 1; @@ -329,17 +321,17 @@ void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, q3); - thresh = __lsx_vreplgr2vr_b(*thresh0); - vec0 = __lsx_vreplgr2vr_b(*thresh1); - thresh = __lsx_vilvl_d(vec0, thresh); + thresh = __lsx_vldrepl_b(thresh0, 0); + p1_out = __lsx_vldrepl_b(thresh1, 0); + thresh = __lsx_vilvl_d(p1_out, thresh); - b_limit = __lsx_vreplgr2vr_b(*b_limit0); - vec0 = __lsx_vreplgr2vr_b(*b_limit1); - b_limit = __lsx_vilvl_d(vec0, b_limit); + b_limit = __lsx_vldrepl_b(b_limit0, 0); + p1_out = __lsx_vldrepl_b(b_limit1, 0); + b_limit = __lsx_vilvl_d(p1_out, b_limit); - limit = __lsx_vreplgr2vr_b(*limit0); - vec0 = __lsx_vreplgr2vr_b(*limit1); - limit = __lsx_vilvl_d(vec0, limit); + limit = __lsx_vldrepl_b(limit0, 0); + p1_out = __lsx_vldrepl_b(limit1, 0); + limit = __lsx_vilvl_d(p1_out, limit); /* mask and hev */ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, @@ -350,44 +342,41 @@ void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride, VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); /* if flat is zero for all pixels, then no need to calculate other filter */ if (__lsx_bz_v(flat)) { - DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); - vec2 = __lsx_vilvl_h(vec1, vec0); - vec3 = __lsx_vilvh_h(vec1, vec0); - DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); - vec4 = __lsx_vilvl_h(vec1, vec0); - vec5 = __lsx_vilvh_h(vec1, vec0); - + DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1); + p2 = __lsx_vilvl_h(p1, p0); + p3 = __lsx_vilvh_h(p1, p0); + DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, p0, p1); + q2 = __lsx_vilvl_h(p1, p0); + q3 = __lsx_vilvh_h(p1, p0); dst -= 2; - __lsx_vstelm_w(vec2, dst, 0, 0); - __lsx_vstelm_w(vec2, dst + stride, 0, 1); - __lsx_vstelm_w(vec2, dst + stride2, 0, 2); - __lsx_vstelm_w(vec2, dst + stride3, 0, 3); + __lsx_vstelm_w(p2, dst, 0, 0); + __lsx_vstelm_w(p2, dst + stride, 0, 1); + __lsx_vstelm_w(p2, dst + stride2, 0, 2); + __lsx_vstelm_w(p2, dst + stride3, 0, 3); dst += stride4; - __lsx_vstelm_w(vec3, dst, 0, 0); - __lsx_vstelm_w(vec3, dst + stride, 0, 1); - __lsx_vstelm_w(vec3, dst + stride2, 0, 2); - __lsx_vstelm_w(vec3, dst + stride3, 0, 3); + __lsx_vstelm_w(p3, dst, 0, 0); + __lsx_vstelm_w(p3, dst + stride, 0, 1); + __lsx_vstelm_w(p3, dst + stride2, 0, 2); + __lsx_vstelm_w(p3, dst + stride3, 0, 3); dst += stride4; - __lsx_vstelm_w(vec4, dst, 0, 0); - __lsx_vstelm_w(vec4, dst + stride, 0, 1); - __lsx_vstelm_w(vec4, dst + stride2, 0, 2); - __lsx_vstelm_w(vec4, dst + stride3, 0, 3); + __lsx_vstelm_w(q2, dst, 0, 0); + __lsx_vstelm_w(q2, dst + stride, 0, 1); + __lsx_vstelm_w(q2, dst + stride2, 0, 2); + __lsx_vstelm_w(q2, dst + stride3, 0, 3); dst += stride4; - __lsx_vstelm_w(vec5, dst, 0, 0); - __lsx_vstelm_w(vec5, dst + stride, 0, 1); - __lsx_vstelm_w(vec5, dst + stride2, 0, 2); - __lsx_vstelm_w(vec5, dst + stride3, 0, 3); + __lsx_vstelm_w(q3, dst, 0, 0); + __lsx_vstelm_w(q3, dst + stride, 0, 1); + __lsx_vstelm_w(q3, dst + stride2, 0, 2); + __lsx_vstelm_w(q3, dst + stride3, 0, 3); } else { - DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, - p1_l, p0_l); - DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, - q2_l, q3_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); - DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_h, p2_h, - p1_h, p0_h); - DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_h, q1_h, - q2_h, q3_h); + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); /* filter8 */ VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, @@ -408,62 +397,62 @@ void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride, q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat); - DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1); - vec3 = __lsx_vilvl_h(vec1, vec0); - vec4 = __lsx_vilvh_h(vec1, vec0); - DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1); - vec6 = __lsx_vilvl_h(vec1, vec0); - vec7 = __lsx_vilvh_h(vec1, vec0); - vec2 = __lsx_vilvl_b(q2, q1); - vec5 = __lsx_vilvh_b(q2, q1); + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3); + p2_filt8_l = __lsx_vilvl_h(q3, p3); + p2_filt8_h = __lsx_vilvh_h(q3, p3); + DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, p3, q3); + p0_filt8_l = __lsx_vilvl_h(q3, p3); + p0_filt8_h = __lsx_vilvh_h(q3, p3); + q1_filt8_l = __lsx_vilvl_b(q2, q1); + q1_filt8_h = __lsx_vilvh_b(q2, q1); dst -= 3; - __lsx_vstelm_w(vec3, dst, 0, 0); - __lsx_vstelm_h(vec2, dst, 4, 0); + __lsx_vstelm_w(p2_filt8_l, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 0); dst += stride; - __lsx_vstelm_w(vec3, dst, 0, 1); - __lsx_vstelm_h(vec2, dst, 4, 1); + __lsx_vstelm_w(p2_filt8_l, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 1); dst += stride; - __lsx_vstelm_w(vec3, dst, 0, 2); - __lsx_vstelm_h(vec2, dst, 4, 2); + __lsx_vstelm_w(p2_filt8_l, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 2); dst += stride; - __lsx_vstelm_w(vec3, dst, 0, 3); - __lsx_vstelm_h(vec2, dst, 4, 3); + __lsx_vstelm_w(p2_filt8_l, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 3); dst += stride; - __lsx_vstelm_w(vec4, dst, 0, 0); - __lsx_vstelm_h(vec2, dst, 4, 4); + __lsx_vstelm_w(p2_filt8_h, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 4); dst += stride; - __lsx_vstelm_w(vec4, dst, 0, 1); - __lsx_vstelm_h(vec2, dst, 4, 5); + __lsx_vstelm_w(p2_filt8_h, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 5); dst += stride; - __lsx_vstelm_w(vec4, dst, 0, 2); - __lsx_vstelm_h(vec2, dst, 4, 6); + __lsx_vstelm_w(p2_filt8_h, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 6); dst += stride; - __lsx_vstelm_w(vec4, dst, 0, 3); - __lsx_vstelm_h(vec2, dst, 4, 7); + __lsx_vstelm_w(p2_filt8_h, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 7); dst += stride; - __lsx_vstelm_w(vec6, dst, 0, 0); - __lsx_vstelm_h(vec5, dst, 4, 0); + __lsx_vstelm_w(p0_filt8_l, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 0); dst += stride; - __lsx_vstelm_w(vec6, dst, 0, 1); - __lsx_vstelm_h(vec5, dst, 4, 1); + __lsx_vstelm_w(p0_filt8_l, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 1); dst += stride; - __lsx_vstelm_w(vec6, dst, 0, 2); - __lsx_vstelm_h(vec5, dst, 4, 2); + __lsx_vstelm_w(p0_filt8_l, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 2); dst += stride; - __lsx_vstelm_w(vec6, dst, 0, 3); - __lsx_vstelm_h(vec5, dst, 4, 3); + __lsx_vstelm_w(p0_filt8_l, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 3); dst += stride; - __lsx_vstelm_w(vec7, dst, 0, 0); - __lsx_vstelm_h(vec5, dst, 4, 4); + __lsx_vstelm_w(p0_filt8_h, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 4); dst += stride; - __lsx_vstelm_w(vec7, dst, 0, 1); - __lsx_vstelm_h(vec5, dst, 4, 5); + __lsx_vstelm_w(p0_filt8_h, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 5); dst += stride; - __lsx_vstelm_w(vec7, dst, 0, 2); - __lsx_vstelm_h(vec5, dst, 4, 6); + __lsx_vstelm_w(p0_filt8_h, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 6); dst += stride; - __lsx_vstelm_w(vec7, dst, 0, 3); - __lsx_vstelm_h(vec5, dst, 4, 7); + __lsx_vstelm_w(p0_filt8_h, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 7); } } From 58919dd7f1c96faffc89f3875267690bc2bbd237 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 20 May 2022 19:15:47 -0700 Subject: [PATCH 0380/1000] GetTempOutFile(): use testing::TempDir() rather than tmpfile(). this allows for setting the path with TEST_TMPDIR and provides a valid default for android. Change-Id: Iecb26f381b6a6ec97da62cfa0b7200f427440a2f --- test/video_source.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/test/video_source.h b/test/video_source.h index 7a2dbe7ef7..349e3de37c 100644 --- a/test/video_source.h +++ b/test/video_source.h @@ -20,8 +20,14 @@ #endif #include #include +#include +#include #include + #include "test/acm_random.h" +#if !defined(_WIN32) +#include "third_party/googletest/src/include/gtest/gtest.h" +#endif #include "vpx/vpx_encoder.h" namespace libvpx_test { @@ -72,7 +78,23 @@ static FILE *GetTempOutFile(std::string *file_name) { } return NULL; #else - return tmpfile(); + std::string temp_dir = testing::TempDir(); + if (temp_dir.empty()) return NULL; + // Versions of testing::TempDir() prior to release-1.11.0-214-g5e6a5336 may + // use the value of an environment variable without checking for a trailing + // path delimiter. + if (temp_dir[temp_dir.size() - 1] != '/') temp_dir += '/'; + const char name_template[] = "libvpxtest.XXXXXX"; + std::unique_ptr temp_file_name( + new char[temp_dir.size() + sizeof(name_template)]); + if (temp_file_name == nullptr) return NULL; + memcpy(temp_file_name.get(), temp_dir.data(), temp_dir.size()); + memcpy(temp_file_name.get() + temp_dir.size(), name_template, + sizeof(name_template)); + const int fd = mkstemp(temp_file_name.get()); + if (fd == -1) return NULL; + *file_name = temp_file_name.get(); + return fdopen(fd, "wb+"); #endif } From 9f1329f8ac88ea5d7c6ae5d6a57221c36cf85ac8 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 25 May 2022 23:39:42 +0000 Subject: [PATCH 0381/1000] Revert "[NEON] Optimize vp9_diamond_search_sad() for NEON" This reverts commit 258affdeab68ed59e181368baa46e2f1d077b0ab. Reason for revert: Not bitexact with C version Original change's description: > [NEON] Optimize vp9_diamond_search_sad() for NEON > > About 50% improvement in comparison to the C function. > I have followed the AVX version with some simplifications. > > Change-Id: I72ddbdb2fbc5ed8a7f0210703fe05523a37db1c9 Change-Id: I5c210b3dfe1f6dec525da857dd8c83946be566fc --- vp9/common/vp9_rtcd_defs.pl | 2 +- .../arm/neon/vp9_diamond_search_sad_neon.c | 322 ------------------ vp9/vp9cx.mk | 1 - 3 files changed, 1 insertion(+), 324 deletions(-) delete mode 100644 vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index e6b65c96f0..4da0b6675b 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -175,7 +175,7 @@ () # Motion search # add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; -specialize qw/vp9_diamond_search_sad avx neon/; +specialize qw/vp9_diamond_search_sad avx/; # # Apply temporal filter diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c deleted file mode 100644 index e56733d43e..0000000000 --- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c +++ /dev/null @@ -1,322 +0,0 @@ -/* - * Copyright (c) 2022 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include - -#include "vpx_dsp/vpx_dsp_common.h" -#include "vp9/encoder/vp9_encoder.h" -#include "vpx_ports/mem.h" - -#ifdef __GNUC__ -#define LIKELY(v) __builtin_expect(v, 1) -#define UNLIKELY(v) __builtin_expect(v, 0) -#else -#define LIKELY(v) (v) -#define UNLIKELY(v) (v) -#endif - -static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { - int_mv result; - result.as_mv.row = row; - result.as_mv.col = col; - return result; -} - -static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) { - // This is simplified from the C implementation to utilise that - // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and - // x->nmvjointsadcost[1] == x->nmvjointsadcost[3] - return mv.as_int == 0 ? 0 : 1; -} - -static INLINE int mv_cost(const int_mv mv, const int *joint_cost, - int *const comp_cost[2]) { - assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX); - assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX); - return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] + - comp_cost[1][mv.as_mv.col]; -} - -static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, - int sad_per_bit) { - const int_mv diff = - pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col); - return ROUND_POWER_OF_TWO( - (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, - VP9_PROB_COST_SHIFT); -} - -/***************************************************************************** - * This function utilizes 3 properties of the cost function lookup tables, * - * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * - * vp9_encoder.c. * - * For the joint cost: * - * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * - * For the component costs: * - * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * - * (Equal costs for both components) * - * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * - * (Cost function is even) * - * If these do not hold, then this function cannot be used without * - * modification, in which case you can revert to using the C implementation, * - * which does not rely on these properties. * - *****************************************************************************/ -int vp9_diamond_search_sad_neon(const MACROBLOCK *x, - const search_site_config *cfg, MV *ref_mv, - MV *best_mv, int search_param, int sad_per_bit, - int *num00, const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv) { - static const uint32_t data[4] = { 0, 1, 2, 3 }; - const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data); - - const int32x4_t zero_s32 = vdupq_n_s32(0); - const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); - const int16x8_t v_max_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(maxmv.as_int)); - const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min); - const int16x8_t v_min_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(minmv.as_int)); - - const int32x4_t v_spb_d = vdupq_n_s32(sad_per_bit); - - const int32x4_t v_joint_cost_0_d = vdupq_n_s32(x->nmvjointsadcost[0]); - const int32x4_t v_joint_cost_1_d = vdupq_n_s32(x->nmvjointsadcost[1]); - - // search_param determines the length of the initial step and hence the number - // of iterations. - // 0 = initial step (MAX_FIRST_STEP) pel - // 1 = (MAX_FIRST_STEP/2) pel, - // 2 = (MAX_FIRST_STEP/4) pel... - const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; - const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; - const int tot_steps = cfg->total_steps - search_param; - - const int_mv fcenter_mv = - pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); - const int16x8_t vfcmv = vdupq_n_s16(fcenter_mv.as_int); - - const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); - const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); - - int_mv bmv = pack_int_mv(ref_row, ref_col); - int_mv new_bmv = bmv; - int16x8_t v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); - - const int what_stride = x->plane[0].src.stride; - const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; - const uint8_t *const what = x->plane[0].src.buf; - const uint8_t *const in_what = - x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; - - // Work out the start point for the search - const uint8_t *best_address = in_what; - const uint8_t *new_best_address = best_address; -#if defined(__aarch64__) - int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address); -#else - int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address); -#endif - unsigned int best_sad = INT_MAX; - int i, j, step; - - // Check the prerequisite cost function properties that are easy to check - // in an assert. See the function-level documentation for details on all - // prerequisites. - assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); - assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); - - // Check the starting position - best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); - best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); - - *num00 = 0; - - for (i = 0, step = 0; step < tot_steps; step++) { - for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { - int16x8_t v_diff_mv_w; - int8x16_t v_inside_d; - uint32x4_t v_outside_d; - int32x4_t v_cost_d, v_sad_d; -#if defined(__aarch64__) - int64x2_t v_blocka[2]; -#else - int32x4_t v_blocka[1]; - uint32x2_t horiz_max_0, horiz_max_1; -#endif - - uint32_t horiz_max; - // Compute the candidate motion vectors - const int16x8_t v_ss_mv_w = vld1q_s16((const int16_t *)&ss_mv[i]); - const int16x8_t v_these_mv_w = vaddq_s16(v_bmv_w, v_ss_mv_w); - // Clamp them to the search bounds - int16x8_t v_these_mv_clamp_w = v_these_mv_w; - v_these_mv_clamp_w = vminq_s16(v_these_mv_clamp_w, v_max_mv_w); - v_these_mv_clamp_w = vmaxq_s16(v_these_mv_clamp_w, v_min_mv_w); - // The ones that did not change are inside the search area - v_inside_d = vreinterpretq_s8_u32( - vceqq_s32(vreinterpretq_s32_s16(v_these_mv_clamp_w), - vreinterpretq_s32_s16(v_these_mv_w))); - - // If none of them are inside, then move on -#if defined(__aarch64__) - horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d)); -#else - horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)), - vget_high_u32(vreinterpretq_u32_s8(v_inside_d))); - horiz_max_1 = vpmax_u32(horiz_max_0, horiz_max_0); - vst1_lane_u32(&horiz_max, horiz_max_1, 0); -#endif - if (LIKELY(horiz_max == 0)) { - continue; - } - - // The inverse mask indicates which of the MVs are outside - v_outside_d = - vreinterpretq_u32_s8(veorq_s8(v_inside_d, vdupq_n_s8((int8_t)0xff))); - // Shift right to keep the sign bit clear, we will use this later - // to set the cost to the maximum value. - v_outside_d = vshrq_n_u32(v_outside_d, 1); - - // Compute the difference MV - v_diff_mv_w = vsubq_s16(v_these_mv_clamp_w, vfcmv); - // We utilise the fact that the cost function is even, and use the - // absolute difference. This allows us to use unsigned indexes later - // and reduces cache pressure somewhat as only a half of the table - // is ever referenced. - v_diff_mv_w = vabsq_s16(v_diff_mv_w); - - // Compute the SIMD pointer offsets. - { -#if defined(__aarch64__) // sizeof(intptr_t) == 8 - // Load the offsets - int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]); - int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]); - // Set the ones falling outside to zero - v_bo10_q = vandq_s64( - v_bo10_q, - vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(v_inside_d)))); - v_bo32_q = vandq_s64( - v_bo32_q, - vmovl_s32(vget_high_s32(vreinterpretq_s32_s8(v_inside_d)))); - // Compute the candidate addresses - v_blocka[0] = vaddq_s64(v_ba_q, v_bo10_q); - v_blocka[1] = vaddq_s64(v_ba_q, v_bo32_q); -#else // sizeof(intptr_t) == 4 - int32x4_t v_bo_d = vld1q_s32((const int32_t *)&ss_os[i]); - v_bo_d = vandq_s32(v_bo_d, vreinterpretq_s32_s8(v_inside_d)); - v_blocka[0] = vaddq_s32(v_ba_d, v_bo_d); -#endif - } - - fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], - in_what_stride, (uint32_t *)&v_sad_d); - - // Look up the component cost of the residual motion vector - { - uint32_t cost[4]; - int16_t __attribute__((aligned(16))) rowcol[8]; - vst1q_s16(rowcol, v_diff_mv_w); - - // Note: This is a use case for gather instruction - cost[0] = x->nmvsadcost[0][rowcol[0]] + x->nmvsadcost[0][rowcol[1]]; - cost[1] = x->nmvsadcost[0][rowcol[2]] + x->nmvsadcost[0][rowcol[3]]; - cost[2] = x->nmvsadcost[0][rowcol[4]] + x->nmvsadcost[0][rowcol[5]]; - cost[3] = x->nmvsadcost[0][rowcol[6]] + x->nmvsadcost[0][rowcol[7]]; - - v_cost_d = vld1q_s32((int32_t *)cost); - } - - // Now add in the joint cost - { - const uint32x4_t v_sel_d = - vceqq_s32(vreinterpretq_s32_s16(v_diff_mv_w), zero_s32); - const int32x4_t v_joint_cost_d = vreinterpretq_s32_u8( - vbslq_u8(vreinterpretq_u8_u32(v_sel_d), - vreinterpretq_u8_s32(v_joint_cost_0_d), - vreinterpretq_u8_s32(v_joint_cost_1_d))); - v_cost_d = vaddq_s32(v_cost_d, v_joint_cost_d); - } - - // Multiply by sad_per_bit - v_cost_d = vmulq_s32(v_cost_d, v_spb_d); - // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT) - v_cost_d = - vaddq_s32(v_cost_d, vdupq_n_s32(1 << (VP9_PROB_COST_SHIFT - 1))); - v_cost_d = vshrq_n_s32(v_cost_d, VP9_PROB_COST_SHIFT); - // Add the cost to the sad - v_sad_d = vaddq_s32(v_sad_d, v_cost_d); - - // Make the motion vectors outside the search area have max cost - // by or'ing in the comparison mask, this way the minimum search won't - // pick them. - v_sad_d = vorrq_s32(v_sad_d, vreinterpretq_s32_u32(v_outside_d)); - - // Find the minimum value and index horizontally in v_sad_d - { - uint32_t local_best_sad; -#if defined(__aarch64__) - local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d)); -#else - uint32x2_t horiz_min_0 = - vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v_sad_d)), - vget_high_u32(vreinterpretq_u32_s32(v_sad_d))); - uint32x2_t horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0); - vst1_lane_u32(&local_best_sad, horiz_min_1, 0); -#endif - - // Update the global minimum if the local minimum is smaller - if (LIKELY(local_best_sad < best_sad)) { -#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#endif - uint32_t local_best_idx; - const uint32x4_t v_sel_d = - vceqq_s32(v_sad_d, vdupq_n_s32(local_best_sad)); - uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d); - v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff)); - -#if defined(__aarch64__) - local_best_idx = vminvq_u32(v_mask_d); -#else - horiz_min_0 = - vmin_u32(vget_low_u32(v_mask_d), vget_high_u32(v_mask_d)); - horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0); - vst1_lane_u32(&local_best_idx, horiz_min_1, 0); -#endif - - new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; -#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) -#pragma GCC diagnostic pop -#endif - new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; - - best_sad = local_best_sad; - } - } - } - - bmv = new_bmv; - best_address = new_best_address; - - v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); -#if defined(__aarch64__) - v_ba_q = vdupq_n_s64((intptr_t)best_address); -#else - v_ba_d = vdupq_n_s32((intptr_t)best_address); -#endif - - if (UNLIKELY(best_address == in_what)) { - (*num00)++; - } - } - - *best_mv = bmv.as_mv; - return best_sad; -} diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index c9afd9a347..92a7fddb9d 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -113,7 +113,6 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c -VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c From 4832bcff20d34ed25a771f3704cfe0a046dbd0f9 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Fri, 13 May 2022 10:59:06 -0700 Subject: [PATCH 0382/1000] L2E: Add control type for the external rate control API Two control types are defined: QP and GOP control. Now the API only supports the QP model. Change-Id: Ib3a712964b9d2282c93993ee56e0558e4795fb46 --- test/vp9_ext_ratectrl_test.cc | 1 + vp9/encoder/vp9_ext_ratectrl.c | 4 ++-- vpx/vpx_ext_ratectrl.h | 14 +++++++++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index 60a350b84e..f6ce778456 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -176,6 +176,7 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest, ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { vpx_rc_funcs_t rc_funcs; + rc_funcs.rc_type = VPX_RC_QP; rc_funcs.create_model = rc_create_model; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats; rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision; diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 9f0098ab5a..67f58329cc 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -143,7 +143,7 @@ vpx_codec_err_t vp9_extrc_get_encodeframe_decision( if (ext_ratectrl == NULL) { return VPX_CODEC_INVALID_PARAM; } - if (ext_ratectrl->ready) { + if (ext_ratectrl->ready && ext_ratectrl->funcs.rc_type == VPX_RC_QP) { vpx_rc_status_t rc_status; vpx_rc_encodeframe_info_t encode_frame_info; encode_frame_info.show_index = show_index; @@ -172,7 +172,7 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( if (ext_ratectrl == NULL) { return VPX_CODEC_INVALID_PARAM; } - if (ext_ratectrl->ready) { + if (ext_ratectrl->ready && ext_ratectrl->funcs.rc_type == VPX_RC_QP) { PSNR_STATS psnr; vpx_rc_status_t rc_status; vpx_rc_encodeframe_result_t encode_frame_result; diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index a193e55953..5b426b6cf0 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -25,7 +25,15 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures. */ -#define VPX_EXT_RATECTRL_ABI_VERSION (1) +#define VPX_EXT_RATECTRL_ABI_VERSION (2) + +/*!\brief The control type of the inference API. + * In VPX_RC_QP mode, the external rate control model determines the + * quantization parameter (QP) for each frame. + * In VPX_RC_GOP mode, the external rate control model determines the + * group of picture (GOP) of the video sequence. + */ +typedef enum vpx_rc_type { VPX_RC_QP = 0, VPX_RC_GOP = 1 } vpx_rc_type_t; /*!\brief Abstract rate control model handler * @@ -327,6 +335,10 @@ typedef vpx_rc_status_t (*vpx_rc_delete_model_cb_fn_t)( * VP9E_SET_EXTERNAL_RATE_CONTROL. */ typedef struct vpx_rc_funcs { + /*! + * The rate control type of this API. + */ + vpx_rc_type_t rc_type; /*! * Create an external rate control model. */ From 3e7685cf621af3e876274f7be9fef83e3d35de3d Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Fri, 13 May 2022 13:42:28 -0700 Subject: [PATCH 0383/1000] L2E: Add vp9 GOP decision helper function Add a helper function to call the external rate control model. The helper function is placed in the function where vp9 determines GOP decisions. The helper function passes frame information, including current frame show index, coding index, etc to the external rate control model, and then receives GOP decisions. The received GOP decisions overwrites the default GOP decision, only when the external rate control model is set to be active via the codec control. The decision should satisfy a few constraints, for example, larger than min_gf_interval; smaller than max_gf_interval. Otherwise, return error. Unit tests are added to test the new functionality. Change-Id: Id129b4e1a91c844ee5c356a7801c862b1130a3d8 --- test/vp9_ext_ratectrl_test.cc | 147 ++++++++++++++++++++++++++++++++- vp9/encoder/vp9_encoder.c | 8 +- vp9/encoder/vp9_ext_ratectrl.c | 33 +++++++- vp9/encoder/vp9_ext_ratectrl.h | 4 + vp9/encoder/vp9_firstpass.c | 32 +++++++ vp9/encoder/vp9_ratectrl.h | 4 + vpx/vpx_ext_ratectrl.h | 45 +++++++++- 7 files changed, 267 insertions(+), 6 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index f6ce778456..e3e7afbf42 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -17,24 +17,43 @@ #include "test/yuv_video_source.h" #include "third_party/googletest/src/include/gtest/gtest.h" #include "vpx/vpx_ext_ratectrl.h" +#include "vpx_dsp/vpx_dsp_common.h" namespace { constexpr int kModelMagicNumber = 51396; constexpr uintptr_t PrivMagicNumber = 5566; constexpr int kFrameNum = 5; +constexpr int kFrameNumGOP = 30; constexpr int kLosslessCodingIndex = 2; +constexpr int kFixedGOPSize = 9; +// The range check in vp9_cx_iface.c shows that the max +// lag in buffer is MAX_LAG_BUFFERS (25): +// RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); +constexpr int kMaxLagInFrames = 25; +constexpr int kDefaultMinGfInterval = 4; +constexpr int kDefaultMaxGfInterval = 16; +// The two pass rate control does not respect the input +// min_gf_interval and max_gf_interval. +// See function "get_active_gf_inverval_range". +// The numbers below are from manual inspection. +constexpr int kReadMinGfInterval = 5; +constexpr int kReadMaxGfInterval = 13; struct ToyRateCtrl { int magic_number; int coding_index; + + int gop_id; + int frames_since_key; + int show_index; }; vpx_rc_status_t rc_create_model(void *priv, const vpx_rc_config_t *ratectrl_config, vpx_rc_model_t *rate_ctrl_model_pt) { ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; - EXPECT_NE(toy_rate_ctrl, nullptr); + if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; toy_rate_ctrl->magic_number = kModelMagicNumber; toy_rate_ctrl->coding_index = -1; *rate_ctrl_model_pt = toy_rate_ctrl; @@ -48,6 +67,27 @@ vpx_rc_status_t rc_create_model(void *priv, return VPX_RC_OK; } +vpx_rc_status_t rc_create_model_gop(void *priv, + const vpx_rc_config_t *ratectrl_config, + vpx_rc_model_t *rate_ctrl_model_pt) { + ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; + if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; + toy_rate_ctrl->magic_number = kModelMagicNumber; + toy_rate_ctrl->gop_id = 0; + toy_rate_ctrl->frames_since_key = 0; + toy_rate_ctrl->show_index = 0; + toy_rate_ctrl->coding_index = 0; + *rate_ctrl_model_pt = toy_rate_ctrl; + EXPECT_EQ(priv, reinterpret_cast(PrivMagicNumber)); + EXPECT_EQ(ratectrl_config->frame_width, 640); + EXPECT_EQ(ratectrl_config->frame_height, 360); + EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOP); + EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 4000); + EXPECT_EQ(ratectrl_config->frame_rate_num, 30); + EXPECT_EQ(ratectrl_config->frame_rate_den, 1); + return VPX_RC_OK; +} + vpx_rc_status_t rc_send_firstpass_stats( vpx_rc_model_t rate_ctrl_model, const vpx_rc_firstpass_stats_t *first_pass_stats) { @@ -61,6 +101,19 @@ vpx_rc_status_t rc_send_firstpass_stats( return VPX_RC_OK; } +vpx_rc_status_t rc_send_firstpass_stats_gop( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_firstpass_stats_t *first_pass_stats) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOP); + for (int i = 0; i < first_pass_stats->num_frames; ++i) { + EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); + } + return VPX_RC_OK; +} + vpx_rc_status_t rc_get_encodeframe_decision( vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *encode_frame_info, @@ -133,6 +186,41 @@ vpx_rc_status_t rc_get_encodeframe_decision( return VPX_RC_OK; } +vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model, + const vpx_rc_gop_info_t *gop_info, + vpx_rc_gop_decision_t *gop_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames); + EXPECT_EQ(gop_info->min_gf_interval, kReadMinGfInterval); + EXPECT_EQ(gop_info->max_gf_interval, kReadMaxGfInterval); + EXPECT_EQ(gop_info->allow_alt_ref, 1); + if (gop_info->is_key_frame) { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + EXPECT_EQ(gop_info->frames_since_key, 0); + EXPECT_EQ(gop_info->gop_id, 0); + toy_rate_ctrl->gop_id = 0; + toy_rate_ctrl->frames_since_key = 0; + } else { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1); + } + EXPECT_EQ(gop_info->gop_id, toy_rate_ctrl->gop_id); + EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); + EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); + EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); + + gop_decision->gop_coding_frames = + VPXMIN(kFixedGOPSize, gop_info->frames_to_key); + gop_decision->use_alt_ref = gop_decision->gop_coding_frames == kFixedGOPSize; + toy_rate_ctrl->frames_since_key += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + toy_rate_ctrl->show_index += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + toy_rate_ctrl->coding_index += gop_decision->gop_coding_frames; + ++toy_rate_ctrl->gop_id; + return VPX_RC_OK; +} + vpx_rc_status_t rc_update_encodeframe_result( vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_result_t *encode_frame_result) { @@ -153,6 +241,18 @@ vpx_rc_status_t rc_update_encodeframe_result( return VPX_RC_OK; } +vpx_rc_status_t rc_update_encodeframe_result_gop( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_result_t *encode_frame_result) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + + const int64_t ref_pixel_count = 640 * 360 * 3 / 2; + EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count); + return VPX_RC_OK; +} + vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) { ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); @@ -200,4 +300,49 @@ TEST_F(ExtRateCtrlTest, EncodeTest) { ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); } +class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + ExtRateCtrlTestGOP() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTestGOP() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); + encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); + + vpx_rc_funcs_t rc_funcs; + rc_funcs.rc_type = VPX_RC_GOP; + rc_funcs.create_model = rc_create_model_gop; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop; + rc_funcs.get_gop_decision = rc_get_gop_decision; + rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } +}; + +TEST_F(ExtRateCtrlTestGOP, EncodeTest) { + cfg_.rc_target_bitrate = 4000; + cfg_.g_lag_in_frames = kMaxLagInFrames; + cfg_.rc_end_usage = VPX_VBR; + + std::unique_ptr video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + "noisy_clip_640_360.y4m", VPX_IMG_FMT_I420, 640, 360, 30, 1, 0, + kFrameNumGOP)); + + ASSERT_NE(video.get(), nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + } // namespace diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 89b7c8e246..6d807b8abf 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4488,7 +4488,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest } } #endif // CONFIG_RATE_CTRL - if (cpi->ext_ratectrl.ready && !ext_rc_recode) { + if (cpi->ext_ratectrl.ready && !ext_rc_recode && + cpi->ext_ratectrl.funcs.rc_type == VPX_RC_QP) { vpx_codec_err_t codec_status; const GF_GROUP *gf_group = &cpi->twopass.gf_group; vpx_rc_encodeframe_decision_t encode_frame_decision; @@ -4548,7 +4549,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; } - if (cpi->ext_ratectrl.ready) { + if (cpi->ext_ratectrl.ready && + cpi->ext_ratectrl.funcs.rc_type == VPX_RC_QP) { last_q_attempt = q; // In general, for the external rate control, we take the qindex provided // as input and encode the frame with this qindex faithfully. However, @@ -5590,7 +5592,7 @@ static void encode_frame_to_data_rate( // build the bitstream vp9_pack_bitstream(cpi, dest, size); - { + if (cpi->ext_ratectrl.ready) { const RefCntBuffer *coded_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx); vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result( diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 67f58329cc..48c90913ee 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -172,7 +172,7 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( if (ext_ratectrl == NULL) { return VPX_CODEC_INVALID_PARAM; } - if (ext_ratectrl->ready && ext_ratectrl->funcs.rc_type == VPX_RC_QP) { + if (ext_ratectrl->ready) { PSNR_STATS psnr; vpx_rc_status_t rc_status; vpx_rc_encodeframe_result_t encode_frame_result; @@ -198,3 +198,34 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( } return VPX_CODEC_OK; } + +vpx_codec_err_t vp9_extrc_get_gop_decision( + EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info, + vpx_rc_gop_decision_t *gop_decision) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + if (ext_ratectrl->ready && ext_ratectrl->funcs.rc_type == VPX_RC_GOP) { + vpx_rc_status_t rc_status; + rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, + gop_info, gop_decision); + if (gop_decision->use_alt_ref) { + const int arf_constraint = + gop_decision->gop_coding_frames >= gop_info->min_gf_interval && + gop_decision->gop_coding_frames < gop_info->lag_in_frames; + if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR; + } + // TODO(chengchen): Take min and max gf interval from the model + // and overwrite libvpx's decision so that we can get rid + // of one of the checks here. + if (gop_decision->gop_coding_frames > gop_info->frames_to_key || + gop_decision->gop_coding_frames - gop_decision->use_alt_ref > + gop_info->max_gf_interval) { + return VPX_CODEC_ERROR; + } + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + } + return VPX_CODEC_OK; +} diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index 74fd68b96d..b46b776b91 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -45,4 +45,8 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth, uint32_t input_bit_depth, const int actual_encoding_qindex); +vpx_codec_err_t vp9_extrc_get_gop_decision( + EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info, + vpx_rc_gop_decision_t *gop_decision); + #endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 67302ed035..6e1f797f4f 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -2714,6 +2714,9 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { // frame in which case it will already have been done. if (is_key_frame == 0) { vp9_zero(twopass->gf_group); + ++rc->gop_id; + } else { + rc->gop_id = 0; } vpx_clear_system_state(); @@ -2751,6 +2754,35 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { } } #endif + // If the external rate control model for GOP is used, the gop decisions + // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref| + // will be overwritten. + if (cpi->ext_ratectrl.ready && + cpi->ext_ratectrl.funcs.rc_type == VPX_RC_GOP) { + vpx_codec_err_t codec_status; + vpx_rc_gop_decision_t gop_decision; + vpx_rc_gop_info_t gop_info; + gop_info.min_gf_interval = active_gf_interval.min; + gop_info.max_gf_interval = active_gf_interval.max; + gop_info.allow_alt_ref = allow_alt_ref; + gop_info.is_key_frame = is_key_frame; + gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active; + gop_info.frames_since_key = rc->frames_since_key; + gop_info.frames_to_key = rc->frames_to_key; + gop_info.lag_in_frames = cpi->oxcf.lag_in_frames; + gop_info.show_index = cm->current_video_frame; + gop_info.coding_index = cm->current_frame_coding_index; + gop_info.gop_id = rc->gop_id; + + codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info, + &gop_decision); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_gop_decision() failed"); + } + gop_coding_frames = gop_decision.gop_coding_frames; + use_alt_ref = gop_decision.use_alt_ref; + } // Was the group length constrained by the requirement for a new KF? rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0; diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 83a12cde73..42547d1a60 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -211,6 +211,10 @@ typedef struct { // Flag to constrain golden frame interval on key frame frequency for 1 pass // VBR. int constrain_gf_key_freq_onepass_vbr; + + // The id of the current GOP. Start from zero. + // When a key frame is inserted, it resets to zero. + int gop_id; } RATE_CONTROL; struct VP9_COMP; diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 5b426b6cf0..e2c475a591 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -25,7 +25,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures. */ -#define VPX_EXT_RATECTRL_ABI_VERSION (2) +#define VPX_EXT_RATECTRL_ABI_VERSION (3) /*!\brief The control type of the inference API. * In VPX_RC_QP mode, the external rate control model determines the @@ -266,6 +266,32 @@ typedef struct vpx_rc_config { int frame_rate_den; /**< denominator of frame rate */ } vpx_rc_config_t; +/*!\brief Information passed to the external rate control model to + * help make GOP decisions. + */ +typedef struct vpx_rc_gop_info { + int min_gf_interval; /**< mininum allowed gf interval */ + int max_gf_interval; /**< maximum allowed gf interval */ + int allow_alt_ref; /**< whether to allow the use of alt ref */ + int is_key_frame; /**< is the current frame a key frame */ + int last_gop_use_alt_ref; /**< does the last gop use alt ref or not */ + int frames_since_key; /**< current frame distance to the last keyframe */ + int frames_to_key; /**< current frame distance to the next keyframe */ + int lag_in_frames; /**< number of lookahead source frames */ + int show_index; /**< display index of this frame, starts from zero*/ + int coding_index; /**< coding index of this frame, starts from zero*/ + int gop_id; /**< the id of the current gop, starts from zero, resets to zero + when a keyframe is set*/ +} vpx_rc_gop_info_t; + +/*!\brief The decision made by the external rate control model to set the + * group of picture. + */ +typedef struct vpx_rc_gop_decision { + int gop_coding_frames; /**< The number of frames of this GOP */ + int use_alt_ref; /**< Whether to use alt ref for this GOP */ +} vpx_rc_gop_decision_t; + /*!\brief Create an external rate control model callback prototype * * This callback is invoked by the encoder to create an external rate control @@ -318,6 +344,19 @@ typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)( vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_result_t *encode_frame_result); +/*!\brief Get the GOP structure from the external rate control model. + * + * This callback is invoked by the encoder to get GOP decisions from + * the external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] gop_info information collected from the encoder + * \param[out] gop_decision GOP decision from the model + */ +typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, + vpx_rc_gop_decision_t *gop_decision); + /*!\brief Delete the external rate control model callback prototype * * This callback is invoked by the encoder to delete the external rate control @@ -355,6 +394,10 @@ typedef struct vpx_rc_funcs { * Update encodeframe result to the external rate control model. */ vpx_rc_update_encodeframe_result_cb_fn_t update_encodeframe_result; + /*! + * Get GOP decisions from the external rate control model. + */ + vpx_rc_get_gop_decision_cb_fn_t get_gop_decision; /*! * Delete the external rate control model. */ From c304ec38d05040b74de4aacada62c4a336714341 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 27 May 2022 19:36:47 -0700 Subject: [PATCH 0384/1000] test/*: normalize use of nullptr this is preferred over NULL in C++11 Change-Id: Ic48ddcc6dfb8975a57f6713549ad04d93db21415 --- test/buffer.h | 14 +++++++------- test/codec_factory.h | 16 ++++++++-------- test/decode_test_driver.h | 2 +- test/encode_test_driver.h | 2 +- test/ivf_video_source.h | 14 +++++++------- test/register_state_check.h | 2 +- test/video_source.h | 22 ++++++++++++---------- test/webm_video_source.h | 14 ++++++++------ test/y4m_test.cc | 4 ++-- test/y4m_video_source.h | 16 ++++++++-------- test/yuv_video_source.h | 16 +++++++++------- 11 files changed, 64 insertions(+), 58 deletions(-) diff --git a/test/buffer.h b/test/buffer.h index b003d2f0d0..023939cedf 100644 --- a/test/buffer.h +++ b/test/buffer.h @@ -31,7 +31,7 @@ class Buffer { : width_(width), height_(height), top_padding_(top_padding), left_padding_(left_padding), right_padding_(right_padding), bottom_padding_(bottom_padding), alignment_(0), padding_value_(0), - stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {} + stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {} Buffer(int width, int height, int top_padding, int left_padding, int right_padding, int bottom_padding, unsigned int alignment) @@ -39,19 +39,19 @@ class Buffer { left_padding_(left_padding), right_padding_(right_padding), bottom_padding_(bottom_padding), alignment_(alignment), padding_value_(0), stride_(0), raw_size_(0), num_elements_(0), - raw_buffer_(NULL) {} + raw_buffer_(nullptr) {} Buffer(int width, int height, int padding) : width_(width), height_(height), top_padding_(padding), left_padding_(padding), right_padding_(padding), bottom_padding_(padding), alignment_(0), padding_value_(0), stride_(0), - raw_size_(0), num_elements_(0), raw_buffer_(NULL) {} + raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {} Buffer(int width, int height, int padding, unsigned int alignment) : width_(width), height_(height), top_padding_(padding), left_padding_(padding), right_padding_(padding), bottom_padding_(padding), alignment_(alignment), padding_value_(0), - stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {} + stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {} ~Buffer() { if (alignment_) { @@ -103,7 +103,7 @@ class Buffer { bool CheckValues(const Buffer &a) const; bool Init() { - if (raw_buffer_ != NULL) return false; + if (raw_buffer_ != nullptr) return false; EXPECT_GT(width_, 0); EXPECT_GT(height_, 0); EXPECT_GE(top_padding_, 0); @@ -126,7 +126,7 @@ class Buffer { } else { raw_buffer_ = new (std::nothrow) T[num_elements_]; } - EXPECT_TRUE(raw_buffer_ != NULL); + EXPECT_NE(raw_buffer_, nullptr); SetPadding(std::numeric_limits::max()); return !::testing::Test::HasFailure(); } @@ -150,7 +150,7 @@ class Buffer { template T *Buffer::TopLeftPixel() const { - if (!raw_buffer_) return NULL; + if (!raw_buffer_) return nullptr; return raw_buffer_ + (top_padding_ * stride_) + left_padding_; } diff --git a/test/codec_factory.h b/test/codec_factory.h index 77ce49de9f..96092610c6 100644 --- a/test/codec_factory.h +++ b/test/codec_factory.h @@ -88,7 +88,7 @@ class VP8Decoder : public Decoder { #if CONFIG_VP8_DECODER return &vpx_codec_vp8_dx_algo; #else - return NULL; + return nullptr; #endif } }; @@ -104,7 +104,7 @@ class VP8Encoder : public Encoder { #if CONFIG_VP8_ENCODER return &vpx_codec_vp8_cx_algo; #else - return NULL; + return nullptr; #endif } }; @@ -124,7 +124,7 @@ class VP8CodecFactory : public CodecFactory { #else (void)cfg; (void)flags; - return NULL; + return nullptr; #endif } @@ -139,7 +139,7 @@ class VP8CodecFactory : public CodecFactory { (void)deadline; (void)init_flags; (void)stats; - return NULL; + return nullptr; #endif } @@ -184,7 +184,7 @@ class VP9Decoder : public Decoder { #if CONFIG_VP9_DECODER return &vpx_codec_vp9_dx_algo; #else - return NULL; + return nullptr; #endif } }; @@ -200,7 +200,7 @@ class VP9Encoder : public Encoder { #if CONFIG_VP9_ENCODER return &vpx_codec_vp9_cx_algo; #else - return NULL; + return nullptr; #endif } }; @@ -220,7 +220,7 @@ class VP9CodecFactory : public CodecFactory { #else (void)cfg; (void)flags; - return NULL; + return nullptr; #endif } @@ -235,7 +235,7 @@ class VP9CodecFactory : public CodecFactory { (void)deadline; (void)init_flags; (void)stats; - return NULL; + return nullptr; #endif } diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h index 04876cdd7c..f446ab4664 100644 --- a/test/decode_test_driver.h +++ b/test/decode_test_driver.h @@ -24,7 +24,7 @@ class CompressedVideoSource; class DxDataIterator { public: explicit DxDataIterator(vpx_codec_ctx_t *decoder) - : decoder_(decoder), iter_(NULL) {} + : decoder_(decoder), iter_(nullptr) {} const vpx_image_t *Next() { return vpx_codec_get_frame(decoder_, &iter_); } diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index 38c61952eb..7085945f6a 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -49,7 +49,7 @@ enum TestMode { class CxDataIterator { public: explicit CxDataIterator(vpx_codec_ctx_t *encoder) - : encoder_(encoder), iter_(NULL) {} + : encoder_(encoder), iter_(nullptr) {} const vpx_codec_cx_pkt_t *Next() { return vpx_codec_get_cx_data(encoder_, &iter_); diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h index 22c05ecde9..a8ac4f154c 100644 --- a/test/ivf_video_source.h +++ b/test/ivf_video_source.h @@ -29,8 +29,9 @@ static unsigned int MemGetLe32(const uint8_t *mem) { class IVFVideoSource : public CompressedVideoSource { public: explicit IVFVideoSource(const std::string &file_name) - : file_name_(file_name), input_file_(NULL), compressed_frame_buf_(NULL), - frame_sz_(0), frame_(0), end_of_file_(false) {} + : file_name_(file_name), input_file_(nullptr), + compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0), + end_of_file_(false) {} virtual ~IVFVideoSource() { delete[] compressed_frame_buf_; @@ -41,13 +42,12 @@ class IVFVideoSource : public CompressedVideoSource { virtual void Init() { // Allocate a buffer for read in the compressed video frame. compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize]; - ASSERT_TRUE(compressed_frame_buf_ != NULL) - << "Allocate frame buffer failed"; + ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed"; } virtual void Begin() { input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_ != NULL) + ASSERT_NE(input_file_, nullptr) << "Input file open failed. Filename: " << file_name_; // Read file header @@ -68,7 +68,7 @@ class IVFVideoSource : public CompressedVideoSource { } void FillFrame() { - ASSERT_TRUE(input_file_ != NULL); + ASSERT_NE(input_file_, nullptr); uint8_t frame_hdr[kIvfFrameHdrSize]; // Check frame header and read a frame from input_file. if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) != @@ -87,7 +87,7 @@ class IVFVideoSource : public CompressedVideoSource { } virtual const uint8_t *cxdata() const { - return end_of_file_ ? NULL : compressed_frame_buf_; + return end_of_file_ ? nullptr : compressed_frame_buf_; } virtual size_t frame_size() const { return frame_sz_; } virtual unsigned int frame_number() const { return frame_; } diff --git a/test/register_state_check.h b/test/register_state_check.h index 1746240c61..0b837dd042 100644 --- a/test/register_state_check.h +++ b/test/register_state_check.h @@ -56,7 +56,7 @@ class RegisterStateCheck { private: static bool StoreRegisters(CONTEXT *const context) { const HANDLE this_thread = GetCurrentThread(); - EXPECT_TRUE(this_thread != NULL); + EXPECT_NE(this_thread, nullptr); context->ContextFlags = CONTEXT_FLOATING_POINT; const bool context_saved = GetThreadContext(this_thread, context) == TRUE; EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError(); diff --git a/test/video_source.h b/test/video_source.h index 349e3de37c..a10ff6fb09 100644 --- a/test/video_source.h +++ b/test/video_source.h @@ -42,7 +42,7 @@ namespace libvpx_test { // A simple function to encapsulate cross platform retrieval of test data path static std::string GetDataPath() { const char *const data_path = getenv("LIBVPX_TEST_DATA_PATH"); - if (data_path == NULL) { + if (data_path == nullptr) { #ifdef LIBVPX_TEST_DATA_PATH // In some environments, we cannot set environment variables // Instead, we set the data path by using a preprocessor symbol @@ -76,10 +76,10 @@ static FILE *GetTempOutFile(std::string *file_name) { return fopen(fname, "wb+"); } } - return NULL; + return nullptr; #else std::string temp_dir = testing::TempDir(); - if (temp_dir.empty()) return NULL; + if (temp_dir.empty()) return nullptr; // Versions of testing::TempDir() prior to release-1.11.0-214-g5e6a5336 may // use the value of an environment variable without checking for a trailing // path delimiter. @@ -87,12 +87,12 @@ static FILE *GetTempOutFile(std::string *file_name) { const char name_template[] = "libvpxtest.XXXXXX"; std::unique_ptr temp_file_name( new char[temp_dir.size() + sizeof(name_template)]); - if (temp_file_name == nullptr) return NULL; + if (temp_file_name == nullptr) return nullptr; memcpy(temp_file_name.get(), temp_dir.data(), temp_dir.size()); memcpy(temp_file_name.get() + temp_dir.size(), name_template, sizeof(name_template)); const int fd = mkstemp(temp_file_name.get()); - if (fd == -1) return NULL; + if (fd == -1) return nullptr; *file_name = temp_file_name.get(); return fdopen(fd, "wb+"); #endif @@ -114,7 +114,7 @@ class TempOutFile { void CloseFile() { if (file_) { fclose(file_); - file_ = NULL; + file_ = nullptr; } } FILE *file_; @@ -133,7 +133,7 @@ class VideoSource { // Advance the cursor to the next frame virtual void Next() = 0; - // Get the current video frame, or NULL on End-Of-Stream. + // Get the current video frame, or nullptr on End-Of-Stream. virtual vpx_image_t *img() const = 0; // Get the presentation timestamp of the current frame. @@ -155,7 +155,7 @@ class VideoSource { class DummyVideoSource : public VideoSource { public: DummyVideoSource() - : img_(NULL), limit_(100), width_(80), height_(64), + : img_(nullptr), limit_(100), width_(80), height_(64), format_(VPX_IMG_FMT_I420) { ReallocImage(); } @@ -172,7 +172,9 @@ class DummyVideoSource : public VideoSource { FillFrame(); } - virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; } + virtual vpx_image_t *img() const { + return (frame_ < limit_) ? img_ : nullptr; + } // Models a stream where Timebase = 1/FPS, so pts == frame. virtual vpx_codec_pts_t pts() const { return frame_; } @@ -212,7 +214,7 @@ class DummyVideoSource : public VideoSource { void ReallocImage() { vpx_img_free(img_); - img_ = vpx_img_alloc(NULL, format_, width_, height_, 32); + img_ = vpx_img_alloc(nullptr, format_, width_, height_, 32); ASSERT_NE(img_, nullptr); raw_sz_ = ((img_->w + 31) & ~31u) * img_->h * img_->bps / 8; } diff --git a/test/webm_video_source.h b/test/webm_video_source.h index 6f55f7db7c..d245926298 100644 --- a/test/webm_video_source.h +++ b/test/webm_video_source.h @@ -26,11 +26,11 @@ class WebMVideoSource : public CompressedVideoSource { public: explicit WebMVideoSource(const std::string &file_name) : file_name_(file_name), vpx_ctx_(new VpxInputContext()), - webm_ctx_(new WebmInputContext()), buf_(NULL), buf_sz_(0), frame_(0), + webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0), frame_(0), end_of_file_(false) {} virtual ~WebMVideoSource() { - if (vpx_ctx_->file != NULL) fclose(vpx_ctx_->file); + if (vpx_ctx_->file != nullptr) fclose(vpx_ctx_->file); webm_free(webm_ctx_); delete vpx_ctx_; delete webm_ctx_; @@ -40,7 +40,7 @@ class WebMVideoSource : public CompressedVideoSource { virtual void Begin() { vpx_ctx_->file = OpenTestDataFile(file_name_); - ASSERT_TRUE(vpx_ctx_->file != NULL) + ASSERT_NE(vpx_ctx_->file, nullptr) << "Input file open failed. Filename: " << file_name_; ASSERT_EQ(file_is_webm(webm_ctx_, vpx_ctx_), 1) << "file is not WebM"; @@ -54,7 +54,7 @@ class WebMVideoSource : public CompressedVideoSource { } void FillFrame() { - ASSERT_TRUE(vpx_ctx_->file != NULL); + ASSERT_NE(vpx_ctx_->file, nullptr); const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_); ASSERT_GE(status, 0) << "webm_read_frame failed"; if (status == 1) { @@ -63,7 +63,7 @@ class WebMVideoSource : public CompressedVideoSource { } void SeekToNextKeyFrame() { - ASSERT_TRUE(vpx_ctx_->file != NULL); + ASSERT_NE(vpx_ctx_->file, nullptr); do { const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_); ASSERT_GE(status, 0) << "webm_read_frame failed"; @@ -74,7 +74,9 @@ class WebMVideoSource : public CompressedVideoSource { } while (!webm_ctx_->is_key_frame && !end_of_file_); } - virtual const uint8_t *cxdata() const { return end_of_file_ ? NULL : buf_; } + virtual const uint8_t *cxdata() const { + return end_of_file_ ? nullptr : buf_; + } virtual size_t frame_size() const { return buf_sz_; } virtual unsigned int frame_number() const { return frame_; } diff --git a/test/y4m_test.cc b/test/y4m_test.cc index 89c6552c5d..32f2cd51d3 100644 --- a/test/y4m_test.cc +++ b/test/y4m_test.cc @@ -202,7 +202,7 @@ TEST(Y4MHeaderTest, RegularHeader) { EXPECT_EQ(0, fseek(f.file(), 0, 0)); y4m_input y4m; - EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL, + EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr, /*num_skip=*/0, /*only_420=*/0), 0); EXPECT_EQ(y4m.pic_w, 4); @@ -229,7 +229,7 @@ TEST(Y4MHeaderTest, LongHeader) { EXPECT_EQ(fseek(f.file(), 0, 0), 0); y4m_input y4m; - EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL, + EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr, /*num_skip=*/0, /*only_420=*/0), 0); EXPECT_EQ(y4m.pic_w, 4); diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h index 89aa2a44fc..71fbf31931 100644 --- a/test/y4m_video_source.h +++ b/test/y4m_video_source.h @@ -23,7 +23,7 @@ namespace libvpx_test { class Y4mVideoSource : public VideoSource { public: Y4mVideoSource(const std::string &file_name, unsigned int start, int limit) - : file_name_(file_name), input_file_(NULL), img_(new vpx_image_t()), + : file_name_(file_name), input_file_(nullptr), img_(new vpx_image_t()), start_(start), limit_(limit), frame_(0), framerate_numerator_(0), framerate_denominator_(0), y4m_() {} @@ -35,13 +35,13 @@ class Y4mVideoSource : public VideoSource { virtual void OpenSource() { CloseSource(); input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_ != NULL) + ASSERT_NE(input_file_, nullptr) << "Input file open failed. Filename: " << file_name_; } virtual void ReadSourceToStart() { - ASSERT_TRUE(input_file_ != NULL); - ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, NULL, 0, 0)); + ASSERT_NE(input_file_, nullptr); + ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, nullptr, 0, 0)); framerate_numerator_ = y4m_.fps_n; framerate_denominator_ = y4m_.fps_d; frame_ = 0; @@ -62,7 +62,7 @@ class Y4mVideoSource : public VideoSource { } virtual vpx_image_t *img() const { - return (frame_ < limit_) ? img_.get() : NULL; + return (frame_ < limit_) ? img_.get() : nullptr; } // Models a stream where Timebase = 1/FPS, so pts == frame. @@ -80,7 +80,7 @@ class Y4mVideoSource : public VideoSource { virtual unsigned int limit() const { return limit_; } virtual void FillFrame() { - ASSERT_TRUE(input_file_ != NULL); + ASSERT_NE(input_file_, nullptr); // Read a frame from input_file. y4m_input_fetch_frame(&y4m_, input_file_, img_.get()); } @@ -101,9 +101,9 @@ class Y4mVideoSource : public VideoSource { void CloseSource() { y4m_input_close(&y4m_); y4m_ = y4m_input(); - if (input_file_ != NULL) { + if (input_file_ != nullptr) { fclose(input_file_); - input_file_ = NULL; + input_file_ = nullptr; } } diff --git a/test/yuv_video_source.h b/test/yuv_video_source.h index 383ab8f1b1..51948c0efb 100644 --- a/test/yuv_video_source.h +++ b/test/yuv_video_source.h @@ -27,8 +27,8 @@ class YUVVideoSource : public VideoSource { YUVVideoSource(const std::string &file_name, vpx_img_fmt format, unsigned int width, unsigned int height, int rate_numerator, int rate_denominator, unsigned int start, int limit) - : file_name_(file_name), input_file_(NULL), img_(NULL), start_(start), - limit_(limit), frame_(0), width_(0), height_(0), + : file_name_(file_name), input_file_(nullptr), img_(nullptr), + start_(start), limit_(limit), frame_(0), width_(0), height_(0), format_(VPX_IMG_FMT_NONE), framerate_numerator_(rate_numerator), framerate_denominator_(rate_denominator) { // This initializes format_, raw_size_, width_, height_ and allocates img. @@ -43,7 +43,7 @@ class YUVVideoSource : public VideoSource { virtual void Begin() { if (input_file_) fclose(input_file_); input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_ != NULL) + ASSERT_NE(input_file_, nullptr) << "Input file open failed. Filename: " << file_name_; if (start_) { fseek(input_file_, static_cast(raw_size_) * start_, SEEK_SET); @@ -58,7 +58,9 @@ class YUVVideoSource : public VideoSource { FillFrame(); } - virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; } + virtual vpx_image_t *img() const { + return (frame_ < limit_) ? img_ : nullptr; + } // Models a stream where Timebase = 1/FPS, so pts == frame. virtual vpx_codec_pts_t pts() const { return frame_; } @@ -78,8 +80,8 @@ class YUVVideoSource : public VideoSource { vpx_img_fmt format) { if (width != width_ || height != height_ || format != format_) { vpx_img_free(img_); - img_ = vpx_img_alloc(NULL, format, width, height, 1); - ASSERT_TRUE(img_ != NULL); + img_ = vpx_img_alloc(nullptr, format, width, height, 1); + ASSERT_NE(img_, nullptr); width_ = width; height_ = height; format_ = format; @@ -99,7 +101,7 @@ class YUVVideoSource : public VideoSource { } virtual void FillFrame() { - ASSERT_TRUE(input_file_ != NULL); + ASSERT_NE(input_file_, nullptr); // Read a frame from input_file. if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) { limit_ = frame_; From d353916ab5e614595b6e816b2baa9d532e06c9bc Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 26 May 2022 19:38:01 -0700 Subject: [PATCH 0385/1000] libs.doxy_template: remove some obsolete variables - COLS_IN_ALPHA_INDEX this was unused given ALPHABETICAL_INDEX = NO - PERL_PATH / MSCGEN_PATH these were unused quiets warnings with doxygen 1.9.1: warning: Tag 'COLS_IN_ALPHA_INDEX' at line 1110 of file 'doxyfile' has become obsolete. warning: Tag 'PERL_PATH' at line 1105 of file 'doxyfile' has become obsolete. warning: Tag 'MSCGEN_PATH' at line 1126 of file 'doxyfile' has become obsolete Change-Id: I6229311afaa3318a3f9bcaf40fafcc5ea71ae271 --- libs.doxy_template | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/libs.doxy_template b/libs.doxy_template index 1eacc8fe2d..73e1b43c72 100644 --- a/libs.doxy_template +++ b/libs.doxy_template @@ -654,12 +654,6 @@ VERBATIM_HEADERS = YES ALPHABETICAL_INDEX = NO -# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then -# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns -# in which this list will be split (can be a number in the range [1..20]) - -COLS_IN_ALPHA_INDEX = 5 - # In case all classes in a project start with a common prefix, all # classes will be put under the same header in the alphabetical index. # The IGNORE_PREFIX tag can be used to specify one or more prefixes that @@ -1099,11 +1093,6 @@ ALLEXTERNALS = NO EXTERNAL_GROUPS = YES -# The PERL_PATH should be the absolute path and name of the perl script -# interpreter (i.e. the result of `which perl'). - -PERL_PATH = /usr/bin/perl - #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- @@ -1117,14 +1106,6 @@ PERL_PATH = /usr/bin/perl CLASS_DIAGRAMS = YES -# You can define message sequence charts within doxygen comments using the \msc -# command. Doxygen will then run the mscgen tool (see http://www.mcternan.me.uk/mscgen/) to -# produce the chart and insert it in the documentation. The MSCGEN_PATH tag allows you to -# specify the directory where the mscgen tool resides. If left empty the tool is assumed to -# be found in the default search path. - -MSCGEN_PATH = - # If set to YES, the inheritance and collaboration graphs will hide # inheritance and usage relations if the target is undocumented # or is not a class. From 8f56e1c074712ffa937dc48a14d4b01e378a170f Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 28 May 2022 15:26:29 -0700 Subject: [PATCH 0386/1000] resize_test: add TODO for test failure DISABLED_TestExternalResizeSmallerWidthBiggerSize was added for webm:1642, but never fixed Bug: webm:1642 Change-Id: I0fa368a44dda550241ea997068c58eaff551233c --- test/resize_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/resize_test.cc b/test/resize_test.cc index c57170ff9b..a71b2acb0c 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -578,6 +578,8 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) { } } +// TODO(https://crbug.com/webm/1642): This causes a segfault in +// init_encode_frame_mb_context(). TEST_P(ResizeRealtimeTest, DISABLED_TestExternalResizeSmallerWidthBiggerSize) { ResizingVideoSource video; video.flag_codec_ = true; From 9d279c88c3e8873c114298d69e919bfef45a1dab Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 28 May 2022 15:25:49 -0700 Subject: [PATCH 0387/1000] resize_test: add TODO for ResizeTest instantiation for VP9 this should match VP8 and use ONE_PASS_TEST_MODES, but currently the code will produce integer sanitizer warnings and may segfault under certain conditions Bug: webm:1767,webm:1768 Change-Id: I6482ff1862f19716fde3d57522591bc61d76a84f --- test/resize_test.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/resize_test.cc b/test/resize_test.cc index a71b2acb0c..ccee614070 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -796,6 +796,9 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) { } VP8_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES); +// TODO(https://crbug.com/webm/1767,https://crbug.com/webm/1768): VP9 should +// use ONE_PASS_TEST_MODES for the ResizeTest instantiation after integer +// sanitizer warnings and segfault are fixed. VP9_INSTANTIATE_TEST_SUITE(ResizeTest, ::testing::Values(::libvpx_test::kRealTime)); VP9_INSTANTIATE_TEST_SUITE(ResizeInternalTest, From 365eebc147627ae83ec8b36077198d8cfb5e0128 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 27 May 2022 21:50:11 -0700 Subject: [PATCH 0388/1000] vp8e_set_config: setjmp before calling vp8_change_config vp8_change_config may call vp8_alloc_compressor_data which expects failures detected by CHECK_MEM_ERROR to not return. Change-Id: Ib7fbf4af904bd9b539402bb61c8f87855eef2ad6 --- vp8/vp8_cx_iface.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 21fed0e8ed..340f3e6638 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -473,14 +473,23 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx, ERROR("Cannot increase lag_in_frames"); res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0); + if (res != VPX_CODEC_OK) return res; - if (!res) { - ctx->cfg = *cfg; - set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); - vp8_change_config(ctx->cpi, &ctx->oxcf); + if (setjmp(ctx->cpi->common.error.jmp)) { + const vpx_codec_err_t codec_err = + update_error_state(ctx, &ctx->cpi->common.error); + ctx->cpi->common.error.setjmp = 0; + vpx_clear_system_state(); + assert(codec_err != VPX_CODEC_OK); + return codec_err; } - return res; + ctx->cpi->common.error.setjmp = 1; + ctx->cfg = *cfg; + set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); + vp8_change_config(ctx->cpi, &ctx->oxcf); + ctx->cpi->common.error.setjmp = 0; + return VPX_CODEC_OK; } static vpx_codec_err_t get_quantizer(vpx_codec_alg_priv_t *ctx, va_list args) { From 3997d9bc6286ba075879353b87678986cdbfa347 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 27 May 2022 21:50:11 -0700 Subject: [PATCH 0389/1000] vp9e_set_config: setjmp before calling vp9_change_config vp9_change_config may call functions that perform allocations which expect failures detected by CHECK_MEM_ERROR to not return. Change-Id: I1dd1eca9c661ed157d51b4a6a77fc9f88236d794 --- vp9/vp9_cx_iface.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index b809ab3e6f..63d8f44878 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -780,7 +780,7 @@ static vpx_codec_err_t set_twopass_params_from_config( static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg) { vpx_codec_err_t res; - int force_key = 0; + volatile int force_key = 0; if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) { if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS) @@ -799,19 +799,28 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, ERROR("Cannot increase lag_in_frames"); res = validate_config(ctx, cfg, &ctx->extra_cfg); + if (res != VPX_CODEC_OK) return res; - if (res == VPX_CODEC_OK) { - ctx->cfg = *cfg; - set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); - set_twopass_params_from_config(&ctx->cfg, ctx->cpi); - // On profile change, request a key frame - force_key |= ctx->cpi->common.profile != ctx->oxcf.profile; - vp9_change_config(ctx->cpi, &ctx->oxcf); + if (setjmp(ctx->cpi->common.error.jmp)) { + const vpx_codec_err_t codec_err = + update_error_state(ctx, &ctx->cpi->common.error); + ctx->cpi->common.error.setjmp = 0; + vpx_clear_system_state(); + assert(codec_err != VPX_CODEC_OK); + return codec_err; } + ctx->cfg = *cfg; + set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); + set_twopass_params_from_config(&ctx->cfg, ctx->cpi); + // On profile change, request a key frame + force_key |= ctx->cpi->common.profile != ctx->oxcf.profile; + vp9_change_config(ctx->cpi, &ctx->oxcf); + if (force_key) ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF; - return res; + ctx->cpi->common.error.setjmp = 0; + return VPX_CODEC_OK; } static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx, From 6549e76307631de7e37459fceb23b4eee4573620 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 28 May 2022 15:24:37 -0700 Subject: [PATCH 0390/1000] vp9_change_config: check vp9_alloc_loop_filter return Change-Id: I4cba67a5ab192d1cf1dbfb5c039a93a4952b071e --- vp9/encoder/vp9_encoder.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 6d807b8abf..a511aa7645 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2057,7 +2057,10 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { cpi->external_resize = 0; } else if (cm->mi_alloc_size == new_mi_size && (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) { - vp9_alloc_loop_filter(cm); + if (vp9_alloc_loop_filter(cm)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate loop filter data"); + } } } From 3dc6aa01bacc9818d4ccc0ee0f1b691ae0ec0315 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 1 Jun 2022 18:55:10 -0700 Subject: [PATCH 0391/1000] vp9,encoder: fix some integer sanitizer warnings the issues fixed in this change are related to implicit conversions between int / unsigned int: vp9/encoder/vp9_segmentation.c:42:36: runtime error: implicit conversion from type 'int' of value -9 (32-bit, signed) to type 'unsigned int' changed the value to 4294967287 (32-bit, unsigned) vpx_dsp/x86/sum_squares_sse2.c:36:52: runtime error: implicit conversion from type 'unsigned int' of value 4294967295 (32-bit, unsigned) to type 'int' changed the value to -1 (32-bit, signed) vpx_dsp/x86/sum_squares_sse2.c:36:67: runtime error: implicit conversion from type 'unsigned int' of value 4294967295 (32-bit, unsigned) to type 'int' changed the value to -1 (32-bit, signed) vp9/encoder/x86/vp9_diamond_search_sad_avx.c:81:45: runtime error: implicit conversion from type 'uint32_t' (aka 'unsigned int') of value 4290576316 (32-bit, unsigned) to type 'int' changed the value to -4390980 (32-bit, signed) vp9/encoder/vp9_rdopt.c:3472:31: runtime error: implicit conversion from type 'int' of value -1024 (32-bit, signed) to type 'uint16_t' (aka 'unsigned short') changed the value to 64512 (16-bit, unsigned) unsigned is forced for masks and int is used with intel intrinsics Bug: webm:1767 Change-Id: Icfa4179e13bc98a36ac29586b60d65819d3ce9ee Fixed: webm:1767 --- test/resize_test.cc | 5 ++--- vp9/encoder/vp9_rdopt.c | 2 +- vp9/encoder/vp9_segmentation.c | 2 +- vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 10 +++++----- vpx_dsp/x86/sum_squares_sse2.c | 2 +- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/test/resize_test.cc b/test/resize_test.cc index ccee614070..212ff46975 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -796,9 +796,8 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) { } VP8_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES); -// TODO(https://crbug.com/webm/1767,https://crbug.com/webm/1768): VP9 should -// use ONE_PASS_TEST_MODES for the ResizeTest instantiation after integer -// sanitizer warnings and segfault are fixed. +// TODO(https://crbug.com/webm/1768): VP9 should use ONE_PASS_TEST_MODES for +// the ResizeTest instantiation after segfault is fixed. VP9_INSTANTIATE_TEST_SUITE(ResizeTest, ::testing::Values(::libvpx_test::kRealTime)); VP9_INSTANTIATE_TEST_SUITE(ResizeInternalTest, diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 0171a05720..3b574ef172 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -3470,7 +3470,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } mode_skip_mask[INTRA_FRAME] |= - ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]); + (uint16_t) ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]); for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0; diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c index a163297e6e..d75488a8e6 100644 --- a/vp9/encoder/vp9_segmentation.c +++ b/vp9/encoder/vp9_segmentation.c @@ -39,7 +39,7 @@ void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data, } void vp9_disable_segfeature(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id) { - seg->feature_mask[segment_id] &= ~(1 << feature_id); + seg->feature_mask[segment_id] &= ~(1u << feature_id); } void vp9_clear_segdata(struct segmentation *seg, int segment_id, diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c index fcf50eb2a7..0e04a2f41f 100644 --- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c @@ -76,9 +76,9 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); - const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int); + const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int); const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min); - const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int); + const __m128i v_min_mv_w = _mm_set1_epi32((int)minmv.as_int); const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); @@ -96,14 +96,14 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); - const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int); + const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int); const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; - __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int); + __m128i v_bmv_w = _mm_set1_epi32((int)bmv.as_int); const int what_stride = x->plane[0].src.stride; const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; @@ -300,7 +300,7 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, bmv = new_bmv; best_address = new_best_address; - v_bmv_w = _mm_set1_epi32(bmv.as_int); + v_bmv_w = _mm_set1_epi32((int)bmv.as_int); #if VPX_ARCH_X86_64 v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else diff --git a/vpx_dsp/x86/sum_squares_sse2.c b/vpx_dsp/x86/sum_squares_sse2.c index 14f3b35c01..df6514b2c4 100644 --- a/vpx_dsp/x86/sum_squares_sse2.c +++ b/vpx_dsp/x86/sum_squares_sse2.c @@ -33,7 +33,7 @@ uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) { } else { // Generic case int r = size; - const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + const __m128i v_zext_mask_q = _mm_set_epi32(0, -1, 0, -1); __m128i v_acc_q = _mm_setzero_si128(); assert(size % 8 == 0); From 4e7c56332de65eb191cef5644c010e309302a993 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Tue, 31 May 2022 11:07:15 -0700 Subject: [PATCH 0392/1000] L2E: Return error when GOP model is not set - Return error instead of OK when GOP model is not set. - Update descriptions for a few variables. Change-Id: I213f6b7085c487507c3935e7ce615e807f4474cc --- test/vp9_ext_ratectrl_test.cc | 12 +++---- vp9/encoder/vp9_ext_ratectrl.c | 43 +++++++++++------------ vp9/encoder/vp9_firstpass.c | 6 ++-- vp9/encoder/vp9_ratectrl.h | 4 +-- vpx/vpx_ext_ratectrl.h | 64 ++++++++++++++++++++++++++-------- 5 files changed, 82 insertions(+), 47 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index e3e7afbf42..1289e2db88 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -44,7 +44,7 @@ struct ToyRateCtrl { int magic_number; int coding_index; - int gop_id; + int gop_index; int frames_since_key; int show_index; }; @@ -73,7 +73,7 @@ vpx_rc_status_t rc_create_model_gop(void *priv, ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; toy_rate_ctrl->magic_number = kModelMagicNumber; - toy_rate_ctrl->gop_id = 0; + toy_rate_ctrl->gop_index = 0; toy_rate_ctrl->frames_since_key = 0; toy_rate_ctrl->show_index = 0; toy_rate_ctrl->coding_index = 0; @@ -198,13 +198,13 @@ vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model, if (gop_info->is_key_frame) { EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); EXPECT_EQ(gop_info->frames_since_key, 0); - EXPECT_EQ(gop_info->gop_id, 0); - toy_rate_ctrl->gop_id = 0; + EXPECT_EQ(gop_info->gop_index, 0); + toy_rate_ctrl->gop_index = 0; toy_rate_ctrl->frames_since_key = 0; } else { EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1); } - EXPECT_EQ(gop_info->gop_id, toy_rate_ctrl->gop_id); + EXPECT_EQ(gop_info->gop_index, toy_rate_ctrl->gop_index); EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); @@ -217,7 +217,7 @@ vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model, toy_rate_ctrl->show_index += gop_decision->gop_coding_frames - gop_decision->use_alt_ref; toy_rate_ctrl->coding_index += gop_decision->gop_coding_frames; - ++toy_rate_ctrl->gop_id; + ++toy_rate_ctrl->gop_index; return VPX_RC_OK; } diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 48c90913ee..ba57b86f60 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -202,30 +202,29 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( vpx_codec_err_t vp9_extrc_get_gop_decision( EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info, vpx_rc_gop_decision_t *gop_decision) { - if (ext_ratectrl == NULL) { + vpx_rc_status_t rc_status; + if (ext_ratectrl == NULL || !ext_ratectrl->ready || + ext_ratectrl->funcs.rc_type != VPX_RC_GOP) { return VPX_CODEC_INVALID_PARAM; } - if (ext_ratectrl->ready && ext_ratectrl->funcs.rc_type == VPX_RC_GOP) { - vpx_rc_status_t rc_status; - rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, - gop_info, gop_decision); - if (gop_decision->use_alt_ref) { - const int arf_constraint = - gop_decision->gop_coding_frames >= gop_info->min_gf_interval && - gop_decision->gop_coding_frames < gop_info->lag_in_frames; - if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR; - } - // TODO(chengchen): Take min and max gf interval from the model - // and overwrite libvpx's decision so that we can get rid - // of one of the checks here. - if (gop_decision->gop_coding_frames > gop_info->frames_to_key || - gop_decision->gop_coding_frames - gop_decision->use_alt_ref > - gop_info->max_gf_interval) { - return VPX_CODEC_ERROR; - } - if (rc_status == VPX_RC_ERROR) { - return VPX_CODEC_ERROR; - } + rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, + gop_info, gop_decision); + if (gop_decision->use_alt_ref) { + const int arf_constraint = + gop_decision->gop_coding_frames >= gop_info->min_gf_interval && + gop_decision->gop_coding_frames < gop_info->lag_in_frames; + if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR; + } + // TODO(chengchen): Take min and max gf interval from the model + // and overwrite libvpx's decision so that we can get rid + // of one of the checks here. + if (gop_decision->gop_coding_frames > gop_info->frames_to_key || + gop_decision->gop_coding_frames - gop_decision->use_alt_ref > + gop_info->max_gf_interval) { + return VPX_CODEC_ERROR; + } + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; } return VPX_CODEC_OK; } diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 6e1f797f4f..2b3c174693 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -2714,9 +2714,9 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { // frame in which case it will already have been done. if (is_key_frame == 0) { vp9_zero(twopass->gf_group); - ++rc->gop_id; + ++rc->gop_index; } else { - rc->gop_id = 0; + rc->gop_index = 0; } vpx_clear_system_state(); @@ -2772,7 +2772,7 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { gop_info.lag_in_frames = cpi->oxcf.lag_in_frames; gop_info.show_index = cm->current_video_frame; gop_info.coding_index = cm->current_frame_coding_index; - gop_info.gop_id = rc->gop_id; + gop_info.gop_index = rc->gop_index; codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info, &gop_decision); diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 42547d1a60..48a21bd1cd 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -212,9 +212,9 @@ typedef struct { // VBR. int constrain_gf_key_freq_onepass_vbr; - // The id of the current GOP. Start from zero. + // The index of the current GOP. Start from zero. // When a key frame is inserted, it resets to zero. - int gop_id; + int gop_index; } RATE_CONTROL; struct VP9_COMP; diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index e2c475a591..db9af444c4 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -25,7 +25,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures. */ -#define VPX_EXT_RATECTRL_ABI_VERSION (3) +#define VPX_EXT_RATECTRL_ABI_VERSION (4) /*!\brief The control type of the inference API. * In VPX_RC_QP mode, the external rate control model determines the @@ -33,7 +33,7 @@ extern "C" { * In VPX_RC_GOP mode, the external rate control model determines the * group of picture (GOP) of the video sequence. */ -typedef enum vpx_rc_type { VPX_RC_QP = 0, VPX_RC_GOP = 1 } vpx_rc_type_t; +typedef enum vpx_rc_type { VPX_RC_QP = 1, VPX_RC_GOP = 2 } vpx_rc_type_t; /*!\brief Abstract rate control model handler * @@ -270,18 +270,54 @@ typedef struct vpx_rc_config { * help make GOP decisions. */ typedef struct vpx_rc_gop_info { - int min_gf_interval; /**< mininum allowed gf interval */ - int max_gf_interval; /**< maximum allowed gf interval */ - int allow_alt_ref; /**< whether to allow the use of alt ref */ - int is_key_frame; /**< is the current frame a key frame */ - int last_gop_use_alt_ref; /**< does the last gop use alt ref or not */ - int frames_since_key; /**< current frame distance to the last keyframe */ - int frames_to_key; /**< current frame distance to the next keyframe */ - int lag_in_frames; /**< number of lookahead source frames */ - int show_index; /**< display index of this frame, starts from zero*/ - int coding_index; /**< coding index of this frame, starts from zero*/ - int gop_id; /**< the id of the current gop, starts from zero, resets to zero - when a keyframe is set*/ + /*! + * Minimum allowed gf interval, fixed for the whole clip. + */ + int min_gf_interval; + /*! + * Maximum allowed gf interval, fixed for the whole clip. + */ + int max_gf_interval; + /*! + * Whether to allow the use of alt ref, can be changed per gop. + */ + int allow_alt_ref; + /*! + * Is the current frame a key frame. + */ + int is_key_frame; + /*! + * Does the previous gop use alt ref or not. + */ + int last_gop_use_alt_ref; + /*! + * Current frame distance to the last keyframe, e.g., if Nth frame is a key, + * then the value of the N+1 th frame is 1. + */ + int frames_since_key; + /*! + * Current frame distance to the next keyframe, e.g. if Nth frame is a key, + * then the value of frame N - 1 is 1. + */ + int frames_to_key; + /*! + * Number of lookahead source frames. + */ + int lag_in_frames; + /*! + * Display index (temporal stamp) of this frame in the whole clip, + * starts from zero. + */ + int show_index; + /*! + * Coding index of this frame in the whole clip, starts from zero. + */ + int coding_index; + /*! + * The index of the current gop, starts from zero, resets to zero + * when a keyframe is set. + */ + int gop_index; } vpx_rc_gop_info_t; /*!\brief The decision made by the external rate control model to set the From 36c9b2d6900f7decbf0f6d775f01c23248b714b4 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 2 Jun 2022 16:34:21 -0700 Subject: [PATCH 0393/1000] .gitignore: add android studio / vscode folders Change-Id: I039a96bc33f55d9ba8bca9f9f6b69135659d2351 --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 5f26835386..99eeb92d0b 100644 --- a/.gitignore +++ b/.gitignore @@ -7,8 +7,10 @@ *.o *~ .cproject +.idea .project .settings +.vscode /*-*.mk /*.asm /*.doxy From abfca783ed175c037dc53ce62ada06e1f3319bd8 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 27 May 2022 19:36:47 -0700 Subject: [PATCH 0394/1000] test/*: normalize use of nullptr this is preferred over NULL in C++11 Change-Id: Ic48ddcc6dfb8975a57f6713549ad04d93db21415 (cherry picked from commit c304ec38d05040b74de4aacada62c4a336714341) --- test/buffer.h | 14 +++++++------- test/codec_factory.h | 16 ++++++++-------- test/decode_test_driver.h | 2 +- test/encode_test_driver.h | 2 +- test/ivf_video_source.h | 14 +++++++------- test/register_state_check.h | 2 +- test/video_source.h | 22 ++++++++++++---------- test/webm_video_source.h | 14 ++++++++------ test/y4m_test.cc | 4 ++-- test/y4m_video_source.h | 16 ++++++++-------- test/yuv_video_source.h | 16 +++++++++------- 11 files changed, 64 insertions(+), 58 deletions(-) diff --git a/test/buffer.h b/test/buffer.h index b003d2f0d0..023939cedf 100644 --- a/test/buffer.h +++ b/test/buffer.h @@ -31,7 +31,7 @@ class Buffer { : width_(width), height_(height), top_padding_(top_padding), left_padding_(left_padding), right_padding_(right_padding), bottom_padding_(bottom_padding), alignment_(0), padding_value_(0), - stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {} + stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {} Buffer(int width, int height, int top_padding, int left_padding, int right_padding, int bottom_padding, unsigned int alignment) @@ -39,19 +39,19 @@ class Buffer { left_padding_(left_padding), right_padding_(right_padding), bottom_padding_(bottom_padding), alignment_(alignment), padding_value_(0), stride_(0), raw_size_(0), num_elements_(0), - raw_buffer_(NULL) {} + raw_buffer_(nullptr) {} Buffer(int width, int height, int padding) : width_(width), height_(height), top_padding_(padding), left_padding_(padding), right_padding_(padding), bottom_padding_(padding), alignment_(0), padding_value_(0), stride_(0), - raw_size_(0), num_elements_(0), raw_buffer_(NULL) {} + raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {} Buffer(int width, int height, int padding, unsigned int alignment) : width_(width), height_(height), top_padding_(padding), left_padding_(padding), right_padding_(padding), bottom_padding_(padding), alignment_(alignment), padding_value_(0), - stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {} + stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {} ~Buffer() { if (alignment_) { @@ -103,7 +103,7 @@ class Buffer { bool CheckValues(const Buffer &a) const; bool Init() { - if (raw_buffer_ != NULL) return false; + if (raw_buffer_ != nullptr) return false; EXPECT_GT(width_, 0); EXPECT_GT(height_, 0); EXPECT_GE(top_padding_, 0); @@ -126,7 +126,7 @@ class Buffer { } else { raw_buffer_ = new (std::nothrow) T[num_elements_]; } - EXPECT_TRUE(raw_buffer_ != NULL); + EXPECT_NE(raw_buffer_, nullptr); SetPadding(std::numeric_limits::max()); return !::testing::Test::HasFailure(); } @@ -150,7 +150,7 @@ class Buffer { template T *Buffer::TopLeftPixel() const { - if (!raw_buffer_) return NULL; + if (!raw_buffer_) return nullptr; return raw_buffer_ + (top_padding_ * stride_) + left_padding_; } diff --git a/test/codec_factory.h b/test/codec_factory.h index 77ce49de9f..96092610c6 100644 --- a/test/codec_factory.h +++ b/test/codec_factory.h @@ -88,7 +88,7 @@ class VP8Decoder : public Decoder { #if CONFIG_VP8_DECODER return &vpx_codec_vp8_dx_algo; #else - return NULL; + return nullptr; #endif } }; @@ -104,7 +104,7 @@ class VP8Encoder : public Encoder { #if CONFIG_VP8_ENCODER return &vpx_codec_vp8_cx_algo; #else - return NULL; + return nullptr; #endif } }; @@ -124,7 +124,7 @@ class VP8CodecFactory : public CodecFactory { #else (void)cfg; (void)flags; - return NULL; + return nullptr; #endif } @@ -139,7 +139,7 @@ class VP8CodecFactory : public CodecFactory { (void)deadline; (void)init_flags; (void)stats; - return NULL; + return nullptr; #endif } @@ -184,7 +184,7 @@ class VP9Decoder : public Decoder { #if CONFIG_VP9_DECODER return &vpx_codec_vp9_dx_algo; #else - return NULL; + return nullptr; #endif } }; @@ -200,7 +200,7 @@ class VP9Encoder : public Encoder { #if CONFIG_VP9_ENCODER return &vpx_codec_vp9_cx_algo; #else - return NULL; + return nullptr; #endif } }; @@ -220,7 +220,7 @@ class VP9CodecFactory : public CodecFactory { #else (void)cfg; (void)flags; - return NULL; + return nullptr; #endif } @@ -235,7 +235,7 @@ class VP9CodecFactory : public CodecFactory { (void)deadline; (void)init_flags; (void)stats; - return NULL; + return nullptr; #endif } diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h index 04876cdd7c..f446ab4664 100644 --- a/test/decode_test_driver.h +++ b/test/decode_test_driver.h @@ -24,7 +24,7 @@ class CompressedVideoSource; class DxDataIterator { public: explicit DxDataIterator(vpx_codec_ctx_t *decoder) - : decoder_(decoder), iter_(NULL) {} + : decoder_(decoder), iter_(nullptr) {} const vpx_image_t *Next() { return vpx_codec_get_frame(decoder_, &iter_); } diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index 38c61952eb..7085945f6a 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -49,7 +49,7 @@ enum TestMode { class CxDataIterator { public: explicit CxDataIterator(vpx_codec_ctx_t *encoder) - : encoder_(encoder), iter_(NULL) {} + : encoder_(encoder), iter_(nullptr) {} const vpx_codec_cx_pkt_t *Next() { return vpx_codec_get_cx_data(encoder_, &iter_); diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h index 22c05ecde9..a8ac4f154c 100644 --- a/test/ivf_video_source.h +++ b/test/ivf_video_source.h @@ -29,8 +29,9 @@ static unsigned int MemGetLe32(const uint8_t *mem) { class IVFVideoSource : public CompressedVideoSource { public: explicit IVFVideoSource(const std::string &file_name) - : file_name_(file_name), input_file_(NULL), compressed_frame_buf_(NULL), - frame_sz_(0), frame_(0), end_of_file_(false) {} + : file_name_(file_name), input_file_(nullptr), + compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0), + end_of_file_(false) {} virtual ~IVFVideoSource() { delete[] compressed_frame_buf_; @@ -41,13 +42,12 @@ class IVFVideoSource : public CompressedVideoSource { virtual void Init() { // Allocate a buffer for read in the compressed video frame. compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize]; - ASSERT_TRUE(compressed_frame_buf_ != NULL) - << "Allocate frame buffer failed"; + ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed"; } virtual void Begin() { input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_ != NULL) + ASSERT_NE(input_file_, nullptr) << "Input file open failed. Filename: " << file_name_; // Read file header @@ -68,7 +68,7 @@ class IVFVideoSource : public CompressedVideoSource { } void FillFrame() { - ASSERT_TRUE(input_file_ != NULL); + ASSERT_NE(input_file_, nullptr); uint8_t frame_hdr[kIvfFrameHdrSize]; // Check frame header and read a frame from input_file. if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) != @@ -87,7 +87,7 @@ class IVFVideoSource : public CompressedVideoSource { } virtual const uint8_t *cxdata() const { - return end_of_file_ ? NULL : compressed_frame_buf_; + return end_of_file_ ? nullptr : compressed_frame_buf_; } virtual size_t frame_size() const { return frame_sz_; } virtual unsigned int frame_number() const { return frame_; } diff --git a/test/register_state_check.h b/test/register_state_check.h index 1746240c61..0b837dd042 100644 --- a/test/register_state_check.h +++ b/test/register_state_check.h @@ -56,7 +56,7 @@ class RegisterStateCheck { private: static bool StoreRegisters(CONTEXT *const context) { const HANDLE this_thread = GetCurrentThread(); - EXPECT_TRUE(this_thread != NULL); + EXPECT_NE(this_thread, nullptr); context->ContextFlags = CONTEXT_FLOATING_POINT; const bool context_saved = GetThreadContext(this_thread, context) == TRUE; EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError(); diff --git a/test/video_source.h b/test/video_source.h index 349e3de37c..a10ff6fb09 100644 --- a/test/video_source.h +++ b/test/video_source.h @@ -42,7 +42,7 @@ namespace libvpx_test { // A simple function to encapsulate cross platform retrieval of test data path static std::string GetDataPath() { const char *const data_path = getenv("LIBVPX_TEST_DATA_PATH"); - if (data_path == NULL) { + if (data_path == nullptr) { #ifdef LIBVPX_TEST_DATA_PATH // In some environments, we cannot set environment variables // Instead, we set the data path by using a preprocessor symbol @@ -76,10 +76,10 @@ static FILE *GetTempOutFile(std::string *file_name) { return fopen(fname, "wb+"); } } - return NULL; + return nullptr; #else std::string temp_dir = testing::TempDir(); - if (temp_dir.empty()) return NULL; + if (temp_dir.empty()) return nullptr; // Versions of testing::TempDir() prior to release-1.11.0-214-g5e6a5336 may // use the value of an environment variable without checking for a trailing // path delimiter. @@ -87,12 +87,12 @@ static FILE *GetTempOutFile(std::string *file_name) { const char name_template[] = "libvpxtest.XXXXXX"; std::unique_ptr temp_file_name( new char[temp_dir.size() + sizeof(name_template)]); - if (temp_file_name == nullptr) return NULL; + if (temp_file_name == nullptr) return nullptr; memcpy(temp_file_name.get(), temp_dir.data(), temp_dir.size()); memcpy(temp_file_name.get() + temp_dir.size(), name_template, sizeof(name_template)); const int fd = mkstemp(temp_file_name.get()); - if (fd == -1) return NULL; + if (fd == -1) return nullptr; *file_name = temp_file_name.get(); return fdopen(fd, "wb+"); #endif @@ -114,7 +114,7 @@ class TempOutFile { void CloseFile() { if (file_) { fclose(file_); - file_ = NULL; + file_ = nullptr; } } FILE *file_; @@ -133,7 +133,7 @@ class VideoSource { // Advance the cursor to the next frame virtual void Next() = 0; - // Get the current video frame, or NULL on End-Of-Stream. + // Get the current video frame, or nullptr on End-Of-Stream. virtual vpx_image_t *img() const = 0; // Get the presentation timestamp of the current frame. @@ -155,7 +155,7 @@ class VideoSource { class DummyVideoSource : public VideoSource { public: DummyVideoSource() - : img_(NULL), limit_(100), width_(80), height_(64), + : img_(nullptr), limit_(100), width_(80), height_(64), format_(VPX_IMG_FMT_I420) { ReallocImage(); } @@ -172,7 +172,9 @@ class DummyVideoSource : public VideoSource { FillFrame(); } - virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; } + virtual vpx_image_t *img() const { + return (frame_ < limit_) ? img_ : nullptr; + } // Models a stream where Timebase = 1/FPS, so pts == frame. virtual vpx_codec_pts_t pts() const { return frame_; } @@ -212,7 +214,7 @@ class DummyVideoSource : public VideoSource { void ReallocImage() { vpx_img_free(img_); - img_ = vpx_img_alloc(NULL, format_, width_, height_, 32); + img_ = vpx_img_alloc(nullptr, format_, width_, height_, 32); ASSERT_NE(img_, nullptr); raw_sz_ = ((img_->w + 31) & ~31u) * img_->h * img_->bps / 8; } diff --git a/test/webm_video_source.h b/test/webm_video_source.h index 6f55f7db7c..d245926298 100644 --- a/test/webm_video_source.h +++ b/test/webm_video_source.h @@ -26,11 +26,11 @@ class WebMVideoSource : public CompressedVideoSource { public: explicit WebMVideoSource(const std::string &file_name) : file_name_(file_name), vpx_ctx_(new VpxInputContext()), - webm_ctx_(new WebmInputContext()), buf_(NULL), buf_sz_(0), frame_(0), + webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0), frame_(0), end_of_file_(false) {} virtual ~WebMVideoSource() { - if (vpx_ctx_->file != NULL) fclose(vpx_ctx_->file); + if (vpx_ctx_->file != nullptr) fclose(vpx_ctx_->file); webm_free(webm_ctx_); delete vpx_ctx_; delete webm_ctx_; @@ -40,7 +40,7 @@ class WebMVideoSource : public CompressedVideoSource { virtual void Begin() { vpx_ctx_->file = OpenTestDataFile(file_name_); - ASSERT_TRUE(vpx_ctx_->file != NULL) + ASSERT_NE(vpx_ctx_->file, nullptr) << "Input file open failed. Filename: " << file_name_; ASSERT_EQ(file_is_webm(webm_ctx_, vpx_ctx_), 1) << "file is not WebM"; @@ -54,7 +54,7 @@ class WebMVideoSource : public CompressedVideoSource { } void FillFrame() { - ASSERT_TRUE(vpx_ctx_->file != NULL); + ASSERT_NE(vpx_ctx_->file, nullptr); const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_); ASSERT_GE(status, 0) << "webm_read_frame failed"; if (status == 1) { @@ -63,7 +63,7 @@ class WebMVideoSource : public CompressedVideoSource { } void SeekToNextKeyFrame() { - ASSERT_TRUE(vpx_ctx_->file != NULL); + ASSERT_NE(vpx_ctx_->file, nullptr); do { const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_); ASSERT_GE(status, 0) << "webm_read_frame failed"; @@ -74,7 +74,9 @@ class WebMVideoSource : public CompressedVideoSource { } while (!webm_ctx_->is_key_frame && !end_of_file_); } - virtual const uint8_t *cxdata() const { return end_of_file_ ? NULL : buf_; } + virtual const uint8_t *cxdata() const { + return end_of_file_ ? nullptr : buf_; + } virtual size_t frame_size() const { return buf_sz_; } virtual unsigned int frame_number() const { return frame_; } diff --git a/test/y4m_test.cc b/test/y4m_test.cc index 89c6552c5d..32f2cd51d3 100644 --- a/test/y4m_test.cc +++ b/test/y4m_test.cc @@ -202,7 +202,7 @@ TEST(Y4MHeaderTest, RegularHeader) { EXPECT_EQ(0, fseek(f.file(), 0, 0)); y4m_input y4m; - EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL, + EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr, /*num_skip=*/0, /*only_420=*/0), 0); EXPECT_EQ(y4m.pic_w, 4); @@ -229,7 +229,7 @@ TEST(Y4MHeaderTest, LongHeader) { EXPECT_EQ(fseek(f.file(), 0, 0), 0); y4m_input y4m; - EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL, + EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr, /*num_skip=*/0, /*only_420=*/0), 0); EXPECT_EQ(y4m.pic_w, 4); diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h index 89aa2a44fc..71fbf31931 100644 --- a/test/y4m_video_source.h +++ b/test/y4m_video_source.h @@ -23,7 +23,7 @@ namespace libvpx_test { class Y4mVideoSource : public VideoSource { public: Y4mVideoSource(const std::string &file_name, unsigned int start, int limit) - : file_name_(file_name), input_file_(NULL), img_(new vpx_image_t()), + : file_name_(file_name), input_file_(nullptr), img_(new vpx_image_t()), start_(start), limit_(limit), frame_(0), framerate_numerator_(0), framerate_denominator_(0), y4m_() {} @@ -35,13 +35,13 @@ class Y4mVideoSource : public VideoSource { virtual void OpenSource() { CloseSource(); input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_ != NULL) + ASSERT_NE(input_file_, nullptr) << "Input file open failed. Filename: " << file_name_; } virtual void ReadSourceToStart() { - ASSERT_TRUE(input_file_ != NULL); - ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, NULL, 0, 0)); + ASSERT_NE(input_file_, nullptr); + ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, nullptr, 0, 0)); framerate_numerator_ = y4m_.fps_n; framerate_denominator_ = y4m_.fps_d; frame_ = 0; @@ -62,7 +62,7 @@ class Y4mVideoSource : public VideoSource { } virtual vpx_image_t *img() const { - return (frame_ < limit_) ? img_.get() : NULL; + return (frame_ < limit_) ? img_.get() : nullptr; } // Models a stream where Timebase = 1/FPS, so pts == frame. @@ -80,7 +80,7 @@ class Y4mVideoSource : public VideoSource { virtual unsigned int limit() const { return limit_; } virtual void FillFrame() { - ASSERT_TRUE(input_file_ != NULL); + ASSERT_NE(input_file_, nullptr); // Read a frame from input_file. y4m_input_fetch_frame(&y4m_, input_file_, img_.get()); } @@ -101,9 +101,9 @@ class Y4mVideoSource : public VideoSource { void CloseSource() { y4m_input_close(&y4m_); y4m_ = y4m_input(); - if (input_file_ != NULL) { + if (input_file_ != nullptr) { fclose(input_file_); - input_file_ = NULL; + input_file_ = nullptr; } } diff --git a/test/yuv_video_source.h b/test/yuv_video_source.h index 383ab8f1b1..51948c0efb 100644 --- a/test/yuv_video_source.h +++ b/test/yuv_video_source.h @@ -27,8 +27,8 @@ class YUVVideoSource : public VideoSource { YUVVideoSource(const std::string &file_name, vpx_img_fmt format, unsigned int width, unsigned int height, int rate_numerator, int rate_denominator, unsigned int start, int limit) - : file_name_(file_name), input_file_(NULL), img_(NULL), start_(start), - limit_(limit), frame_(0), width_(0), height_(0), + : file_name_(file_name), input_file_(nullptr), img_(nullptr), + start_(start), limit_(limit), frame_(0), width_(0), height_(0), format_(VPX_IMG_FMT_NONE), framerate_numerator_(rate_numerator), framerate_denominator_(rate_denominator) { // This initializes format_, raw_size_, width_, height_ and allocates img. @@ -43,7 +43,7 @@ class YUVVideoSource : public VideoSource { virtual void Begin() { if (input_file_) fclose(input_file_); input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_ != NULL) + ASSERT_NE(input_file_, nullptr) << "Input file open failed. Filename: " << file_name_; if (start_) { fseek(input_file_, static_cast(raw_size_) * start_, SEEK_SET); @@ -58,7 +58,9 @@ class YUVVideoSource : public VideoSource { FillFrame(); } - virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; } + virtual vpx_image_t *img() const { + return (frame_ < limit_) ? img_ : nullptr; + } // Models a stream where Timebase = 1/FPS, so pts == frame. virtual vpx_codec_pts_t pts() const { return frame_; } @@ -78,8 +80,8 @@ class YUVVideoSource : public VideoSource { vpx_img_fmt format) { if (width != width_ || height != height_ || format != format_) { vpx_img_free(img_); - img_ = vpx_img_alloc(NULL, format, width, height, 1); - ASSERT_TRUE(img_ != NULL); + img_ = vpx_img_alloc(nullptr, format, width, height, 1); + ASSERT_NE(img_, nullptr); width_ = width; height_ = height; format_ = format; @@ -99,7 +101,7 @@ class YUVVideoSource : public VideoSource { } virtual void FillFrame() { - ASSERT_TRUE(input_file_ != NULL); + ASSERT_NE(input_file_, nullptr); // Read a frame from input_file. if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) { limit_ = frame_; From 9546c699fb06a8e0f0c92be92159034a28005ac2 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 2 Jun 2022 19:38:03 -0700 Subject: [PATCH 0395/1000] libs.mk,build/make/Makefile: make test targets ordinary rules this fixes a regression in make 4.2 and still present in 4.3 causing double colon rules to be serialized which breaks sharding done by the test and test-no-data-check rules. these targets only define one set of rules so ordinary rules work unlike clean. install may be another candidate, but that's left for a follow up. Change-Id: I9f074eca2ad266eeca6e31aae2e9f31eec8680e0 Tested: make 3.81, 4.1, 4.2, 4.2.1, 4.3 --- build/make/Makefile | 14 +++++++------- libs.mk | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/build/make/Makefile b/build/make/Makefile index b7a873cc81..5c38c18e57 100644 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -21,9 +21,9 @@ all: .DEFAULT clean:: .DEFAULT exampletest: .DEFAULT install:: .DEFAULT -test:: .DEFAULT -test-no-data-check:: .DEFAULT -testdata:: .DEFAULT +test: .DEFAULT +test-no-data-check: .DEFAULT +testdata: .DEFAULT utiltest: .DEFAULT exampletest-no-data-check utiltest-no-data-check: .DEFAULT test_%: .DEFAULT ; @@ -111,13 +111,13 @@ exampletest: .PHONY: install install:: .PHONY: test -test:: +test: .PHONY: testdata -testdata:: +testdata: .PHONY: utiltest utiltest: .PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check -test-no-data-check:: +test-no-data-check: exampletest-no-data-check utiltest-no-data-check: # Force to realign stack always on OS/2 @@ -465,6 +465,6 @@ INSTALL_TARGETS += .install-docs .install-srcs .install-libs .install-bins all: $(BUILD_TARGETS) install:: $(INSTALL_TARGETS) dist: $(INSTALL_TARGETS) -test:: +test: .SUFFIXES: # Delete default suffix rules diff --git a/libs.mk b/libs.mk index b59bb45e1e..b87f8bf8b0 100644 --- a/libs.mk +++ b/libs.mk @@ -536,7 +536,7 @@ $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1 esac \ ) -testdata:: $(LIBVPX_TEST_DATA) +testdata: $(LIBVPX_TEST_DATA) $(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\ [ -x "$$(which shasum)" ] && sha1sum=shasum;\ [ -x "$$(which sha1)" ] && sha1sum=sha1;\ @@ -709,15 +709,15 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(TEST_INTRA_PRED_SPEED_SRCS) INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(RC_INTERFACE_TEST_SRCS) define test_shard_template -test:: test_shard.$(1) -test-no-data-check:: test_shard_ndc.$(1) +test: test_shard.$(1) +test-no-data-check: test_shard_ndc.$(1) test_shard.$(1) test_shard_ndc.$(1): $(LIBVPX_TEST_BIN) @set -e; \ export GTEST_SHARD_INDEX=$(1); \ export GTEST_TOTAL_SHARDS=$(2); \ $(LIBVPX_TEST_BIN) test_shard.$(1): testdata -.PHONY: test_shard.$(1) +.PHONY: test_shard.$(1) test_shard_ndc.$(1) endef NUM_SHARDS := 10 From 386f25be5353978c28866e442f844f6bd2a1537e Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 27 May 2022 21:50:11 -0700 Subject: [PATCH 0396/1000] vp8e_set_config: setjmp before calling vp8_change_config vp8_change_config may call vp8_alloc_compressor_data which expects failures detected by CHECK_MEM_ERROR to not return. Change-Id: Ib7fbf4af904bd9b539402bb61c8f87855eef2ad6 (cherry picked from commit 365eebc147627ae83ec8b36077198d8cfb5e0128) --- vp8/vp8_cx_iface.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 21fed0e8ed..340f3e6638 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -473,14 +473,23 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx, ERROR("Cannot increase lag_in_frames"); res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0); + if (res != VPX_CODEC_OK) return res; - if (!res) { - ctx->cfg = *cfg; - set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); - vp8_change_config(ctx->cpi, &ctx->oxcf); + if (setjmp(ctx->cpi->common.error.jmp)) { + const vpx_codec_err_t codec_err = + update_error_state(ctx, &ctx->cpi->common.error); + ctx->cpi->common.error.setjmp = 0; + vpx_clear_system_state(); + assert(codec_err != VPX_CODEC_OK); + return codec_err; } - return res; + ctx->cpi->common.error.setjmp = 1; + ctx->cfg = *cfg; + set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); + vp8_change_config(ctx->cpi, &ctx->oxcf); + ctx->cpi->common.error.setjmp = 0; + return VPX_CODEC_OK; } static vpx_codec_err_t get_quantizer(vpx_codec_alg_priv_t *ctx, va_list args) { From 6e7f6363965b959f026956903fcd2aeb55698110 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 27 May 2022 21:50:11 -0700 Subject: [PATCH 0397/1000] vp9e_set_config: setjmp before calling vp9_change_config vp9_change_config may call functions that perform allocations which expect failures detected by CHECK_MEM_ERROR to not return. Change-Id: I1dd1eca9c661ed157d51b4a6a77fc9f88236d794 (cherry picked from commit 3997d9bc6286ba075879353b87678986cdbfa347) --- vp9/vp9_cx_iface.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index b809ab3e6f..63d8f44878 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -780,7 +780,7 @@ static vpx_codec_err_t set_twopass_params_from_config( static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg) { vpx_codec_err_t res; - int force_key = 0; + volatile int force_key = 0; if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) { if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS) @@ -799,19 +799,28 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, ERROR("Cannot increase lag_in_frames"); res = validate_config(ctx, cfg, &ctx->extra_cfg); + if (res != VPX_CODEC_OK) return res; - if (res == VPX_CODEC_OK) { - ctx->cfg = *cfg; - set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); - set_twopass_params_from_config(&ctx->cfg, ctx->cpi); - // On profile change, request a key frame - force_key |= ctx->cpi->common.profile != ctx->oxcf.profile; - vp9_change_config(ctx->cpi, &ctx->oxcf); + if (setjmp(ctx->cpi->common.error.jmp)) { + const vpx_codec_err_t codec_err = + update_error_state(ctx, &ctx->cpi->common.error); + ctx->cpi->common.error.setjmp = 0; + vpx_clear_system_state(); + assert(codec_err != VPX_CODEC_OK); + return codec_err; } + ctx->cfg = *cfg; + set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); + set_twopass_params_from_config(&ctx->cfg, ctx->cpi); + // On profile change, request a key frame + force_key |= ctx->cpi->common.profile != ctx->oxcf.profile; + vp9_change_config(ctx->cpi, &ctx->oxcf); + if (force_key) ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF; - return res; + ctx->cpi->common.error.setjmp = 0; + return VPX_CODEC_OK; } static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx, From db754a45328c03dcb27414727b35d61294eb172c Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 28 May 2022 15:24:37 -0700 Subject: [PATCH 0398/1000] vp9_change_config: check vp9_alloc_loop_filter return Change-Id: I4cba67a5ab192d1cf1dbfb5c039a93a4952b071e (cherry picked from commit 6549e76307631de7e37459fceb23b4eee4573620) --- vp9/encoder/vp9_encoder.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 89b7c8e246..d3f4d1ea81 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2057,7 +2057,10 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { cpi->external_resize = 0; } else if (cm->mi_alloc_size == new_mi_size && (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) { - vp9_alloc_loop_filter(cm); + if (vp9_alloc_loop_filter(cm)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate loop filter data"); + } } } From 3c5529e313b3a08cfdcd393efbab3df04aab6074 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Fri, 3 Jun 2022 15:08:48 -0700 Subject: [PATCH 0399/1000] L2E: Use bit mask to represent control type The bit mask allows us to easily add an additional control mode which both the QP and GOP are controlled by an external model. Change-Id: I49f676f622a6e70feb2a39dc97a4e5050b7f4760 --- vp9/encoder/vp9_encoder.c | 4 ++-- vp9/encoder/vp9_ext_ratectrl.c | 4 ++-- vp9/encoder/vp9_firstpass.c | 2 +- vpx/vpx_ext_ratectrl.h | 8 +++++++- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index a511aa7645..2fb88a9f70 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4492,7 +4492,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest } #endif // CONFIG_RATE_CTRL if (cpi->ext_ratectrl.ready && !ext_rc_recode && - cpi->ext_ratectrl.funcs.rc_type == VPX_RC_QP) { + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) { vpx_codec_err_t codec_status; const GF_GROUP *gf_group = &cpi->twopass.gf_group; vpx_rc_encodeframe_decision_t encode_frame_decision; @@ -4553,7 +4553,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest } if (cpi->ext_ratectrl.ready && - cpi->ext_ratectrl.funcs.rc_type == VPX_RC_QP) { + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) { last_q_attempt = q; // In general, for the external rate control, we take the qindex provided // as input and encode the frame with this qindex faithfully. However, diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index ba57b86f60..d5b60b02a6 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -143,7 +143,7 @@ vpx_codec_err_t vp9_extrc_get_encodeframe_decision( if (ext_ratectrl == NULL) { return VPX_CODEC_INVALID_PARAM; } - if (ext_ratectrl->ready && ext_ratectrl->funcs.rc_type == VPX_RC_QP) { + if (ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0) { vpx_rc_status_t rc_status; vpx_rc_encodeframe_info_t encode_frame_info; encode_frame_info.show_index = show_index; @@ -204,7 +204,7 @@ vpx_codec_err_t vp9_extrc_get_gop_decision( vpx_rc_gop_decision_t *gop_decision) { vpx_rc_status_t rc_status; if (ext_ratectrl == NULL || !ext_ratectrl->ready || - ext_ratectrl->funcs.rc_type != VPX_RC_GOP) { + (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) { return VPX_CODEC_INVALID_PARAM; } rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 2b3c174693..9ed59e8ead 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -2758,7 +2758,7 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref| // will be overwritten. if (cpi->ext_ratectrl.ready && - cpi->ext_ratectrl.funcs.rc_type == VPX_RC_GOP) { + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0) { vpx_codec_err_t codec_status; vpx_rc_gop_decision_t gop_decision; vpx_rc_gop_info_t gop_info; diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index db9af444c4..c3feac55e4 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -32,8 +32,14 @@ extern "C" { * quantization parameter (QP) for each frame. * In VPX_RC_GOP mode, the external rate control model determines the * group of picture (GOP) of the video sequence. + * In VPX_RC_GOP_QP mode, the external rate control model determines + * both the QP and the GOP. */ -typedef enum vpx_rc_type { VPX_RC_QP = 1, VPX_RC_GOP = 2 } vpx_rc_type_t; +typedef enum vpx_rc_type { + VPX_RC_QP = 1 << 0, + VPX_RC_GOP = 1 << 1, + VPX_RC_GOP_QP = VPX_RC_QP | VPX_RC_GOP +} vpx_rc_type_t; /*!\brief Abstract rate control model handler * From b21634073652cb7e73e52168dc2fa9c1b9a30adf Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Mon, 6 Jun 2022 14:53:46 -0700 Subject: [PATCH 0400/1000] L2E: send first pass stats before gop decisions This change let the encoder send first pass stats before gop decisioins so that external models could make use of it. Change-Id: Iafc7eddab93aa77ceaf8e1f7663a52b27d94af80 --- vp9/encoder/vp9_encoder.c | 10 ---------- vp9/encoder/vp9_firstpass.c | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 2fb88a9f70..87f369d3bb 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -5802,16 +5802,6 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags, ENCODE_FRAME_RESULT *encode_frame_result) { cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; - - if (cpi->common.current_frame_coding_index == 0) { - VP9_COMMON *cm = &cpi->common; - const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats( - &cpi->ext_ratectrl, &cpi->twopass.first_pass_info); - if (codec_status != VPX_CODEC_OK) { - vpx_internal_error(&cm->error, codec_status, - "vp9_extrc_send_firstpass_stats() failed"); - } - } #if CONFIG_MISMATCH_DEBUG mismatch_move_frame_idx_w(); #endif diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 9ed59e8ead..44a4cec0f9 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -3493,6 +3493,16 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { FIRSTPASS_STATS this_frame; const int show_idx = cm->current_video_frame; + if (cpi->common.current_frame_coding_index == 0) { + VP9_COMMON *cm = &cpi->common; + const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats( + &cpi->ext_ratectrl, &cpi->twopass.first_pass_info); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_send_firstpass_stats() failed"); + } + } + if (!twopass->stats_in) return; // Configure image size specific vizier parameters From 7bb4bd36122e55236148190625507efc90ec5b75 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Mon, 6 Jun 2022 15:46:41 -0700 Subject: [PATCH 0401/1000] L2E: rename 'gop_index' to 'gop_global_index' 'gop_index' has already been used in vpx_rc_encodeframe_info_t, which represents the frame index inside the current group of picture (gop). We therefore use 'gop_global_index' to represent the index of the current gop to avoid duplicate names. Change-Id: I3eb8987dd878f650649b013e0036e23d0846b5f0 --- test/vp9_ext_ratectrl_test.cc | 12 ++++++------ vp9/encoder/vp9_firstpass.c | 6 +++--- vp9/encoder/vp9_ratectrl.h | 2 +- vpx/vpx_ext_ratectrl.h | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index 1289e2db88..a3af4e98f0 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -44,7 +44,7 @@ struct ToyRateCtrl { int magic_number; int coding_index; - int gop_index; + int gop_global_index; int frames_since_key; int show_index; }; @@ -73,7 +73,7 @@ vpx_rc_status_t rc_create_model_gop(void *priv, ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; toy_rate_ctrl->magic_number = kModelMagicNumber; - toy_rate_ctrl->gop_index = 0; + toy_rate_ctrl->gop_global_index = 0; toy_rate_ctrl->frames_since_key = 0; toy_rate_ctrl->show_index = 0; toy_rate_ctrl->coding_index = 0; @@ -198,13 +198,13 @@ vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model, if (gop_info->is_key_frame) { EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); EXPECT_EQ(gop_info->frames_since_key, 0); - EXPECT_EQ(gop_info->gop_index, 0); - toy_rate_ctrl->gop_index = 0; + EXPECT_EQ(gop_info->gop_global_index, 0); + toy_rate_ctrl->gop_global_index = 0; toy_rate_ctrl->frames_since_key = 0; } else { EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1); } - EXPECT_EQ(gop_info->gop_index, toy_rate_ctrl->gop_index); + EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); @@ -217,7 +217,7 @@ vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model, toy_rate_ctrl->show_index += gop_decision->gop_coding_frames - gop_decision->use_alt_ref; toy_rate_ctrl->coding_index += gop_decision->gop_coding_frames; - ++toy_rate_ctrl->gop_index; + ++toy_rate_ctrl->gop_global_index; return VPX_RC_OK; } diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 44a4cec0f9..e121ac80e1 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -2714,9 +2714,9 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { // frame in which case it will already have been done. if (is_key_frame == 0) { vp9_zero(twopass->gf_group); - ++rc->gop_index; + ++rc->gop_global_index; } else { - rc->gop_index = 0; + rc->gop_global_index = 0; } vpx_clear_system_state(); @@ -2772,7 +2772,7 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { gop_info.lag_in_frames = cpi->oxcf.lag_in_frames; gop_info.show_index = cm->current_video_frame; gop_info.coding_index = cm->current_frame_coding_index; - gop_info.gop_index = rc->gop_index; + gop_info.gop_global_index = rc->gop_global_index; codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info, &gop_decision); diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 48a21bd1cd..96a8fd3f1d 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -214,7 +214,7 @@ typedef struct { // The index of the current GOP. Start from zero. // When a key frame is inserted, it resets to zero. - int gop_index; + int gop_global_index; } RATE_CONTROL; struct VP9_COMP; diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index c3feac55e4..6e41abaf40 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -81,7 +81,7 @@ typedef struct vpx_rc_encodeframe_info { int show_index; /**< display index, starts from zero*/ int coding_index; /**< coding index, starts from zero*/ /*! - * index in group of picture, starts from zero. + * index of the current frame in this group of picture, starts from zero. */ int gop_index; int ref_frame_coding_indexes[3]; /**< three reference frames' coding indices*/ @@ -323,7 +323,7 @@ typedef struct vpx_rc_gop_info { * The index of the current gop, starts from zero, resets to zero * when a keyframe is set. */ - int gop_index; + int gop_global_index; } vpx_rc_gop_info_t; /*!\brief The decision made by the external rate control model to set the From 7b1b9f7cd23e085d97c26ed026d2c817d78a14d6 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Mon, 6 Jun 2022 16:46:55 -0700 Subject: [PATCH 0402/1000] L2E: Use libvpx's default q in case of invalid external value If the external model recommends an invalid q value, we use the default q selected by libvpx's rate control strategy. We update the test so that when the external model wants to control GOP decision, it could get per frame information and just recommend an invalid q. Change-Id: I69be4b0ee0800e7ab0706d305242bb87f001b1f7 --- test/vp9_ext_ratectrl_test.cc | 79 ++++++++++++++++++++++++++++++++++- vp9/encoder/vp9_encoder.c | 8 +++- vpx/vpx_ext_ratectrl.h | 9 ++++ 3 files changed, 92 insertions(+), 4 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index a3af4e98f0..66d4233766 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -186,6 +186,81 @@ vpx_rc_status_t rc_get_encodeframe_decision( return VPX_RC_OK; } +vpx_rc_status_t rc_get_encodeframe_decision_gop( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_LT(encode_frame_info->show_index, kFrameNumGOP); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->show_index, 0); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + } + + if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->show_index, 1); + EXPECT_EQ(encode_frame_info->gop_index, 1); + EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 0); // kRefFrameTypeLast + } + + if (encode_frame_info->coding_index == 2) { + EXPECT_EQ(encode_frame_info->show_index, 2); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + } + + if (encode_frame_info->coding_index == 3 || + encode_frame_info->coding_index == 12 || + encode_frame_info->coding_index == 21) { + EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/); + EXPECT_EQ(encode_frame_info->gop_index, 1); + } + + if (encode_frame_info->coding_index == 11 || + encode_frame_info->coding_index == 20 || + encode_frame_info->coding_index == 29) { + EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/); + EXPECT_EQ(encode_frame_info->gop_index, 0); + } + + if (encode_frame_info->coding_index >= 30) { + EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/); + } + + // When the model recommends an invalid q, valid range [0, 255], + // the encoder will ignore it and use the default q selected + // by libvpx rate control strategy. + frame_decision->q_index = VPX_DEFAULT_Q; + frame_decision->max_frame_size = 0; + + toy_rate_ctrl->coding_index += 1; + return VPX_RC_OK; +} + vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, vpx_rc_gop_decision_t *gop_decision) { @@ -216,7 +291,6 @@ vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model, gop_decision->gop_coding_frames - gop_decision->use_alt_ref; toy_rate_ctrl->show_index += gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - toy_rate_ctrl->coding_index += gop_decision->gop_coding_frames; ++toy_rate_ctrl->gop_global_index; return VPX_RC_OK; } @@ -319,9 +393,10 @@ class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest, encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); vpx_rc_funcs_t rc_funcs; - rc_funcs.rc_type = VPX_RC_GOP; + rc_funcs.rc_type = VPX_RC_GOP_QP; rc_funcs.create_model = rc_create_model_gop; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop; + rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop; rc_funcs.get_gop_decision = rc_get_gop_decision; rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop; rc_funcs.delete_model = rc_delete_model; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 87f369d3bb..85bd706629 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4510,8 +4510,12 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest vpx_internal_error(&cm->error, codec_status, "vp9_extrc_get_encodeframe_decision() failed"); } - q = encode_frame_decision.q_index; - ext_rc_max_frame_size = encode_frame_decision.max_frame_size; + // If the external model recommends a reserved value, we use + // libvpx's default q. + if (encode_frame_decision.q_index != VPX_DEFAULT_Q) { + q = encode_frame_decision.q_index; + ext_rc_max_frame_size = encode_frame_decision.max_frame_size; + } } vp9_set_quantizer(cpi, q); diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 6e41abaf40..b57148c69b 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -48,11 +48,20 @@ typedef enum vpx_rc_type { */ typedef void *vpx_rc_model_t; +/*!\brief A reserved value for the q index. + * If the external rate control model returns this value, + * the encoder will use the default q selected by libvpx's rate control + * system. + */ +#define VPX_DEFAULT_Q -1 + /*!\brief Encode frame decision made by the external rate control model * * The encoder will receive the decision from the external rate control model * through get_encodeframe_decision() defined in vpx_rc_funcs_t. * + * If q_index = VPX_DEFAULT_Q, the encoder will use libvpx's default q. + * * If max_frame_size = 0, the encoding ignores max frame size limit. * If max_frame_size = -1, the encoding uses VP9's max frame size as the limit. * If the encoded frame size is larger than max_frame_size, the frame is From 46bfeed2c9a7e52c8d1624f9e388af137e02ff19 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Fri, 10 Jun 2022 13:52:31 -0700 Subject: [PATCH 0403/1000] Convert EncoderTest::last_pts_ to a local variable Convert the data member EncoderTest::last_pts_ to a local variable in the EncoderTest::RunLoop() and VP9FrameSizeTestsLarge::RunLoop() methods. EncoderTest::last_pts_ is only used in these two methods, and these two methods first set EncoderTest::last_pts_ to 0 before using it. So EncoderTest::last_pts_ is effectively a local variable in these two methods. Note that several subclasses of EncoderTest declare their own last_pts_ data member and use it to calculate the data rate. Apparently their own last_pts_ data member hides the same-named data member in the base class. Although this is allowed by C++, this is very confusing. Change-Id: I55ce1cf8cc62e07333d8a902d65b46343a3d5881 --- test/encode_test_driver.cc | 6 +++--- test/encode_test_driver.h | 4 +--- test/frame_size_tests.cc | 6 +++--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc index 1ce39eaeff..9ca15ae4d3 100644 --- a/test/encode_test_driver.cc +++ b/test/encode_test_driver.cc @@ -169,7 +169,7 @@ void EncoderTest::RunLoop(VideoSource *video) { ASSERT_TRUE(passes_ == 1 || passes_ == 2); for (unsigned int pass = 0; pass < passes_; pass++) { - last_pts_ = 0; + vpx_codec_pts_t last_pts = 0; if (passes_ == 1) { cfg_.g_pass = VPX_RC_ONE_PASS; @@ -225,8 +225,8 @@ void EncoderTest::RunLoop(VideoSource *video) { has_dxdata = true; } - ASSERT_GE(pkt->data.frame.pts, last_pts_); - last_pts_ = pkt->data.frame.pts; + ASSERT_GE(pkt->data.frame.pts, last_pts); + last_pts = pkt->data.frame.pts; FramePktHook(pkt); break; diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index 7085945f6a..f6bb841d8c 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -206,8 +206,7 @@ class Encoder { class EncoderTest { protected: explicit EncoderTest(const CodecFactory *codec) - : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0), - last_pts_(0) { + : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0) { // Default to 1 thread. cfg_.g_threads = 1; } @@ -291,7 +290,6 @@ class EncoderTest { TwopassStatsStore stats_; unsigned long init_flags_; unsigned long frame_flags_; - vpx_codec_pts_t last_pts_; }; } // namespace libvpx_test diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc index d85c193e0b..8a0eb71ba0 100644 --- a/test/frame_size_tests.cc +++ b/test/frame_size_tests.cc @@ -111,7 +111,7 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest, ASSERT_TRUE(passes_ == 1 || passes_ == 2); for (unsigned int pass = 0; pass < passes_; pass++) { - last_pts_ = 0; + vpx_codec_pts_t last_pts = 0; if (passes_ == 1) { cfg_.g_pass = VPX_RC_ONE_PASS; @@ -144,8 +144,8 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest, again = true; switch (pkt->kind) { case VPX_CODEC_CX_FRAME_PKT: - ASSERT_GE(pkt->data.frame.pts, last_pts_); - last_pts_ = pkt->data.frame.pts; + ASSERT_GE(pkt->data.frame.pts, last_pts); + last_pts = pkt->data.frame.pts; FramePktHook(pkt); break; From 013ec5722ce88bebcdcf32b1496fcca413199336 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 13 Jun 2022 16:29:31 -0400 Subject: [PATCH 0404/1000] Restore backward compatibility This CL breaks the backward compatibility: 1365e7e1a vp9-svc: Remove VP9E_SET_TEMPORAL_LAYERING_MODE Forcing the value of the next element Bug: webm:1752 Change-Id: I83c774b3aa6cca25f2f14995590fb20c0a1668d4 --- vpx/vp8cx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index f5dc6d1188..a61238cb10 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -516,7 +516,7 @@ enum vp8e_enc_control_id { * * Supported in codecs: VP9 */ - VP9E_SET_MIN_GF_INTERVAL, + VP9E_SET_MIN_GF_INTERVAL = 48, /*!\brief Codec control function to set minimum interval between GF/ARF frames * From 878266136bfcd8a9132cd60091c7d35943348dbc Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 13 Jun 2022 16:36:07 -0400 Subject: [PATCH 0405/1000] Update AUTHORS Bug: webm:1752 Change-Id: I08b4100a0e8c003cd9a7bdaf72926c268e02d53c --- AUTHORS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/AUTHORS b/AUTHORS index 174cc59ee7..fffda63360 100644 --- a/AUTHORS +++ b/AUTHORS @@ -68,11 +68,13 @@ Guillermo Ballester Valor Hangyu Kuang Hanno Böck Han Shen +Hao Chen Harish Mahendrakar Henrik Lundin Hien Ho Hirokazu Honda Hui Su +Ilya Kurdyukov Ivan Krasin Ivan Maltz Jacek Caban @@ -91,9 +93,11 @@ Jeff Petkau Jeremy Leconte Jerome Jiang Jia Jia +Jianhui Dai Jian Zhou Jim Bankoski jinbo +Jin Bo Jingning Han Joel Fernandes Joey Parrish @@ -111,6 +115,7 @@ Justin Clift Justin Lebar Kaustubh Raste KO Myung-Hun +Konstantinos Margaritis Kyle Siefring Lawrence Velázquez Linfeng Zhang @@ -118,6 +123,7 @@ Liu Peng Lou Quillio Luca Barbato Luc Trudeau +Lu Wang Makoto Kato Mans Rullgard Marco Paniconi @@ -131,6 +137,7 @@ Michael Kohler Mike Frysinger Mike Hommey Mikhal Shemer +Mikko Koivisto Min Chen Minghai Shang Min Ye @@ -206,6 +213,7 @@ xiwei gu Yaowu Xu Yi Luo Yongzhe Wang +yuanhecai Yue Chen Yun Liu Yunqing Wang From 027d710a6e6490e84b209985ba49ec01b02482a8 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 13 Jun 2022 16:29:31 -0400 Subject: [PATCH 0406/1000] Restore backward compatibility This CL breaks the backward compatibility: 1365e7e1a vp9-svc: Remove VP9E_SET_TEMPORAL_LAYERING_MODE Forcing the value of the next element Bug: webm:1752 Change-Id: I83c774b3aa6cca25f2f14995590fb20c0a1668d4 (cherry picked from commit 013ec5722ce88bebcdcf32b1496fcca413199336) --- vpx/vp8cx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index f5dc6d1188..a61238cb10 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -516,7 +516,7 @@ enum vp8e_enc_control_id { * * Supported in codecs: VP9 */ - VP9E_SET_MIN_GF_INTERVAL, + VP9E_SET_MIN_GF_INTERVAL = 48, /*!\brief Codec control function to set minimum interval between GF/ARF frames * From 95d196fdf45edf57015cc18635c52b30a93522fd Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 16 Jun 2022 18:22:52 -0700 Subject: [PATCH 0407/1000] vp9_cx_iface: set default cpu_used=5 w/CONFIG_REALTIME_ONLY this avoids a crash if cpu-used is not explicitly set as there are some (unnecessary) checks against use_nonrd_pick_mode which would cause encoding to be skipped if the old default of 0 were used Bug: webm:1773 Change-Id: I62fba5fb51d8afa422689b7de3f03e8f7570e50b Fixed: webm:1773 --- test/realtime_test.cc | 24 +++++++++++++++++------- vp9/vp9_cx_iface.c | 6 +++++- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/test/realtime_test.cc b/test/realtime_test.cc index 853b942824..c5de2dcb35 100644 --- a/test/realtime_test.cc +++ b/test/realtime_test.cc @@ -35,17 +35,19 @@ class RealtimeTest } void BeginPassHook(unsigned int /*pass*/) override { +#if !CONFIG_REALTIME_ONLY // TODO(tomfinegan): We're changing the pass value here to make sure // we get frames when real time mode is combined with |g_pass| set to // VPX_RC_FIRST_PASS. This is necessary because EncoderTest::RunLoop() sets // the pass value based on the mode passed into EncoderTest::SetMode(), // which overrides the one specified in SetUp() above. cfg_.g_pass = VPX_RC_FIRST_PASS; +#endif } void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) override { - if (video->frame() == 0) { + if (video->frame() == 0 && set_cpu_used_) { encoder->Control(VP8E_SET_CPUUSED, 8); } } @@ -70,15 +72,23 @@ class RealtimeTest ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } + void TestEncode() { + ::libvpx_test::RandomVideoSource video; + video.SetSize(kVideoSourceWidth, kVideoSourceHeight); + video.set_limit(kFramesToEncode); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + EXPECT_EQ(kFramesToEncode, frame_packets_); + } + int frame_packets_; + bool set_cpu_used_ = true; }; -TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) { - ::libvpx_test::RandomVideoSource video; - video.SetSize(kVideoSourceWidth, kVideoSourceHeight); - video.set_limit(kFramesToEncode); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - EXPECT_EQ(kFramesToEncode, frame_packets_); +TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) { TestEncode(); } + +TEST_P(RealtimeTest, RealtimeDefaultCpuUsed) { + set_cpu_used_ = false; + TestEncode(); } TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); } diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 63d8f44878..05ac9e1691 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -65,7 +65,11 @@ typedef struct vp9_extracfg { } vp9_extracfg; static struct vp9_extracfg default_extra_cfg = { - 0, // cpu_used +#if CONFIG_REALTIME_ONLY + 5, // cpu_used +#else + 0, // cpu_used +#endif 1, // enable_auto_alt_ref 0, // noise_sensitivity 0, // sharpness From 08b86d76224453ef9cbab4b10a48617715d9a14e Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 16 Jun 2022 18:43:44 -0700 Subject: [PATCH 0408/1000] vp9_encode_sb_row: remove a branch w/CONFIG_REALTIME_ONLY replace the check on use_nonrd_pick_mode with an assert. this is only a start, there are many branches that could be removed that check mode == REALTIME, etc. with this configuration. Bug: webm:1773 Change-Id: I38cf9f83e7c085eb8e87d5cf6db7dc75359b611b --- vp9/encoder/vp9_encodeframe.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 5f08fa6f60..a9f392bf51 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -5856,9 +5856,12 @@ void vp9_encode_sb_row(VP9_COMP *cpi, ThreadData *td, int tile_row, get_start_tok(cpi, tile_row, tile_col, mi_row, &tok); cpi->tplist[tile_row][tile_col][tile_sb_row].start = tok; +#if CONFIG_REALTIME_ONLY + assert(cpi->sf.use_nonrd_pick_mode); + encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok); +#else if (cpi->sf.use_nonrd_pick_mode) encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok); -#if !CONFIG_REALTIME_ONLY else encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); #endif From 03d4c6fed9ce058ff27cdf523275d301073f6651 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 17 Jun 2022 14:14:10 -0400 Subject: [PATCH 0409/1000] Update CHANGELOG and version info A stale codec control was removed, but compatibility was restored. New codec control was added. Bump *current* and *age*, and keep *revision* as 0. Bug: webm:1752 Bug: webm:1757 Change-Id: I76179f129a10c06d897b5c62462808ed9b9c2923 --- CHANGELOG | 29 +++++++++++++++++++++++++++++ libs.mk | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index ea2fc9d81c..f061751ae3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,32 @@ +2022-06-17 v1.12.0 "Torrent Duck" + This release adds optimizations for Loongarch, adds support for vp8 in the + real-time rate control library, upgrades GoogleTest to v1.11.0, updates + libwebm to libwebm-1.0.0.28-20-g206d268, and includes numerous bug fixes. + + - Upgrading: + This release is ABI compatible with the previous release. + + vp8 support in the real-time rate control library. + New codec control VP8E_SET_RTC_EXTERNAL_RATECTRL is added. + + Configure support for darwin21 is added. + + GoogleTest is upgraded to v1.11.0. + + libwebm is updated to libwebm-1.0.0.28-20-g206d268. + + - Enhancement: + Numerous improvements on checking memory allocations. + Optimizations for Loongarch. + Code clean-up. + + - Bug fixes: + Fix to a crash related to {vp8/vp9}_set_roi_map. + Fix to compiling failure with -Wformat-nonliteral. + Fix to integer overflow with vp9 with high resolution content. + Fix to AddNoiseTest failure with ARMv7. + Fix to libvpx Null-dereference READ in vp8. + 2021-09-27 v1.11.0 "Smew Duck" This maintenance release adds support for VBR mode in VP9 rate control interface, new codec controls to get quantization parameters and loop filter diff --git a/libs.mk b/libs.mk index b59bb45e1e..00e49a19d7 100644 --- a/libs.mk +++ b/libs.mk @@ -313,7 +313,7 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) # SO_VERSION_* then follow the rules in the link to detemine the new version # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1 SO_VERSION_MAJOR := 7 -SO_VERSION_MINOR := 0 +SO_VERSION_MINOR := 1 SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib From 5df4da402675f2d829acfb006d7cd27dd387d776 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 21 Jun 2022 14:54:16 -0400 Subject: [PATCH 0410/1000] Update CHANGELOG for L2E Bug: webm:1752 Change-Id: I5335e0360501503d5c162be4bbdef3ad73151e9f --- CHANGELOG | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index f061751ae3..cd4e8ba43a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -15,6 +15,9 @@ libwebm is updated to libwebm-1.0.0.28-20-g206d268. + Allow SimpleEncode environment to take target level as input to match + the level conformance in vp9. + - Enhancement: Numerous improvements on checking memory allocations. Optimizations for Loongarch. From 158468202510a5d70ec1f0e9f4231c9ccacda8cd Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 16 Jun 2022 18:22:52 -0700 Subject: [PATCH 0411/1000] vp9_cx_iface: set default cpu_used=5 w/CONFIG_REALTIME_ONLY this avoids a crash if cpu-used is not explicitly set as there are some (unnecessary) checks against use_nonrd_pick_mode which would cause encoding to be skipped if the old default of 0 were used Bug: webm:1773 Change-Id: I62fba5fb51d8afa422689b7de3f03e8f7570e50b Fixed: webm:1773 (cherry picked from commit 95d196fdf45edf57015cc18635c52b30a93522fd) --- test/realtime_test.cc | 24 +++++++++++++++++------- vp9/vp9_cx_iface.c | 6 +++++- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/test/realtime_test.cc b/test/realtime_test.cc index 853b942824..c5de2dcb35 100644 --- a/test/realtime_test.cc +++ b/test/realtime_test.cc @@ -35,17 +35,19 @@ class RealtimeTest } void BeginPassHook(unsigned int /*pass*/) override { +#if !CONFIG_REALTIME_ONLY // TODO(tomfinegan): We're changing the pass value here to make sure // we get frames when real time mode is combined with |g_pass| set to // VPX_RC_FIRST_PASS. This is necessary because EncoderTest::RunLoop() sets // the pass value based on the mode passed into EncoderTest::SetMode(), // which overrides the one specified in SetUp() above. cfg_.g_pass = VPX_RC_FIRST_PASS; +#endif } void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) override { - if (video->frame() == 0) { + if (video->frame() == 0 && set_cpu_used_) { encoder->Control(VP8E_SET_CPUUSED, 8); } } @@ -70,15 +72,23 @@ class RealtimeTest ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } + void TestEncode() { + ::libvpx_test::RandomVideoSource video; + video.SetSize(kVideoSourceWidth, kVideoSourceHeight); + video.set_limit(kFramesToEncode); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + EXPECT_EQ(kFramesToEncode, frame_packets_); + } + int frame_packets_; + bool set_cpu_used_ = true; }; -TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) { - ::libvpx_test::RandomVideoSource video; - video.SetSize(kVideoSourceWidth, kVideoSourceHeight); - video.set_limit(kFramesToEncode); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - EXPECT_EQ(kFramesToEncode, frame_packets_); +TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) { TestEncode(); } + +TEST_P(RealtimeTest, RealtimeDefaultCpuUsed) { + set_cpu_used_ = false; + TestEncode(); } TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); } diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 63d8f44878..05ac9e1691 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -65,7 +65,11 @@ typedef struct vp9_extracfg { } vp9_extracfg; static struct vp9_extracfg default_extra_cfg = { - 0, // cpu_used +#if CONFIG_REALTIME_ONLY + 5, // cpu_used +#else + 0, // cpu_used +#endif 1, // enable_auto_alt_ref 0, // noise_sensitivity 0, // sharpness From 10178e6161c9126a6178eadad122309f8372fb0d Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 16 Jun 2022 18:43:44 -0700 Subject: [PATCH 0412/1000] vp9_encode_sb_row: remove a branch w/CONFIG_REALTIME_ONLY replace the check on use_nonrd_pick_mode with an assert. this is only a start, there are many branches that could be removed that check mode == REALTIME, etc. with this configuration. Bug: webm:1773 Change-Id: I38cf9f83e7c085eb8e87d5cf6db7dc75359b611b (cherry picked from commit 08b86d76224453ef9cbab4b10a48617715d9a14e) --- vp9/encoder/vp9_encodeframe.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 5f08fa6f60..a9f392bf51 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -5856,9 +5856,12 @@ void vp9_encode_sb_row(VP9_COMP *cpi, ThreadData *td, int tile_row, get_start_tok(cpi, tile_row, tile_col, mi_row, &tok); cpi->tplist[tile_row][tile_col][tile_sb_row].start = tok; +#if CONFIG_REALTIME_ONLY + assert(cpi->sf.use_nonrd_pick_mode); + encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok); +#else if (cpi->sf.use_nonrd_pick_mode) encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok); -#if !CONFIG_REALTIME_ONLY else encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); #endif From ec58d55c3af91f9db2511fb872bdc19868cbed92 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Fri, 24 Jun 2022 11:57:42 -0700 Subject: [PATCH 0413/1000] L2E: Distinguish fixed and active gf_interval min/max_gf_interval is fixed and can be passed from the command line. It must satisfy the level constraints. active_min/max_gf_interval might be changing based on min/max_gf_interval. It is determined per GOP. Change-Id: If456c691c97a8b4c946859c05cedd39ca7defa9c --- test/vp9_ext_ratectrl_test.cc | 9 +++++---- vp9/encoder/vp9_firstpass.c | 6 ++++-- vpx/vpx_ext_ratectrl.h | 16 +++++++++++++++- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index 66d4233766..68703b7e94 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -33,8 +33,7 @@ constexpr int kFixedGOPSize = 9; constexpr int kMaxLagInFrames = 25; constexpr int kDefaultMinGfInterval = 4; constexpr int kDefaultMaxGfInterval = 16; -// The two pass rate control does not respect the input -// min_gf_interval and max_gf_interval. +// The active gf interval might change for each GOP // See function "get_active_gf_inverval_range". // The numbers below are from manual inspection. constexpr int kReadMinGfInterval = 5; @@ -267,8 +266,10 @@ vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model, ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames); - EXPECT_EQ(gop_info->min_gf_interval, kReadMinGfInterval); - EXPECT_EQ(gop_info->max_gf_interval, kReadMaxGfInterval); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); + EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); + EXPECT_EQ(gop_info->active_min_gf_interval, kReadMinGfInterval); + EXPECT_EQ(gop_info->active_max_gf_interval, kReadMaxGfInterval); EXPECT_EQ(gop_info->allow_alt_ref, 1); if (gop_info->is_key_frame) { EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index e121ac80e1..4682cc0030 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -2762,8 +2762,10 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { vpx_codec_err_t codec_status; vpx_rc_gop_decision_t gop_decision; vpx_rc_gop_info_t gop_info; - gop_info.min_gf_interval = active_gf_interval.min; - gop_info.max_gf_interval = active_gf_interval.max; + gop_info.min_gf_interval = rc->min_gf_interval; + gop_info.max_gf_interval = rc->max_gf_interval; + gop_info.active_min_gf_interval = active_gf_interval.min; + gop_info.active_max_gf_interval = active_gf_interval.max; gop_info.allow_alt_ref = allow_alt_ref; gop_info.is_key_frame = is_key_frame; gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active; diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index b57148c69b..c3309b0f26 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -25,7 +25,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures. */ -#define VPX_EXT_RATECTRL_ABI_VERSION (4) +#define VPX_EXT_RATECTRL_ABI_VERSION (5) /*!\brief The control type of the inference API. * In VPX_RC_QP mode, the external rate control model determines the @@ -287,12 +287,26 @@ typedef struct vpx_rc_config { typedef struct vpx_rc_gop_info { /*! * Minimum allowed gf interval, fixed for the whole clip. + * Note that it will be modified to match vp9's level constraints + * in the encoder. + * The level constraint is defined in vp9_encoder.c: + * const Vp9LevelSpec vp9_level_defs[VP9_LEVELS]. */ int min_gf_interval; /*! * Maximum allowed gf interval, fixed for the whole clip. */ int max_gf_interval; + /*! + * Minimum allowed gf interval for the current GOP, determined + * by the encoder. + */ + int active_min_gf_interval; + /*! + * Maximum allowed gf interval for the current GOP, determined + * by the encoder. + */ + int active_max_gf_interval; /*! * Whether to allow the use of alt ref, can be changed per gop. */ From 03265cd42b3783532de72f2ded5436652e6f5ce3 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 28 Jun 2022 14:38:49 -0400 Subject: [PATCH 0414/1000] Replace date with version and release from README CHANGELOG has the date. Bug: webm:1752 Change-Id: I2888ce2afed8619f043eee1e9ca23bdf9d75e607 --- README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README b/README index a083ebf90e..477a145ba3 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -README - 08 March 2021 +v1.12.0 Torrent Duck Welcome to the WebM VP8/VP9 Codec SDK! From b355ab504667c352d96ab70bcb92165b8fc32813 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 28 Jun 2022 10:24:57 -0400 Subject: [PATCH 0415/1000] Add vp8_ prefix for quantize_lsx.c Duplicate name as vpx_dsp/loongarch/quantize_lsx.c Chromium update script fails. Bug: webm:1755 Change-Id: Ifb956c2292d909496eb2b9e1833993f1b021b07e --- vp8/encoder/loongarch/{quantize_lsx.c => vp8_quantize_lsx.c} | 0 vp8/vp8cx.mk | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename vp8/encoder/loongarch/{quantize_lsx.c => vp8_quantize_lsx.c} (100%) diff --git a/vp8/encoder/loongarch/quantize_lsx.c b/vp8/encoder/loongarch/vp8_quantize_lsx.c similarity index 100% rename from vp8/encoder/loongarch/quantize_lsx.c rename to vp8/encoder/loongarch/vp8_quantize_lsx.c diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 5744cbabcc..b4b3fda9ea 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -125,8 +125,8 @@ VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c endif # common (loongarch LSX intrinsics) -VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/quantize_lsx.c VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/dct_lsx.c VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/encodeopt_lsx.c +VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/vp8_quantize_lsx.c VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes)) From 896b59f44d63a789c9e34c394e9380323e538692 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Tue, 28 Jun 2022 11:22:48 -0700 Subject: [PATCH 0416/1000] rtc-svc: Fix to make SVC work for Profile 1 Added datarate unittest for 4:4:4 and 4:2:2 input, for spatial and temporal layers. Fix is needed in vp9_set_literal_size(): the sampling_x/y should be passed into update_inital_width(), othewise sampling_x/y = 1/1 (4:2:0) was forced. vp9_set_literal_size() is only called by the svc and on dynamic resize. Fix issue with the normative optimized scaler: UV width/height was assumed to be 1/2 of Y, for the ssse and neon code. Also fix to assert for the scaled width/height: in case scaled width/height is odd it should be incremented by 1 (make it even). Change-Id: I3a2e40effa53c505f44ef05aaa3132e1b7f57dd5 --- test/svc_datarate_test.cc | 81 +++++++++++++++++++-- vp9/encoder/arm/neon/vp9_frame_scale_neon.c | 4 +- vp9/encoder/vp9_encoder.c | 6 +- vp9/encoder/x86/vp9_frame_scale_ssse3.c | 4 +- 4 files changed, 82 insertions(+), 13 deletions(-) diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc index 291cb01280..f3f76a0d33 100644 --- a/test/svc_datarate_test.cc +++ b/test/svc_datarate_test.cc @@ -548,13 +548,16 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } if (!single_layer_resize_) { - ASSERT_EQ(pkt->data.frame.width[sl], - top_sl_width_ * svc_params_.scaling_factor_num[sl] / - svc_params_.scaling_factor_den[sl]); - - ASSERT_EQ(pkt->data.frame.height[sl], - top_sl_height_ * svc_params_.scaling_factor_num[sl] / - svc_params_.scaling_factor_den[sl]); + unsigned int scaled_width = top_sl_width_ * + svc_params_.scaling_factor_num[sl] / + svc_params_.scaling_factor_den[sl]; + if (scaled_width % 2 != 0) scaled_width += 1; + ASSERT_EQ(pkt->data.frame.width[sl], scaled_width); + unsigned int scaled_height = top_sl_height_ * + svc_params_.scaling_factor_num[sl] / + svc_params_.scaling_factor_den[sl]; + if (scaled_height % 2 != 0) scaled_height += 1; + ASSERT_EQ(pkt->data.frame.height[sl], scaled_height); } else if (superframe_count_ > 0) { if (pkt->data.frame.width[sl] < prev_frame_width[sl] && pkt->data.frame.height[sl] < prev_frame_height[sl]) @@ -678,6 +681,70 @@ class DatarateOnePassCbrSvcSingleBR } }; +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers, for 4:4:4 Profile 1. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL444Profile1) { + SetSvcConfig(3, 3); + ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140); + cfg_.g_profile = 1; + cfg_.g_bit_depth = VPX_BITS_8; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 352; + top_sl_height_ = 288; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3 +// temporal layers, for 4:2:2 Profile 1. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL3TL422Profile1) { + SetSvcConfig(2, 3); + ::libvpx_test::Y4mVideoSource video("park_joy_90p_8_422.y4m", 0, 20); + cfg_.g_profile = 1; + cfg_.g_bit_depth = VPX_BITS_8; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 160; + top_sl_height_ = 90; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Use large under/over shoot thresholds as this is a very short clip, + // so not good for testing rate-targeting. + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5, + 1.7); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1 // temporal layer, with screen content mode on and same speed setting for all // layers. diff --git a/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/vp9/encoder/arm/neon/vp9_frame_scale_neon.c index e46f789bac..69b8cfffd7 100644 --- a/vp9/encoder/arm/neon/vp9_frame_scale_neon.c +++ b/vp9/encoder/arm/neon/vp9_frame_scale_neon.c @@ -710,8 +710,8 @@ void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; const int dst_h = dst->y_crop_height; - const int dst_uv_w = dst_w / 2; - const int dst_uv_h = dst_h / 2; + const int dst_uv_w = dst->uv_crop_width; + const int dst_uv_h = dst->uv_crop_height; int scaled = 0; // phase_scaler is usually 0 or 8. diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 85bd706629..371779e772 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -8155,9 +8155,11 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, unsigned int height) { VP9_COMMON *cm = &cpi->common; #if CONFIG_VP9_HIGHBITDEPTH - update_initial_width(cpi, cm->use_highbitdepth, 1, 1); + update_initial_width(cpi, cm->use_highbitdepth, cpi->common.subsampling_x, + cpi->common.subsampling_y); #else - update_initial_width(cpi, 0, 1, 1); + update_initial_width(cpi, 0, cpi->common.subsampling_x, + cpi->common.subsampling_y); #endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_TEMPORAL_DENOISING diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c index 7685e7bc3e..bf0e8b121f 100644 --- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c +++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -754,8 +754,8 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; const int dst_h = dst->y_crop_height; - const int dst_uv_w = dst_w / 2; - const int dst_uv_h = dst_h / 2; + const int dst_uv_w = dst->uv_crop_width; + const int dst_uv_h = dst->uv_crop_height; int scaled = 0; // phase_scaler is usually 0 or 8. From 711bef67400f096416cb1ba7f6560e533871490f Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Wed, 29 Jun 2022 10:35:36 -0700 Subject: [PATCH 0417/1000] rtc: Add svc test for profile 2 10/12 bit Add TODO to fix the superframe parser for 10/12 bit. Change-Id: Ib76c4daa0ff2f516510829ead6a397c89abba2f3 --- test/svc_datarate_test.cc | 82 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc index f3f76a0d33..51e90e776c 100644 --- a/test/svc_datarate_test.cc +++ b/test/svc_datarate_test.cc @@ -745,6 +745,88 @@ TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL3TL422Profile1) { #endif } +#if CONFIG_VP9_HIGHBITDEPTH +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers, for Profle 2 10bit. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL10bitProfile2) { + SetSvcConfig(3, 3); + ::libvpx_test::Y4mVideoSource video("park_joy_90p_10_420_20f.y4m", 0, 20); + cfg_.g_profile = 2; + cfg_.g_bit_depth = VPX_BITS_10; + cfg_.g_input_bit_depth = VPX_BITS_10; + if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 160; + top_sl_height_ = 90; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // TODO(marpan/jianj): Comment out the rate-target checking for now + // as superframe parsing to get frame size needs to be fixed for + // high bitdepth. + /* + // Use large under/over shoot thresholds as this is a very short clip, + // so not good for testing rate-targeting. + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5, + 1.7); + */ +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers, for Profle 2 12bit. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL12bitProfile2) { + SetSvcConfig(3, 3); + ::libvpx_test::Y4mVideoSource video("park_joy_90p_12_420_20f.y4m", 0, 20); + cfg_.g_profile = 2; + cfg_.g_bit_depth = VPX_BITS_12; + cfg_.g_input_bit_depth = VPX_BITS_12; + if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 160; + top_sl_height_ = 90; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // TODO(marpan/jianj): Comment out the rate-target checking for now + // as superframe parsing to get frame size needs to be fixed for + // high bitdepth. + /* + // Use large under/over shoot thresholds as this is a very short clip, + // so not good for testing rate-targeting. + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5, + 1.7); + */ +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} +#endif + // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1 // temporal layer, with screen content mode on and same speed setting for all // layers. From dbac8e01e05ad3d1b47887b1ac864339115aa721 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 30 Jun 2022 10:53:15 -0400 Subject: [PATCH 0418/1000] ABI compatibility to CHANGELOG for prev releases. Bug: webm:1757 Change-Id: I19576aa0bc065045dcb0eaf770ae5b0d9ac9d684 --- CHANGELOG | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index cd4e8ba43a..4f5dcbd44e 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -36,6 +36,7 @@ levels, and includes several improvements to NEON and numerous bug fixes. - Upgrading: + This release is ABI incompatible with the previous release. New codec control is added to get quantization parameters and loop filter levels. @@ -61,6 +62,7 @@ well as numerous bug fixes. - Upgrading: + This release is ABI incompatible with the previous release. New codec control is added to disable loopfilter for VP9. New encoder control is added to disable feature to increase Q on overshoot @@ -91,6 +93,7 @@ well as incremental improvements. - Upgrading: + This release is ABI compatible with the previous release. NV12 support is added to this release. A new interface is added for VP9 rate control. The new library libvp9rc.a must be linked by applications. @@ -114,12 +117,14 @@ This release collects incremental improvements to many aspects of the library. - Upgrading: + This release is ABI compatible with the previous release. ARCH_* defines have been removed in favor of VPX_ARCH_*. 2019-07-15 v1.8.1 "Orpington Duck" This release collects incremental improvements to many aspects of the library. - Upgrading: + This release is ABI incompatible with the previous release. VP8E_SET_CPUUSED now accepts values up to 9 for vp9. VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT had a spelling fix (was VP8E). The --sdk-path option has been removed. If you were using it to build for @@ -138,7 +143,8 @@ This release focused on encoding performance for realtime and VOD use cases. - Upgrading: - This adds and improves several vp9 controls. Most are related to SVC: + This release is ABI incompatible with the previous release. This adds and + improves several vp9 controls. Most are related to SVC: VP9E_SET_SVC_FRAME_DROP_LAYER: - Frame dropping in SVC. VP9E_SET_SVC_INTER_LAYER_PRED: From 5b530fc962bcb8a51bbf03f5fbc2912f21b86e70 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 30 Jun 2022 13:48:56 -0400 Subject: [PATCH 0419/1000] Fix bug with smaller width bigger size Bug: webm:1642 Change-Id: I831b7701495eebeeff6bdc0b570f737bb6d536c6 --- test/resize_test.cc | 11 +++-------- vp9/common/vp9_alloccommon.c | 15 ++++++--------- vp9/encoder/vp9_encoder.c | 27 +++++++++++++++++++++++++-- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/test/resize_test.cc b/test/resize_test.cc index 212ff46975..1e5e166f7c 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -101,11 +101,8 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, *h = initial_h; return; } - if (frame < 100) { - *w = initial_w * 7 / 10; - *h = initial_h * 16 / 10; - return; - } + *w = initial_w * 7 / 10; + *h = initial_h * 16 / 10; return; } if (frame < 10) { @@ -578,9 +575,7 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) { } } -// TODO(https://crbug.com/webm/1642): This causes a segfault in -// init_encode_frame_mb_context(). -TEST_P(ResizeRealtimeTest, DISABLED_TestExternalResizeSmallerWidthBiggerSize) { +TEST_P(ResizeRealtimeTest, TestExternalResizeSmallerWidthBiggerSize) { ResizingVideoSource video; video.flag_codec_ = true; video.smaller_width_larger_size_ = true; diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index faad657a08..c27fe6477b 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -132,15 +132,6 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { if (cm->mi_alloc_size < new_mi_size) { cm->free_mi(cm); if (cm->alloc_mi(cm, new_mi_size)) goto fail; - } - - if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) { - // Create the segmentation map structure and set to 0. - free_seg_map(cm); - if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail; - } - - if (cm->above_context_alloc_cols < cm->mi_cols) { vpx_free(cm->above_context); cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc( 2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE, @@ -154,6 +145,12 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { cm->above_context_alloc_cols = cm->mi_cols; } + if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) { + // Create the segmentation map structure and set to 0. + free_seg_map(cm); + if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail; + } + if (vp9_alloc_loop_filter(cm)) goto fail; return 0; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 371779e772..d58e8a3123 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1972,6 +1972,17 @@ static void alloc_copy_partition_data(VP9_COMP *cpi) { } } +static void free_copy_partition_data(VP9_COMP *cpi) { + vpx_free(cpi->prev_partition); + cpi->prev_partition = NULL; + vpx_free(cpi->prev_segment_id); + cpi->prev_segment_id = NULL; + vpx_free(cpi->prev_variance_low); + cpi->prev_variance_low = NULL; + vpx_free(cpi->copied_frame_cnt); + cpi->copied_frame_cnt = NULL; +} + void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; @@ -2051,6 +2062,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows); if (cm->mi_alloc_size < new_mi_size) { vp9_free_context_buffers(cm); + vp9_free_pc_tree(&cpi->td); + vpx_free(cpi->mbmi_ext_base); alloc_compressor_data(cpi); realloc_segmentation_maps(cpi); cpi->initial_width = cpi->initial_height = 0; @@ -2069,8 +2082,18 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { update_frame_size(cpi); if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) { - memset(cpi->consec_zero_mv, 0, - cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv)); + vpx_free(cpi->consec_zero_mv); + CHECK_MEM_ERROR( + cm, cpi->consec_zero_mv, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv))); + + vpx_free(cpi->skin_map); + CHECK_MEM_ERROR( + cm, cpi->skin_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0]))); + + free_copy_partition_data(cpi); + alloc_copy_partition_data(cpi); if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_reset_resize(cpi); rc->rc_1_frame = 0; From 933b6b90a583b593efd8acb644603ab189226309 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 6 Jul 2022 15:06:51 -0700 Subject: [PATCH 0420/1000] Revert "Fix bug with smaller width bigger size" This reverts commit 5b530fc962bcb8a51bbf03f5fbc2912f21b86e70. This fixes memory related fuzzer failures in the decoder. Bug: webm:1642 Bug: oss-fuzz:48609 Bug: oss-fuzz:48629 Bug: oss-fuzz:48632 Bug: oss-fuzz:48638 Bug: oss-fuzz:48639 Bug: oss-fuzz:48651 Bug: oss-fuzz:48657 Bug: oss-fuzz:48659 Bug: oss-fuzz:48660 Bug: oss-fuzz:48661 Bug: oss-fuzz:48680 Bug: oss-fuzz:48686 Bug: oss-fuzz:48697 Bug: oss-fuzz:48706 Bug: oss-fuzz:48712 Bug: oss-fuzz:48717 Bug: oss-fuzz:48728 Bug: oss-fuzz:48732 Bug: oss-fuzz:48780 Bug: oss-fuzz:48781 Bug: oss-fuzz:48782 Bug: oss-fuzz:48785 Change-Id: I67a8539a3083f00eec1164fef5c6a8bc209f91fc --- test/resize_test.cc | 11 ++++++++--- vp9/common/vp9_alloccommon.c | 15 +++++++++------ vp9/encoder/vp9_encoder.c | 27 ++------------------------- 3 files changed, 19 insertions(+), 34 deletions(-) diff --git a/test/resize_test.cc b/test/resize_test.cc index 1e5e166f7c..212ff46975 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -101,8 +101,11 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, *h = initial_h; return; } - *w = initial_w * 7 / 10; - *h = initial_h * 16 / 10; + if (frame < 100) { + *w = initial_w * 7 / 10; + *h = initial_h * 16 / 10; + return; + } return; } if (frame < 10) { @@ -575,7 +578,9 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) { } } -TEST_P(ResizeRealtimeTest, TestExternalResizeSmallerWidthBiggerSize) { +// TODO(https://crbug.com/webm/1642): This causes a segfault in +// init_encode_frame_mb_context(). +TEST_P(ResizeRealtimeTest, DISABLED_TestExternalResizeSmallerWidthBiggerSize) { ResizingVideoSource video; video.flag_codec_ = true; video.smaller_width_larger_size_ = true; diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index c27fe6477b..faad657a08 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -132,6 +132,15 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { if (cm->mi_alloc_size < new_mi_size) { cm->free_mi(cm); if (cm->alloc_mi(cm, new_mi_size)) goto fail; + } + + if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) { + // Create the segmentation map structure and set to 0. + free_seg_map(cm); + if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail; + } + + if (cm->above_context_alloc_cols < cm->mi_cols) { vpx_free(cm->above_context); cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc( 2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE, @@ -145,12 +154,6 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { cm->above_context_alloc_cols = cm->mi_cols; } - if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) { - // Create the segmentation map structure and set to 0. - free_seg_map(cm); - if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail; - } - if (vp9_alloc_loop_filter(cm)) goto fail; return 0; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index d58e8a3123..371779e772 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1972,17 +1972,6 @@ static void alloc_copy_partition_data(VP9_COMP *cpi) { } } -static void free_copy_partition_data(VP9_COMP *cpi) { - vpx_free(cpi->prev_partition); - cpi->prev_partition = NULL; - vpx_free(cpi->prev_segment_id); - cpi->prev_segment_id = NULL; - vpx_free(cpi->prev_variance_low); - cpi->prev_variance_low = NULL; - vpx_free(cpi->copied_frame_cnt); - cpi->copied_frame_cnt = NULL; -} - void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; @@ -2062,8 +2051,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows); if (cm->mi_alloc_size < new_mi_size) { vp9_free_context_buffers(cm); - vp9_free_pc_tree(&cpi->td); - vpx_free(cpi->mbmi_ext_base); alloc_compressor_data(cpi); realloc_segmentation_maps(cpi); cpi->initial_width = cpi->initial_height = 0; @@ -2082,18 +2069,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { update_frame_size(cpi); if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) { - vpx_free(cpi->consec_zero_mv); - CHECK_MEM_ERROR( - cm, cpi->consec_zero_mv, - vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv))); - - vpx_free(cpi->skin_map); - CHECK_MEM_ERROR( - cm, cpi->skin_map, - vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0]))); - - free_copy_partition_data(cpi); - alloc_copy_partition_data(cpi); + memset(cpi->consec_zero_mv, 0, + cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv)); if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_reset_resize(cpi); rc->rc_1_frame = 0; From ba56eafb5742e5c28b7a99b5442698e9a3a61683 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Thu, 7 Jul 2022 06:34:11 -0700 Subject: [PATCH 0421/1000] VPX: Add quantize speed test for ref vs opt. Bug: b/237714063 Change-Id: I4304ba8d976fed3613e28442983b04a9cfc15b79 --- test/vp9_quantize_test.cc | 226 ++++++++++++++++++++++---------------- 1 file changed, 134 insertions(+), 92 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index ca1062a76f..b14a20cfcc 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -67,6 +67,45 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan); } +void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, + int16_t *quant, int16_t *quant_shift, + int16_t *dequant, int16_t *round_fp, + int16_t *quant_fp) { + // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V. + constexpr int kMaxQRoundingFactorFp = 64; + + for (int j = 0; j < 2; j++) { + // The range is 4 to 1828 in the VP9 tables. + const int qlookup = rnd->RandRange(1825) + 4; + round_fp[j] = (kMaxQRoundingFactorFp * qlookup) >> 7; + quant_fp[j] = (1 << 16) / qlookup; + + // Values determined by deconstructing vp9_init_quantizer(). + // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y + // values or U/V values of any bit depth. This is because y_delta is not + // factored into the vp9_ac_quant() call. + zbin[j] = rnd->RandRange(1200); + + // round may be up to 685 for Y values or 914 for U/V. + round[j] = rnd->RandRange(914); + // quant ranges from 1 to -32703 + quant[j] = static_cast(rnd->RandRange(32704)) - 32703; + // quant_shift goes up to 1 << 16. + quant_shift[j] = rnd->RandRange(16384); + // dequant maxes out at 1828 for all cases. + dequant[j] = rnd->RandRange(1828); + } + for (int j = 2; j < 8; j++) { + zbin[j] = zbin[1]; + round_fp[j] = round_fp[1]; + quant_fp[j] = quant_fp[1]; + round[j] = round[1]; + quant[j] = quant[1]; + quant_shift[j] = quant_shift[1]; + dequant[j] = dequant[1]; + } +} + class VP9QuantizeBase : public AbstractBench { public: VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp) @@ -148,6 +187,7 @@ class VP9QuantizeTest : public VP9QuantizeBase, protected: virtual void Run(); + void Speed(bool is_median); const QuantizeFunc quantize_op_; const QuantizeFunc ref_quantize_op_; }; @@ -159,6 +199,98 @@ void VP9QuantizeTest::Run() { scan_->iscan); } +void VP9QuantizeTest::Speed(bool is_median) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + ASSERT_TRUE(coeff_.Init()); + ASSERT_TRUE(qcoeff_.Init()); + ASSERT_TRUE(dqcoeff_.Init()); + TX_SIZE starting_sz, ending_sz; + + if (max_size_ == 16) { + starting_sz = TX_4X4; + ending_sz = TX_16X16; + } else { + starting_sz = TX_32X32; + ending_sz = TX_32X32; + } + + for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) { + // zbin > coeff, zbin < coeff. + for (int i = 0; i < 2; ++i) { + // TX_TYPE defines the scan order. That is not relevant to the speed test. + // Pick the first one. + const TX_TYPE tx_type = DCT_DCT; + count_ = (4 << sz) * (4 << sz); + scan_ = &vp9_scan_orders[sz][tx_type]; + + GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, + quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, + quant_fp_ptr_); + + if (i == 0) { + // When |coeff values| are less than zbin the results are 0. + int threshold = 100; + if (max_size_ == 32) { + // For 32x32, the threshold is halved. Double it to keep the values + // from clearing it. + threshold = 200; + } + for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold; + coeff_.Set(&rnd, -99, 99); + } else if (i == 1) { + for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50; + coeff_.Set(&rnd, -500, 500); + } + if (is_median) { + RunNTimes(10000000 / count_); + const char *type = + (i == 0) ? "Bypass calculations " : "Full calculations "; + char block_size[16]; + snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz); + char title[100]; + snprintf(title, sizeof(title), "%25s %8s ", type, block_size); + PrintMedian(title); + } else { + Buffer ref_qcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_qcoeff.Init()); + Buffer ref_dqcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_dqcoeff.Init()); + uint16_t ref_eob = 0; + + const int kNumTests = 5000000; + vpx_usec_timer timer, simd_timer; + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, + q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, + scan_->scan, scan_->iscan); + } + vpx_usec_timer_mark(&timer); + + vpx_usec_timer_start(&simd_timer); + for (int n = 0; n < kNumTests; ++n) { + quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, + quant_shift_ptr_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, + scan_->scan, scan_->iscan); + } + vpx_usec_timer_mark(&simd_timer); + + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer)); + const int simd_elapsed_time = + static_cast(vpx_usec_timer_elapsed(&simd_timer)); + printf("c_time = %d \t simd_time = %d \t Gain = %f \n", elapsed_time, + simd_elapsed_time, ((float)elapsed_time / simd_elapsed_time)); + } + } + } +} + // This quantizer compares the AC coefficients to the quantization step size to // determine if further multiplication operations are needed. // Based on vp9_quantize_fp_sse2(). @@ -254,45 +386,6 @@ void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1); } -void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, - int16_t *quant, int16_t *quant_shift, - int16_t *dequant, int16_t *round_fp, - int16_t *quant_fp) { - // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V. - const int max_qrounding_factor_fp = 64; - - for (int j = 0; j < 2; j++) { - // The range is 4 to 1828 in the VP9 tables. - const int qlookup = rnd->RandRange(1825) + 4; - round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7; - quant_fp[j] = (1 << 16) / qlookup; - - // Values determined by deconstructing vp9_init_quantizer(). - // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y - // values or U/V values of any bit depth. This is because y_delta is not - // factored into the vp9_ac_quant() call. - zbin[j] = rnd->RandRange(1200); - - // round may be up to 685 for Y values or 914 for U/V. - round[j] = rnd->RandRange(914); - // quant ranges from 1 to -32703 - quant[j] = static_cast(rnd->RandRange(32704)) - 32703; - // quant_shift goes up to 1 << 16. - quant_shift[j] = rnd->RandRange(16384); - // dequant maxes out at 1828 for all cases. - dequant[j] = rnd->RandRange(1828); - } - for (int j = 2; j < 8; j++) { - zbin[j] = zbin[1]; - round_fp[j] = round_fp[1]; - quant_fp[j] = quant_fp[1]; - round[j] = round[1]; - quant[j] = quant[1]; - quant_shift[j] = quant_shift[1]; - dequant[j] = dequant[1]; - } -} - TEST_P(VP9QuantizeTest, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); ASSERT_TRUE(coeff_.Init()); @@ -403,60 +496,9 @@ TEST_P(VP9QuantizeTest, EOBCheck) { } } -TEST_P(VP9QuantizeTest, DISABLED_Speed) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - ASSERT_TRUE(coeff_.Init()); - ASSERT_TRUE(qcoeff_.Init()); - ASSERT_TRUE(dqcoeff_.Init()); - TX_SIZE starting_sz, ending_sz; - - if (max_size_ == 16) { - starting_sz = TX_4X4; - ending_sz = TX_16X16; - } else { - starting_sz = TX_32X32; - ending_sz = TX_32X32; - } +TEST_P(VP9QuantizeTest, DISABLED_Speed) { Speed(false); } - for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) { - // zbin > coeff, zbin < coeff. - for (int i = 0; i < 2; ++i) { - // TX_TYPE defines the scan order. That is not relevant to the speed test. - // Pick the first one. - const TX_TYPE tx_type = DCT_DCT; - count_ = (4 << sz) * (4 << sz); - scan_ = &vp9_scan_orders[sz][tx_type]; - - GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, - quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, - quant_fp_ptr_); - - if (i == 0) { - // When |coeff values| are less than zbin the results are 0. - int threshold = 100; - if (max_size_ == 32) { - // For 32x32, the threshold is halved. Double it to keep the values - // from clearing it. - threshold = 200; - } - for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold; - coeff_.Set(&rnd, -99, 99); - } else if (i == 1) { - for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50; - coeff_.Set(&rnd, -500, 500); - } - - RunNTimes(10000000 / count_); - const char *type = - (i == 0) ? "Bypass calculations " : "Full calculations "; - char block_size[16]; - snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz); - char title[100]; - snprintf(title, sizeof(title), "%25s %8s ", type, block_size); - PrintMedian(title); - } - } -} +TEST_P(VP9QuantizeTest, DISABLED_SpeedMedian) { Speed(true); } using std::make_tuple; From 8f4d1890cbecbbec47c8baeba53e89a2b37ae3a8 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 6 Jul 2022 08:42:54 +0000 Subject: [PATCH 0422/1000] Revert "Revert "[NEON] Optimize vp9_diamond_search_sad() for NEON"" This reverts commit 9f1329f8ac88ea5d7c6ae5d6a57221c36cf85ac8 and fixes a dumb mistake in evaluation of vfcmv. Used vdupq_n_s16, instead of vdupq_n_s32. Change-Id: Ie236c878c166405c49bc0f93f6d63a6715534a0a --- vp9/common/vp9_rtcd_defs.pl | 2 +- .../arm/neon/vp9_diamond_search_sad_neon.c | 322 ++++++++++++++++++ vp9/vp9cx.mk | 1 + 3 files changed, 324 insertions(+), 1 deletion(-) create mode 100644 vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 4da0b6675b..e6b65c96f0 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -175,7 +175,7 @@ () # Motion search # add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; -specialize qw/vp9_diamond_search_sad avx/; +specialize qw/vp9_diamond_search_sad avx neon/; # # Apply temporal filter diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c new file mode 100644 index 0000000000..e56733d43e --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vpx_ports/mem.h" + +#ifdef __GNUC__ +#define LIKELY(v) __builtin_expect(v, 1) +#define UNLIKELY(v) __builtin_expect(v, 0) +#else +#define LIKELY(v) (v) +#define UNLIKELY(v) (v) +#endif + +static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { + int_mv result; + result.as_mv.row = row; + result.as_mv.col = col; + return result; +} + +static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) { + // This is simplified from the C implementation to utilise that + // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and + // x->nmvjointsadcost[1] == x->nmvjointsadcost[3] + return mv.as_int == 0 ? 0 : 1; +} + +static INLINE int mv_cost(const int_mv mv, const int *joint_cost, + int *const comp_cost[2]) { + assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX); + assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX); + return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] + + comp_cost[1][mv.as_mv.col]; +} + +static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, + int sad_per_bit) { + const int_mv diff = + pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col); + return ROUND_POWER_OF_TWO( + (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, + VP9_PROB_COST_SHIFT); +} + +/***************************************************************************** + * This function utilizes 3 properties of the cost function lookup tables, * + * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * + * vp9_encoder.c. * + * For the joint cost: * + * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * + * For the component costs: * + * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * + * (Equal costs for both components) * + * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * + * (Cost function is even) * + * If these do not hold, then this function cannot be used without * + * modification, in which case you can revert to using the C implementation, * + * which does not rely on these properties. * + *****************************************************************************/ +int vp9_diamond_search_sad_neon(const MACROBLOCK *x, + const search_site_config *cfg, MV *ref_mv, + MV *best_mv, int search_param, int sad_per_bit, + int *num00, const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv) { + static const uint32_t data[4] = { 0, 1, 2, 3 }; + const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data); + + const int32x4_t zero_s32 = vdupq_n_s32(0); + const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); + const int16x8_t v_max_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(maxmv.as_int)); + const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min); + const int16x8_t v_min_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(minmv.as_int)); + + const int32x4_t v_spb_d = vdupq_n_s32(sad_per_bit); + + const int32x4_t v_joint_cost_0_d = vdupq_n_s32(x->nmvjointsadcost[0]); + const int32x4_t v_joint_cost_1_d = vdupq_n_s32(x->nmvjointsadcost[1]); + + // search_param determines the length of the initial step and hence the number + // of iterations. + // 0 = initial step (MAX_FIRST_STEP) pel + // 1 = (MAX_FIRST_STEP/2) pel, + // 2 = (MAX_FIRST_STEP/4) pel... + const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; + const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; + const int tot_steps = cfg->total_steps - search_param; + + const int_mv fcenter_mv = + pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); + const int16x8_t vfcmv = vdupq_n_s16(fcenter_mv.as_int); + + const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); + const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); + + int_mv bmv = pack_int_mv(ref_row, ref_col); + int_mv new_bmv = bmv; + int16x8_t v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); + + const int what_stride = x->plane[0].src.stride; + const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; + const uint8_t *const what = x->plane[0].src.buf; + const uint8_t *const in_what = + x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; + + // Work out the start point for the search + const uint8_t *best_address = in_what; + const uint8_t *new_best_address = best_address; +#if defined(__aarch64__) + int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address); +#else + int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address); +#endif + unsigned int best_sad = INT_MAX; + int i, j, step; + + // Check the prerequisite cost function properties that are easy to check + // in an assert. See the function-level documentation for details on all + // prerequisites. + assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); + assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); + + // Check the starting position + best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); + best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); + + *num00 = 0; + + for (i = 0, step = 0; step < tot_steps; step++) { + for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { + int16x8_t v_diff_mv_w; + int8x16_t v_inside_d; + uint32x4_t v_outside_d; + int32x4_t v_cost_d, v_sad_d; +#if defined(__aarch64__) + int64x2_t v_blocka[2]; +#else + int32x4_t v_blocka[1]; + uint32x2_t horiz_max_0, horiz_max_1; +#endif + + uint32_t horiz_max; + // Compute the candidate motion vectors + const int16x8_t v_ss_mv_w = vld1q_s16((const int16_t *)&ss_mv[i]); + const int16x8_t v_these_mv_w = vaddq_s16(v_bmv_w, v_ss_mv_w); + // Clamp them to the search bounds + int16x8_t v_these_mv_clamp_w = v_these_mv_w; + v_these_mv_clamp_w = vminq_s16(v_these_mv_clamp_w, v_max_mv_w); + v_these_mv_clamp_w = vmaxq_s16(v_these_mv_clamp_w, v_min_mv_w); + // The ones that did not change are inside the search area + v_inside_d = vreinterpretq_s8_u32( + vceqq_s32(vreinterpretq_s32_s16(v_these_mv_clamp_w), + vreinterpretq_s32_s16(v_these_mv_w))); + + // If none of them are inside, then move on +#if defined(__aarch64__) + horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d)); +#else + horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)), + vget_high_u32(vreinterpretq_u32_s8(v_inside_d))); + horiz_max_1 = vpmax_u32(horiz_max_0, horiz_max_0); + vst1_lane_u32(&horiz_max, horiz_max_1, 0); +#endif + if (LIKELY(horiz_max == 0)) { + continue; + } + + // The inverse mask indicates which of the MVs are outside + v_outside_d = + vreinterpretq_u32_s8(veorq_s8(v_inside_d, vdupq_n_s8((int8_t)0xff))); + // Shift right to keep the sign bit clear, we will use this later + // to set the cost to the maximum value. + v_outside_d = vshrq_n_u32(v_outside_d, 1); + + // Compute the difference MV + v_diff_mv_w = vsubq_s16(v_these_mv_clamp_w, vfcmv); + // We utilise the fact that the cost function is even, and use the + // absolute difference. This allows us to use unsigned indexes later + // and reduces cache pressure somewhat as only a half of the table + // is ever referenced. + v_diff_mv_w = vabsq_s16(v_diff_mv_w); + + // Compute the SIMD pointer offsets. + { +#if defined(__aarch64__) // sizeof(intptr_t) == 8 + // Load the offsets + int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]); + int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]); + // Set the ones falling outside to zero + v_bo10_q = vandq_s64( + v_bo10_q, + vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(v_inside_d)))); + v_bo32_q = vandq_s64( + v_bo32_q, + vmovl_s32(vget_high_s32(vreinterpretq_s32_s8(v_inside_d)))); + // Compute the candidate addresses + v_blocka[0] = vaddq_s64(v_ba_q, v_bo10_q); + v_blocka[1] = vaddq_s64(v_ba_q, v_bo32_q); +#else // sizeof(intptr_t) == 4 + int32x4_t v_bo_d = vld1q_s32((const int32_t *)&ss_os[i]); + v_bo_d = vandq_s32(v_bo_d, vreinterpretq_s32_s8(v_inside_d)); + v_blocka[0] = vaddq_s32(v_ba_d, v_bo_d); +#endif + } + + fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], + in_what_stride, (uint32_t *)&v_sad_d); + + // Look up the component cost of the residual motion vector + { + uint32_t cost[4]; + int16_t __attribute__((aligned(16))) rowcol[8]; + vst1q_s16(rowcol, v_diff_mv_w); + + // Note: This is a use case for gather instruction + cost[0] = x->nmvsadcost[0][rowcol[0]] + x->nmvsadcost[0][rowcol[1]]; + cost[1] = x->nmvsadcost[0][rowcol[2]] + x->nmvsadcost[0][rowcol[3]]; + cost[2] = x->nmvsadcost[0][rowcol[4]] + x->nmvsadcost[0][rowcol[5]]; + cost[3] = x->nmvsadcost[0][rowcol[6]] + x->nmvsadcost[0][rowcol[7]]; + + v_cost_d = vld1q_s32((int32_t *)cost); + } + + // Now add in the joint cost + { + const uint32x4_t v_sel_d = + vceqq_s32(vreinterpretq_s32_s16(v_diff_mv_w), zero_s32); + const int32x4_t v_joint_cost_d = vreinterpretq_s32_u8( + vbslq_u8(vreinterpretq_u8_u32(v_sel_d), + vreinterpretq_u8_s32(v_joint_cost_0_d), + vreinterpretq_u8_s32(v_joint_cost_1_d))); + v_cost_d = vaddq_s32(v_cost_d, v_joint_cost_d); + } + + // Multiply by sad_per_bit + v_cost_d = vmulq_s32(v_cost_d, v_spb_d); + // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT) + v_cost_d = + vaddq_s32(v_cost_d, vdupq_n_s32(1 << (VP9_PROB_COST_SHIFT - 1))); + v_cost_d = vshrq_n_s32(v_cost_d, VP9_PROB_COST_SHIFT); + // Add the cost to the sad + v_sad_d = vaddq_s32(v_sad_d, v_cost_d); + + // Make the motion vectors outside the search area have max cost + // by or'ing in the comparison mask, this way the minimum search won't + // pick them. + v_sad_d = vorrq_s32(v_sad_d, vreinterpretq_s32_u32(v_outside_d)); + + // Find the minimum value and index horizontally in v_sad_d + { + uint32_t local_best_sad; +#if defined(__aarch64__) + local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d)); +#else + uint32x2_t horiz_min_0 = + vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v_sad_d)), + vget_high_u32(vreinterpretq_u32_s32(v_sad_d))); + uint32x2_t horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0); + vst1_lane_u32(&local_best_sad, horiz_min_1, 0); +#endif + + // Update the global minimum if the local minimum is smaller + if (LIKELY(local_best_sad < best_sad)) { +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + uint32_t local_best_idx; + const uint32x4_t v_sel_d = + vceqq_s32(v_sad_d, vdupq_n_s32(local_best_sad)); + uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d); + v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff)); + +#if defined(__aarch64__) + local_best_idx = vminvq_u32(v_mask_d); +#else + horiz_min_0 = + vmin_u32(vget_low_u32(v_mask_d), vget_high_u32(v_mask_d)); + horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0); + vst1_lane_u32(&local_best_idx, horiz_min_1, 0); +#endif + + new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic pop +#endif + new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; + + best_sad = local_best_sad; + } + } + } + + bmv = new_bmv; + best_address = new_best_address; + + v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); +#if defined(__aarch64__) + v_ba_q = vdupq_n_s64((intptr_t)best_address); +#else + v_ba_d = vdupq_n_s32((intptr_t)best_address); +#endif + + if (UNLIKELY(best_address == in_what)) { + (*num00)++; + } + } + + *best_mv = bmv.as_mv; + return best_sad; +} diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 92a7fddb9d..c9afd9a347 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -113,6 +113,7 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c From 873aab02adb54370c062726de056ea4f1888cb0e Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 9 Jul 2022 14:49:54 -0700 Subject: [PATCH 0423/1000] vp8_macros_msa.h: avoid shadowing variables in defines this avoids a warning with certain versions of gcc; observed with: mipsisa32r6el-linux-gnu-gcc (Debian 10.2.1-6) 10.2.1 20210110 Change-Id: I8999f487a79a9d53133816d572054b2423330bcf --- vp8/common/mips/msa/vp8_macros_msa.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h index ddc881a7fc..fde22f537c 100644 --- a/vp8/common/mips/msa/vp8_macros_msa.h +++ b/vp8/common/mips/msa/vp8_macros_msa.h @@ -69,12 +69,12 @@ #else // !(__mips == 64) #define LD(psrc) \ ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + const uint8_t *psrc_ld = (const uint8_t *)(psrc); \ uint32_t val0_m, val1_m; \ uint64_t val_m = 0; \ \ - val0_m = LW(psrc_m); \ - val1_m = LW(psrc_m + 4); \ + val0_m = LW(psrc_ld); \ + val1_m = LW(psrc_ld + 4); \ \ val_m = (uint64_t)(val1_m); \ val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ From e2603ead67947cd534e0a593422bae6427451ad6 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 6 Jul 2022 08:51:52 -0700 Subject: [PATCH 0424/1000] VPX: Add vpx_quantize_b_avx2(). Up to 1.58x faster than vpx_quantize_b_avx() depending on the size. Bug: b/237714063 Change-Id: I595a6bb32ebee63f69f27b5a15322fdeae1bf70e --- test/vp9_quantize_test.cc | 23 +++-- vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/quantize_avx2.c | 185 +++++++++++++++++++++++++++++++++++ 4 files changed, 201 insertions(+), 10 deletions(-) create mode 100644 vpx_dsp/x86/quantize_avx2.c diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index b14a20cfcc..c7ce13ff22 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -241,14 +241,16 @@ void VP9QuantizeTest::Speed(bool is_median) { for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50; coeff_.Set(&rnd, -500, 500); } + + const char *type = + (i == 0) ? "Bypass calculations " : "Full calculations "; + char block_size[16]; + snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz); + char title[100]; + snprintf(title, sizeof(title), "%25s %8s ", type, block_size); + if (is_median) { RunNTimes(10000000 / count_); - const char *type = - (i == 0) ? "Bypass calculations " : "Full calculations "; - char block_size[16]; - snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz); - char title[100]; - snprintf(title, sizeof(title), "%25s %8s ", type, block_size); PrintMedian(title); } else { Buffer ref_qcoeff = @@ -284,8 +286,9 @@ void VP9QuantizeTest::Speed(bool is_median) { static_cast(vpx_usec_timer_elapsed(&timer)); const int simd_elapsed_time = static_cast(vpx_usec_timer_elapsed(&simd_timer)); - printf("c_time = %d \t simd_time = %d \t Gain = %f \n", elapsed_time, - simd_elapsed_time, ((float)elapsed_time / simd_elapsed_time)); + printf("%s c_time = %d \t simd_time = %d \t Gain = %f \n", title, + elapsed_time, simd_elapsed_time, + ((float)elapsed_time / simd_elapsed_time)); } } } @@ -575,7 +578,9 @@ INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, ::testing::Values(make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, - 16, true))); + 16, true), + make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, + VPX_BITS_8, 16, false))); #endif // HAVE_AVX2 #if HAVE_NEON diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 13999af04d..8e0100c310 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -326,6 +326,7 @@ DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.h DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c +DSP_SRCS-$(HAVE_AVX2) += x86/quantize_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/quantize_lsx.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index d3c668f9ae..7ecd3ac90f 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -711,7 +711,7 @@ () # if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx lsx/; + specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx lsx/; diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c new file mode 100644 index 0000000000..e1c6e944ce --- /dev/null +++ b/vpx_dsp/x86/quantize_avx2.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE void load_b_values_avx2( + const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, + __m256i *round, const int16_t *quant_ptr, __m256i *quant, + const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr, + __m256i *shift) { + *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr)); + *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); + // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when + // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16) + *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); + + *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + *round = _mm256_permute4x64_epi64(*round, 0x54); + + *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + *quant = _mm256_permute4x64_epi64(*quant, 0x54); + *dequant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); + *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr)); + *shift = _mm256_permute4x64_epi64(*shift, 0x54); +} + +static VPX_FORCE_INLINE __m256i +load_coefficients_avx2(const tran_low_t *coeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH + // typedef int32_t tran_low_t; + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8)); + return _mm256_packs_epi32(coeff1, coeff2); +#else + // typedef int16_t tran_low_t; + return _mm256_loadu_si256((const __m256i *)coeff_ptr); +#endif +} + +static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals, + tran_low_t *coeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH + // typedef int32_t tran_low_t; + __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); + __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); + __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); + _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo); + _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); +#else + // typedef int16_t tran_low_t; + _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals); +#endif +} + +static VPX_FORCE_INLINE __m256i +quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant, + __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) { + const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); + const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); + const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); + + if (_mm256_movemask_epi8(v_zbin_mask) == 0) { + _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); +#endif // CONFIG_VP9_HIGHBITDEPTH + return _mm256_setzero_si256(); + } + { + // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0 + const __m256i v_tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); + + const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); + const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); + const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift); + const __m256i v_nz_mask = + _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); + const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant); + const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant); + + const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high); + const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high); +#else + const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant); +#endif + + store_coefficients_avx2(v_qcoeff, qcoeff_ptr); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo); + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi); +#else + store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr); +#endif + return v_nz_mask; + } +} + +static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, + __m256i v_eobmax, + __m256i v_mask) { + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); +#if CONFIG_VP9_HIGHBITDEPTH + // typedef int32_t tran_low_t; + const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8); + const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask); +#else + // typedef int16_t tran_low_t; + const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan, v_mask); +#endif + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask); + return _mm256_max_epi16(v_eobmax, v_nz_iscan); +} + +static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) { + const __m128i eob_lo = _mm256_castsi256_si128(eob256); + const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1); + __m128i eob = _mm_max_epi16(eob_lo, eob_hi); + __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} + +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask; + __m256i v_eobmax = _mm256_set1_epi16(0); + intptr_t count; + (void)scan; + + load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, + &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, + &v_quant_shift); + // Do DC and first 15 AC. + v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, + &v_dequant, &v_round, &v_zbin, &v_quant_shift); + + v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); + + v_round = _mm256_unpackhi_epi64(v_round, v_round); + v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); + v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); + v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); + v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); + + for (count = n_coeffs - 16; count > 0; count -= 16) { + coeff_ptr += 16; + qcoeff_ptr += 16; + dqcoeff_ptr += 16; + iscan += 16; + v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, + &v_dequant, &v_round, &v_zbin, &v_quant_shift); + + v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); + } + + *eob_ptr = accumulate_eob256(v_eobmax); +} From cc8236f1d281701afbb05b298769dc6de41500d8 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 13 Jul 2022 16:54:30 +0000 Subject: [PATCH 0425/1000] Actually include the fix for commit 8f4d1890c. Change-Id: I6780f610151f2e092da525ff064d4b69f74fa61b --- vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c index e56733d43e..33753f77b0 100644 --- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c +++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -99,7 +99,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); - const int16x8_t vfcmv = vdupq_n_s16(fcenter_mv.as_int); + const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int)); const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); From 168b312774166958897f727196a59ee8ad423e78 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 13 Jul 2022 21:54:22 -0700 Subject: [PATCH 0426/1000] vpxenc: fix --disable-loopfilter help alignment Change-Id: I34444e6437ca0e735d6db07bf98bfa4741ad2c01 --- vpxenc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vpxenc.c b/vpxenc.c index 7eff97b132..61672acadd 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -524,9 +524,12 @@ static const arg_def_t row_mt = static const arg_def_t disable_loopfilter = ARG_DEF(NULL, "disable-loopfilter", 1, - "Control Loopfilter in VP9\n" + "Control Loopfilter in VP9:\n" + " " "0: Loopfilter on for all frames (default)\n" + " " "1: Loopfilter off for non reference frames\n" + " " "2: Loopfilter off for all frames"); #endif From 68d9e7aa2f281a4be6b8f3efb24b67b2bfd1f67d Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Thu, 14 Jul 2022 11:41:11 -0700 Subject: [PATCH 0427/1000] L2E: Update the description of allow_alt_ref It is fixed per each encoding and can not be changed per GOP. Change-Id: I5905b712437142f2274bfa674ceef6093495457f --- vpx/vpx_ext_ratectrl.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index c3309b0f26..b6c950d87e 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -308,7 +308,9 @@ typedef struct vpx_rc_gop_info { */ int active_max_gf_interval; /*! - * Whether to allow the use of alt ref, can be changed per gop. + * Whether to allow the use of alt ref, determined by the encoder. + * It is fixed for the entire encode. + * See function "is_altref_enabled" in vp9_encoder.h. */ int allow_alt_ref; /*! From a5ead0427c21ef15b12f9b582e860b77f714c622 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 18 Jul 2022 18:56:03 -0700 Subject: [PATCH 0428/1000] vpx_int_pro_row_c: add an assert for height this quiets a static analysis warning with clang 11: vpx_dsp/avg.c:353:15: warning: Assigned value is garbage or undefined [core.uninitialized.Assign] hbuf[idx] /= norm_factor; ^ ~~~~~~~~~~~ the same fix was applied in libaom: 1ad0889bc aom_int_pro_row_c: add an assert for height Bug: b/229626362 Change-Id: Ic8a249f866b33b02ec9f378581e51ac104d97169 --- vpx_dsp/avg.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c index 1c45e8a73d..9540154074 100644 --- a/vpx_dsp/avg.c +++ b/vpx_dsp/avg.c @@ -7,6 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ + +#include #include #include "./vpx_dsp_rtcd.h" @@ -344,6 +346,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height) { int idx; const int norm_factor = height >> 1; + assert(height >= 2); for (idx = 0; idx < 16; ++idx) { int i; hbuf[idx] = 0; From 53dd1e8e785ea42fa88499dbfd0c2c9dcd055833 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 18 Jul 2022 19:00:49 -0700 Subject: [PATCH 0429/1000] avg_intrin_{sse2,avg2}: rm dead store in hadamard_8x8 this quiets a couple static analysis warnings with clang 11: vpx_dsp/x86/avg_intrin_sse2.c:278:45: warning: Although the value stored to 'src_diff' is used in the enclosing expression, the value is never actually read from 'src_diff' [deadcode.DeadStores] src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); ^ ~~~~~~~~~~ vpx_dsp/x86/avg_intrin_avx2.c:307:49: warning: Although the value stored to 'src_diff' is used in the enclosing expression, the value is never actually read from 'src_diff' [deadcode.DeadStores] src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); ^ ~~~~~~~~~~ Bug: b/229626362 Change-Id: I4b0201bd39775885df0afc03fa5da70910b9dad6 --- vpx_dsp/x86/avg_intrin_avx2.c | 2 +- vpx_dsp/x86/avg_intrin_sse2.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c index 3f4f577a21..d93e6ccae5 100644 --- a/vpx_dsp/x86/avg_intrin_avx2.c +++ b/vpx_dsp/x86/avg_intrin_avx2.c @@ -304,7 +304,7 @@ static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride, src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); - src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride)); hadamard_col8x2_avx2(src, 0); hadamard_col8x2_avx2(src, 1); diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c index 9da2f34c9b..0c4919f6d8 100644 --- a/vpx_dsp/x86/avg_intrin_sse2.c +++ b/vpx_dsp/x86/avg_intrin_sse2.c @@ -275,7 +275,7 @@ static INLINE void hadamard_8x8_sse2(const int16_t *src_diff, src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride)); hadamard_col8_sse2(src, 0); hadamard_col8_sse2(src, 1); From 414b4f05124b27c512a63c78c3057934850bc941 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Tue, 12 Jul 2022 13:22:35 -0700 Subject: [PATCH 0430/1000] VPX: Add vpx_quantize_b_32x32_avx2(). Up to 1.36x faster than vpx_quantize_b_32x32_avx() for full calculations. Up to 1.29x faster for VP9_HIGHBITDEPTH builds. Bug: b/237714063 Change-Id: I97aa6a18d4dc2f3187b76800f91bbba7be447ef1 --- test/vp9_quantize_test.cc | 5 +- vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/quantize_avx2.c | 117 ++++++++++++++++++++++++++++++++++- 3 files changed, 119 insertions(+), 5 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index c7ce13ff22..3e0dd77396 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -580,7 +580,10 @@ INSTANTIATE_TEST_SUITE_P( &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, - VPX_BITS_8, 16, false))); + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_avx2, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, + false))); #endif // HAVE_AVX2 #if HAVE_NEON diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 7ecd3ac90f..beb594f9f3 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -714,7 +714,7 @@ () specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx lsx/; + specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index e1c6e944ce..6fd5174876 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -18,15 +18,25 @@ static VPX_FORCE_INLINE void load_b_values_avx2( const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr, __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr, - __m256i *shift) { + __m256i *shift, int log_scale) { *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr)); *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); + *zbin = _mm256_add_epi16(*zbin, rnd); + *zbin = _mm256_srai_epi16(*zbin, log_scale); + } // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16) *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); *round = _mm256_permute4x64_epi64(*round, 0x54); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); + *round = _mm256_add_epi16(*round, rnd); + *round = _mm256_srai_epi16(*round, log_scale); + } *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); *quant = _mm256_permute4x64_epi64(*quant, 0x54); @@ -151,13 +161,13 @@ void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask; - __m256i v_eobmax = _mm256_set1_epi16(0); + __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; (void)scan; load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, - &v_quant_shift); + &v_quant_shift, 0); // Do DC and first 15 AC. v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, &v_dequant, &v_round, &v_zbin, &v_quant_shift); @@ -183,3 +193,104 @@ void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob256(v_eobmax); } + +static VPX_FORCE_INLINE __m256i quantize_b_32x32_16( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant, + __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin, + __m256i *v_quant_shift, __m256i *v_eobmax) { + const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); + const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); + const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); + + if (_mm256_movemask_epi8(v_zbin_mask) == 0) { + _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); +#endif + return *v_eobmax; + } + { + // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0 + const __m256i v_tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); + // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + // quant_shift_ptr[rc != 0]) >> 15); + const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); + const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); + const __m256i v_tmp32_hi = + _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1); + const __m256i v_tmp32_lo = + _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15); + const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo); + const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); + const __m256i v_sign_lo = + _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff); + const __m256i v_sign_hi = + _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff); + const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant); + const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant); + const __m256i v_dqcoeff_lo = _mm256_sign_epi32( + _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo); + const __m256i v_dqcoeff_hi = _mm256_sign_epi32( + _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi); + const __m256i v_nz_mask = + _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); + + store_coefficients_avx2(v_qcoeff, qcoeff_ptr); + +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo); + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi); +#else + store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi), + dqcoeff_ptr); +#endif + + return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask); + } +} + +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; + __m256i v_eobmax = _mm256_setzero_si256(); + intptr_t count; + (void)n_coeffs; + (void)scan; + + load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, + &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, + &v_quant_shift, 1); + + // Do DC and first 15 AC. + v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, + &v_quant, &v_dequant, &v_round, &v_zbin, + &v_quant_shift, &v_eobmax); + + v_round = _mm256_unpackhi_epi64(v_round, v_round); + v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); + v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); + v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); + v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); + + for (count = (32 * 32) - 16; count > 0; count -= 16) { + coeff_ptr += 16; + qcoeff_ptr += 16; + dqcoeff_ptr += 16; + iscan += 16; + v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, + &v_quant, &v_dequant, &v_round, &v_zbin, + &v_quant_shift, &v_eobmax); + } + + *eob_ptr = accumulate_eob256(v_eobmax); +} From cc8610e18917c5ef654684a2ae7c3a5dee26640e Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 19 Jul 2022 16:30:59 -0700 Subject: [PATCH 0431/1000] encode_api_test: quiet static analysis warning in ConfigChangeThreadCount(); initialize cfg as the static analyzer can assume AlwaysTrue() within EXPECT_NO_FATAL_FAILURE may return false causing InitCodec() not to be called. test/encode_api_test.cc|321 col 3| warning: 1st function call argument is an uninitialized value [core.CallAndMessage] video.SetSize(cfg.g_w, cfg.g_h); Bug: b/229626362 Change-Id: I54899ed0a207ca685416bed3a0e9c9644668e163 --- test/encode_api_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 6f61c77502..08159148bd 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -336,7 +336,7 @@ TEST(EncodeAPI, ConfigChangeThreadCount) { for (const auto *iface : kCodecIfaces) { SCOPED_TRACE(vpx_codec_iface_name(iface)); for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) { - vpx_codec_enc_cfg_t cfg; + vpx_codec_enc_cfg_t cfg = {}; struct Encoder { ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); } vpx_codec_ctx_t ctx = {}; From a36d42f8bd5942de9b2ddf7855f6fac3369d5a7a Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 19 Jul 2022 16:32:30 -0700 Subject: [PATCH 0432/1000] pp_filter_test: quiet static analysis warning in CheckLowFilterOutput(); use std::unique_ptr to avoid spurious memory leak warning: test/pp_filter_test.cc|466 col 3| warning: Potential leak of memory pointed to by 'expected_output' [cplusplus.NewDeleteLeaks] ASSERT_NE(expected_output, nullptr); Bug: b/229626362 Change-Id: Ie9e06c9b9442ffa134e514d2aee70841d19c8ecb --- test/pp_filter_test.cc | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc index 775f7f36a3..27d5ffa907 100644 --- a/test/pp_filter_test.cc +++ b/test/pp_filter_test.cc @@ -7,7 +7,11 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ + #include + +#include + #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" @@ -458,14 +462,13 @@ TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) { SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); - unsigned char *expected_output = new unsigned char[rows_ * cols_]; + std::unique_ptr expected_output( + new unsigned char[rows_ * cols_]); ASSERT_NE(expected_output, nullptr); - SetRows(expected_output, rows_, cols_, cols_); + SetRows(expected_output.get(), rows_, cols_, cols_); RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), q2mbl(0), - expected_output); - - delete[] expected_output; + expected_output.get()); } TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) { From 59b27f758c137c2a2d4c11c9feb9a8875f58d096 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 19 Jul 2022 17:16:05 -0700 Subject: [PATCH 0433/1000] avg_intrin_avx2: rm dead store in highbd_hadamard_8x8 missed in: 53dd1e8e7 avg_intrin_{sse2,avg2}: rm dead store in hadamard_8x8 Change-Id: I378e4a388ceb193a4cfee4d9d317fc62fcc4b39e --- vpx_dsp/x86/avg_intrin_avx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c index d93e6ccae5..b2e01319d3 100644 --- a/vpx_dsp/x86/avg_intrin_avx2.c +++ b/vpx_dsp/x86/avg_intrin_avx2.c @@ -104,7 +104,7 @@ void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride, src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); - src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride)); src32[0] = _mm256_cvtepi16_epi32(src16[0]); src32[1] = _mm256_cvtepi16_epi32(src16[1]); From 4e504233f8e603abdd4b39395c8717668009a865 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Wed, 20 Jul 2022 15:35:33 -0700 Subject: [PATCH 0434/1000] L2E: Add more unit tests for GOP API Add unit tests for a 4 frame video, which could be considered as a corner case. Three different GOP settings are tested and verified as valid. (1). The first GOP has 3 coding frames, no alt ref. The second GOP has 1 coding frame, no alt ref. The numer of coding frames is 4. Their frame types are: keyframe, inter_frame, inter_frame, golden_frame. (2). The first GOP has 4 coding frames, use alt ref. The second GOP has 1 coding frame, which is the overlay of the first GOP's alt ref frame. The numer of coding frames is 5. Their types are: keyframe, alt_ref, inter_frame, inter_frame, overlay_frame. (3). Only one GOP with 4 coding frames, do not use alt ref. The numer of coding frames is 4. Their types are: keyframe, inter_frame, inter_frame, inter_frame. Change-Id: I4079ff5065da79834b363b1e1976f65efed3f91f --- test/test.mk | 1 + test/vp9_ext_ratectrl_test.cc | 534 +++++++++++++++++++++++++++++++--- 2 files changed, 496 insertions(+), 39 deletions(-) diff --git a/test/test.mk b/test/test.mk index 6df4572904..f60d8f823f 100644 --- a/test/test.mk +++ b/test/test.mk @@ -59,6 +59,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ext_ratectrl_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += ../vp9/simple_encode.h LIBVPX_TEST_SRCS-yes += decode_test_driver.cc LIBVPX_TEST_SRCS-yes += decode_test_driver.h diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index 68703b7e94..c954495dff 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -16,6 +16,7 @@ #include "test/util.h" #include "test/yuv_video_source.h" #include "third_party/googletest/src/include/gtest/gtest.h" +#include "vp9/simple_encode.h" #include "vpx/vpx_ext_ratectrl.h" #include "vpx_dsp/vpx_dsp_common.h" @@ -25,6 +26,7 @@ constexpr int kModelMagicNumber = 51396; constexpr uintptr_t PrivMagicNumber = 5566; constexpr int kFrameNum = 5; constexpr int kFrameNumGOP = 30; +constexpr int kFrameNumGOPShort = 4; constexpr int kLosslessCodingIndex = 2; constexpr int kFixedGOPSize = 9; // The range check in vp9_cx_iface.c shows that the max @@ -38,6 +40,7 @@ constexpr int kDefaultMaxGfInterval = 16; // The numbers below are from manual inspection. constexpr int kReadMinGfInterval = 5; constexpr int kReadMaxGfInterval = 13; +const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv"; struct ToyRateCtrl { int magic_number; @@ -50,12 +53,12 @@ struct ToyRateCtrl { vpx_rc_status_t rc_create_model(void *priv, const vpx_rc_config_t *ratectrl_config, - vpx_rc_model_t *rate_ctrl_model_pt) { + vpx_rc_model_t *rate_ctrl_model_ptr) { ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; toy_rate_ctrl->magic_number = kModelMagicNumber; toy_rate_ctrl->coding_index = -1; - *rate_ctrl_model_pt = toy_rate_ctrl; + *rate_ctrl_model_ptr = toy_rate_ctrl; EXPECT_EQ(priv, reinterpret_cast(PrivMagicNumber)); EXPECT_EQ(ratectrl_config->frame_width, 352); EXPECT_EQ(ratectrl_config->frame_height, 288); @@ -68,7 +71,7 @@ vpx_rc_status_t rc_create_model(void *priv, vpx_rc_status_t rc_create_model_gop(void *priv, const vpx_rc_config_t *ratectrl_config, - vpx_rc_model_t *rate_ctrl_model_pt) { + vpx_rc_model_t *rate_ctrl_model_ptr) { ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; toy_rate_ctrl->magic_number = kModelMagicNumber; @@ -76,7 +79,7 @@ vpx_rc_status_t rc_create_model_gop(void *priv, toy_rate_ctrl->frames_since_key = 0; toy_rate_ctrl->show_index = 0; toy_rate_ctrl->coding_index = 0; - *rate_ctrl_model_pt = toy_rate_ctrl; + *rate_ctrl_model_ptr = toy_rate_ctrl; EXPECT_EQ(priv, reinterpret_cast(PrivMagicNumber)); EXPECT_EQ(ratectrl_config->frame_width, 640); EXPECT_EQ(ratectrl_config->frame_height, 360); @@ -87,6 +90,27 @@ vpx_rc_status_t rc_create_model_gop(void *priv, return VPX_RC_OK; } +vpx_rc_status_t rc_create_model_gop_short( + void *priv, const vpx_rc_config_t *ratectrl_config, + vpx_rc_model_t *rate_ctrl_model_ptr) { + ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; + if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; + toy_rate_ctrl->magic_number = kModelMagicNumber; + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + toy_rate_ctrl->show_index = 0; + toy_rate_ctrl->coding_index = 0; + *rate_ctrl_model_ptr = toy_rate_ctrl; + EXPECT_EQ(priv, reinterpret_cast(PrivMagicNumber)); + EXPECT_EQ(ratectrl_config->frame_width, 352); + EXPECT_EQ(ratectrl_config->frame_height, 288); + EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOPShort); + EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 500); + EXPECT_EQ(ratectrl_config->frame_rate_num, 30); + EXPECT_EQ(ratectrl_config->frame_rate_den, 1); + return VPX_RC_OK; +} + vpx_rc_status_t rc_send_firstpass_stats( vpx_rc_model_t rate_ctrl_model, const vpx_rc_firstpass_stats_t *first_pass_stats) { @@ -113,6 +137,19 @@ vpx_rc_status_t rc_send_firstpass_stats_gop( return VPX_RC_OK; } +vpx_rc_status_t rc_send_firstpass_stats_gop_short( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_firstpass_stats_t *first_pass_stats) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOPShort); + for (int i = 0; i < first_pass_stats->num_frames; ++i) { + EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); + } + return VPX_RC_OK; +} + vpx_rc_status_t rc_get_encodeframe_decision( vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *encode_frame_info, @@ -128,19 +165,17 @@ vpx_rc_status_t rc_get_encodeframe_decision( if (encode_frame_info->coding_index == 0) { EXPECT_EQ(encode_frame_info->show_index, 0); EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], 0); // kRefFrameTypeLast EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], 0); // kRefFrameTypePast EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], 0); // kRefFrameTypeFuture - } - - if (encode_frame_info->coding_index == 1) { + } else if (encode_frame_info->coding_index == 1) { EXPECT_EQ(encode_frame_info->show_index, 4); EXPECT_EQ(encode_frame_info->gop_index, 1); - EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef); EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], 1); // kRefFrameTypeLast EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], @@ -149,19 +184,15 @@ vpx_rc_status_t rc_get_encodeframe_decision( 0); // kRefFrameTypeFuture EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], 0); // kRefFrameTypeLast - } - - if (encode_frame_info->coding_index >= 2 && - encode_frame_info->coding_index < 5) { + } else if (encode_frame_info->coding_index >= 2 && + encode_frame_info->coding_index < 5) { // In the first group of pictures, coding_index and gop_index are equal. EXPECT_EQ(encode_frame_info->gop_index, encode_frame_info->coding_index); - EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/); - } - - if (encode_frame_info->coding_index == 5) { + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + } else if (encode_frame_info->coding_index == 5) { EXPECT_EQ(encode_frame_info->show_index, 4); EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], 1); // kRefFrameTypeLast EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], @@ -197,19 +228,17 @@ vpx_rc_status_t rc_get_encodeframe_decision_gop( if (encode_frame_info->coding_index == 0) { EXPECT_EQ(encode_frame_info->show_index, 0); EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], 0); // kRefFrameTypeLast EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], 0); // kRefFrameTypePast EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], 0); // kRefFrameTypeFuture - } - - if (encode_frame_info->coding_index == 1) { + } else if (encode_frame_info->coding_index == 1) { EXPECT_EQ(encode_frame_info->show_index, 1); EXPECT_EQ(encode_frame_info->gop_index, 1); - EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], 1); // kRefFrameTypeLast EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], @@ -218,36 +247,198 @@ vpx_rc_status_t rc_get_encodeframe_decision_gop( 0); // kRefFrameTypeFuture EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], 0); // kRefFrameTypeLast - } - - if (encode_frame_info->coding_index == 2) { + } else if (encode_frame_info->coding_index == 2) { EXPECT_EQ(encode_frame_info->show_index, 2); EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], 0); // kRefFrameTypeLast EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], 0); // kRefFrameTypePast EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], 0); // kRefFrameTypeFuture + } else if (encode_frame_info->coding_index == 3 || + encode_frame_info->coding_index == 12 || + encode_frame_info->coding_index == 21) { + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef); + EXPECT_EQ(encode_frame_info->gop_index, 1); + } else if (encode_frame_info->coding_index == 11 || + encode_frame_info->coding_index == 20 || + encode_frame_info->coding_index == 29) { + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); + EXPECT_EQ(encode_frame_info->gop_index, 0); + } else if (encode_frame_info->coding_index >= 30) { + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); } - if (encode_frame_info->coding_index == 3 || - encode_frame_info->coding_index == 12 || - encode_frame_info->coding_index == 21) { - EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/); + // When the model recommends an invalid q, valid range [0, 255], + // the encoder will ignore it and use the default q selected + // by libvpx rate control strategy. + frame_decision->q_index = VPX_DEFAULT_Q; + frame_decision->max_frame_size = 0; + + toy_rate_ctrl->coding_index += 1; + return VPX_RC_OK; +} + +vpx_rc_status_t rc_get_encodeframe_decision_gop_short( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->show_index, 0); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->show_index, 1); EXPECT_EQ(encode_frame_info->gop_index, 1); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 2) { + EXPECT_EQ(encode_frame_info->show_index, 2); + EXPECT_EQ(encode_frame_info->gop_index, 2); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 3) { + EXPECT_EQ(encode_frame_info->show_index, 3); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2); } - if (encode_frame_info->coding_index == 11 || - encode_frame_info->coding_index == 20 || - encode_frame_info->coding_index == 29) { - EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/); + // When the model recommends an invalid q, valid range [0, 255], + // the encoder will ignore it and use the default q selected + // by libvpx rate control strategy. + frame_decision->q_index = VPX_DEFAULT_Q; + frame_decision->max_frame_size = 0; + + toy_rate_ctrl->coding_index += 1; + return VPX_RC_OK; +} + +vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->show_index, 0); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->show_index, 3); + EXPECT_EQ(encode_frame_info->gop_index, 1); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 2) { + EXPECT_EQ(encode_frame_info->show_index, 1); + EXPECT_EQ(encode_frame_info->gop_index, 2); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 3) { + EXPECT_EQ(encode_frame_info->show_index, 2); + EXPECT_EQ(encode_frame_info->gop_index, 3); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 4) { + EXPECT_EQ(encode_frame_info->show_index, 3); EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2); } - if (encode_frame_info->coding_index >= 30) { - EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/); + // When the model recommends an invalid q, valid range [0, 255], + // the encoder will ignore it and use the default q selected + // by libvpx rate control strategy. + frame_decision->q_index = VPX_DEFAULT_Q; + frame_decision->max_frame_size = 0; + + toy_rate_ctrl->coding_index += 1; + return VPX_RC_OK; +} + +vpx_rc_status_t rc_get_encodeframe_decision_gop_short_no_arf( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->show_index, 0); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->show_index, 1); + EXPECT_EQ(encode_frame_info->gop_index, 1); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 2) { + EXPECT_EQ(encode_frame_info->show_index, 2); + EXPECT_EQ(encode_frame_info->gop_index, 2); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 3) { + EXPECT_EQ(encode_frame_info->show_index, 3); + EXPECT_EQ(encode_frame_info->gop_index, 3); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); } // When the model recommends an invalid q, valid range [0, 255], @@ -296,6 +487,117 @@ vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model, return VPX_RC_OK; } +// Test on a 4 frame video. +// Test a setting of 2 GOPs. +// The first GOP has 3 coding frames, no alt ref. +// The second GOP has 1 coding frame, no alt ref. +vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model, + const vpx_rc_gop_info_t *gop_info, + vpx_rc_gop_decision_t *gop_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); + EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); + EXPECT_EQ(gop_info->allow_alt_ref, 1); + if (gop_info->is_key_frame) { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + EXPECT_EQ(gop_info->frames_since_key, 0); + EXPECT_EQ(gop_info->gop_global_index, 0); + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + } else { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + } + EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); + EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); + EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); + EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); + + gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 3 : 1; + gop_decision->use_alt_ref = 0; + toy_rate_ctrl->frames_since_key += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + toy_rate_ctrl->show_index += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + ++toy_rate_ctrl->gop_global_index; + return VPX_RC_OK; +} + +// Test on a 4 frame video. +// Test a setting of 2 GOPs. +// The first GOP has 4 coding frames. Use alt ref. +// The second GOP only contains the overlay frame of the first GOP's alt ref +// frame. +vpx_rc_status_t rc_get_gop_decision_short_overlay( + vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, + vpx_rc_gop_decision_t *gop_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); + EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); + EXPECT_EQ(gop_info->allow_alt_ref, 1); + if (gop_info->is_key_frame) { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + EXPECT_EQ(gop_info->frames_since_key, 0); + EXPECT_EQ(gop_info->gop_global_index, 0); + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + } else { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1); + } + EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); + EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); + EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); + EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); + + gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1; + gop_decision->use_alt_ref = gop_info->is_key_frame ? 1 : 0; + toy_rate_ctrl->frames_since_key += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + toy_rate_ctrl->show_index += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + ++toy_rate_ctrl->gop_global_index; + return VPX_RC_OK; +} + +// Test on a 4 frame video. +// Test a setting of 1 GOP. +// The GOP has 4 coding frames. Do not use alt ref. +vpx_rc_status_t rc_get_gop_decision_short_no_arf( + vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, + vpx_rc_gop_decision_t *gop_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); + EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); + EXPECT_EQ(gop_info->allow_alt_ref, 1); + if (gop_info->is_key_frame) { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + EXPECT_EQ(gop_info->frames_since_key, 0); + EXPECT_EQ(gop_info->gop_global_index, 0); + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + } else { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + } + EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); + EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); + EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); + EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); + + gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1; + gop_decision->use_alt_ref = 0; + toy_rate_ctrl->frames_since_key += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + toy_rate_ctrl->show_index += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + ++toy_rate_ctrl->gop_global_index; + return VPX_RC_OK; +} + vpx_rc_status_t rc_update_encodeframe_result( vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_result_t *encode_frame_result) { @@ -328,6 +630,18 @@ vpx_rc_status_t rc_update_encodeframe_result_gop( return VPX_RC_OK; } +vpx_rc_status_t rc_update_encodeframe_result_gop_short( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_result_t *encode_frame_result) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + + const int64_t ref_pixel_count = 352 * 288 * 3 / 2; + EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count); + return VPX_RC_OK; +} + vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) { ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); @@ -371,7 +685,7 @@ TEST_F(ExtRateCtrlTest, EncodeTest) { "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNum)); - ASSERT_NE(video.get(), nullptr); + ASSERT_NE(video, nullptr); ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); } @@ -417,7 +731,149 @@ TEST_F(ExtRateCtrlTestGOP, EncodeTest) { "noisy_clip_640_360.y4m", VPX_IMG_FMT_I420, 640, 360, 30, 1, 0, kFrameNumGOP)); - ASSERT_NE(video.get(), nullptr); + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + ExtRateCtrlTestGOPShort() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTestGOPShort() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); + encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); + + vpx_rc_funcs_t rc_funcs; + rc_funcs.rc_type = VPX_RC_GOP_QP; + rc_funcs.create_model = rc_create_model_gop_short; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; + rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short; + rc_funcs.get_gop_decision = rc_get_gop_decision_short; + rc_funcs.update_encodeframe_result = + rc_update_encodeframe_result_gop_short; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } +}; + +TEST_F(ExtRateCtrlTestGOPShort, EncodeTest) { + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = kMaxLagInFrames - 1; + cfg_.rc_end_usage = VPX_VBR; + + std::unique_ptr video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); + + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +class ExtRateCtrlTestGOPShortOverlay + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + ExtRateCtrlTestGOPShortOverlay() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTestGOPShortOverlay() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); + encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); + + vpx_rc_funcs_t rc_funcs; + rc_funcs.rc_type = VPX_RC_GOP_QP; + rc_funcs.create_model = rc_create_model_gop_short; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; + rc_funcs.get_encodeframe_decision = + rc_get_encodeframe_decision_gop_short_overlay; + rc_funcs.get_gop_decision = rc_get_gop_decision_short_overlay; + rc_funcs.update_encodeframe_result = + rc_update_encodeframe_result_gop_short; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } +}; + +TEST_F(ExtRateCtrlTestGOPShortOverlay, EncodeTest) { + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = kMaxLagInFrames - 1; + cfg_.rc_end_usage = VPX_VBR; + + std::unique_ptr video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); + + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +class ExtRateCtrlTestGOPShortNoARF + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + ExtRateCtrlTestGOPShortNoARF() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTestGOPShortNoARF() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); + encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); + + vpx_rc_funcs_t rc_funcs; + rc_funcs.rc_type = VPX_RC_GOP_QP; + rc_funcs.create_model = rc_create_model_gop_short; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; + rc_funcs.get_encodeframe_decision = + rc_get_encodeframe_decision_gop_short_no_arf; + rc_funcs.get_gop_decision = rc_get_gop_decision_short_no_arf; + rc_funcs.update_encodeframe_result = + rc_update_encodeframe_result_gop_short; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } +}; + +TEST_F(ExtRateCtrlTestGOPShortNoARF, EncodeTest) { + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = kMaxLagInFrames - 1; + cfg_.rc_end_usage = VPX_VBR; + + std::unique_ptr video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); + + ASSERT_NE(video, nullptr); ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); } From 90c5493ff5d805676233252be633d2eedd5ceb50 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 20 Jul 2022 09:51:56 -0700 Subject: [PATCH 0435/1000] VPX: Add vpx_highbd_quantize_b_avx2(). Up to 3.61x faster than vpx_highbd_quantize_b_sse2() for full calculations. ~2.3% overall encoder improvement for the test clip used. Bug: b/237714063 Change-Id: I23f88d2a7f96aaa4103778372f4f552207f73cee --- test/vp9_quantize_test.cc | 12 ++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 169 ++++++++++++++++++++++ 4 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 vpx_dsp/x86/highbd_quantize_intrin_avx2.c diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 3e0dd77396..ac33d17707 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -574,6 +574,17 @@ INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest, #endif // HAVE_AVX #if VPX_ARCH_X86_64 && HAVE_AVX2 +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + AVX2, VP9QuantizeTest, + ::testing::Values( + make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false))); +#else INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, ::testing::Values(make_tuple(&QuantFPWrapper, @@ -584,6 +595,7 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false))); +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_AVX2 #if HAVE_NEON diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 8e0100c310..dd667195f5 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -332,6 +332,7 @@ DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/quantize_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_quantize_intrin_avx2.c endif # avg diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index beb594f9f3..45fcb559f1 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -718,7 +718,7 @@ () if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_highbd_quantize_b sse2/; + specialize qw/vpx_highbd_quantize_b sse2 avx2/; add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b_32x32 sse2/; diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c new file mode 100644 index 0000000000..f4c288cc09 --- /dev/null +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" + +static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i sign = _mm_srai_epi16(*p, 15); + const __m128i dc = _mm_unpacklo_epi16(*p, sign); + const __m128i ac = _mm_unpackhi_epi16(*p, sign); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static VPX_FORCE_INLINE void update_qp(__m256i *qp) { + int i; + for (i = 0; i < 5; ++i) { + qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11); + } +} + +static VPX_FORCE_INLINE void init_qp(const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *dequant_ptr, + const int16_t *quant_shift_ptr, + __m256i *qp, int log_scale) { + const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr); + const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); + const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr); + init_one_qp(&zbin, &qp[0]); + init_one_qp(&round, &qp[1]); + init_one_qp(&quant, &qp[2]); + init_one_qp(&dequant, &qp[3]); + init_one_qp(&quant_shift, &qp[4]); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1))); + qp[0] = _mm256_add_epi32(qp[0], rnd); + qp[0] = _mm256_srai_epi32(qp[0], log_scale); + + qp[1] = _mm256_add_epi32(qp[1], rnd); + qp[1] = _mm256_srai_epi32(qp[1], log_scale); + } + // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when + // calculating the zbin mask. + qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1)); +} + +// Note: +// *x is vector multiplied by *y which is 16 int32_t parallel multiplication +// and right shift 16. The output, 16 int32_t is save in *p. +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32(const __m256i *x, + const __m256i *y) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr, + __m256i eobmax, + __m256i nz_mask) { + const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask); + const __m256i packed_nz_mask_perm = + _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); + const __m256i iscan = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); + const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm); + const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm); + return _mm256_max_epi16(eobmax, nz_iscan); +} + +// Get the max eob from the lower 128 bits. +static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob) { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + return (uint16_t)_mm256_extract_epi16(eob, 0); +} + +static VPX_FORCE_INLINE void quantize(const __m256i *qp, + const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); + + if (_mm256_movemask_epi8(zbin_mask) == 0) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)dqcoeff, zero); + return; + } + { + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); + const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]); + const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); + const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]); + const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } +} + +void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int step = 8; + __m256i eob = _mm256_setzero_si256(); + __m256i qp[5]; + (void)scan; + + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0); + + quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + + *eob_ptr = get_max_eob(eob); +} From 90ef3906a2af36a0f973709302785de92a7e12a1 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Mon, 25 Jul 2022 05:52:27 -0700 Subject: [PATCH 0436/1000] VPX: Add vpx_highbd_quantize_b_32x32_avx2(). Up to 11.78x faster than vpx_quantize_b_32x32_sse2() for full calculations. ~1.7% overall encoder improvement for the test clip used. Bug: b/237714063 Change-Id: Ib759056db94d3487239cb2748ffef1184a89ae18 --- test/vp9_quantize_test.cc | 8 ++- vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 86 +++++++++++++++++++++++ 3 files changed, 94 insertions(+), 2 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index ac33d17707..705f3f0077 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -583,7 +583,13 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, VPX_BITS_10, 16, false), make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false))); + VPX_BITS_12, 16, false), + make_tuple(&vpx_highbd_quantize_b_32x32_avx2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_avx2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_avx2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); #else INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 45fcb559f1..e7ad640af8 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -721,7 +721,7 @@ () specialize qw/vpx_highbd_quantize_b sse2 avx2/; add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_highbd_quantize_b_32x32 sse2/; + specialize qw/vpx_highbd_quantize_b_32x32 sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c index f4c288cc09..ec1110ff8c 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -167,3 +167,89 @@ void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = get_max_eob(eob); } + +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x, + const __m256i *y, + int log_scale) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE void quantize_b_32x32( + const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); + + if (_mm256_movemask_epi8(zbin_mask) == 0) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)dqcoeff, zero); + return; + } + + { + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); + // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0); + const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); + // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], 1); + const __m256i abs_dq = + _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), 1); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } +} + +void vpx_highbd_quantize_b_32x32_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const unsigned int step = 8; + __m256i eob = _mm256_setzero_si256(); + __m256i qp[5]; + (void)scan; + + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1); + + quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + + *eob_ptr = get_max_eob(eob); +} \ No newline at end of file From ea13f315c9e98a908761fb29042728857d070216 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Jul 2022 18:17:22 -0700 Subject: [PATCH 0437/1000] highbd_temporal_filter_sse4: remove unused function params this clears warnings under clang-13 of the form: vp9/encoder/x86/highbd_temporal_filter_sse4.c|196 col 63| warning: parameter 'v_pre' set but not used [-Wunused-but-set-parameter] this is the high-bitdepth version of: 73b8aade8 temporal_filter_sse4: remove unused function params Change-Id: I9b2c9bf27c16975e4855df6a2c967da4c8c63a3a --- vp9/encoder/x86/highbd_temporal_filter_sse4.c | 135 ++++++------------ 1 file changed, 42 insertions(+), 93 deletions(-) diff --git a/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/vp9/encoder/x86/highbd_temporal_filter_sse4.c index 4fa24512c5..a7f5117cff 100644 --- a/vp9/encoder/x86/highbd_temporal_filter_sse4.c +++ b/vp9/encoder/x86/highbd_temporal_filter_sse4.c @@ -191,13 +191,11 @@ static INLINE void highbd_read_chroma_dist_row_8( } static void vp9_highbd_apply_temporal_filter_luma_8( - const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, - int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, - int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, - int uv_pre_stride, unsigned int block_width, unsigned int block_height, - int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum, - uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist, - const uint32_t *v_dist, const uint32_t *const *neighbors_first, + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_first, const uint32_t *const *neighbors_second, int top_weight, int bottom_weight) { const int rounding = (1 << strength) >> 1; @@ -256,17 +254,12 @@ static void vp9_highbd_apply_temporal_filter_luma_8( highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, y_accum); - y_src += y_src_stride; y_pre += y_pre_stride; y_count += y_pre_stride; y_accum += y_pre_stride; y_dist += DIST_STRIDE; - u_src += uv_src_stride; - u_pre += uv_pre_stride; u_dist += DIST_STRIDE; - v_src += uv_src_stride; - v_pre += uv_pre_stride; v_dist += DIST_STRIDE; // Then all the rows except the last one @@ -300,11 +293,7 @@ static void vp9_highbd_apply_temporal_filter_luma_8( highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, &v_second); - u_src += uv_src_stride; - u_pre += uv_pre_stride; u_dist += DIST_STRIDE; - v_src += uv_src_stride; - v_pre += uv_pre_stride; v_dist += DIST_STRIDE; } @@ -320,7 +309,6 @@ static void vp9_highbd_apply_temporal_filter_luma_8( highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, y_accum); - y_src += y_src_stride; y_pre += y_pre_stride; y_count += y_pre_stride; y_accum += y_pre_stride; @@ -364,13 +352,10 @@ static void vp9_highbd_apply_temporal_filter_luma_8( // Perform temporal filter for the luma component. static void vp9_highbd_apply_temporal_filter_luma( - const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, - int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, - int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, - int uv_pre_stride, unsigned int block_width, unsigned int block_height, - int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, - uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist, - const uint32_t *u_dist, const uint32_t *v_dist) { + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { unsigned int blk_col = 0, uv_blk_col = 0; const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x; const unsigned int mid_width = block_width >> 1, @@ -384,9 +369,7 @@ static void vp9_highbd_apply_temporal_filter_luma( neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS; neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; vp9_highbd_apply_temporal_filter_luma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, - v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y, + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, bottom_weight); @@ -399,13 +382,10 @@ static void vp9_highbd_apply_temporal_filter_luma( for (; blk_col < mid_width; blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { vp9_highbd_apply_temporal_filter_luma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, - u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step, - block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, - y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, - v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, - bottom_weight); + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); } if (!use_whole_blk) { @@ -417,21 +397,16 @@ static void vp9_highbd_apply_temporal_filter_luma( for (; blk_col < last_width; blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { vp9_highbd_apply_temporal_filter_luma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, - u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step, - block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, - y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, - v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, - bottom_weight); + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); } // Right neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS; vp9_highbd_apply_temporal_filter_luma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, - v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y, + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, bottom_weight); @@ -491,13 +466,11 @@ static INLINE void highbd_add_luma_dist_to_8_chroma_mod( // blk_fw as an array of size 4 for the weights for each of the 4 subblocks, // else use top_weight for top half, and bottom weight for bottom half. static void vp9_highbd_apply_temporal_filter_chroma_8( - const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, - int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, - int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, - int uv_pre_stride, unsigned int uv_block_width, - unsigned int uv_block_height, int ss_x, int ss_y, int strength, - uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, - const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist, + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int uv_block_width, unsigned int uv_block_height, int ss_x, + int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist, + const uint32_t *u_dist, const uint32_t *v_dist, const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd, int top_weight, int bottom_weight, const int *blk_fw) { const int rounding = (1 << strength) >> 1; @@ -565,10 +538,8 @@ static void vp9_highbd_apply_temporal_filter_chroma_8( highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, v_accum); - u_src += uv_src_stride; u_pre += uv_pre_stride; u_dist += DIST_STRIDE; - v_src += uv_src_stride; v_pre += uv_pre_stride; v_dist += DIST_STRIDE; u_count += uv_pre_stride; @@ -576,8 +547,6 @@ static void vp9_highbd_apply_temporal_filter_chroma_8( v_count += uv_pre_stride; v_accum += uv_pre_stride; - y_src += y_src_stride * (1 + ss_y); - y_pre += y_pre_stride * (1 + ss_y); y_dist += DIST_STRIDE * (1 + ss_y); // Then all the rows except the last one @@ -649,10 +618,8 @@ static void vp9_highbd_apply_temporal_filter_chroma_8( highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, v_accum); - u_src += uv_src_stride; u_pre += uv_pre_stride; u_dist += DIST_STRIDE; - v_src += uv_src_stride; v_pre += uv_pre_stride; v_dist += DIST_STRIDE; u_count += uv_pre_stride; @@ -660,8 +627,6 @@ static void vp9_highbd_apply_temporal_filter_chroma_8( v_count += uv_pre_stride; v_accum += uv_pre_stride; - y_src += y_src_stride * (1 + ss_y); - y_pre += y_pre_stride * (1 + ss_y); y_dist += DIST_STRIDE * (1 + ss_y); } @@ -720,12 +685,10 @@ static void vp9_highbd_apply_temporal_filter_chroma_8( // Perform temporal filter for the chroma components. static void vp9_highbd_apply_temporal_filter_chroma( - const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, - int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, - int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, - int uv_pre_stride, unsigned int block_width, unsigned int block_height, - int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, - uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { const unsigned int uv_width = block_width >> ss_x, uv_height = block_height >> ss_y; @@ -755,8 +718,6 @@ static void vp9_highbd_apply_temporal_filter_chroma( if (use_whole_blk) { vp9_highbd_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, @@ -764,8 +725,6 @@ static void vp9_highbd_apply_temporal_filter_chroma( neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); } else { vp9_highbd_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, @@ -789,13 +748,11 @@ static void vp9_highbd_apply_temporal_filter_chroma( } vp9_highbd_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, - v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, - strength, u_accum + uv_blk_col, u_count + uv_blk_col, - v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, - u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd, - top_weight, bottom_weight, NULL); + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); blk_col += blk_col_step; uv_blk_col += uv_blk_col_step; @@ -812,8 +769,6 @@ static void vp9_highbd_apply_temporal_filter_chroma( for (; uv_blk_col < uv_mid_width; blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { vp9_highbd_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, @@ -830,8 +785,6 @@ static void vp9_highbd_apply_temporal_filter_chroma( for (; uv_blk_col < uv_last_width; blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { vp9_highbd_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, @@ -849,13 +802,11 @@ static void vp9_highbd_apply_temporal_filter_chroma( } vp9_highbd_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, - v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, - strength, u_accum + uv_blk_col, u_count + uv_blk_col, - v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, - u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd, - top_weight, bottom_weight, NULL); + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); } void vp9_highbd_apply_temporal_filter_sse4_1( @@ -929,14 +880,12 @@ void vp9_highbd_apply_temporal_filter_sse4_1( u_dist_ptr = u_dist + 1; v_dist_ptr = v_dist + 1; - vp9_highbd_apply_temporal_filter_luma( - y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride, - u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, - strength, blk_fw, use_whole_blk, y_accum, y_count, y_dist_ptr, u_dist_ptr, - v_dist_ptr); + vp9_highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, + block_height, ss_x, ss_y, strength, + blk_fw, use_whole_blk, y_accum, y_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); vp9_highbd_apply_temporal_filter_chroma( - y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride, u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr); From ce484db2110fc0a81f9d4e6c1a09d132e1e2b2b7 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Mon, 25 Jul 2022 10:58:44 -0700 Subject: [PATCH 0438/1000] VPX: vp9_quantize_fp_avx2() cleanup. No change in performance. Bug: b/237714063 Change-Id: I8ea42759cc4dc57be6a29c23784997cb90ad4090 --- test/vp9_quantize_test.cc | 2 + vp9/encoder/x86/vp9_quantize_avx2.c | 164 ++++++++++++++-------------- 2 files changed, 86 insertions(+), 80 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 705f3f0077..97998eb08b 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -578,6 +578,8 @@ INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest, INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, ::testing::Values( + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, VPX_BITS_8, 16, false), make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c index db18b1a7a4..5d02f4fe85 100644 --- a/vp9/encoder/x86/vp9_quantize_avx2.c +++ b/vp9/encoder/x86/vp9_quantize_avx2.c @@ -18,7 +18,7 @@ #include "vpx_dsp/x86/quantize_sse2.h" // Zero fill 8 positions in the output buffer. -static INLINE void store_zero_tran_low(tran_low_t *a) { +static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) { const __m256i zero = _mm256_setzero_si256(); #if CONFIG_VP9_HIGHBITDEPTH _mm256_storeu_si256((__m256i *)(a), zero); @@ -28,22 +28,72 @@ static INLINE void store_zero_tran_low(tran_low_t *a) { #endif } -static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr, - __m256i *coeff256) { - const __m256i iscan = _mm256_loadu_si256(iscan_ptr); - const __m256i zero256 = _mm256_setzero_si256(); +static VPX_FORCE_INLINE void load_fp_values_avx2( + const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr, + __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) { + *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + *round = _mm256_permute4x64_epi64(*round, 0x54); + *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + *quant = _mm256_permute4x64_epi64(*quant, 0x54); + *dequant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); +} + +static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, + __m256i v_eobmax, + __m256i v_mask) { + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); #if CONFIG_VP9_HIGHBITDEPTH - // The _mm256_packs_epi32() in load_tran_low() packs the 64 bit coeff as - // B1 A1 B0 A0. Shuffle to B1 B0 A1 A0 in order to scan eob correctly. - const __m256i _coeff256 = _mm256_permute4x64_epi64(*coeff256, 0xd8); - const __m256i zero_coeff0 = _mm256_cmpeq_epi16(_coeff256, zero256); + // typedef int32_t tran_low_t; + const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8); + const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask); #else - const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256); + // typedef int16_t tran_low_t; + const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan, v_mask); #endif - const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256); - // Add one to convert from indices to counts - const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0); - return _mm256_and_si256(iscan_plus_one, nzero_coeff0); + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask); + return _mm256_max_epi16(v_eobmax, v_nz_iscan); +} + +static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob256) { + const __m256i eob_lo = eob256; + // Copy upper 128 to lower 128 + const __m256i eob_hi = _mm256_permute2x128_si256(eob256, eob256, 0X81); + __m256i eob = _mm256_max_epi16(eob_lo, eob_hi); + __m256i eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + return (uint16_t)_mm256_extract_epi16(eob, 0); +} + +static VPX_FORCE_INLINE void quantize_fp_16( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { + const __m256i coeff = load_tran_low(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const int32_t nzflag = + _mm256_movemask_epi8(_mm256_cmpgt_epi16(abs_coeff, *thr)); + + if (nzflag) { + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + store_tran_low(qcoeff, qcoeff_ptr); + store_tran_low(dqcoeff, dqcoeff_ptr); + + *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); + } else { + store_zero_tran_low(qcoeff_ptr); + store_zero_tran_low(dqcoeff_ptr); + } } void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, @@ -51,10 +101,8 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - __m128i eob; - __m256i round256, quant256, dequant256; - __m256i eob256, thr256; - + __m256i round, quant, dequant, thr; + __m256i eob_max = _mm256_setzero_si256(); (void)scan; coeff_ptr += n_coeffs; @@ -63,74 +111,30 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; - { - __m256i coeff256; - - // Setup global values - { - const __m128i round = _mm_load_si128((const __m128i *)round_ptr); - const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr); - const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr); - round256 = _mm256_castsi128_si256(round); - round256 = _mm256_permute4x64_epi64(round256, 0x54); - - quant256 = _mm256_castsi128_si256(quant); - quant256 = _mm256_permute4x64_epi64(quant256, 0x54); - - dequant256 = _mm256_castsi128_si256(dequant); - dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54); - } - - { - __m256i qcoeff256; - __m256i qtmp256; - coeff256 = load_tran_low(coeff_ptr + n_coeffs); - qcoeff256 = _mm256_abs_epi16(coeff256); - qcoeff256 = _mm256_adds_epi16(qcoeff256, round256); - qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256); - qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256); - store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs); - coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256); - store_tran_low(coeff256, dqcoeff_ptr + n_coeffs); - } - - eob256 = scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256); - n_coeffs += 8 * 2; - } + // Setup global values + load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr, + &dequant); + thr = _mm256_setzero_si256(); - // remove dc constants - dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31); - quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31); - round256 = _mm256_permute2x128_si256(round256, round256, 0x31); + quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); - thr256 = _mm256_srai_epi16(dequant256, 1); + n_coeffs += 8 * 2; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + thr = _mm256_srai_epi16(dequant, 1); // AC only loop while (n_coeffs < 0) { - __m256i coeff256 = load_tran_low(coeff_ptr + n_coeffs); - __m256i qcoeff256 = _mm256_abs_epi16(coeff256); - int32_t nzflag = - _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256)); - - if (nzflag) { - __m256i qtmp256; - qcoeff256 = _mm256_adds_epi16(qcoeff256, round256); - qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256); - qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256); - store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs); - coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256); - store_tran_low(coeff256, dqcoeff_ptr + n_coeffs); - eob256 = _mm256_max_epi16( - eob256, scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256)); - } else { - store_zero_tran_low(qcoeff_ptr + n_coeffs); - store_zero_tran_low(dqcoeff_ptr + n_coeffs); - } + quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); n_coeffs += 8 * 2; } - eob = _mm_max_epi16(_mm256_castsi256_si128(eob256), - _mm256_extracti128_si256(eob256, 1)); - - *eob_ptr = accumulate_eob(eob); + *eob_ptr = get_max_eob(eob_max); } From 1c0c4d51b474585d05b36d2d70af6b20f507c931 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Jul 2022 19:08:40 -0700 Subject: [PATCH 0439/1000] y4m_input_fetch_frame: fix ubsan null/zero offset warning reported under clang-13; use a while loop in file_read() to force a size check before attempting to read. buf (aux_buf) may be may be null when no conversion is necessary. y4minput.c:29:43: runtime error: applying zero offset to null pointer Bug: b/229626362 Change-Id: Ia3250d6ff9c325faf48eaa31f4399e20837f8f7b --- y4minput.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/y4minput.c b/y4minput.c index 7d3c03a7fc..745e2f1cd6 100644 --- a/y4minput.c +++ b/y4minput.c @@ -21,12 +21,13 @@ // Reads 'size' bytes from 'file' into 'buf' with some fault tolerance. // Returns true on success. static int file_read(void *buf, size_t size, FILE *file) { - const int kMaxRetries = 5; - int retry_count = 0; - int file_error; + const int kMaxTries = 5; + int try_count = 0; + int file_error = 0; size_t len = 0; - do { + while (!feof(file) && len < size && try_count < kMaxTries) { const size_t n = fread((uint8_t *)buf + len, 1, size - len, file); + ++try_count; len += n; file_error = ferror(file); if (file_error) { @@ -39,13 +40,13 @@ static int file_read(void *buf, size_t size, FILE *file) { return 0; } } - } while (!feof(file) && len < size && ++retry_count < kMaxRetries); + } if (!feof(file) && len != size) { fprintf(stderr, "Error reading file: %u of %u bytes read," - " error: %d, retries: %d, %d: %s\n", - (uint32_t)len, (uint32_t)size, file_error, retry_count, errno, + " error: %d, tries: %d, %d: %s\n", + (uint32_t)len, (uint32_t)size, file_error, try_count, errno, strerror(errno)); } return len == size; From ed78231aa54c131018c0c9415cf416beac97a698 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 26 Jul 2022 19:26:23 -0700 Subject: [PATCH 0440/1000] vp9,decoder_decode: fix ubsan null/zero offset warning reported under clang-13. null data may be passed as a flush; move data_end after that check vp9/vp9_dx_iface.c:337:40: runtime error: applying zero offset to null pointer Bug: b/229626362 Change-Id: I845726fd6eb6ac7a776e49272c6477a5ad30ffdf --- vp9/vp9_dx_iface.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 3c42c7dfed..bdfe217936 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -334,7 +334,6 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *data, unsigned int data_sz, void *user_priv, long deadline) { const uint8_t *data_start = data; - const uint8_t *const data_end = data + data_sz; vpx_codec_err_t res; uint32_t frame_sizes[8]; int frame_count; @@ -362,6 +361,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, // Decode in serial mode. if (frame_count > 0) { + const uint8_t *const data_end = data + data_sz; int i; for (i = 0; i < frame_count; ++i) { @@ -379,6 +379,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, data_start += frame_size; } } else { + const uint8_t *const data_end = data + data_sz; while (data_start < data_end) { const uint32_t frame_size = (uint32_t)(data_end - data_start); const vpx_codec_err_t res = From 7dab508cd9c8fc8f0ac44bfb380c31e25919d883 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 27 Jul 2022 13:29:41 -0700 Subject: [PATCH 0441/1000] encode_test_driver: normalize frame_flags type use vpx_enc_frame_flags_t; this avoids int -> unsigned conversion warnings; reported w/clang -fsanitize=integer: test/error_resilience_test.cc:95:9: runtime error: implicit conversion from type 'int' of value -12845057 (32-bit, signed) to type 'unsigned long' changed the value to 4282122239 (32-bit, unsigned) Bug: b/229626362 Change-Id: I0fc1dbe44a258f397cf1a05347d8cb86ee70b1b8 --- test/encode_test_driver.cc | 5 +++-- test/encode_test_driver.h | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc index 9ca15ae4d3..d3feeee34d 100644 --- a/test/encode_test_driver.cc +++ b/test/encode_test_driver.cc @@ -52,7 +52,8 @@ void Encoder::InitEncoder(VideoSource *video) { } } -void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) { +void Encoder::EncodeFrame(VideoSource *video, + const vpx_enc_frame_flags_t frame_flags) { if (video->img()) { EncodeFrameInternal(*video, frame_flags); } else { @@ -70,7 +71,7 @@ void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) { } void Encoder::EncodeFrameInternal(const VideoSource &video, - const unsigned long frame_flags) { + const vpx_enc_frame_flags_t frame_flags) { vpx_codec_err_t res; const vpx_image_t *img = video.img(); diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index f6bb841d8c..b57df85291 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -103,7 +103,7 @@ class Encoder { } // This is a thin wrapper around vpx_codec_encode(), so refer to // vpx_encoder.h for its semantics. - void EncodeFrame(VideoSource *video, const unsigned long frame_flags); + void EncodeFrame(VideoSource *video, vpx_enc_frame_flags_t frame_flags); // Convenience wrapper for EncodeFrame() void EncodeFrame(VideoSource *video) { EncodeFrame(video, 0); } @@ -184,7 +184,7 @@ class Encoder { // Encode an image void EncodeFrameInternal(const VideoSource &video, - const unsigned long frame_flags); + vpx_enc_frame_flags_t frame_flags); // Flush the encoder on EOS void Flush(); @@ -289,7 +289,7 @@ class EncoderTest { unsigned long deadline_; TwopassStatsStore stats_; unsigned long init_flags_; - unsigned long frame_flags_; + vpx_enc_frame_flags_t frame_flags_; }; } // namespace libvpx_test From 9763f3c549569df3bad776a02537a97b2d203a3c Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 27 Jul 2022 15:15:25 -0700 Subject: [PATCH 0442/1000] vp8_find_near_mvs: fix implicit conversion warnings unsigned -> int and vice versa reported by clang -fsanitize=integer vp8/common/findnearmv.c:108:11: runtime error: implicit conversion from type 'uint32_t' (aka 'unsigned int') of value 4294443008 (32-bit, unsigned) to type 'int' changed the value to -524288 (32-bit, signed) vp8/common/findnearmv.c:110:33: runtime error: implicit conversion from type 'int' of value -524288 (32-bit, signed) to type 'uint32_t' (aka 'unsigned int') changed the value to 4294443008 (32-bit, unsigned) Bug: b/229626362 Change-Id: Ic7ce0fd98255ccf9307ac73e9fb6a8189b268214 --- vp8/common/findnearmv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c index 6889fdedde..3b31923621 100644 --- a/vp8/common/findnearmv.c +++ b/vp8/common/findnearmv.c @@ -105,9 +105,9 @@ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest, tmp = near_mv_ref_cnts[CNT_NEAREST]; near_mv_ref_cnts[CNT_NEAREST] = near_mv_ref_cnts[CNT_NEAR]; near_mv_ref_cnts[CNT_NEAR] = tmp; - tmp = near_mvs[CNT_NEAREST].as_int; + tmp = (int)near_mvs[CNT_NEAREST].as_int; near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int; - near_mvs[CNT_NEAR].as_int = tmp; + near_mvs[CNT_NEAR].as_int = (uint32_t)tmp; } /* Use near_mvs[0] to store the "best" MV */ From b6d06a6e268d945b960660a91e77195c2612642c Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 27 Jul 2022 15:31:00 -0700 Subject: [PATCH 0443/1000] vp8,read_mb_modes_mv: fix implicit conversion warnings w/clang -fsanitize=integer fixes warnings of the form: implicit conversion from type 'uint32_t' (aka 'unsigned int') of value 4294443008 (32-bit, unsigned) to type 'int' changed the value to -524288 (32-bit, signed) Bug: b/229626362 Change-Id: Ic7c0a2e7b64a1dd6fd5cc64adcd5765318c2a956 --- vp8/decoder/decodemv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c index 51817a2cb9..3f459d623f 100644 --- a/vp8/decoder/decodemv.c +++ b/vp8/decoder/decodemv.c @@ -372,9 +372,9 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, tmp = cnt[CNT_NEAREST]; cnt[CNT_NEAREST] = cnt[CNT_NEAR]; cnt[CNT_NEAR] = tmp; - tmp = near_mvs[CNT_NEAREST].as_int; + tmp = (int)near_mvs[CNT_NEAREST].as_int; near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int; - near_mvs[CNT_NEAR].as_int = tmp; + near_mvs[CNT_NEAR].as_int = (uint32_t)tmp; } if (vp8_read(bc, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) { From aecf7ba51af515545d9f26ea9c66f347d6c40814 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 27 Jul 2022 15:41:00 -0700 Subject: [PATCH 0444/1000] variance_avx2.c: fix implicit conversion warnings w/clang -fsanitize=integer fixes warnings of the form: implicit conversion from type 'int' of value -1323 (32-bit, signed) to type 'unsigned int' changed the value to 4294965973 (32-bit, unsigned) Bug: b/229626362 Change-Id: I7291d9bd5cacea0d88d9f4c4624c096764f4a472 --- vpx_dsp/x86/variance_avx2.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c index 9232acbfbb..35925d5908 100644 --- a/vpx_dsp/x86/variance_avx2.c +++ b/vpx_dsp/x86/variance_avx2.c @@ -590,17 +590,20 @@ static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride, return sum; } -static unsigned int sub_pixel_variance32xh_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, int height, unsigned int *sse) { +static int sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse) { return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, NULL, 0, 0, height, sse); } -static unsigned int sub_pixel_avg_variance32xh_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, const uint8_t *second_pred, - int second_stride, int height, unsigned int *sse) { +static int sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, + int second_stride, int height, + unsigned int *sse) { return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, second_pred, second_stride, 1, height, sse); } From e533d989ea93ee6cd980cc07e077475733694687 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 27 Jul 2022 15:48:24 -0700 Subject: [PATCH 0445/1000] vp9_filter_block_plane_non420: fix implicit conversion warnings w/clang -fsanitize=integer fixes warnings of the form: implicit conversion from type 'int' of value -2 (32-bit, signed) to type 'unsigned int' changed the value to 4294967294 (32-bit, unsigned) Bug: b/229626362 Change-Id: Id7e13b3d494ccd1a2351db8fab6fdb6a9a771d51 --- vp9/common/vp9_loopfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 95d6029f3b..765cb11726 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -1180,7 +1180,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm, } // Disable filtering on the leftmost column - border_mask = ~(mi_col == 0 ? 1 : 0); + border_mask = ~(mi_col == 0 ? 1u : 0u); #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { highbd_filter_selectively_vert( From 4667992d8ba51d60045d6fed705635a6455eb4f8 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 27 Jul 2022 15:22:37 -0700 Subject: [PATCH 0446/1000] x86: normalize type with _mm_cvtsi128_si32 prefer int in most cases w/clang -fsanitize=integer fixes warnings of the form: implicit conversion from type 'int' of value -809931979 (32-bit, signed) to type 'uint32_t' (aka 'unsigned int') changed the value to 3485035317 (32-bit, unsigned) Bug: b/229626362 Change-Id: I0c6604efc188f2660c531eddfc7aa10060637813 --- vp8/encoder/x86/denoising_sse2.c | 2 +- vpx_dsp/x86/avg_intrin_sse2.c | 2 +- vpx_dsp/x86/convolve_avx2.h | 5 ++--- vpx_dsp/x86/mem_sse2.h | 2 +- vpx_dsp/x86/sad_avx2.c | 15 ++++++--------- vpx_dsp/x86/variance_sse2.c | 2 +- vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c | 6 +++--- vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 2 +- vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c | 6 +++--- 9 files changed, 19 insertions(+), 23 deletions(-) diff --git a/vp8/encoder/x86/denoising_sse2.c b/vp8/encoder/x86/denoising_sse2.c index 89cad53356..f35b930169 100644 --- a/vp8/encoder/x86/denoising_sse2.c +++ b/vp8/encoder/x86/denoising_sse2.c @@ -30,7 +30,7 @@ static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) { _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); - unsigned int sum_diff = abs(_mm_cvtsi128_si32(hgfedcba)); + unsigned int sum_diff = (unsigned int)abs(_mm_cvtsi128_si32(hgfedcba)); return sum_diff; } diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c index 0c4919f6d8..015c11a1f3 100644 --- a/vpx_dsp/x86/avg_intrin_sse2.c +++ b/vpx_dsp/x86/avg_intrin_sse2.c @@ -164,7 +164,7 @@ unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) { s0 = _mm_add_epi32(s0, s1); s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8)); s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4)); - avg = _mm_cvtsi128_si32(s0); + avg = (unsigned int)_mm_cvtsi128_si32(s0); return (avg + 32) >> 6; } diff --git a/vpx_dsp/x86/convolve_avx2.h b/vpx_dsp/x86/convolve_avx2.h index 99bc9637fc..ebee964b18 100644 --- a/vpx_dsp/x86/convolve_avx2.h +++ b/vpx_dsp/x86/convolve_avx2.h @@ -129,9 +129,8 @@ static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1, static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1, __m128i *const dst_ptr_2, const __m256i *const src) { - *((uint32_t *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src)); - *((uint32_t *)(dst_ptr_2)) = - _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1)); + *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src)); + *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1)); } static INLINE __m256i mm256_round_epi32(const __m256i *const src, diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h index 8b6d4d1dd4..6df4773f73 100644 --- a/vpx_dsp/x86/mem_sse2.h +++ b/vpx_dsp/x86/mem_sse2.h @@ -33,7 +33,7 @@ static INLINE __m128i load_unaligned_u32(const void *a) { } static INLINE void store_unaligned_u32(void *const a, const __m128i v) { - const uint32_t val = _mm_cvtsi128_si32(v); + const int val = _mm_cvtsi128_si32(v); memcpy(a, &val, sizeof(val)); } diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c index 3b48acd510..29bedb0e6e 100644 --- a/vpx_dsp/x86/sad_avx2.c +++ b/vpx_dsp/x86/sad_avx2.c @@ -14,7 +14,7 @@ #define FSAD64_H(h) \ unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ - int i, res; \ + int i; \ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ __m256i sum_sad = _mm256_setzero_si256(); \ __m256i sum_sad_h; \ @@ -35,8 +35,7 @@ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ } #define FSAD32_H(h) \ @@ -92,7 +91,7 @@ FSAD32 unsigned int vpx_sad64x##h##_avg_avx2( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred) { \ - int i, res; \ + int i; \ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ __m256i sum_sad = _mm256_setzero_si256(); \ __m256i sum_sad_h; \ @@ -118,15 +117,14 @@ FSAD32 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ } #define FSADAVG32_H(h) \ unsigned int vpx_sad32x##h##_avg_avx2( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred) { \ - int i, res; \ + int i; \ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ __m256i sum_sad = _mm256_setzero_si256(); \ __m256i sum_sad_h; \ @@ -156,8 +154,7 @@ FSAD32 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ } #define FSADAVG64 \ diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c index a67c92aadb..fedc8b84e5 100644 --- a/vpx_dsp/x86/variance_sse2.c +++ b/vpx_dsp/x86/variance_sse2.c @@ -19,7 +19,7 @@ static INLINE unsigned int add32x4_sse2(__m128i val) { val = _mm_add_epi32(val, _mm_srli_si128(val, 8)); val = _mm_add_epi32(val, _mm_srli_si128(val, 4)); - return _mm_cvtsi128_si32(val); + return (unsigned int)_mm_cvtsi128_si32(val); } unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) { diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c index 0cbd151dc3..21a35ae3c3 100644 --- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c +++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c @@ -485,7 +485,7 @@ static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr, // Saturate and convert to 8-bit words dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); - *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); src_ptr += src_stride; dst_ptr += dst_stride; @@ -589,8 +589,8 @@ static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero); // Save only half of the register (8 words) - *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012); - *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012); + *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123); // Update the source by two rows src_ptr += src_stride_unrolled; diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 6f2983a4b5..db3c39de0f 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -798,7 +798,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, // Pack to 8-bits dst = _mm_packus_epi16(dst, _mm_setzero_si128()); - *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst); } } diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index ed46d6245d..4ea2752d38 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -580,7 +580,7 @@ static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr, // Pack to 8-bits dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); - *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); src_ptr += src_stride; dst_ptr += dst_stride; @@ -666,8 +666,8 @@ static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr, reg_1 = _mm_packus_epi16(reg_1, reg_1); // Save the result - *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0); - *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0); + *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1); // Update the source by two rows src_ptr += src_stride_unrolled; From 1ce49998f751c711325d9fdc93d37d61a73465d5 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 27 Jul 2022 18:56:22 -0700 Subject: [PATCH 0447/1000] vp9_active_[hv]_edge: add missing vpx_clear_system_state this fixes runtime errors with clang -fsanitize=integer in x86 builds: ../vp9/encoder/vp9_rdopt.c:3250:17: runtime error: signed integer overflow: 18 - -2147483648 cannot be represented in type 'int' ../vp9/encoder/vp9_rdopt.c:3277:16: runtime error: signed integer overflow: 26 - -2147483648 cannot be represented in type 'int' Bug: b/229626362 Change-Id: Ic9a5063c840b4fce7056f61362234721add056a6 --- vp9/encoder/vp9_rdopt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 3b574ef172..bfde5ab1a5 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -3242,6 +3242,7 @@ int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) { // For two pass account for any formatting bars detected. if (cpi->oxcf.pass == 2) { TWO_PASS *twopass = &cpi->twopass; + vpx_clear_system_state(); // The inactive region is specified in MBs not mi units. // The image edge is in the following MB row. @@ -3269,6 +3270,7 @@ int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) { // For two pass account for any formatting bars detected. if (cpi->oxcf.pass == 2) { TWO_PASS *twopass = &cpi->twopass; + vpx_clear_system_state(); // The inactive region is specified in MBs not mi units. // The image edge is in the following MB row. From b3536cfafe7c56fa24bfec35e9cacc2082065bd2 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 29 Jul 2022 09:40:53 +0000 Subject: [PATCH 0448/1000] Provide Arm SDOT optimizations for SAD functions Change-Id: I497ee1c45d1fc4d643cefad7d87e5aaacd77869c --- vpx_dsp/arm/sad_neon.c | 306 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 306 insertions(+) diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index b1509d883a..34870375a3 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -21,9 +21,16 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ + (__ARM_FEATURE_DOTPROD == 1) + const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8); + const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); + return horizontal_add_uint32x4(dp); +#else uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); return horizontal_add_uint16x8(abs); +#endif } uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, @@ -33,13 +40,36 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ + (__ARM_FEATURE_DOTPROD == 1) + const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg); + const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); + return horizontal_add_uint32x4(prod); +#else uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg)); abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); return horizontal_add_uint16x8(abs); +#endif } uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ + (__ARM_FEATURE_DOTPROD == 1) + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + const uint8x16_t src2_u8 = + load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride); + const uint8x16_t ref2_u8 = + load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride); + const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, ref1_u8); + const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, ref2_u8); + prod = vdotq_u32(prod, sad1_u8, ones); + prod = vdotq_u32(prod, sad2_u8, ones); + return horizontal_add_uint32x4(prod); +#else int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < 8; i += 4) { @@ -52,11 +82,32 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, } return horizontal_add_uint16x8(abs); +#endif } uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred) { +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ + (__ARM_FEATURE_DOTPROD == 1) + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + const uint8x16_t src2_u8 = + load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride); + const uint8x16_t ref2_u8 = + load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride); + const uint8x16_t second_pred1_u8 = vld1q_u8(second_pred); + const uint8x16_t second_pred2_u8 = vld1q_u8(second_pred + 16); + const uint8x16_t avg1 = vrhaddq_u8(ref1_u8, second_pred1_u8); + const uint8x16_t avg2 = vrhaddq_u8(ref2_u8, second_pred2_u8); + const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, avg1); + const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, avg2); + prod = vdotq_u32(prod, sad1_u8, ones); + prod = vdotq_u32(prod, sad2_u8, ones); + return horizontal_add_uint32x4(prod); +#else int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < 8; i += 4) { @@ -72,8 +123,66 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, } return horizontal_add_uint16x8(abs); +#endif } +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ + (__ARM_FEATURE_DOTPROD == 1) +static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int height) { + int i; + uint32x2_t prod = vdup_n_u32(0); + const uint8x8_t ones = vdup_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x8_t a_u8 = vld1_u8(src_ptr); + const uint8x8_t b_u8 = vld1_u8(ref_ptr); + const uint8x8_t sad_u8 = vabd_u8(a_u8, b_u8); + src_ptr += src_stride; + ref_ptr += ref_stride; + prod = vdot_u32(prod, sad_u8, ones); + } + return prod; +} + +static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { + int i; + uint32x2_t prod = vdup_n_u32(0); + const uint8x8_t ones = vdup_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x8_t a_u8 = vld1_u8(src_ptr); + const uint8x8_t b_u8 = vld1_u8(ref_ptr); + const uint8x8_t c_u8 = vld1_u8(second_pred); + const uint8x8_t avg = vrhadd_u8(b_u8, c_u8); + const uint8x8_t sad_u8 = vabd_u8(a_u8, avg); + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 8; + prod = vdot_u32(prod, sad_u8, ones); + } + return prod; +} + +#define SAD8XN(n) \ + uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint32x2_t prod = \ + sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return horizontal_add_uint32x2(prod); \ + } \ + \ + uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint32x2_t prod = \ + sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return horizontal_add_uint32x2(prod); \ + } + +#else static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -124,11 +233,68 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride, sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint16x8(abs); \ } +#endif SAD8XN(4) SAD8XN(8) SAD8XN(16) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ + (__ARM_FEATURE_DOTPROD == 1) +static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t src_u8 = vld1q_u8(src_ptr); + const uint8x16_t ref_u8 = vld1q_u8(ref_ptr); + const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8); + src_ptr += src_stride; + ref_ptr += ref_stride; + prod = vdotq_u32(prod, sad_u8, ones); + } + return prod; +} + +static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t a_u8 = vld1q_u8(src_ptr); + const uint8x16_t b_u8 = vld1q_u8(ref_ptr); + const uint8x16_t c_u8 = vld1q_u8(second_pred); + const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8); + const uint8x16_t sad_u8 = vabdq_u8(a_u8, avg); + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + prod = vdotq_u32(prod, sad_u8, ones); + } + return prod; +} + +#define SAD16XN(n) \ + uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint32x4_t prod = \ + sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return horizontal_add_uint32x4(prod); \ + } \ + \ + uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint32x4_t prod = \ + sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return horizontal_add_uint32x4(prod); \ + } +#else static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -182,11 +348,79 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride, sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint16x8(abs); \ } +#endif SAD16XN(8) SAD16XN(16) SAD16XN(32) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ + (__ARM_FEATURE_DOTPROD == 1) +static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t a_lo = vld1q_u8(src_ptr); + const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); + const uint8x16_t b_lo = vld1q_u8(ref_ptr); + const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); + const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, b_lo); + const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, b_hi); + src_ptr += src_stride; + ref_ptr += ref_stride; + prod = vdotq_u32(prod, sad_lo_u8, ones); + prod = vdotq_u32(prod, sad_hi_u8, ones); + } + return prod; +} + +static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t a_lo = vld1q_u8(src_ptr); + const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); + const uint8x16_t b_lo = vld1q_u8(ref_ptr); + const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); + const uint8x16_t c_lo = vld1q_u8(second_pred); + const uint8x16_t c_hi = vld1q_u8(second_pred + 16); + const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo); + const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi); + const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, avg_lo); + const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, avg_hi); + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 32; + prod = vdotq_u32(prod, sad_lo_u8, ones); + prod = vdotq_u32(prod, sad_hi_u8, ones); + } + return prod; +} + +#define SAD32XN(n) \ + uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint32x4_t prod = \ + sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return horizontal_add_uint32x4(prod); \ + } \ + \ + uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint32x4_t prod = \ + sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return horizontal_add_uint32x4(prod); \ + } + +#else static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -250,11 +484,82 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride, sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint16x8(abs); \ } +#endif SAD32XN(16) SAD32XN(32) SAD32XN(64) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ + (__ARM_FEATURE_DOTPROD == 1) +static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t a_0 = vld1q_u8(src_ptr); + const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); + const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); + const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); + const uint8x16_t b_0 = vld1q_u8(ref_ptr); + const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); + const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); + const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); + const uint8x16_t sad_0_u8 = vabdq_u8(a_0, b_0); + const uint8x16_t sad_1_u8 = vabdq_u8(a_1, b_1); + const uint8x16_t sad_2_u8 = vabdq_u8(a_2, b_2); + const uint8x16_t sad_3_u8 = vabdq_u8(a_3, b_3); + src_ptr += src_stride; + ref_ptr += ref_stride; + prod = vdotq_u32(prod, sad_0_u8, ones); + prod = vdotq_u32(prod, sad_1_u8, ones); + prod = vdotq_u32(prod, sad_2_u8, ones); + prod = vdotq_u32(prod, sad_3_u8, ones); + } + return prod; +} + +static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t a_0 = vld1q_u8(src_ptr); + const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); + const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); + const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); + const uint8x16_t b_0 = vld1q_u8(ref_ptr); + const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); + const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); + const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); + const uint8x16_t c_0 = vld1q_u8(second_pred); + const uint8x16_t c_1 = vld1q_u8(second_pred + 16); + const uint8x16_t c_2 = vld1q_u8(second_pred + 32); + const uint8x16_t c_3 = vld1q_u8(second_pred + 48); + const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0); + const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1); + const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2); + const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3); + const uint8x16_t sad_0_u8 = vabdq_u8(a_0, avg_0); + const uint8x16_t sad_1_u8 = vabdq_u8(a_1, avg_1); + const uint8x16_t sad_2_u8 = vabdq_u8(a_2, avg_2); + const uint8x16_t sad_3_u8 = vabdq_u8(a_3, avg_3); + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 64; + prod = vdotq_u32(prod, sad_0_u8, ones); + prod = vdotq_u32(prod, sad_1_u8, ones); + prod = vdotq_u32(prod, sad_2_u8, ones); + prod = vdotq_u32(prod, sad_3_u8, ones); + } + return prod; +} +#else static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -332,6 +637,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, return vpadalq_u16(sum, abs_1); } } +#endif #define SAD64XN(n) \ uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \ From 0b3d4114468e8b602ece9b28f2485c60394311ca Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Fri, 29 Jul 2022 15:42:15 -0700 Subject: [PATCH 0449/1000] Fix off-by-one error of max w/h in validate_config Fix the off-by-one errors of maximum g_w and g_h in validate_config(). Bug: webm:1774 Change-Id: I343783d06c1f53222be2366be79171b214486201 --- vp9/vp9_cx_iface.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 05ac9e1691..588d9d502e 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -170,8 +170,8 @@ static vpx_codec_err_t update_error_state( static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg, const struct vp9_extracfg *extra_cfg) { - RANGE_CHECK(cfg, g_w, 1, 65535); // 16 bits available - RANGE_CHECK(cfg, g_h, 1, 65535); // 16 bits available + RANGE_CHECK(cfg, g_w, 1, 65536); // 16 bits available + RANGE_CHECK(cfg, g_h, 1, 65536); // 16 bits available RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000); RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000); RANGE_CHECK_HI(cfg, g_profile, 3); From 29db7fe9757549efa8df5cc6f3cff0ece7b4b0d1 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 27 Jul 2022 07:37:44 -0700 Subject: [PATCH 0450/1000] VPX: Add vp9_quantize_fp_32x32_avx2(). Up to 1.80x faster than vp9_quantize_fp_32x32_ssse3() for full calculations. Bug: b/237714063 Change-Id: Ic4ae4724fce7ac85c7a089535b16a999e02f0a10 --- test/vp9_quantize_test.cc | 3 + vp9/common/vp9_rtcd_defs.pl | 2 +- vp9/encoder/x86/vp9_quantize_avx2.c | 121 ++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 1 deletion(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 97998eb08b..4bd573b4f3 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -598,6 +598,9 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, + VPX_BITS_8, 32, true), make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16, false), make_tuple(&vpx_quantize_b_32x32_avx2, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index e6b65c96f0..a4d28f0ff5 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -132,7 +132,7 @@ () specialize qw/vp9_quantize_fp neon sse2 avx2 vsx/, "$ssse3_x86_64"; add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -specialize qw/vp9_quantize_fp_32x32 neon vsx/, "$ssse3_x86_64"; +specialize qw/vp9_quantize_fp_32x32 neon avx2 vsx/, "$ssse3_x86_64"; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_block_error avx2 sse2/; diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c index 5d02f4fe85..6ded0913a7 100644 --- a/vp9/encoder/x86/vp9_quantize_avx2.c +++ b/vp9/encoder/x86/vp9_quantize_avx2.c @@ -138,3 +138,124 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = get_max_eob(eob_max); } + +// Enable this flag when matching the optimized code to +// vp9_quantize_fp_32x32_c(). Disabled, the optimized code will match the +// existing ssse3 code and quantize_fp_32x32_nz_c(). +// +// #define MATCH_VP9_QUANTIZE_FP_32X32_C + +#ifndef MATCH_VP9_QUANTIZE_FP_32X32_C +static VPX_FORCE_INLINE void quantize_fp_32x32_16_no_nzflag( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { + const __m256i coeff = load_tran_low(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i abs_dqcoeff = + _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1); + const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + store_tran_low(qcoeff, qcoeff_ptr); + store_tran_low(dqcoeff, dqcoeff_ptr); + + *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); + (void)thr; +} +#endif + +static VPX_FORCE_INLINE void quantize_fp_32x32_16( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { + const __m256i coeff = load_tran_low(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i thr_mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + const int32_t nzflag = _mm256_movemask_epi8(thr_mask); + + if (nzflag) { +#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(abs_coeff, *round), thr_mask); +#else + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); +#endif + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i abs_dqcoeff = + _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1); + const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + store_tran_low(qcoeff, qcoeff_ptr); + store_tran_low(dqcoeff, dqcoeff_ptr); + + *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); + } else { + store_zero_tran_low(qcoeff_ptr); + store_zero_tran_low(dqcoeff_ptr); + } +} + +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m256i round, quant, dequant, thr; + __m256i eob_max = _mm256_setzero_si256(); + (void)scan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr, + &dequant); + thr = _mm256_srli_epi16(dequant, 2); + quant = _mm256_slli_epi16(quant, 1); + { + const __m256i rnd = _mm256_set1_epi16((int16_t)1); + round = _mm256_add_epi16(round, rnd); + round = _mm256_srai_epi16(round, 1); + } + +#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C + // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when + // calculating the zbin mask. + thr = _mm256_sub_epi16(thr, _mm256_set1_epi16(1)); + quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); +#else + quantize_fp_32x32_16_no_nzflag( + &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs, + qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max); +#endif + + n_coeffs += 8 * 2; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + thr = _mm256_permute2x128_si256(thr, thr, 0x31); + + // AC only loop + while (n_coeffs < 0) { + quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += 8 * 2; + } + + *eob_ptr = get_max_eob(eob_max); +} From a55e248349a1d549dbb3d988ced7ff93ac20ca0d Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Tue, 2 Aug 2022 11:22:04 -0700 Subject: [PATCH 0451/1000] VPX: Add vp9_highbd_quantize_fp_avx2(). Up to 5.37x faster than vp9_highbd_quantize_fp_c() for full calculations. ~1.6% overall encoder improvement for the test clip used. Bug: b/237714063 Change-Id: I584fd1f60a3e02f1ded092de98970725fc66c5b8 --- test/vp9_quantize_test.cc | 3 + vp9/common/vp9_rtcd_defs.pl | 1 + vp9/encoder/x86/vp9_quantize_avx2.c | 108 ++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 4bd573b4f3..a2dbfc541d 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -580,6 +580,9 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_12, 16, + true), make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, VPX_BITS_8, 16, false), make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index a4d28f0ff5..b956877d3c 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -196,6 +196,7 @@ () # ENCODEMB INVOKE add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_highbd_quantize_fp avx2/; add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ; diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c index 6ded0913a7..bd93e71e8a 100644 --- a/vp9/encoder/x86/vp9_quantize_avx2.c +++ b/vp9/encoder/x86/vp9_quantize_avx2.c @@ -259,3 +259,111 @@ void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = get_max_eob(eob_max); } + +#if CONFIG_VP9_HIGHBITDEPTH +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x, + const __m256i *y, + int log_scale) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) { + const __m128i v = _mm_load_si128((const __m128i *)val_ptr); + const __m128i zero = _mm_setzero_si128(); + const __m128i dc = _mm_unpacklo_epi16(v, zero); + const __m128i ac = _mm_unpackhi_epi16(v, zero); + return _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static VPX_FORCE_INLINE void highbd_load_fp_values( + const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr, + __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) { + *round = highbd_init_256(round_ptr); + *quant = highbd_init_256(quant_ptr); + *dequant = highbd_init_256(dequant_ptr); +} + +static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob( + const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) { + const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask); + const __m256i packed_nz_mask_perm = + _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); + const __m256i iscan = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); + const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm); + const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm); + return _mm256_max_epi16(eobmax, nz_iscan); +} + +static VPX_FORCE_INLINE void highbd_quantize_fp( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i tmp_rnd = _mm256_add_epi32(abs_coeff, *round); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0); + const __m256i abs_dq = _mm256_mullo_epi32(abs_q, *dequant); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + + _mm256_storeu_si256((__m256i *)qcoeff_ptr, q); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq); + + *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask); +} + +void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int step = 8; + __m256i round, quant, dequant; + __m256i eob_max = _mm256_setzero_si256(); + (void)scan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, + &dequant); + + highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + + n_coeffs += step; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + + // AC only loop + while (n_coeffs < 0) { + highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += step; + } + + *eob_ptr = get_max_eob(eob_max); +} +#endif // CONFIG_VP9_HIGHBITDEPTH From 2e61a623d4d5408c59f58e7bec713d789b27c3ef Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 3 Aug 2022 08:01:25 -0700 Subject: [PATCH 0452/1000] VPX: Add vp9_highbd_quantize_fp_32x32_avx2(). ~4x faster than vp9_highbd_quantize_fp_32x32_c() for full calculations. Bug: b/237714063 Change-Id: Iff2182b8e7b1ac79811e33080d1f6cac6679382d --- test/vp9_quantize_test.cc | 3 ++ vp9/common/vp9_rtcd_defs.pl | 1 + vp9/encoder/x86/vp9_quantize_avx2.c | 71 +++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index a2dbfc541d..b936350236 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -583,6 +583,9 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_12, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_12, + 32, true), make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, VPX_BITS_8, 16, false), make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index b956877d3c..1b23e8a66f 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -199,6 +199,7 @@ () specialize qw/vp9_highbd_quantize_fp avx2/; add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ; + specialize qw/vp9_highbd_quantize_fp_32x32 avx2/; # fdct functions add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c index bd93e71e8a..7afeba32cd 100644 --- a/vp9/encoder/x86/vp9_quantize_avx2.c +++ b/vp9/encoder/x86/vp9_quantize_avx2.c @@ -366,4 +366,75 @@ void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = get_max_eob(eob_max); } + +static VPX_FORCE_INLINE void highbd_quantize_fp_32x32( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i thr_mask = _mm256_cmpgt_epi32(abs_coeff, *thr); + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, *round), thr_mask); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0); + const __m256i abs_dq = + _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, *dequant), 1); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + + _mm256_storeu_si256((__m256i *)qcoeff_ptr, q); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq); + + *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask); +} + +void vp9_highbd_quantize_fp_32x32_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + const int step = 8; + __m256i round, quant, dequant, thr; + __m256i eob_max = _mm256_setzero_si256(); + (void)scan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, + &dequant); + thr = _mm256_srli_epi32(dequant, 2); + // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when + // calculating the zbin mask. + thr = _mm256_sub_epi32(thr, _mm256_set1_epi32(1)); + quant = _mm256_slli_epi32(quant, 1); + round = _mm256_srai_epi32(_mm256_add_epi32(round, _mm256_set1_epi32(1)), 1); + + highbd_quantize_fp_32x32(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + + n_coeffs += step; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + thr = _mm256_permute2x128_si256(thr, thr, 0x31); + + // AC only loop + while (n_coeffs < 0) { + highbd_quantize_fp_32x32( + &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs, + qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += step; + } + + *eob_ptr = get_max_eob(eob_max); +} #endif // CONFIG_VP9_HIGHBITDEPTH From c9f049fd9164e0b5b950bdb8ac80186787b5b64c Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Fri, 5 Aug 2022 07:40:26 -0700 Subject: [PATCH 0453/1000] VPX: Add vpx_subtract_block_avx2(). ~1.3x faster than vpx_subtract_block_sse2(). Based on aom_subtract_block_avx2(). Bug: b/241580104 Change-Id: I17da036363f213d53c6546c3e858e4c3cba44a5b --- test/vp9_subtract_test.cc | 4 ++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/subtract_avx2.c | 96 ++++++++++++++++++++++++++++++++++++ 4 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 vpx_dsp/x86/subtract_avx2.c diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc index 211cc6c7ad..f634a032d8 100644 --- a/test/vp9_subtract_test.cc +++ b/test/vp9_subtract_test.cc @@ -133,6 +133,10 @@ INSTANTIATE_TEST_SUITE_P(C, VP9SubtractBlockTest, INSTANTIATE_TEST_SUITE_P(SSE2, VP9SubtractBlockTest, ::testing::Values(vpx_subtract_block_sse2)); #endif +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_avx2)); +#endif #if HAVE_NEON INSTANTIATE_TEST_SUITE_P(NEON, VP9SubtractBlockTest, ::testing::Values(vpx_subtract_block_neon)); diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index dd667195f5..ffe954832d 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -376,6 +376,7 @@ DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/subtract_avx2.c DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index e7ad640af8..db211ed8ce 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -730,7 +730,7 @@ () # Block subtraction # add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; -specialize qw/vpx_subtract_block neon msa mmi sse2 vsx lsx/; +specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/; # # Single block SAD diff --git a/vpx_dsp/x86/subtract_avx2.c b/vpx_dsp/x86/subtract_avx2.c new file mode 100644 index 0000000000..4d259ef5c5 --- /dev/null +++ b/vpx_dsp/x86/subtract_avx2.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE void subtract32_avx2(int16_t *diff_ptr, + const uint8_t *src_ptr, + const uint8_t *pred_ptr) { + const __m256i s = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i p = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s)); + const __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1)); + const __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p)); + const __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1)); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + const __m256i d_1 = _mm256_sub_epi16(s_1, p_1); + _mm256_storeu_si256((__m256i *)diff_ptr, d_0); + _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1); +} + +static VPX_FORCE_INLINE void subtract_block_16xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int j; + for (j = 0; j < rows; ++j) { + const __m128i s = _mm_lddqu_si128((const __m128i *)src_ptr); + const __m128i p = _mm_lddqu_si128((const __m128i *)pred_ptr); + const __m256i s_0 = _mm256_cvtepu8_epi16(s); + const __m256i p_0 = _mm256_cvtepu8_epi16(p); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + _mm256_storeu_si256((__m256i *)diff_ptr, d_0); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static VPX_FORCE_INLINE void subtract_block_32xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int j; + for (j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static VPX_FORCE_INLINE void subtract_block_64xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int j; + for (j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + switch (cols) { + case 16: + subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 32: + subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 64: + subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + default: + vpx_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } +} From eaf0b5b47e3abe8a9c5e82bf2cb1528d2ed443fc Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Fri, 5 Aug 2022 17:32:12 -0700 Subject: [PATCH 0454/1000] Fix VP9 auto level The iteration index is wrong, causing the starting level to be chosen is "LEVEL_5_2", which is intended for videos of a large resolution. Change-Id: Id88836981bdcbd7494bd06193d6a433ac75a6d2e --- vp9/encoder/vp9_ratectrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 0852973914..4f356f9aed 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -2584,7 +2584,7 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi, const uint32_t pic_breadth = VPXMAX(cpi->common.width, cpi->common.height); int i; - for (i = LEVEL_1; i < LEVEL_MAX; ++i) { + for (i = 0; i < VP9_LEVELS; ++i) { if (vp9_level_defs[i].max_luma_picture_size >= pic_size && vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) { if (rc->min_gf_interval <= From 4355a392e6b2dec619dde616028b5a91c8917c0c Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 8 Aug 2022 11:28:27 -0700 Subject: [PATCH 0455/1000] vp9_cx_iface,encoder_encode: only calc ts when img!=NULL avoid calculating the end timestamp when performing a flush to prevent an implicit conversion warning when applying a non-zero offset to a 0 pts used in that case: vp9/vp9_cx_iface.c:1361:50: runtime error: implicit conversion from type 'vpx_codec_pts_t' (aka 'long') of value -15 (64-bit, signed) to type 'unsigned long' changed the value to 18446744073709551601 (64-bit, unsigned) Bug: b/229626362 Change-Id: I68ba19b7d6de35cc185707dfb6b43406b7165035 --- vp9/vp9_cx_iface.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 588d9d502e..02bd2e579b 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1357,8 +1357,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, unsigned int lib_flags = 0; YV12_BUFFER_CONFIG sd; int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, pts); - int64_t dst_end_time_stamp = - timebase_units_to_ticks(timestamp_ratio, pts + duration); size_t size, cx_data_sz; unsigned char *cx_data; @@ -1369,6 +1367,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1; if (img != NULL) { + const int64_t dst_end_time_stamp = + timebase_units_to_ticks(timestamp_ratio, pts + duration); res = image2yuvconfig(img, &sd); // Store the original flags in to the frame buffer. Will extract the @@ -1405,6 +1405,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, // compute first pass stats if (img) { int ret; + int64_t dst_end_time_stamp; vpx_codec_cx_pkt_t fps_pkt; ENCODE_FRAME_RESULT encode_frame_result; vp9_init_encode_frame_result(&encode_frame_result); @@ -1430,6 +1431,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, #endif // !CONFIG_REALTIME_ONLY } else { ENCODE_FRAME_RESULT encode_frame_result; + int64_t dst_end_time_stamp; vp9_init_encode_frame_result(&encode_frame_result); while (cx_data_sz >= ctx->cx_data_sz / 2 && -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data, From 3cf0a241569efd53fa9a9bd62d963278106816c6 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Fri, 5 Aug 2022 18:06:08 -0700 Subject: [PATCH 0456/1000] L2E: Add target level in GOP unit tests Change-Id: Icecc3031e1052bb5a94f6c5957ec5190aae990ba --- test/vp9_ext_ratectrl_test.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index c954495dff..6687f7fec5 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -497,7 +497,7 @@ vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model, ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); - EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval + 1); EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); EXPECT_EQ(gop_info->allow_alt_ref, 1); if (gop_info->is_key_frame) { @@ -571,7 +571,7 @@ vpx_rc_status_t rc_get_gop_decision_short_no_arf( ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); - EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval + 1); EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); EXPECT_EQ(gop_info->allow_alt_ref, 1); if (gop_info->is_key_frame) { @@ -752,6 +752,7 @@ class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest, if (video->frame() == 0) { encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); + encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); vpx_rc_funcs_t rc_funcs; rc_funcs.rc_type = VPX_RC_GOP_QP; @@ -799,6 +800,7 @@ class ExtRateCtrlTestGOPShortOverlay if (video->frame() == 0) { encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); + encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_UNKNOWN); vpx_rc_funcs_t rc_funcs; rc_funcs.rc_type = VPX_RC_GOP_QP; @@ -847,6 +849,7 @@ class ExtRateCtrlTestGOPShortNoARF if (video->frame() == 0) { encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); + encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); vpx_rc_funcs_t rc_funcs; rc_funcs.rc_type = VPX_RC_GOP_QP; From ec4aa6d1915a92172539f359ef9e1847ae9ff327 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Mon, 8 Aug 2022 10:12:39 -0700 Subject: [PATCH 0457/1000] Use level defined min gf interval Assume the level definition of min_gf_interval is the minimum allowed gf_interval. We take this level comformant min_gf_interval instead of +1. Change-Id: I9c7e62f210c95b356e9716579ee4c19638de8e35 --- test/vp9_ext_ratectrl_test.cc | 6 +++--- vp9/encoder/vp9_ratectrl.c | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index 6687f7fec5..16e3248f76 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -497,7 +497,7 @@ vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model, ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); - EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval + 1); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); EXPECT_EQ(gop_info->allow_alt_ref, 1); if (gop_info->is_key_frame) { @@ -571,7 +571,7 @@ vpx_rc_status_t rc_get_gop_decision_short_no_arf( ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); - EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval + 1); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); EXPECT_EQ(gop_info->allow_alt_ref, 1); if (gop_info->is_key_frame) { @@ -800,7 +800,7 @@ class ExtRateCtrlTestGOPShortOverlay if (video->frame() == 0) { encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); - encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_UNKNOWN); + encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); vpx_rc_funcs_t rc_funcs; rc_funcs.rc_type = VPX_RC_GOP_QP; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 4f356f9aed..1ddf64d41a 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -2589,8 +2589,7 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi, vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) { if (rc->min_gf_interval <= (int)vp9_level_defs[i].min_altref_distance) { - rc->min_gf_interval = - (int)vp9_level_defs[i].min_altref_distance + 1; + rc->min_gf_interval = (int)vp9_level_defs[i].min_altref_distance; rc->max_gf_interval = VPXMAX(rc->max_gf_interval, rc->min_gf_interval); } From 3c2b21c22e5d12fc790324ac2f78c029ccf694b3 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Mon, 8 Aug 2022 15:09:32 -0700 Subject: [PATCH 0458/1000] VPX: Fix vp9_quantize_fp_avx2() VS build error. Add build fix for _mm256_extract_epi16() being undefined. Bug: b/237714063 Change-Id: I855b1828ce1b6b2b2f063fe097999481881bf074 --- vp9/encoder/x86/vp9_quantize_avx2.c | 4 ++++ vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c index 7afeba32cd..15ce71c5c6 100644 --- a/vp9/encoder/x86/vp9_quantize_avx2.c +++ b/vp9/encoder/x86/vp9_quantize_avx2.c @@ -67,7 +67,11 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob256) { eob = _mm256_max_epi16(eob, eob_s); eob_s = _mm256_shufflelo_epi16(eob, 1); eob = _mm256_max_epi16(eob, eob_s); +#if defined(_MSC_VER) && (_MSC_VER < 1910) + return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff; +#else return (uint16_t)_mm256_extract_epi16(eob, 0); +#endif } static VPX_FORCE_INLINE void quantize_fp_16( diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c index ec1110ff8c..cbc715c046 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -94,7 +94,11 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob) { eob = _mm256_max_epi16(eob, eob_s); eob_s = _mm256_shufflelo_epi16(eob, 1); eob = _mm256_max_epi16(eob, eob_s); +#if defined(_MSC_VER) && (_MSC_VER < 1910) + return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff; +#else return (uint16_t)_mm256_extract_epi16(eob, 0); +#endif } static VPX_FORCE_INLINE void quantize(const __m256i *qp, From 1e07619a0afd63dd746c97766f63cf0a9d9c7e65 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 10 Aug 2022 05:29:50 -0700 Subject: [PATCH 0459/1000] VPX: vp9_quantize_fp_neon() cleanup. No change in performance. Bug: b/237714063 Change-Id: I868cda7acb0de840fbc85b23f3e36c50b39c331b --- vp9/encoder/arm/neon/vp9_quantize_neon.c | 167 +++++++++++++---------- 1 file changed, 92 insertions(+), 75 deletions(-) diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index 236c3176c7..46c772da2e 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -26,9 +26,8 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/vpx_dsp_common.h" -static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, - const int16x8_t dequant, - tran_low_t *dqcoeff) { +static VPX_FORCE_INLINE void calculate_dqcoeff_and_store( + const int16x8_t qcoeff, const int16x8_t dequant, tran_low_t *dqcoeff) { const int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); const int32x4_t dqcoeff_1 = @@ -42,6 +41,84 @@ static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, #endif // CONFIG_VP9_HIGHBITDEPTH } +static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr, + int16x8_t v_eobmax, + uint16x8_t v_nz_mask) { + const int16x8_t v_iscan = vld1q_s16(&iscan_ptr[0]); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1)); + const int16x8_t v_nz_iscan = + vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan_plus1); + return vmaxq_s16(v_eobmax, v_nz_iscan); +} + +static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { +#ifdef __aarch64__ + return (uint16_t)vmaxvq_s16(v_eobmax); +#else + const int16x4_t v_eobmax_3210 = + vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + + return (uint16_t)vget_lane_s16(v_eobmax_final, 0); +#endif // __aarch64__ +} + +static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *dequant_ptr, + int16x8_t *round, int16x8_t *quant, + int16x8_t *dequant) { + *round = vld1q_s16(round_ptr); + *quant = vld1q_s16(quant_ptr); + *dequant = vld1q_s16(dequant_ptr); +} + +static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round, + int16x8_t *v_quant, + int16x8_t *v_dequant) { +#ifdef __aarch64__ + *v_round = vdupq_laneq_s16(*v_round, 1); + *v_quant = vdupq_laneq_s16(*v_quant, 1); + *v_dequant = vdupq_laneq_s16(*v_dequant, 1); +#else + *v_round = vdupq_lane_s16(vget_low_s16(*v_round), 1); + *v_quant = vdupq_lane_s16(vget_low_s16(*v_quant), 1); + *v_dequant = vdupq_lane_s16(vget_low_s16(*v_dequant), 1); +#endif +} + +static VPX_FORCE_INLINE void quantize_fp_8( + const int16x8_t *v_round, const int16x8_t *v_quant, + const int16x8_t *v_dequant, const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + int16x8_t *v_eobmax) { + const int16x8_t v_zero = vdupq_n_s16(0); + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs, *v_round); + const int32x4_t v_tmp_lo = + vmull_s16(vget_low_s16(v_tmp), vget_low_s16(*v_quant)); + const int32x4_t v_tmp_hi = + vmull_s16(vget_high_s16(v_tmp), vget_high_s16(*v_quant)); + const int16x8_t v_tmp2 = + vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + calculate_dqcoeff_and_store(v_qcoeff, *v_dequant, dqcoeff_ptr); + store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); + + *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask); +} + void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, @@ -50,84 +127,24 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. int i; - const int16x8_t v_zero = vdupq_n_s16(0); - const int16x8_t v_one = vdupq_n_s16(1); - int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); - int16x8_t v_round = vmovq_n_s16(round_ptr[1]); - int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); - int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); - + int16x8_t v_eobmax = vdupq_n_s16(-1); + int16x8_t v_round, v_quant, v_dequant; (void)scan; - // adjust for dc - v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); - v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); - v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); + load_fp_values(round_ptr, quant_ptr, dequant_ptr, &v_round, &v_quant, + &v_dequant); // process dc and the first seven ac coeffs - { - const int16x8_t v_iscan = vld1q_s16(&iscan[0]); - const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); - const int16x8_t v_abs = vabsq_s16(v_coeff); - const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); - const int32x4_t v_tmp_lo = - vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); - const int32x4_t v_tmp_hi = - vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); - const int16x8_t v_tmp2 = - vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); - const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); - const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); - const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); - const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); - const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); - calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr); - v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); - store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); - v_round = vmovq_n_s16(round_ptr[1]); - v_quant = vmovq_n_s16(quant_ptr[1]); - v_dequant = vmovq_n_s16(dequant_ptr[1]); - } + quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr, + dqcoeff_ptr, &v_eobmax); + // now process the rest of the ac coeffs + update_fp_values(&v_round, &v_quant, &v_dequant); for (i = 8; i < count; i += 8) { - const int16x8_t v_iscan = vld1q_s16(&iscan[i]); - const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i); - const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); - const int16x8_t v_abs = vabsq_s16(v_coeff); - const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); - const int32x4_t v_tmp_lo = - vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); - const int32x4_t v_tmp_hi = - vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); - const int16x8_t v_tmp2 = - vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); - const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); - const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); - const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); - const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); - const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); - calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr + i); - v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); - store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff); + quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i, + qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax); } -#ifdef __aarch64__ - *eob_ptr = vmaxvq_s16(v_eobmax_76543210); -#else - { - const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210), - vget_high_s16(v_eobmax_76543210)); - const int64x1_t v_eobmax_xx32 = - vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); - const int16x4_t v_eobmax_tmp = - vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); - const int64x1_t v_eobmax_xxx3 = - vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); - const int16x4_t v_eobmax_final = - vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); - - *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); - } -#endif // __aarch64__ + + *eob_ptr = get_max_eob(v_eobmax); } static INLINE int32x4_t extract_sign_bit(int32x4_t a) { From 33b385ec4e850b2663a0048667c34289e38473d2 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 12 Aug 2022 22:24:39 -0700 Subject: [PATCH 0460/1000] configure: add -Wc++{14,17,20}-extensions the snapshot of googletest and the test files themselves are targeting c++11 currently; these warnings are supported by recent versions of clang Change-Id: I5d36b3bd4058ba1610f0c8b27cad27aadee85939 --- configure | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/configure b/configure index beea650329..425ab3cb30 100755 --- a/configure +++ b/configure @@ -666,6 +666,11 @@ process_toolchain() { check_add_cxxflags -Wno-psabi fi + # Enforce C++11 compatibility. + check_add_cxxflags -Wc++14-extensions + check_add_cxxflags -Wc++17-extensions + check_add_cxxflags -Wc++20-extensions + # disable some warnings specific to libyuv. check_cxxflags -Wno-missing-declarations \ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations" From 14b8eaf7da0ceb3dc802cae9cd344b8788154077 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sun, 14 Aug 2022 15:43:36 -0700 Subject: [PATCH 0461/1000] examples/svc_encodeframe.c: rm empty {}s in switch these have been unnecessary since: 0e97e7049 remove spatial svc experiment Bug: b/229626362 Change-Id: I57528af4dcb9092b752161c8eaba2e2808c29c5f --- examples/svc_encodeframe.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/examples/svc_encodeframe.c b/examples/svc_encodeframe.c index 08bda0e5c9..003096e701 100644 --- a/examples/svc_encodeframe.c +++ b/examples/svc_encodeframe.c @@ -552,11 +552,8 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, iter = NULL; while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) { switch (cx_pkt->kind) { - case VPX_CODEC_PSNR_PKT: { - } - ++si->psnr_pkt_received; - break; - default: { break; } + case VPX_CODEC_PSNR_PKT: ++si->psnr_pkt_received; break; + default: break; } } From 763167aac7e58452dd6db5ce4db6703b88a20dec Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Mon, 15 Aug 2022 06:14:49 -0700 Subject: [PATCH 0462/1000] VPX: Add vp9_highbd_quantize_fp_neon(). Up to 4.1x faster than vp9_highbd_quantize_fp_c() for full calculations. ~1.3% overall encoder improvement for the test clip used. Bug: b/237714063 Change-Id: I8c6466bdbcf1c398b1d8b03cab4165c1d8556b0c --- test/vp9_quantize_test.cc | 11 +++ vp9/common/vp9_rtcd_defs.pl | 2 +- vp9/encoder/arm/neon/vp9_quantize_neon.c | 85 ++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 1 deletion(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index b936350236..5c75c4b082 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -616,6 +616,16 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_AVX2 #if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, VP9QuantizeTest, + ::testing::Values(make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, + VPX_BITS_12, 16, true))); +#else INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, @@ -629,6 +639,7 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 32, true))); +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 1b23e8a66f..62a597af5a 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -196,7 +196,7 @@ () # ENCODEMB INVOKE add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_highbd_quantize_fp avx2/; + specialize qw/vp9_highbd_quantize_fp avx2 neon/; add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ; specialize qw/vp9_highbd_quantize_fp_32x32 avx2/; diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index 46c772da2e..a085372868 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -278,3 +278,88 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, #endif // __aarch64__ } } + +#if CONFIG_VP9_HIGHBITDEPTH +static VPX_FORCE_INLINE uint16x4_t +highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32, + int32x4_t v_dequant_s32, int32x4_t v_round_s32) { + const int32x4_t v_coeff = vld1q_s32(coeff_ptr); + const int32x4_t v_coeff_sign = + vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0))); + const int32x4_t v_abs_coeff = vabsq_s32(v_coeff); + const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32); + // const int abs_qcoeff = (int)((tmp * quant) >> 16); + const int32x4_t v_abs_qcoeff = vqdmulhq_s32(v_tmp, v_quant_s32); + // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_qcoeff = + vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); + const int32x4_t v_abs_dqcoeff = vmulq_s32(v_abs_qcoeff, v_dequant_s32); + // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_dqcoeff = + vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); + + vst1q_s32(qcoeff_ptr, v_qcoeff); + vst1q_s32(dqcoeff_ptr, v_dqcoeff); + + // Packed nz_qcoeff_mask. Used to find eob. + return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0))); +} + +void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, + const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int16x4_t v_zero = vdup_n_s16(0); + const int16x4_t v_quant = vld1_s16(quant_ptr); + const int16x4_t v_dequant = vld1_s16(dequant_ptr); + const int16x4_t v_round = vld1_s16(round_ptr); + int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); + int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); + int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); + uint16x4_t v_mask_lo, v_mask_hi; + int16x8_t v_eobmax = vdupq_n_s16(-1); + + (void)scan; + + // DC and first 3 AC + v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + + // overwrite the DC constants with AC constants + v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1); + v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1); + v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1); + + // 4 more AC + v_mask_hi = + highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32); + + // Find the max lane eob for the first 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + + count -= 8; + do { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + v_mask_hi = + highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32); + // Find the max lane eob for 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + count -= 8; + } while (count); + + *eob_ptr = get_max_eob(v_eobmax); +} +#endif // CONFIG_VP9_HIGHBITDEPTH From c13788bae9f19fe307cef0df9e638ca84bdd4e50 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Fri, 12 Aug 2022 07:15:33 -0700 Subject: [PATCH 0463/1000] vp9_quantize_fp_32x32_neon() cleanup. No change in performance. Bug: b/237714063 Change-Id: If6ad5fc27de4babe0bfff3fdbb4b7fd99a0544ab --- vp9/encoder/arm/neon/vp9_quantize_neon.c | 161 ++++++++--------------- 1 file changed, 57 insertions(+), 104 deletions(-) diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index 46c772da2e..ec749d5397 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -151,52 +151,30 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) { return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); } -void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, - const int16_t *round_ptr, - const int16_t *quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - const int16x8_t one = vdupq_n_s16(1); - const int16x8_t neg_one = vdupq_n_s16(-1); - - // ROUND_POWER_OF_TWO(round_ptr[], 1) - const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); - const int16x8_t quant = vld1q_s16(quant_ptr); - const int16x4_t dequant = vld1_s16(dequant_ptr); - // dequant >> 2 is used similar to zbin as a threshold. - const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2); +static VPX_FORCE_INLINE void quantize_fp_32x32_8( + const int16x8_t *v_round, const int16x8_t *v_quant, + const int16x8_t *v_dequant, const int16x8_t *dequant_thresh, + const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int16x8_t *v_eobmax) { + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_coeff_abs = vabsq_s16(v_coeff); + const int16x8_t v_thr_mask = + vreinterpretq_s16_u16(vcgeq_s16(v_coeff_abs, *dequant_thresh)); + const int16x8_t v_tmp_rnd = + vandq_s16(vqaddq_s16(v_coeff_abs, *v_round), v_thr_mask); + const int16x8_t v_abs_qcoeff = vqdmulhq_s16(v_tmp_rnd, *v_quant); + const int16x8_t v_qcoeff = + vsubq_s16(veorq_s16(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); + const uint16x8_t v_nz_mask = vceqq_s16(v_abs_qcoeff, vdupq_n_s16(0)); - // Process dc and the first seven ac coeffs. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - const int16x8_t dequant_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh)); - - int16x8_t qcoeff = vqaddq_s16(coeff_abs, round); int32x4_t dqcoeff_0, dqcoeff_1; - uint16x8_t eob_max; - (void)scan; - (void)count; - - // coeff * quant_ptr[]) >> 15 - qcoeff = vqdmulhq_s16(qcoeff, quant); - - // Restore sign. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - qcoeff = vandq_s16(qcoeff, dequant_mask); - - // qcoeff * dequant[] / 2 - dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), dequant); - dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]); - + dqcoeff_0 = vmull_s16(vget_low_s16(v_qcoeff), vget_low_s16(*v_dequant)); + dqcoeff_1 = vmull_s16(vget_high_s16(v_qcoeff), vget_high_s16(*v_dequant)); // Add 1 if negative to round towards zero because the C uses division. dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + #if CONFIG_VP9_HIGHBITDEPTH vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1)); vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1)); @@ -205,76 +183,51 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, vshrn_n_s32(dqcoeff_1, 1))); #endif - eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask); +} - iscan += 8; - coeff_ptr += 8; - qcoeff_ptr += 8; - dqcoeff_ptr += 8; +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, + const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int16x8_t eob_max = vdupq_n_s16(-1); + // ROUND_POWER_OF_TWO(round_ptr[], 1) + int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); + int16x8_t quant = vld1q_s16(quant_ptr); + int16x8_t dequant = vld1q_s16(dequant_ptr); + // dequant >> 2 is used similar to zbin as a threshold. + int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2); + int i; - { - int i; - const int16x8_t round = vrshrq_n_s16(vmovq_n_s16(round_ptr[1]), 1); - const int16x8_t quant = vmovq_n_s16(quant_ptr[1]); - const int16x8_t dequant_thresh = - vshrq_n_s16(vmovq_n_s16(dequant_ptr[1]), 2); - - // Process the rest of the ac coeffs. - for (i = 8; i < 32 * 32; i += 8) { - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - const int16x8_t dequant_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh)); - - int16x8_t qcoeff = vqaddq_s16(coeff_abs, round); - int32x4_t dqcoeff_0, dqcoeff_1; - - qcoeff = vqdmulhq_s16(qcoeff, quant); - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - qcoeff = vandq_s16(qcoeff, dequant_mask); - - dqcoeff_0 = vmull_n_s16(vget_low_s16(qcoeff), dequant_ptr[1]); - dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]); - - dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); - dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + (void)scan; + (void)count; -#if CONFIG_VP9_HIGHBITDEPTH - vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1)); - vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1)); -#else - store_s16q_to_tran_low( - dqcoeff_ptr, - vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1))); -#endif + // Process dc and the first seven ac coeffs. + quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr, + iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max); - eob_max = - vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + update_fp_values(&round, &quant, &dequant); + dequant_thresh = vdupq_lane_s16(vget_low_s16(dequant_thresh), 1); - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + iscan += 8; + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; - iscan += 8; - coeff_ptr += 8; - qcoeff_ptr += 8; - dqcoeff_ptr += 8; - } + // Process the rest of the ac coeffs. + for (i = 8; i < 32 * 32; i += 8) { + quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr, + iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max); -#ifdef __aarch64__ - *eob_ptr = vmaxvq_u16(eob_max); -#else - { - const uint16x4_t eob_max_0 = - vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); - const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); - const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); - vst1_lane_u16(eob_ptr, eob_max_2, 0); - } -#endif // __aarch64__ + iscan += 8; + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; } + + *eob_ptr = get_max_eob(eob_max); } From fecec6293f622c1df36dbfd131d34956b4b712d8 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 15 Aug 2022 17:26:41 -0700 Subject: [PATCH 0464/1000] simple_encode.cc: clear -Wextra-semi-stmt warnings fixes warnings of the form: ../vp9/simple_encode.cc:755:48: warning: empty expression statement has no effect; remove unnecessary ';' to silence this warning [-Wextra-semi-stmt] SET_STRUCT_VALUE(config, oxcf, ret, key_freq); Bug: b/229626362 Change-Id: I1c9b0ae9927cdd7c31da000633bcb6e2b8242cd4 --- vp9/simple_encode.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 654699e1b2..f42912d35b 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -744,10 +744,12 @@ static void UpdateGroupOfPicture(const VP9_COMP *cpi, int start_coding_index, } #define SET_STRUCT_VALUE(config, structure, ret, field) \ - if (strcmp(config.name, #field) == 0) { \ - structure->field = atoi(config.value); \ - ret = 1; \ - } + do { \ + if (strcmp(config.name, #field) == 0) { \ + structure->field = atoi(config.value); \ + ret = 1; \ + } \ + } while (false) static void UpdateEncodeConfig(const EncodeConfig &config, VP9EncoderConfig *oxcf) { From 37a3999f5a70e5e88c6f22030ef8bb106990a8d7 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Tue, 16 Aug 2022 06:12:21 -0700 Subject: [PATCH 0465/1000] Add vp9_highbd_quantize_fp_32x32_neon(). Up to 2.6x faster than vp9_highbd_quantize_fp_32x32_c() for full calculations. Bug: b/237714063 Change-Id: Icfeff2ad4dcd57d0ceb47fe04789710807b9cbad --- test/vp9_quantize_test.cc | 15 ++-- vp9/common/vp9_rtcd_defs.pl | 2 +- vp9/encoder/arm/neon/vp9_quantize_neon.c | 92 ++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 7 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 5c75c4b082..48c8180366 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -619,12 +619,15 @@ INSTANTIATE_TEST_SUITE_P( #if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, - ::testing::Values(make_tuple(&QuantFPWrapper, - &QuantFPWrapper, VPX_BITS_8, - 16, true), - make_tuple(&QuantFPWrapper, - &QuantFPWrapper, - VPX_BITS_12, 16, true))); + ::testing::Values( + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_12, 16, + true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_12, + 32, true))); #else INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 62a597af5a..4290c2380e 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -199,7 +199,7 @@ () specialize qw/vp9_highbd_quantize_fp avx2 neon/; add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ; - specialize qw/vp9_highbd_quantize_fp_32x32 avx2/; + specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/; # fdct functions add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index 96dee5c6c1..945fd522e8 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -315,4 +315,96 @@ void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, *eob_ptr = get_max_eob(v_eobmax); } + +static VPX_FORCE_INLINE uint16x4_t +highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32, + int32x4_t v_dequant_s32, int32x4_t v_round_s32) { + const int32x4_t v_coeff = vld1q_s32(coeff_ptr); + const int32x4_t v_coeff_sign = + vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0))); + const int32x4_t v_abs_coeff = vabsq_s32(v_coeff); + // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) + const int32x4_t v_abs_coeff_scaled = vshlq_n_s32(v_abs_coeff, 2); + const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32); + // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0 + const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32), + vreinterpretq_s32_u32(v_mask)); + // const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale)); + const int32x4_t v_abs_qcoeff = + vqdmulhq_s32(vshlq_n_s32(v_tmp, 1), v_quant_s32); + // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_qcoeff = + vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); + // vshlq_s32 will shift right if shift value is negative. + const int32x4_t v_abs_dqcoeff = + vshrq_n_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), 1); + // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_dqcoeff = + vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); + + vst1q_s32(qcoeff_ptr, v_qcoeff); + vst1q_s32(dqcoeff_ptr, v_dqcoeff); + + // Packed nz_qcoeff_mask. Used to find eob. + return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0))); +} + +void vp9_highbd_quantize_fp_32x32_neon( + const tran_low_t *coeff_ptr, intptr_t count, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + const int16x4_t v_quant = vld1_s16(quant_ptr); + const int16x4_t v_dequant = vld1_s16(dequant_ptr); + const int16x4_t v_zero = vdup_n_s16(0); + const int16x4_t v_round = + vqrdmulh_n_s16(vld1_s16(round_ptr), (int16_t)(1 << 14)); + int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); + int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); + int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); + uint16x4_t v_mask_lo, v_mask_hi; + int16x8_t v_eobmax = vdupq_n_s16(-1); + + (void)scan; + + // DC and first 3 AC + v_mask_lo = + highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + + // overwrite the DC constants with AC constants + v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1); + v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1); + v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1); + + // 4 more AC + v_mask_hi = + highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32); + + // Find the max lane eob for the first 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + + count -= 8; + do { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_mask_lo = + highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + v_mask_hi = highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4, + dqcoeff_ptr + 4, v_quant_s32, + v_dequant_s32, v_round_s32); + // Find the max lane eob for 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + count -= 8; + } while (count); + + *eob_ptr = get_max_eob(v_eobmax); +} #endif // CONFIG_VP9_HIGHBITDEPTH From d22e5a49e38a1749afdbff167dec81fb5061c5c6 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 16 Aug 2022 13:57:25 -0700 Subject: [PATCH 0466/1000] configure: add -Wno-pass-failed for libyuv with certain optimization flags or sanitizers enabled some code may fail to vectorize: third_party/libyuv/source/row_common.cc:3178:7: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning] this was observed with integer/undefined sanitizers using clang 11/13 Bug: b/229626362 Change-Id: I01595c641763c4cd4242e02f2cc5cbabfe69d03e --- configure | 2 ++ 1 file changed, 2 insertions(+) diff --git a/configure b/configure index 425ab3cb30..1b850b5e04 100755 --- a/configure +++ b/configure @@ -676,6 +676,8 @@ process_toolchain() { && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations" check_cxxflags -Wno-missing-prototypes \ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes" + check_cxxflags -Wno-pass-failed \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-pass-failed" check_cxxflags -Wno-unused-parameter \ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter" fi From c1fb6c6624072cef4785689d4f3ec02bff52266d Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 16 Aug 2022 16:48:00 -0700 Subject: [PATCH 0467/1000] variance_sse2.c: add some missing casts quiets integer sanitizer warnings of the form: ../vpx_dsp/x86/variance_sse2.c:100:10: runtime error: implicit conversion from type 'unsigned int' of value 4294966272 (32-bit, unsigned) to type 'int' changed the value to -1024 (32-bit, signed) Bug: b/229626362 Change-Id: I150cc0a6a6b85143c3bf96886686fe3a40897db5 --- vpx_dsp/x86/variance_sse2.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c index fedc8b84e5..d6eb12da1a 100644 --- a/vpx_dsp/x86/variance_sse2.c +++ b/vpx_dsp/x86/variance_sse2.c @@ -85,7 +85,7 @@ static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum, vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_unpacklo_epi16(vsum, vsum); vsum = _mm_srai_epi32(vsum, 16); - *sum = add32x4_sse2(vsum); + *sum = (int)add32x4_sse2(vsum); } static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { @@ -97,7 +97,7 @@ static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { // Can handle 1024 pixels' diff sum (such as 32x32) static INLINE int sum_final_sse2(const __m128i sum) { const __m128i t = sum_to_32bit_sse2(sum); - return add32x4_sse2(t); + return (int)add32x4_sse2(t); } static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride, @@ -349,7 +349,7 @@ unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); } *sse = add32x4_sse2(vsse); - sum = add32x4_sse2(vsum); + sum = (int)add32x4_sse2(vsum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); } @@ -369,7 +369,7 @@ unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); } *sse = add32x4_sse2(vsse); - sum = add32x4_sse2(vsum); + sum = (int)add32x4_sse2(vsum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); } @@ -389,7 +389,7 @@ unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); } *sse = add32x4_sse2(vsse); - sum = add32x4_sse2(vsum); + sum = (int)add32x4_sse2(vsum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); } From d939886809d1bf8428d7bfb515fa8ff544c44fe7 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 16 Aug 2022 16:52:06 -0700 Subject: [PATCH 0468/1000] load_unaligned_u32: use an int w/_mm_cvtsi32_si128 this matches the type of the function parameter; quiets integer sanitizer warnings of the form: implicit conversion from type 'uint32_t' (aka 'unsigned int') of value 3215646151 (32-bit, unsigned) to type 'int' changed the value to -1079321145 (32-bit, signed) Bug: b/229626362 Change-Id: Ia9a5dc5e1f57cbf4f8f8fa457bb674ef43369d37 --- vpx_dsp/x86/mem_sse2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h index 6df4773f73..031f361a41 100644 --- a/vpx_dsp/x86/mem_sse2.h +++ b/vpx_dsp/x86/mem_sse2.h @@ -27,7 +27,7 @@ static INLINE int32_t loadu_int32(const void *src) { } static INLINE __m128i load_unaligned_u32(const void *a) { - uint32_t val; + int val; memcpy(&val, a, sizeof(val)); return _mm_cvtsi32_si128(val); } From b77b6b68d3e7bf54db7ac2996a6b103cafa7c32c Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 16 Aug 2022 17:28:08 -0700 Subject: [PATCH 0469/1000] highbd_quantize_intrin_sse2: quiet int sanitizer warnings add a missing cast in ^ operations; quiets warnings of the form: implicit conversion from type 'int' of value -1 (32-bit, signed) to type 'unsigned int' changed the value to 4294967295 (32-bit, unsigned) Bug: b/229626362 Change-Id: I56f74981050b2c9d00bad20e68f1b73ce7454729 --- vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 4535a0f7a2..1264fbed22 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -82,7 +82,8 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; const uint32_t abs_qcoeff = (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); - qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; + qcoeff_ptr[k] = + (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j]; dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; } @@ -143,7 +144,7 @@ void vpx_highbd_quantize_b_32x32_sse2( const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; const uint32_t abs_qcoeff = (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } From cb18d72c306b8400279191940ec0861af338bb5a Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 16 Aug 2022 17:54:14 -0700 Subject: [PATCH 0470/1000] webmdec,WebmInputContext: make timestamp_ns signed this matches the type returned from libwebm, which uses -1 as an error; quiets integer sanitizer warnings of the form: implicit conversion from type 'long long' of value -1 (64-bit, signed) to type 'uint64_t' (aka 'unsigned long') changed the value to 18446744073709551615 (64-bit, unsigned) Bug: b/229626362 Change-Id: Id3966912f802aee3c0f7852225b55f3057c3e76a --- webmdec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmdec.h b/webmdec.h index d8618b07d6..6ae7ee16d0 100644 --- a/webmdec.h +++ b/webmdec.h @@ -27,7 +27,7 @@ struct WebmInputContext { const void *block; int block_frame_index; int video_track_index; - uint64_t timestamp_ns; + int64_t timestamp_ns; int is_key_frame; int reached_eos; }; From a76a0228359723fe8b3c522ea0e7c2e2acb26ca8 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 16 Aug 2022 17:46:24 -0700 Subject: [PATCH 0471/1000] vp8,VP8_COMP: normalize segment_encode_breakout type use unsigned int as the API value is of this type; this quiets some integer sanitizer warnings of the form: implicit conversion from type 'unsigned int' of value 2147483648 (32-bit, unsigned) to type 'int' changed the value to -2147483648 (32-bit, signed) Bug: b/229626362 Change-Id: I3d1ca618bf1b3cd57a5dca65a3067f351c1473f8 --- test/encode_api_test.cc | 4 ++-- vp8/encoder/onyx_int.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 08159148bd..ecdf928343 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -233,8 +233,8 @@ TEST(EncodeAPI, SetRoi) { roi.roi_map = roi_map; // VP8 only. This value isn't range checked. roi.static_threshold[1] = 1000; - roi.static_threshold[2] = INT_MIN; - roi.static_threshold[3] = INT_MAX; + roi.static_threshold[2] = UINT_MAX / 2 + 1; + roi.static_threshold[3] = UINT_MAX; for (const auto delta : { -63, -1, 0, 1, 63 }) { for (int i = 0; i < 8; ++i) { diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 424f51b180..726dcc9466 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -483,7 +483,7 @@ typedef struct VP8_COMP { unsigned char *segmentation_map; signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; - int segment_encode_breakout[MAX_MB_SEGMENTS]; + unsigned int segment_encode_breakout[MAX_MB_SEGMENTS]; unsigned char *active_map; unsigned int active_map_enabled; From 9db0ec67e33fae4cfd066a7cf0fce9f81442c90e Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 16 Aug 2022 21:59:20 -0700 Subject: [PATCH 0472/1000] vpx_encoder.h: make flag constants unsigned this matches the type for vpx_codec_frame_flags_t and vpx_codec_er_flags_t and quiets int sanitizer warnings of the form: implicit conversion from type 'int' of value -9 (32-bit, signed) to type 'unsigned int' changed the value to 4294967287 (32-bit, unsigned) Bug: b/229626362 Change-Id: Icfc5993250f37cedb300c7032cab28ce4bec1f86 --- vpx/vpx_encoder.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 21254bb547..e776ec8136 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -115,14 +115,14 @@ typedef int64_t vpx_codec_pts_t; * support frame types that are codec specific (MPEG-1 D-frames for example) */ typedef uint32_t vpx_codec_frame_flags_t; -#define VPX_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */ +#define VPX_FRAME_IS_KEY 0x1u /**< frame is the start of a GOP */ /*!\brief frame can be dropped without affecting the stream (no future frame * depends on this one) */ -#define VPX_FRAME_IS_DROPPABLE 0x2 +#define VPX_FRAME_IS_DROPPABLE 0x2u /*!\brief frame should be decoded but will not be shown */ -#define VPX_FRAME_IS_INVISIBLE 0x4 +#define VPX_FRAME_IS_INVISIBLE 0x4u /*!\brief this is a fragment of the encoded frame */ -#define VPX_FRAME_IS_FRAGMENT 0x8 +#define VPX_FRAME_IS_FRAGMENT 0x8u /*!\brief Error Resilient flags * @@ -132,12 +132,12 @@ typedef uint32_t vpx_codec_frame_flags_t; */ typedef uint32_t vpx_codec_er_flags_t; /*!\brief Improve resiliency against losses of whole frames */ -#define VPX_ERROR_RESILIENT_DEFAULT 0x1 +#define VPX_ERROR_RESILIENT_DEFAULT 0x1u /*!\brief The frame partitions are independently decodable by the bool decoder, * meaning that partitions can be decoded even though earlier partitions have * been lost. Note that intra prediction is still done over the partition * boundary. */ -#define VPX_ERROR_RESILIENT_PARTITIONS 0x2 +#define VPX_ERROR_RESILIENT_PARTITIONS 0x2u /*!\brief Encoder output packet variants * From 7a0e2bf1bc8c6e4154deaa29c9a3de882fee2000 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Thu, 18 Aug 2022 12:19:08 -0700 Subject: [PATCH 0473/1000] Fix TEST_P(SADx4Test, DISABLED_Speed) The reference code was being timed instead of the optimized code. Change-Id: I67eb08dcda80e20eaa075dc2c91b7e8ef5c0cdfb --- test/sad_test.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 2506f1adbc..960bd499ba 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -517,14 +517,12 @@ TEST_P(SADx4Test, DISABLED_Speed) { uint32_t reference_sad[4]; DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); vpx_usec_timer timer; - - memset(reference_sad, 0, sizeof(reference_sad)); - SADs(exp_sad); + for (int block = 0; block < 4; ++block) { + reference_sad[block] = ReferenceSAD(GetBlockRefOffset(block)); + } vpx_usec_timer_start(&timer); for (int i = 0; i < kCountSpeedTestBlock; ++i) { - for (int block = 0; block < 4; ++block) { - reference_sad[block] = ReferenceSAD(GetBlockRefOffset(block)); - } + SADs(exp_sad); } vpx_usec_timer_mark(&timer); for (int block = 0; block < 4; ++block) { From df619cb823c99df68ba9cf4ce4cccb101b6080ac Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 18 Aug 2022 11:45:49 -0700 Subject: [PATCH 0474/1000] loopfilter.c: normalize flat func param type flat/flat2 are stored as int8_t as returned by the filter_mask* functions. this quiets integer sanitizer warnings of the form: vpx_dsp/loopfilter.c:197:28: runtime error: implicit conversion from type 'int8_t' (aka 'signed char') of value -1 (8-bit, signed) to type 'uint8_t' (aka 'unsigned char') changed the value to 255 (8-bit, unsigned) Bug: b/229626362 Change-Id: Iacb6ae052d4cb2b6e0ebccbacf59ece9501d3b5f --- vpx_dsp/loopfilter.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c index 9956028317..d6504aab1f 100644 --- a/vpx_dsp/loopfilter.c +++ b/vpx_dsp/loopfilter.c @@ -159,7 +159,7 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } -static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, +static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, uint8_t *oq3) { @@ -232,8 +232,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } -static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat, - uint8_t flat2, uint8_t *op7, uint8_t *op6, +static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint8_t *op7, uint8_t *op6, uint8_t *op5, uint8_t *op4, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, @@ -505,7 +505,7 @@ void vpx_highbd_lpf_vertical_4_dual_c( bd); } -static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, +static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat, uint16_t *op3, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, uint16_t *oq3, int bd) { @@ -584,8 +584,8 @@ void vpx_highbd_lpf_vertical_8_dual_c( bd); } -static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat, - uint8_t flat2, uint16_t *op7, uint16_t *op6, +static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint16_t *op7, uint16_t *op6, uint16_t *op5, uint16_t *op4, uint16_t *op3, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, From 2694c7bc92463520d55764566ffebf694a98704a Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 18 Aug 2022 11:56:25 -0700 Subject: [PATCH 0475/1000] update_thresh_freq_fact_row_mt: normalize param types make source_variance unsigned; this matches update_thresh_freq_fact() and the type of the MACROBLOCK member. quiets integer sanitizer warnings of the form: vp9/encoder/vp9_pickmode.c:2710:58: runtime error: implicit conversion from type 'unsigned int' of value 4294967295 (32-bit, unsigned) to type 'int' changed the value to -1 (32-bit, signed) Bug: b/229626362 Change-Id: I812c6ca914507bf25cad323dea3d91a3a2ea4f1d --- vp9/encoder/vp9_pickmode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 697c589ab3..90dbc427e7 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1112,7 +1112,7 @@ static INLINE int rd_less_than_thresh_row_mt(int64_t best_rd, int thresh, } static INLINE void update_thresh_freq_fact_row_mt( - VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance, + VP9_COMP *cpi, TileDataEnc *tile_data, unsigned int source_variance, int thresh_freq_fact_idx, MV_REFERENCE_FRAME ref_frame, THR_MODES best_mode_idx, PREDICTION_MODE mode) { THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)]; From c7358d801627ea7509a2a7c0d9fce9b72c14dfed Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 18 Aug 2022 11:57:32 -0700 Subject: [PATCH 0476/1000] vp9,search_new_mv: descale rather than scale sse this changes from scaling best sse to downscaling base sse in comparisons. this quiets an integer sanitizer warning of the form: vp9/encoder/vp9_pickmode.c:1632:41: runtime error: left shift of 4294967295 by 1 places cannot be represented in type 'unsigned int' Bug: b/229626362 Change-Id: Iee2920474ba700a46177d4514ba6ef7691958069 --- vp9/encoder/vp9_pickmode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 697c589ab3..d8fffe23ee 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1627,9 +1627,9 @@ static int search_new_mv(VP9_COMP *cpi, MACROBLOCK *x, return -1; // Exit NEWMV search if base_mv_sse is large. - if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale)) + if (sf->base_mv_aggressive && (base_mv_sse >> scale) > best_sse_sofar) return -1; - if (base_mv_sse < (best_sse_sofar << 1)) { + if ((base_mv_sse >> 1) < best_sse_sofar) { // Base layer mv is good. // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since // (0, 0) mode is already tested. From 002b6b1ce05c2810cb858188b29aafe785bbc01a Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 17 Aug 2022 19:20:25 -0700 Subject: [PATCH 0477/1000] compiler_attributes.h: add VPX_NO_UNSIGNED_SHIFT_CHECK and use it on MD5Transform(); this behavior is well defined and is only a warning with -fsanitize=integer, not -fsanitize=undefined. quiets warnings of the form: md5_utils.c:163:3: runtime error: left shift of 143704723 by 7 places cannot be represented in type 'unsigned int' Bug: b/229626362 Change-Id: I60a384b2c2556f5ce71ad8ebce050329aba0b4e4 --- md5_utils.c | 4 ++-- vpx_ports/compiler_attributes.h | 12 +++++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/md5_utils.c b/md5_utils.c index c4106525f2..abd8d43c39 100644 --- a/md5_utils.c +++ b/md5_utils.c @@ -151,8 +151,8 @@ void MD5Final(md5byte digest[16], struct MD5Context *ctx) { * reflect the addition of 16 longwords of new data. MD5Update blocks * the data and converts bytes into longwords for this routine. */ -VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4], - UWORD32 const in[16]) { +VPX_NO_UNSIGNED_OVERFLOW_CHECK VPX_NO_UNSIGNED_SHIFT_CHECK void MD5Transform( + UWORD32 buf[4], UWORD32 const in[16]) { UWORD32 a, b, c, d; a = buf[0]; diff --git a/vpx_ports/compiler_attributes.h b/vpx_ports/compiler_attributes.h index 354352016c..4b468749b8 100644 --- a/vpx_ports/compiler_attributes.h +++ b/vpx_ports/compiler_attributes.h @@ -29,13 +29,23 @@ #endif // __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) #if defined(__clang__) && __has_attribute(no_sanitize) +// Both of these have defined behavior and are used in certain operations or +// optimizations thereof. There are cases where an overflow may be unintended, +// however, so use of these attributes should be done with care. #define VPX_NO_UNSIGNED_OVERFLOW_CHECK \ __attribute__((no_sanitize("unsigned-integer-overflow"))) -#endif +#if __clang_major__ >= 12 +#define VPX_NO_UNSIGNED_SHIFT_CHECK \ + __attribute__((no_sanitize("unsigned-shift-base"))) +#endif // __clang__ >= 12 +#endif // __clang__ #ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK #define VPX_NO_UNSIGNED_OVERFLOW_CHECK #endif +#ifndef VPX_NO_UNSIGNED_SHIFT_CHECK +#define VPX_NO_UNSIGNED_SHIFT_CHECK +#endif //------------------------------------------------------------------------------ // Variable attributes. From b55ef982b0537b4c3d63486e55dcb8fff5fa1d78 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 18 Aug 2022 11:35:06 -0700 Subject: [PATCH 0478/1000] use VPX_NO_UNSIGNED_SHIFT_CHECK with entropy functions these shift values off the most significant bit as part of the process; vp8_regular_quantize_b_sse4_1 is included here for a special case of mask creation quiets warnings of the form: vp8/decoder/dboolhuff.h:81:11: runtime error: left shift of 2373679303235599696 by 3 places cannot be represented in type 'VP8_BD_VALUE' (aka 'unsigned long') vp8/encoder/bitstream.c:257:18: runtime error: left shift of 2147493041 by 1 places cannot be represented in type 'unsigned int' vp8/encoder/x86/quantize_sse4.c:114:18: runtime error: left shift of 4294967294 by 1 places cannot be represented in type 'unsigned int' vp9/encoder/vp9_pickmode.c:1632:41: runtime error: left shift of 4294967295 by 1 places cannot be represented in type 'unsigned int' Bug: b/229626362 Change-Id: Iabed118b2a094232783e5ad0e586596d874103ca --- vp8/decoder/dboolhuff.h | 4 +++- vp8/encoder/bitstream.c | 5 ++++- vp8/encoder/x86/quantize_sse4.c | 5 ++++- vpx_dsp/bitwriter.h | 5 ++++- 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h index f2a18f0d90..673b2fbd5d 100644 --- a/vp8/decoder/dboolhuff.h +++ b/vp8/decoder/dboolhuff.h @@ -15,6 +15,7 @@ #include #include "./vpx_config.h" +#include "vpx_ports/compiler_attributes.h" #include "vpx_ports/mem.h" #include "vpx/vp8dx.h" #include "vpx/vpx_integer.h" @@ -50,7 +51,8 @@ int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source, void vp8dx_bool_decoder_fill(BOOL_DECODER *br); -static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) { +static VPX_NO_UNSIGNED_SHIFT_CHECK int vp8dx_decode_bool(BOOL_DECODER *br, + int probability) { unsigned int bit = 0; VP8_BD_VALUE value; unsigned int split; diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c index 0e97af5f2e..190b013afd 100644 --- a/vp8/encoder/bitstream.c +++ b/vp8/encoder/bitstream.c @@ -19,6 +19,7 @@ #include #include "vpx/vpx_encoder.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/compiler_attributes.h" #include "vpx_ports/system_state.h" #include "bitstream.h" @@ -117,7 +118,9 @@ static void write_split(vp8_writer *bc, int x) { vp8_mbsplit_encodings + x); } -void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) { +void VPX_NO_UNSIGNED_SHIFT_CHECK vp8_pack_tokens(vp8_writer *w, + const TOKENEXTRA *p, + int xcount) { const TOKENEXTRA *stop = p + xcount; unsigned int split; int shift; diff --git a/vp8/encoder/x86/quantize_sse4.c b/vp8/encoder/x86/quantize_sse4.c index 6d03365fcb..4c2d24cc27 100644 --- a/vp8/encoder/x86/quantize_sse4.c +++ b/vp8/encoder/x86/quantize_sse4.c @@ -13,8 +13,11 @@ #include "./vp8_rtcd.h" #include "vp8/encoder/block.h" #include "vpx_ports/bitops.h" /* get_lsb */ +#include "vpx_ports/compiler_attributes.h" -void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { +// Unsigned shift overflow is disabled for the use of ~1U << eob with ymask. +VPX_NO_UNSIGNED_SHIFT_CHECK void vp8_regular_quantize_b_sse4_1(BLOCK *b, + BLOCKD *d) { int eob = -1; short *zbin_boost_ptr = b->zrun_zbin_boost; __m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr)); diff --git a/vpx_dsp/bitwriter.h b/vpx_dsp/bitwriter.h index 04084af8f2..5f1ee69ec2 100644 --- a/vpx_dsp/bitwriter.h +++ b/vpx_dsp/bitwriter.h @@ -13,6 +13,7 @@ #include +#include "vpx_ports/compiler_attributes.h" #include "vpx_ports/mem.h" #include "vpx_dsp/prob.h" @@ -35,7 +36,9 @@ typedef struct vpx_writer { void vpx_start_encode(vpx_writer *br, uint8_t *source); void vpx_stop_encode(vpx_writer *br); -static INLINE void vpx_write(vpx_writer *br, int bit, int probability) { +static INLINE VPX_NO_UNSIGNED_SHIFT_CHECK void vpx_write(vpx_writer *br, + int bit, + int probability) { unsigned int split; int count = br->count; unsigned int range = br->range; From 595bf7022afd2e7116bbfc0e1ee38c482e6d3811 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 18 Aug 2022 17:51:19 -0700 Subject: [PATCH 0479/1000] vp9.read_inter_block_mode_info: return on corruption with block sizes < 8x8 previously only the inner loop was aborted. this could cause propagation of invalid motion vectors to scale_mv(). this quiets integer sanitizer warnings of the form: vp9/common/vp9_mvref_common.h:239:18: runtime error: implicit conversion from type 'int' of value 32768 (32-bit, signed) to type 'int16_t' (aka 'short') changed the value to -32768 (16-bit, signed) Bug: b/229626362 Change-Id: I58b5a425adf21542cbf4cc4dd5ab3cc5ed008264 --- vp9/decoder/vp9_decodemv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index 8a8d2ad86e..f4bfb785f7 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -755,7 +755,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, if (!assign_mv(cm, xd, b_mode, mi->bmi[j].as_mv, best_ref_mvs, best_sub8x8, is_compound, allow_hp, r)) { xd->corrupted |= 1; - break; + return; } if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j]; From ebf4caa85791f0fb35d7e7dc717d61652f53e6d8 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 12 Aug 2022 17:41:11 +0000 Subject: [PATCH 0480/1000] [NEON] Added vpx_highbd_quantize_b* functions Total gain for 12-bit encoding: * ~4.8% for best profile * ~6.2% for rt profile Change-Id: I61e646ab7aedf06a25db1365d6d1cf7b05101c21 --- test/vp9_quantize_test.cc | 23 ++- vpx_dsp/arm/highbd_quantize_neon.c | 318 +++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 4 files changed, 337 insertions(+), 9 deletions(-) create mode 100644 vpx_dsp/arm/highbd_quantize_neon.c diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 48c8180366..4ecdd91b06 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -620,14 +620,23 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, ::testing::Values( + make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&vpx_highbd_quantize_b_32x32_neon, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_neon, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_neon, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false), make_tuple(&QuantFPWrapper, - &QuantFPWrapper, VPX_BITS_8, 16, true), - make_tuple(&QuantFPWrapper, - &QuantFPWrapper, VPX_BITS_12, 16, - true), - make_tuple(&QuantFPWrapper, - &QuantFPWrapper, VPX_BITS_12, - 32, true))); + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 32, + true))); #else INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c new file mode 100644 index 0000000000..4ce432e1f9 --- /dev/null +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -0,0 +1,318 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" + +static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store( + const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1, + tran_low_t *dqcoeff_ptr) { + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); +} + +static VPX_FORCE_INLINE void highbd_quantize_8_neon( + const int32x4_t coeff_0, const int32x4_t coeff_1, const int32x4_t zbin, + const int32x4_t round, const int32x4_t quant, const int32x4_t quant_shift, + int32x4_t *qcoeff_0, int32x4_t *qcoeff_1) { + // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values + const int32x4_t coeff_0_sign = vshrq_n_s32(coeff_0, 31); + const int32x4_t coeff_1_sign = vshrq_n_s32(coeff_1, 31); + const int32x4_t coeff_0_abs = vabsq_s32(coeff_0); + const int32x4_t coeff_1_abs = vabsq_s32(coeff_1); + + // Calculate 2 masks of elements outside the bin + const int32x4_t zbin_mask_0 = + vreinterpretq_s32_u32(vcgeq_s32(coeff_0_abs, zbin)); + const int32x4_t zbin_mask_1 = vreinterpretq_s32_u32( + vcgeq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(zbin), 1))); + + // Get the rounded values + const int32x4_t rounded_0 = vaddq_s32(coeff_0_abs, round); + const int32x4_t rounded_1 = + vaddq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(round), 1)); + + // (round * (quant << 15) * 2) >> 16 == (round * quant) + int32x4_t qcoeff_tmp_0 = vqdmulhq_s32(rounded_0, quant); + int32x4_t qcoeff_tmp_1 = + vqdmulhq_s32(rounded_1, vdupq_lane_s32(vget_low_s32(quant), 1)); + + // Add rounded values + qcoeff_tmp_0 = vaddq_s32(qcoeff_tmp_0, rounded_0); + qcoeff_tmp_1 = vaddq_s32(qcoeff_tmp_1, rounded_1); + + // (round * (quant_shift << 15) * 2) >> 16 == (round * quant_shift) + qcoeff_tmp_0 = vqdmulhq_s32(qcoeff_tmp_0, quant_shift); + qcoeff_tmp_1 = + vqdmulhq_s32(qcoeff_tmp_1, vdupq_lane_s32(vget_low_s32(quant_shift), 1)); + + // Restore the sign bit. + qcoeff_tmp_0 = veorq_s32(qcoeff_tmp_0, coeff_0_sign); + qcoeff_tmp_1 = veorq_s32(qcoeff_tmp_1, coeff_1_sign); + qcoeff_tmp_0 = vsubq_s32(qcoeff_tmp_0, coeff_0_sign); + qcoeff_tmp_1 = vsubq_s32(qcoeff_tmp_1, coeff_1_sign); + + // Only keep the relevant coeffs + *qcoeff_0 = vandq_s32(qcoeff_tmp_0, zbin_mask_0); + *qcoeff_1 = vandq_s32(qcoeff_tmp_1, zbin_mask_1); +} + +static VPX_FORCE_INLINE int16x8_t +highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int32x4_t zbin, + const int32x4_t round, const int32x4_t quant, + const int32x4_t quant_shift, const int32x4_t dequant) { + int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1; + + // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values + const int32x4_t coeff_0 = vld1q_s32(coeff_ptr); + const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4); + highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift, + &qcoeff_0, &qcoeff_1); + + // Store the 32-bit qcoeffs + vst1q_s32(qcoeff_ptr, qcoeff_0); + vst1q_s32(qcoeff_ptr + 4, qcoeff_1); + + // Calculate and store the dqcoeffs + dqcoeff_0 = vmulq_s32(qcoeff_0, dequant); + dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1)); + + highbd_calculate_dqcoeff_and_store(dqcoeff_0, dqcoeff_1, dqcoeff_ptr); + + return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1)); +} + +void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int16x8_t one = vdupq_n_s16(1); + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + + // Only the first element of each vector is DC. + // High half has identical elements, but we can reconstruct it from the low + // half by duplicating the 2nd element. So we only need to pass a 4x32-bit + // vector + int32x4_t zbin = vmovl_s16(vld1_s16(zbin_ptr)); + int32x4_t round = vmovl_s16(vld1_s16(round_ptr)); + // Extend the quant, quant_shift vectors to ones of 32-bit elements + // scale to high-half, so we can use vqdmulhq_s32 + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15); + int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 15); + int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); + + // Process first 8 values which include a dc component. + { + // Add one because the eob does not index from 0. + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + + const int16x8_t qcoeff = + highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + + __builtin_prefetch(coeff_ptr + 64); + + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + n_coeffs -= 8; + + { + zbin = vdupq_lane_s32(vget_low_s32(zbin), 1); + round = vdupq_lane_s32(vget_low_s32(round), 1); + quant = vdupq_lane_s32(vget_low_s32(quant), 1); + quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1); + dequant = vdupq_lane_s32(vget_low_s32(dequant), 1); + + do { + // Add one because the eob is not its index. + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + + const int16x8_t qcoeff = + highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, + round, quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + n_coeffs -= 8; + } while (n_coeffs > 0); + } + +#ifdef __aarch64__ + *eob_ptr = vmaxvq_u16(eob_max); +#else + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +#endif // __aarch64__ + // Need these here, else the compiler complains about mixing declarations and + // code in C90 + (void)n_coeffs; + (void)scan; +} + +static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) { + return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); +} + +static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store_32x32( + int32x4_t dqcoeff_0, int32x4_t dqcoeff_1, tran_low_t *dqcoeff_ptr) { + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + + dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1); + dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1); + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); +} + +static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int32x4_t zbin, const int32x4_t round, + const int32x4_t quant, const int32x4_t quant_shift, + const int32x4_t dequant) { + int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1; + + // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values + const int32x4_t coeff_0 = vld1q_s32(coeff_ptr); + const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4); + highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift, + &qcoeff_0, &qcoeff_1); + + // Store the 32-bit qcoeffs + vst1q_s32(qcoeff_ptr, qcoeff_0); + vst1q_s32(qcoeff_ptr + 4, qcoeff_1); + + // Calculate and store the dqcoeffs + dqcoeff_0 = vmulq_s32(qcoeff_0, dequant); + dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1)); + + highbd_calculate_dqcoeff_and_store_32x32(dqcoeff_0, dqcoeff_1, dqcoeff_ptr); + + return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1)); +} + +void vpx_highbd_quantize_b_32x32_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int16x8_t one = vdupq_n_s16(1); + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + int i; + + // Only the first element of each vector is DC. + // High half has identical elements, but we can reconstruct it from the low + // half by duplicating the 2nd element. So we only need to pass a 4x32-bit + // vector + int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1); + int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1); + // Extend the quant, quant_shift vectors to ones of 32-bit elements + // scale to high-half, so we can use vqdmulhq_s32 + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15); + int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16); + int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); + + // Process first 8 values which include a dc component. + { + // Add one because the eob does not index from 0. + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + + const int16x8_t qcoeff = + highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, + round, quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + { + zbin = vdupq_lane_s32(vget_low_s32(zbin), 1); + round = vdupq_lane_s32(vget_low_s32(round), 1); + quant = vdupq_lane_s32(vget_low_s32(quant), 1); + quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1); + dequant = vdupq_lane_s32(vget_low_s32(dequant), 1); + + for (i = 1; i < 32 * 32 / 8; ++i) { + // Add one because the eob is not its index. + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + + const int16x8_t qcoeff = + highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, + round, quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + } + +#ifdef __aarch64__ + *eob_ptr = vmaxvq_u16(eob_max); +#else + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +#endif // __aarch64__ + // Need these here, else the compiler complains about mixing declarations and + // code in C90 + (void)n_coeffs; + (void)scan; +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index ffe954832d..4f17425ccd 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -333,6 +333,7 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/quantize_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c DSP_SRCS-$(HAVE_AVX2) += x86/highbd_quantize_intrin_avx2.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_quantize_neon.c endif # avg diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index db211ed8ce..cab74f93e0 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -718,10 +718,10 @@ () if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_highbd_quantize_b sse2 avx2/; + specialize qw/vpx_highbd_quantize_b neon sse2 avx2/; add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_highbd_quantize_b_32x32 sse2 avx2/; + specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER From a8980078a133d98319468866580cfc866b0cc5d7 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 22 Aug 2022 10:48:40 -0700 Subject: [PATCH 0481/1000] highbd_quantize_neon.c: remove unneeded assert.h Change-Id: I041f5fb23b856a2b519669b5bf8a40d3772b4a6e --- vpx_dsp/arm/highbd_quantize_neon.c | 1 - 1 file changed, 1 deletion(-) diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index 4ce432e1f9..502a9c972d 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -9,7 +9,6 @@ */ #include -#include #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" From d050161f0d627ab118308ab17ce8d0e040116459 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 19 Aug 2022 22:00:42 +0000 Subject: [PATCH 0482/1000] [NEON] Added vpx_highbd_sad* functions Total gain for 12-bit encoding: * ~7.8% for best profile * ~10% for rt profile Change-Id: I89eda5c4372a5b628c9df84cdeb4c8486fc44789 --- test/sad_test.cc | 118 ++++++++++++++++++ vpx_dsp/arm/highbd_sad_neon.c | 225 ++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 74 +++++------ 4 files changed, 383 insertions(+), 35 deletions(-) create mode 100644 vpx_dsp/arm/highbd_sad_neon.c diff --git a/test/sad_test.cc b/test/sad_test.cc index 960bd499ba..4fb2af6244 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -727,6 +727,45 @@ const SadMxNParam neon_tests[] = { SadMxNParam(8, 4, &vpx_sad8x4_neon), SadMxNParam(4, 8, &vpx_sad4x8_neon), SadMxNParam(4, 4, &vpx_sad4x4_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 8), + SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 8), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 8), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 8), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 8), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 8), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 8), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 8), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 8), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 8), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 8), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 8), + SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 10), + SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 10), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 10), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 10), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 10), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 10), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 10), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 10), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 10), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 10), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 10), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 10), + SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 12), + SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 12), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 12), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 12), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 12), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 12), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 12), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 12), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 12), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 12), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 12), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH + }; INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests)); @@ -744,6 +783,47 @@ const SadMxNAvgParam avg_neon_tests[] = { SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_neon), SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon), SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 8), + SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 8), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 8), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 8), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 8), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 8), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 8), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 8), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 8), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 8), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 8), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 8), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 8), + SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 10), + SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 10), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 10), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 10), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 10), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 10), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 10), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 10), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 10), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 10), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 10), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 10), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 10), + SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 12), + SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 12), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 12), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 12), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 12), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 12), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 12), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 12), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 12), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 12), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 12), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 12), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests)); @@ -761,6 +841,44 @@ const SadMxNx4Param x4d_neon_tests[] = { SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon), SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon), SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 8), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 8), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 8), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 8), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 8), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 8), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 8), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 8), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 8), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 8), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 8), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 8), + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 10), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 10), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 10), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 10), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 10), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 10), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 10), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 10), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 10), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 10), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 10), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 10), + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 12), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 12), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 12), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 12), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 12), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 12), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 12), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 12), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 12), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 12), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 12), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests)); #endif // HAVE_NEON diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c new file mode 100644 index 0000000000..ecb52ce5a5 --- /dev/null +++ b/vpx_dsp/arm/highbd_sad_neon.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static VPX_FORCE_INLINE uint32_t highbd_sad4_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int width, + int height) { + int i, j; + uint32x4_t sum_abs_diff = vdupq_n_u32(0); + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + const uint16x4_t src_u16 = vld1_u16(src16_ptr + j); + const uint16x4_t ref_u16 = vld1_u16(ref16_ptr + j); + sum_abs_diff = vabal_u16(sum_abs_diff, src_u16, ref_u16); + } + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } + + return horizontal_add_uint32x4(sum_abs_diff); +} + +static VPX_FORCE_INLINE uint32_t highbd_sad8_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int width, + int height) { + int i, j; + uint32x4_t sum_abs_diff = vdupq_n_u32(0); + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + const uint16x8_t src_u16 = vld1q_u16(src16_ptr + j); + const uint16x8_t ref_u16 = vld1q_u16(ref16_ptr + j); + sum_abs_diff = + vabal_u16(sum_abs_diff, vget_low_u16(src_u16), vget_low_u16(ref_u16)); + sum_abs_diff = vabal_u16(sum_abs_diff, vget_high_u16(src_u16), + vget_high_u16(ref_u16)); + } + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } + + return horizontal_add_uint32x4(sum_abs_diff); +} + +static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, const uint8_t *second_pred, int width, int height) { + int i, j; + uint32x4_t sum_abs_diff = vdupq_n_u32(0); + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + const uint16x4_t a_u16 = vld1_u16(src16_ptr + j); + const uint16x4_t b_u16 = vld1_u16(ref16_ptr + j); + const uint16x4_t c_u16 = vld1_u16(pred_ptr + j); + const uint16x4_t avg = vrhadd_u16(b_u16, c_u16); + sum_abs_diff = vabal_u16(sum_abs_diff, a_u16, avg); + } + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred_ptr += width; + } + + return horizontal_add_uint32x4(sum_abs_diff); +} + +static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, const uint8_t *second_pred, int width, int height) { + int i, j; + uint32x4_t sum_abs_diff = vdupq_n_u32(0); + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + const uint16x8_t a_u16 = vld1q_u16(src16_ptr + j); + const uint16x8_t b_u16 = vld1q_u16(ref16_ptr + j); + const uint16x8_t c_u16 = vld1q_u16(pred_ptr + j); + const uint16x8_t avg = vrhaddq_u16(b_u16, c_u16); + sum_abs_diff = + vabal_u16(sum_abs_diff, vget_low_u16(a_u16), vget_low_u16(avg)); + sum_abs_diff = + vabal_u16(sum_abs_diff, vget_high_u16(a_u16), vget_high_u16(avg)); + } + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred_ptr += width; + } + + return horizontal_add_uint32x4(sum_abs_diff); +} + +#define highbd_sad4MxN(m, n) \ + unsigned int vpx_highbd_sad##m##x##n##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return highbd_sad4_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ + } + +#define highbd_sadMxN(m, n) \ + unsigned int vpx_highbd_sad##m##x##n##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return highbd_sad8_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ + } + +#define highbd_sad4MxN_avg(m, n) \ + unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return highbd_sad4_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \ + second_pred, m, n); \ + } + +#define highbd_sadMxN_avg(m, n) \ + unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return highbd_sad8_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \ + second_pred, m, n); \ + } + +#define highbd_sadMxNx4D(m, n) \ + void vpx_highbd_sad##m##x##n##x4d_neon( \ + const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], int ref_stride, \ + uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad##m##x##n##_neon(src_ptr, src_stride, \ + ref_array[i], ref_stride); \ + } \ + } + +/* clang-format off */ +// 4x4 +highbd_sad4MxN(4, 4) +highbd_sad4MxN_avg(4, 4) +highbd_sadMxNx4D(4, 4) + +// 4x8 +highbd_sad4MxN(4, 8) +highbd_sad4MxN_avg(4, 8) +highbd_sadMxNx4D(4, 8) + +// 8x4 +highbd_sadMxN(8, 4) +highbd_sadMxN_avg(8, 4) +highbd_sadMxNx4D(8, 4) + +// 8x8 +highbd_sadMxN(8, 8) +highbd_sadMxN_avg(8, 8) +highbd_sadMxNx4D(8, 8) + +// 8x16 +highbd_sadMxN(8, 16) +highbd_sadMxN_avg(8, 16) +highbd_sadMxNx4D(8, 16) + +// 16x8 +highbd_sadMxN(16, 8) +highbd_sadMxN_avg(16, 8) +highbd_sadMxNx4D(16, 8) + +// 16x16 +highbd_sadMxN(16, 16) +highbd_sadMxN_avg(16, 16) +highbd_sadMxNx4D(16, 16) + +// 16x32 +highbd_sadMxN(16, 32) +highbd_sadMxN_avg(16, 32) +highbd_sadMxNx4D(16, 32) + +// 32x16 +highbd_sadMxN(32, 16) +highbd_sadMxN_avg(32, 16) +highbd_sadMxNx4D(32, 16) + +// 32x32 +highbd_sadMxN(32, 32) +highbd_sadMxN_avg(32, 32) +highbd_sadMxNx4D(32, 32) + +// 32x64 +highbd_sadMxN(32, 64) +highbd_sadMxN_avg(32, 64) +highbd_sadMxNx4D(32, 64) + +// 64x32 +highbd_sadMxN(64, 32) +highbd_sadMxN_avg(64, 32) +highbd_sadMxNx4D(64, 32) + +// 64x64 +highbd_sadMxN(64, 64) +highbd_sadMxN_avg(64, 64) +highbd_sadMxNx4D(64, 64) + /* clang-format on */ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 4f17425ccd..1a03aed526 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -392,6 +392,7 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index cab74f93e0..72442ed41f 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -940,41 +940,43 @@ () # Single block SAD # add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad64x64 sse2/; + specialize qw/vpx_highbd_sad64x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad64x32 sse2/; + specialize qw/vpx_highbd_sad64x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad32x64 sse2/; + specialize qw/vpx_highbd_sad32x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad32x32 sse2/; + specialize qw/vpx_highbd_sad32x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad32x16 sse2/; + specialize qw/vpx_highbd_sad32x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad16x32 sse2/; + specialize qw/vpx_highbd_sad16x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad16x16 sse2/; + specialize qw/vpx_highbd_sad16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad16x8 sse2/; + specialize qw/vpx_highbd_sad16x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad8x16 sse2/; + specialize qw/vpx_highbd_sad8x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad8x8 sse2/; + specialize qw/vpx_highbd_sad8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad8x4 sse2/; + specialize qw/vpx_highbd_sad8x4 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad4x8 neon/; add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad4x4 neon/; # # Avg @@ -988,83 +990,85 @@ () add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max"; add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad64x64_avg sse2/; + specialize qw/vpx_highbd_sad64x64_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad64x32_avg sse2/; + specialize qw/vpx_highbd_sad64x32_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad32x64_avg sse2/; + specialize qw/vpx_highbd_sad32x64_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad32x32_avg sse2/; + specialize qw/vpx_highbd_sad32x32_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad32x16_avg sse2/; + specialize qw/vpx_highbd_sad32x16_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad16x32_avg sse2/; + specialize qw/vpx_highbd_sad16x32_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad16x16_avg sse2/; + specialize qw/vpx_highbd_sad16x16_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad16x8_avg sse2/; + specialize qw/vpx_highbd_sad16x8_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad8x16_avg sse2/; + specialize qw/vpx_highbd_sad8x16_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad8x8_avg sse2/; + specialize qw/vpx_highbd_sad8x8_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad8x4_avg sse2/; + specialize qw/vpx_highbd_sad8x4_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vpx_highbd_sad4x8_avg neon/; add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vpx_highbd_sad4x4_avg neon/; # # Multi-block SAD, comparing a reference to N independent blocks # add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad64x64x4d sse2/; + specialize qw/vpx_highbd_sad64x64x4d sse2 neon/; add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad64x32x4d sse2/; + specialize qw/vpx_highbd_sad64x32x4d sse2 neon/; add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad32x64x4d sse2/; + specialize qw/vpx_highbd_sad32x64x4d sse2 neon/; add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad32x32x4d sse2/; + specialize qw/vpx_highbd_sad32x32x4d sse2 neon/; add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad32x16x4d sse2/; + specialize qw/vpx_highbd_sad32x16x4d sse2 neon/; add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad16x32x4d sse2/; + specialize qw/vpx_highbd_sad16x32x4d sse2 neon/; add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad16x16x4d sse2/; + specialize qw/vpx_highbd_sad16x16x4d sse2 neon/; add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad16x8x4d sse2/; + specialize qw/vpx_highbd_sad16x8x4d sse2 neon/; add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad8x16x4d sse2/; + specialize qw/vpx_highbd_sad8x16x4d sse2 neon/; add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad8x8x4d sse2/; + specialize qw/vpx_highbd_sad8x8x4d sse2 neon/; add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad8x4x4d sse2/; + specialize qw/vpx_highbd_sad8x4x4d sse2 neon/; add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad4x8x4d sse2/; + specialize qw/vpx_highbd_sad4x8x4d sse2 neon/; add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad4x4x4d sse2/; + specialize qw/vpx_highbd_sad4x4x4d sse2 neon/; # # Structured Similarity (SSIM) From a6d95698fe79e7427596392c632b78f961861e98 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 22 Aug 2022 19:46:50 +0000 Subject: [PATCH 0483/1000] [NEON] Add vpx_highbd_subtract_block function Total gain for 12-bit encoding: * ~1% for best and rt profile Change-Id: I4039120dc570baab1ae519a5e38b1acff38d81f0 --- vpx_dsp/arm/subtract_neon.c | 56 ++++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 1 + 2 files changed, 57 insertions(+) diff --git a/vpx_dsp/arm/subtract_neon.c b/vpx_dsp/arm/subtract_neon.c index 612897e247..2c008e48ab 100644 --- a/vpx_dsp/arm/subtract_neon.c +++ b/vpx_dsp/arm/subtract_neon.c @@ -79,3 +79,59 @@ void vpx_subtract_block_neon(int rows, int cols, int16_t *diff, } while (r); } } + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, + const uint8_t *src8_ptr, + ptrdiff_t src_stride, + const uint8_t *pred8_ptr, + ptrdiff_t pred_stride, int bd) { + int r = rows, c; + uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr); + (void)bd; + + if (cols >= 16) { + do { + for (c = 0; c < cols; c += 16) { + const uint16x8_t s0 = vld1q_u16(&src[c + 0]); + const uint16x8_t s1 = vld1q_u16(&src[c + 8]); + const uint16x8_t p0 = vld1q_u16(&pred[c + 0]); + const uint16x8_t p1 = vld1q_u16(&pred[c + 8]); + const uint16x8_t d0 = vsubq_u16(s0, p0); + const uint16x8_t d1 = vsubq_u16(s1, p1); + vst1q_s16(&diff_ptr[c + 0], vreinterpretq_s16_u16(d0)); + vst1q_s16(&diff_ptr[c + 8], vreinterpretq_s16_u16(d1)); + } + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } else if (cols >= 8) { + do { + for (c = 0; c < cols; c += 8) { + const uint16x8_t s = vld1q_u16(&src[c]); + const uint16x8_t p = vld1q_u16(&pred[c]); + const uint16x8_t d0 = vsubq_u16(s, p); + vst1q_s16(&diff_ptr[c], vreinterpretq_s16_u16(d0)); + } + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } else if (cols >= 4) { + do { + for (c = 0; c < cols; c += 4) { + const uint16x4_t s = vld1_u16(&src[c]); + const uint16x4_t p = vld1_u16(&pred[c]); + const uint16x4_t v_diff = vsub_u16(s, p); + vst1_s16(&diff_ptr[c], vreinterpret_s16_u16(v_diff)); + } + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 72442ed41f..6cd46129f0 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -935,6 +935,7 @@ () # Block subtraction # add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd"; + specialize qw/vpx_highbd_subtract_block neon/; # # Single block SAD From a689fe68a3e304df7f6ee3f711de108282da636f Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 22 Aug 2022 19:33:26 -0700 Subject: [PATCH 0484/1000] vp9_ratectrl_rtc_test: initialize loopfilter_ctrl[] this was added in: 7beafefd1 vp9: Allow for disabling loopfilter per spatial layer but the test doesn't zero initialize its svc_params_ member. fixes the use of an uninitialized value, reported by valgrind and integer sanitizer: [ RUN ] VP9/RcInterfaceSvcTest.Svc/0 ==1064682== Conditional jump or move depends on uninitialised value(s) ==1064682== at 0x1C5624: loopfilter_frame (vp9_encoder.c:3285) ==1064682== by 0x1C9B54: encode_frame_to_data_rate (vp9_encoder.c:5595) ==1064682== by 0x1CA2EE: SvcEncode (vp9_encoder.c:5789) ==1064682== by 0x1CEA01: vp9_get_compressed_data (vp9_encoder.c:7891) ==1064682== by 0x185F0E: encoder_encode (vp9_cx_iface.c:1437) ==1064682== by 0x1503BB: vpx_codec_encode (vpx_encoder.c:208) vp9/encoder/vp9_svc_layercontext.c:362:26: runtime error: implicit conversion from type 'int' of value -1 (32-bit, signed) to type 'LOOPFILTER_CONTROL' changed the value to 4294967295 (32-bit, unsigned) #0 0x558925f45377 in vp9_restore_layer_context vp9/encoder/vp9_svc_layercontext.c:362:26 #1 0x558925ef89fd in vp9_get_compressed_data vp9/encoder/vp9_encoder.c:7781:5 #2 0x558925e3ef3e in encoder_encode vp9/vp9_cx_iface.c:1437:20 Bug: b/229626362 Change-Id: I33d244be7752c68b71efa9c62ca45d6b202ec761 --- test/vp9_ratectrl_rtc_test.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index b09a45bb76..03a58fa926 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -271,6 +271,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, svc_params_.max_quantizers[i] = 56; svc_params_.min_quantizers[i] = 2; svc_params_.speed_per_layer[i] = 7; + svc_params_.loopfilter_ctrl[i] = LOOPFILTER_ALL; } cfg_.rc_end_usage = VPX_CBR; cfg_.g_lag_in_frames = 0; From daae445b2a467ba701e87aac06bc9c98af31dfdd Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Sat, 20 Aug 2022 19:02:15 +0000 Subject: [PATCH 0485/1000] [NEON] Improve vpx_quantize_b* functions Slight optimization, prefetch gives a 1% improvement in 1st pass Change-Id: Iba4664964664234666406ab53893e02d481fbe61 --- vpx_dsp/arm/quantize_neon.c | 269 +++++++++++++++++------------------- 1 file changed, 130 insertions(+), 139 deletions(-) diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index bd7818a074..dcdf588cbc 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -17,20 +17,57 @@ static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, const int16x8_t dequant, - tran_low_t *dqcoeff) { + tran_low_t *dqcoeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH const int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); const int32x4_t dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); -#if CONFIG_VP9_HIGHBITDEPTH - vst1q_s32(dqcoeff, dqcoeff_0); - vst1q_s32(dqcoeff + 4, dqcoeff_1); + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); #else - vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1))); + vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant)); #endif // CONFIG_VP9_HIGHBITDEPTH } +static INLINE int16x8_t +quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16x8_t zbin, + const int16x8_t round, const int16x8_t quant, + const int16x8_t quant_shift, const int16x8_t dequant) { + // Load coeffs as 8 x 16-bit ints, take sign and abs values + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + // Calculate mask of elements outside the bin + const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + // Get the rounded values + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 + qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + // Only keep the relevant coeffs + qcoeff = vandq_s16(qcoeff, zbin_mask); + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + + calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); + + return qcoeff; +} + void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, @@ -41,106 +78,61 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; - (void)scan; + + // Only the first element of each vector is DC. + int16x8_t zbin = vld1q_s16(zbin_ptr); + int16x8_t round = vld1q_s16(round_ptr); + int16x8_t quant = vld1q_s16(quant_ptr); + int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. { - // Only the first element of each vector is DC. - const int16x8_t zbin = vld1q_s16(zbin_ptr); - const int16x8_t round = vld1q_s16(round_ptr); - const int16x8_t quant = vld1q_s16(quant_ptr); - const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); - const int16x8_t dequant = vld1q_s16(dequant_ptr); // Add one because the eob does not index from 0. const uint16x8_t v_iscan = vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); - - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); - - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 - qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant, + quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } n_coeffs -= 8; { - const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]); - const int16x8_t round = vdupq_n_s16(round_ptr[1]); - const int16x8_t quant = vdupq_n_s16(quant_ptr[1]); - const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]); - const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]); + zbin = vdupq_lane_s16(vget_low_s16(zbin), 1); + round = vdupq_lane_s16(vget_low_s16(round), 1); + quant = vdupq_lane_s16(vget_low_s16(quant), 1); + quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1); + dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); do { // Add one because the eob is not its index. const uint16x8_t v_iscan = vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); - - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); - - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 - qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; - n_coeffs -= 8; } while (n_coeffs > 0); } @@ -156,6 +148,9 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ + // Need these here, else the compiler complains about mixing declarations and + // code in C90 + (void)scan; } static INLINE int32x4_t extract_sign_bit(int32x4_t a) { @@ -164,7 +159,7 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) { static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff, const int16x8_t dequant, - tran_low_t *dqcoeff) { + tran_low_t *dqcoeff_ptr) { int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); int32x4_t dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); @@ -176,14 +171,51 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff, #if CONFIG_VP9_HIGHBITDEPTH dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1); dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1); - vst1q_s32(dqcoeff, dqcoeff_0); - vst1q_s32(dqcoeff + 4, dqcoeff_1); + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); #else - vst1q_s16(dqcoeff, + vst1q_s16(dqcoeff_ptr, vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1))); #endif // CONFIG_VP9_HIGHBITDEPTH } +static INLINE int16x8_t +quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16x8_t zbin, + const int16x8_t round, const int16x8_t quant, + const int16x8_t quant_shift, const int16x8_t dequant) { + // Load coeffs as 8 x 16-bit ints, take sign and abs values + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + // Calculate mask of elements outside the bin + const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + // Get the rounded values + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 + qcoeff = vqdmulhq_s16(qcoeff, quant_shift); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + // Only keep the relevant coeffs + qcoeff = vandq_s16(qcoeff, zbin_mask); + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + + calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); + + return qcoeff; +} + // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, @@ -198,103 +230,58 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; - (void)scan; - (void)n_coeffs; // Because we will always calculate 32*32. + + // Only the first element of each vector is DC. + int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); + int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); + int16x8_t quant = vld1q_s16(quant_ptr); + int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. { - // Only the first element of each vector is DC. - const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); - const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); - const int16x8_t quant = vld1q_s16(quant_ptr); - const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); - const int16x8_t dequant = vld1q_s16(dequant_ptr); // Add one because the eob does not index from 0. const uint16x8_t v_iscan = vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); - - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); - - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 - qcoeff = vqdmulhq_s16(qcoeff, quant_shift); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } { - const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1); - const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1); - const int16x8_t quant = vdupq_n_s16(quant_ptr[1]); - const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]); - const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]); + zbin = vdupq_lane_s16(vget_low_s16(zbin), 1); + round = vdupq_lane_s16(vget_low_s16(round), 1); + quant = vdupq_lane_s16(vget_low_s16(quant), 1); + quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1); + dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); for (i = 1; i < 32 * 32 / 8; ++i) { // Add one because the eob is not its index. const uint16x8_t v_iscan = vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); - - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); - - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 - qcoeff = vqdmulhq_s16(qcoeff, quant_shift); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } } @@ -310,4 +297,8 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ + // Need these here, else the compiler complains about mixing declarations and + // code in C90 + (void)n_coeffs; + (void)scan; } From a3c9b9126decd2f9933005cf215107518a4413cb Mon Sep 17 00:00:00 2001 From: clang-format Date: Sat, 13 Aug 2022 10:33:56 -0700 Subject: [PATCH 0486/1000] .clang-format: update to clang-format-11 only store the deltas from --style Google in the file and reapply using Debian clang-format version 11.1.0-6+build1 Bug: b/229626362 Change-Id: I3e18a2e7c17a90a48405b3cf1b37ebc652aba0db --- .clang-format | 142 +-------------------------- examples/vp9_spatial_svc_encoder.c | 4 +- vp8/common/mips/dspr2/filter_dspr2.c | 12 +-- vp8/common/mips/msa/vp8_macros_msa.h | 18 ++-- vp8/encoder/encodemv.c | 10 +- vp8/encoder/firstpass.c | 31 +++--- vp8/encoder/mcomp.c | 29 +++--- vp8/encoder/onyx_if.c | 9 +- vp8/encoder/rdopt.c | 2 +- vp9/decoder/vp9_decodemv.c | 4 +- vp9/decoder/vp9_detokenize.c | 15 +-- vp9/encoder/vp9_aq_cyclicrefresh.c | 2 +- vp9/encoder/vp9_bitstream.c | 6 +- vp9/encoder/vp9_denoiser.c | 7 +- vp9/encoder/vp9_firstpass.c | 9 +- vp9/encoder/vp9_temporal_filter.c | 12 +-- vpx_dsp/mips/macros_msa.h | 46 ++++----- y4menc.c | 41 ++++---- 18 files changed, 129 insertions(+), 270 deletions(-) diff --git a/.clang-format b/.clang-format index 866b7e2117..a8bc4967c3 100644 --- a/.clang-format +++ b/.clang-format @@ -1,149 +1,9 @@ --- Language: Cpp -# BasedOnStyle: Google -# Generated with clang-format 7.0.1 -AccessModifierOffset: -1 -AlignAfterOpenBracket: Align -AlignConsecutiveAssignments: false -AlignConsecutiveDeclarations: false -AlignEscapedNewlines: Left -AlignOperands: true -AlignTrailingComments: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: false +BasedOnStyle: Google AllowShortCaseLabelsOnASingleLine: true -AllowShortFunctionsOnASingleLine: All -AllowShortIfStatementsOnASingleLine: true -AllowShortLoopsOnASingleLine: true -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: true -BinPackArguments: true -BinPackParameters: true -BraceWrapping: - AfterClass: false - AfterControlStatement: false - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - IndentBraces: false - SplitEmptyFunction: true - SplitEmptyRecord: true - SplitEmptyNamespace: true -BreakBeforeBinaryOperators: None -BreakBeforeBraces: Attach -BreakBeforeInheritanceComma: false -BreakInheritanceList: BeforeColon -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false -BreakConstructorInitializers: BeforeColon -BreakAfterJavaFieldAnnotations: false -BreakStringLiterals: true -ColumnLimit: 80 -CommentPragmas: '^ IWYU pragma:' -CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: false -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 Cpp11BracedListStyle: false DerivePointerAlignment: false -DisableFormat: false -ExperimentalAutoDetectBinPacking: false -FixNamespaceComments: true -ForEachMacros: - - foreach - - Q_FOREACH - - BOOST_FOREACH -IncludeBlocks: Preserve -IncludeCategories: - - Regex: '^' - Priority: 2 - - Regex: '^<.*\.h>' - Priority: 1 - - Regex: '^<.*' - Priority: 2 - - Regex: '.*' - Priority: 3 -IncludeIsMainRegex: '([-_](test|unittest))?$' -IndentCaseLabels: true -IndentPPDirectives: None -IndentWidth: 2 -IndentWrappedFunctionNames: false -JavaScriptQuotes: Leave -JavaScriptWrapImports: true -KeepEmptyLinesAtTheStartOfBlocks: false -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBinPackProtocolList: Never -ObjCBlockIndentWidth: 2 -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: false -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 1 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakTemplateDeclaration: 10 -PenaltyBreakString: 1000 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 200 PointerAlignment: Right -RawStringFormats: - - Language: Cpp - Delimiters: - - cc - - CC - - cpp - - Cpp - - CPP - - 'c++' - - 'C++' - CanonicalDelimiter: '' - BasedOnStyle: google - - Language: TextProto - Delimiters: - - pb - - PB - - proto - - PROTO - EnclosingFunctions: - - EqualsProto - - EquivToProto - - PARSE_PARTIAL_TEXT_PROTO - - PARSE_TEST_PROTO - - PARSE_TEXT_PROTO - - ParseTextOrDie - - ParseTextProtoOrDie - CanonicalDelimiter: '' - BasedOnStyle: google -ReflowComments: true SortIncludes: false -SortUsingDeclarations: true -SpaceAfterCStyleCast: false -SpaceAfterTemplateKeyword: true -SpaceBeforeAssignmentOperators: true -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeParens: ControlStatements -SpaceBeforeRangeBasedForLoopColon: true -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 2 -SpacesInAngles: false -SpacesInContainerLiterals: true -SpacesInCStyleCastParentheses: false -SpacesInParentheses: false -SpacesInSquareBrackets: false -Standard: Auto -TabWidth: 8 -UseTab: Never -... - diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c index e85dbf8e71..d287e58319 100644 --- a/examples/vp9_spatial_svc_encoder.c +++ b/examples/vp9_spatial_svc_encoder.c @@ -1146,7 +1146,9 @@ int main(int argc, const char **argv) { cx_pkt->data.twopass_stats.sz); break; } - default: { break; } + default: { + break; + } } #if CONFIG_VP9_DECODER && !SIMULCAST_MODE diff --git a/vp8/common/mips/dspr2/filter_dspr2.c b/vp8/common/mips/dspr2/filter_dspr2.c index e46827b0e4..b9da52084d 100644 --- a/vp8/common/mips/dspr2/filter_dspr2.c +++ b/vp8/common/mips/dspr2/filter_dspr2.c @@ -816,8 +816,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr, : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr) - : [src_pixels_per_line] "r"(src_pixels_per_line), - [output_ptr] "r"(output_ptr)); + : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"( + output_ptr)); __asm__ __volatile__( "ulw %[Temp1], 0(%[src_ptr]) \n\t" @@ -832,8 +832,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr, : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr) - : [src_pixels_per_line] "r"(src_pixels_per_line), - [output_ptr] "r"(output_ptr)); + : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"( + output_ptr)); __asm__ __volatile__( "ulw %[Temp1], 0(%[src_ptr]) \n\t" @@ -848,8 +848,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr, : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr) - : [src_pixels_per_line] "r"(src_pixels_per_line), - [output_ptr] "r"(output_ptr)); + : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"( + output_ptr)); output_ptr += 48; } diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h index fde22f537c..7cb3c98690 100644 --- a/vp8/common/mips/msa/vp8_macros_msa.h +++ b/vp8/common/mips/msa/vp8_macros_msa.h @@ -122,10 +122,11 @@ const uint8_t *psrc_m = (const uint8_t *)(psrc); \ uint32_t val_m; \ \ - asm volatile("lwr %[val_m], 0(%[psrc_m]) \n\t" \ - "lwl %[val_m], 3(%[psrc_m]) \n\t" \ - : [val_m] "=&r"(val_m) \ - : [psrc_m] "r"(psrc_m)); \ + asm volatile( \ + "lwr %[val_m], 0(%[psrc_m]) \n\t" \ + "lwl %[val_m], 3(%[psrc_m]) \n\t" \ + : [val_m] "=&r"(val_m) \ + : [psrc_m] "r"(psrc_m)); \ \ val_m; \ }) @@ -136,10 +137,11 @@ const uint8_t *psrc_m = (const uint8_t *)(psrc); \ uint64_t val_m = 0; \ \ - asm volatile("ldr %[val_m], 0(%[psrc_m]) \n\t" \ - "ldl %[val_m], 7(%[psrc_m]) \n\t" \ - : [val_m] "=&r"(val_m) \ - : [psrc_m] "r"(psrc_m)); \ + asm volatile( \ + "ldr %[val_m], 0(%[psrc_m]) \n\t" \ + "ldl %[val_m], 7(%[psrc_m]) \n\t" \ + : [val_m] "=&r"(val_m) \ + : [psrc_m] "r"(psrc_m)); \ \ val_m; \ }) diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c index c88ea1653e..384bb29389 100644 --- a/vp8/encoder/encodemv.c +++ b/vp8/encoder/encodemv.c @@ -31,17 +31,15 @@ static void encode_mvcomponent(vp8_writer *const w, const int v, vp8_write(w, 1, p[mvpis_short]); - do + do { vp8_write(w, (x >> i) & 1, p[MVPbits + i]); - - while (++i < 3); + } while (++i < 3); i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */ - do + do { vp8_write(w, (x >> i) & 1, p[MVPbits + i]); - - while (--i > 3); + } while (--i > 3); if (x & 0xFFF0) vp8_write(w, (x >> 3) & 1, p[MVPbits + 3]); } diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index ed177e3cb6..65d2681c91 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -903,9 +903,9 @@ static double calc_correction_factor(double err_per_mb, double err_devisor, correction_factor = pow(error_term, power_term); /* Clip range */ - correction_factor = (correction_factor < 0.05) - ? 0.05 - : (correction_factor > 5.0) ? 5.0 : correction_factor; + correction_factor = (correction_factor < 0.05) ? 0.05 + : (correction_factor > 5.0) ? 5.0 + : correction_factor; return correction_factor; } @@ -947,11 +947,10 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats, } cpi->twopass.est_max_qcorrection_factor = - (cpi->twopass.est_max_qcorrection_factor < 0.1) - ? 0.1 - : (cpi->twopass.est_max_qcorrection_factor > 10.0) - ? 10.0 - : cpi->twopass.est_max_qcorrection_factor; + (cpi->twopass.est_max_qcorrection_factor < 0.1) ? 0.1 + : (cpi->twopass.est_max_qcorrection_factor > 10.0) + ? 10.0 + : cpi->twopass.est_max_qcorrection_factor; } /* Corrections for higher compression speed settings @@ -1178,10 +1177,9 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, } else { current_spend_ratio = (double)cpi->long_rolling_actual_bits / (double)cpi->long_rolling_target_bits; - current_spend_ratio = - (current_spend_ratio > 10.0) - ? 10.0 - : (current_spend_ratio < 0.1) ? 0.1 : current_spend_ratio; + current_spend_ratio = (current_spend_ratio > 10.0) ? 10.0 + : (current_spend_ratio < 0.1) ? 0.1 + : current_spend_ratio; } /* Calculate a correction factor based on the quality of prediction in @@ -1968,11 +1966,10 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { } cpi->twopass.gf_group_bits = - (cpi->twopass.gf_group_bits < 0) - ? 0 - : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits) - ? cpi->twopass.kf_group_bits - : cpi->twopass.gf_group_bits; + (cpi->twopass.gf_group_bits < 0) ? 0 + : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits) + ? cpi->twopass.kf_group_bits + : cpi->twopass.gf_group_bits; /* Clip cpi->twopass.gf_group_bits based on user supplied data rate * variability limit (cpi->oxcf.two_pass_vbrmax_section) diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index ae092c66e1..b92e2135e9 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -204,20 +204,21 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) { /* returns distortion + motion vector cost */ #define ERR(r, c) (MVC(r, c) + DIST(r, c)) /* checks if (r,c) has better score than previous best */ -#define CHECK_BETTER(v, r, c) \ - do { \ - IFMVCV(r, c, \ - { \ - thismse = DIST(r, c); \ - if ((v = (MVC(r, c) + thismse)) < besterr) { \ - besterr = v; \ - br = r; \ - bc = c; \ - *distortion = thismse; \ - *sse1 = sse; \ - } \ - }, \ - v = UINT_MAX;) \ +#define CHECK_BETTER(v, r, c) \ + do { \ + IFMVCV( \ + r, c, \ + { \ + thismse = DIST(r, c); \ + if ((v = (MVC(r, c) + thismse)) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + }, \ + v = UINT_MAX;) \ } while (0) int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index ffb3867dd1..94fb6e256e 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -4202,11 +4202,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } /* Clamp cpi->zbin_over_quant */ - cpi->mb.zbin_over_quant = (cpi->mb.zbin_over_quant < zbin_oq_low) - ? zbin_oq_low - : (cpi->mb.zbin_over_quant > zbin_oq_high) - ? zbin_oq_high - : cpi->mb.zbin_over_quant; + cpi->mb.zbin_over_quant = + (cpi->mb.zbin_over_quant < zbin_oq_low) ? zbin_oq_low + : (cpi->mb.zbin_over_quant > zbin_oq_high) ? zbin_oq_high + : cpi->mb.zbin_over_quant; Loop = Q != last_q; } else { diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 5821fc7346..bbddacf8f0 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1608,7 +1608,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4], RATE_DISTORTION *rd, unsigned int q2dc = xd->block[24].dequant[0]; /* If theres is no codeable 2nd order dc or a very small uniform pixel change change */ - if ((sse - var> 4) || (sse / 2 > var && sse - var < 64)) { + if ((sse - var < q2dc * q2dc >> 4) || (sse / 2 > var && sse - var < 64)) { /* Check u and v to make sure skip is ok */ unsigned int sse2 = VP8_UVSSE(x); if (sse2 * 2 < threshold) { diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index f4bfb785f7..db3e746639 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -426,7 +426,9 @@ static INLINE int assign_mv(VP9_COMMON *cm, MACROBLOCKD *xd, zero_mv_pair(mv); break; } - default: { return 0; } + default: { + return 0; + } } return ret; } diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index c2e6b3d545..3ed1bd6ffa 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -133,17 +133,18 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type, int16_t dqv = dq[0]; const uint8_t *const cat6_prob = #if CONFIG_VP9_HIGHBITDEPTH - (xd->bd == VPX_BITS_12) - ? vp9_cat6_prob_high12 - : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2 : + (xd->bd == VPX_BITS_12) ? vp9_cat6_prob_high12 + : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2 + : #endif // CONFIG_VP9_HIGHBITDEPTH - vp9_cat6_prob; + vp9_cat6_prob; const int cat6_bits = #if CONFIG_VP9_HIGHBITDEPTH - (xd->bd == VPX_BITS_12) ? 18 - : (xd->bd == VPX_BITS_10) ? 16 : + (xd->bd == VPX_BITS_12) ? 18 + : (xd->bd == VPX_BITS_10) ? 16 + : #endif // CONFIG_VP9_HIGHBITDEPTH - 14; + 14; // Keep value, range, and count as locals. The compiler produces better // results with the locals than using r directly. BD_VALUE value = r->value; diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index e336179e90..90792aebea 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -471,7 +471,7 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { cr->sb_index = i; cr->reduce_refresh = 0; if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) - if (count_sel<(3 * count_tot)>> 2) cr->reduce_refresh = 1; + if (count_sel < (3 * count_tot) >> 2) cr->reduce_refresh = 1; } // Set cyclic refresh parameters. diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 75bd097f24..a84c8b524f 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -134,9 +134,9 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp, const TOKENEXTRA *p; const vp9_extra_bit *const extra_bits = #if CONFIG_VP9_HIGHBITDEPTH - (bit_depth == VPX_BITS_12) - ? vp9_extra_bits_high12 - : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 : vp9_extra_bits; + (bit_depth == VPX_BITS_12) ? vp9_extra_bits_high12 + : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 + : vp9_extra_bits; #else vp9_extra_bits; (void)bit_depth; diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index 2885223b59..77d72396ae 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -233,7 +233,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation( frame == ALTREF_FRAME || (frame == GOLDEN_FRAME && use_gf_temporal_ref) || (frame != LAST_FRAME && - ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) || + ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) || denoiser->denoising_level >= kDenHigh))) { frame = LAST_FRAME; ctx->newmv_sse = ctx->zeromv_lastref_sse; @@ -764,8 +764,9 @@ int64_t vp9_scale_acskip_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, int abs_sumdiff, int temporal_layer_id) { if (noise_level >= kDenLow && abs_sumdiff < 5) - return threshold *= - (noise_level == kDenLow) ? 2 : (temporal_layer_id == 2) ? 10 : 6; + return threshold *= (noise_level == kDenLow) ? 2 + : (temporal_layer_id == 2) ? 10 + : 6; else return threshold; } diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 4682cc0030..e9250e25c0 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -2113,11 +2113,10 @@ static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi, } // Clamp odd edge cases. - total_group_bits = (total_group_bits < 0) - ? 0 - : (total_group_bits > twopass->kf_group_bits) - ? twopass->kf_group_bits - : total_group_bits; + total_group_bits = (total_group_bits < 0) ? 0 + : (total_group_bits > twopass->kf_group_bits) + ? twopass->kf_group_bits + : total_group_bits; // Clip based on user supplied data rate variability limit. if (total_group_bits > (int64_t)max_bits * gop_frames) diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 701bb89287..8af30c42aa 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -777,16 +777,16 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, // Assign higher weight to matching MB if it's error // score is lower. If not applying MC default behavior // is to weight all MBs equal. - blk_fw[0] = err < (thresh_low << THR_SHIFT) - ? 2 - : err < (thresh_high << THR_SHIFT) ? 1 : 0; + blk_fw[0] = err < (thresh_low << THR_SHIFT) ? 2 + : err < (thresh_high << THR_SHIFT) ? 1 + : 0; blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0]; } else { use_32x32 = 0; for (k = 0; k < 4; k++) - blk_fw[k] = blk_bestsme[k] < thresh_low - ? 2 - : blk_bestsme[k] < thresh_high ? 1 : 0; + blk_fw[k] = blk_bestsme[k] < thresh_low ? 2 + : blk_bestsme[k] < thresh_high ? 1 + : 0; } for (k = 0; k < 4; k++) { diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h index 3c2f50c790..d54ce53684 100644 --- a/vpx_dsp/mips/macros_msa.h +++ b/vpx_dsp/mips/macros_msa.h @@ -83,31 +83,33 @@ val_lh_m; \ }) -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \ - uint32_t val_lw_m; \ - \ - __asm__ __volatile__("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ - "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ - : [val_lw_m] "=&r"(val_lw_m) \ - : [psrc_lw_m] "r"(psrc_lw_m)); \ - \ - val_lw_m; \ +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \ + uint32_t val_lw_m; \ + \ + __asm__ __volatile__( \ + "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ + "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ + : [val_lw_m] "=&r"(val_lw_m) \ + : [psrc_lw_m] "r"(psrc_lw_m)); \ + \ + val_lw_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ - uint64_t val_ld_m = 0; \ - \ - __asm__ __volatile__("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ - "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ - : [val_ld_m] "=&r"(val_ld_m) \ - : [psrc_ld_m] "r"(psrc_ld_m)); \ - \ - val_ld_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ + uint64_t val_ld_m = 0; \ + \ + __asm__ __volatile__( \ + "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ + "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ + : [val_ld_m] "=&r"(val_ld_m) \ + : [psrc_ld_m] "r"(psrc_ld_m)); \ + \ + val_ld_m; \ }) #else // !(__mips == 64) #define LD(psrc) \ diff --git a/y4menc.c b/y4menc.c index 02b729e5bb..1877981279 100644 --- a/y4menc.c +++ b/y4menc.c @@ -17,39 +17,34 @@ int y4m_write_file_header(char *buf, size_t len, int width, int height, const char *color; switch (bit_depth) { case 8: - color = fmt == VPX_IMG_FMT_I444 - ? "C444\n" - : fmt == VPX_IMG_FMT_I422 ? "C422\n" : "C420jpeg\n"; + color = fmt == VPX_IMG_FMT_I444 ? "C444\n" + : fmt == VPX_IMG_FMT_I422 ? "C422\n" + : "C420jpeg\n"; break; case 9: - color = fmt == VPX_IMG_FMT_I44416 - ? "C444p9 XYSCSS=444P9\n" - : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n" - : "C420p9 XYSCSS=420P9\n"; + color = fmt == VPX_IMG_FMT_I44416 ? "C444p9 XYSCSS=444P9\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n" + : "C420p9 XYSCSS=420P9\n"; break; case 10: - color = fmt == VPX_IMG_FMT_I44416 - ? "C444p10 XYSCSS=444P10\n" - : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n" - : "C420p10 XYSCSS=420P10\n"; + color = fmt == VPX_IMG_FMT_I44416 ? "C444p10 XYSCSS=444P10\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n" + : "C420p10 XYSCSS=420P10\n"; break; case 12: - color = fmt == VPX_IMG_FMT_I44416 - ? "C444p12 XYSCSS=444P12\n" - : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n" - : "C420p12 XYSCSS=420P12\n"; + color = fmt == VPX_IMG_FMT_I44416 ? "C444p12 XYSCSS=444P12\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n" + : "C420p12 XYSCSS=420P12\n"; break; case 14: - color = fmt == VPX_IMG_FMT_I44416 - ? "C444p14 XYSCSS=444P14\n" - : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n" - : "C420p14 XYSCSS=420P14\n"; + color = fmt == VPX_IMG_FMT_I44416 ? "C444p14 XYSCSS=444P14\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n" + : "C420p14 XYSCSS=420P14\n"; break; case 16: - color = fmt == VPX_IMG_FMT_I44416 - ? "C444p16 XYSCSS=444P16\n" - : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n" - : "C420p16 XYSCSS=420P16\n"; + color = fmt == VPX_IMG_FMT_I44416 ? "C444p16 XYSCSS=444P16\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n" + : "C420p16 XYSCSS=420P16\n"; break; default: color = NULL; assert(0); } From 2c7657202e839a661997f08c57554523901ad26d Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 24 Aug 2022 15:48:24 -0700 Subject: [PATCH 0487/1000] vp8_ratectrl_rtc_test.cc: ensure frame_type is initialized this fixes a valgrind failure: ==1095597== Conditional jump or move depends on uninitialised value(s) ==1095597== at 0x12E0CC: (anonymous namespace)::Vp8RcInterfaceTest::PreEncodeFrameHook(libvpx_test::VideoSource*, libvpx_test:: > Encoder*) (vp8_ratectrl_rtc_test.cc:131) ==1095597== by 0x1255A9: libvpx_test::EncoderTest::RunLoop(libvpx_test::VideoSource*) (encode_test_driver.cc:205) Bug: webm:1776 Change-Id: Id3b40f62573ee513e79c74b6315c71b6ecd22c9a Fixed: webm:1776 --- test/vp8_ratectrl_rtc_test.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc index ad310666e7..7410f3c01d 100644 --- a/test/vp8_ratectrl_rtc_test.cc +++ b/test/vp8_ratectrl_rtc_test.cc @@ -127,8 +127,7 @@ class Vp8RcInterfaceTest encoder->Control(VP8E_SET_CPUUSED, -6); encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1); encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); - } - if (frame_params_.frame_type == INTER_FRAME) { + } else if (frame_params_.frame_type == INTER_FRAME) { // Disable golden frame update. frame_flags_ |= VP8_EFLAG_NO_UPD_GF; frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; From 7663fcb46733253f7ac6625aadc89cea08d942bf Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 24 Aug 2022 18:50:10 -0700 Subject: [PATCH 0488/1000] libs.doxy_template: remove obsolete CLASS_DIAGRAMS This was reported with doxygen 1.9.4. Also update the comment for CLASS_GRAPH by running "doxygen -u" because the original comment for CLASS_GRAPH mentions the obsolete tag 'CLASS_DIAGRAMS', Change-Id: I3bca547201f794d363bd814b7c7f7c9d7088797a --- libs.doxy_template | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/libs.doxy_template b/libs.doxy_template index 73e1b43c72..1ee442af3e 100644 --- a/libs.doxy_template +++ b/libs.doxy_template @@ -1097,15 +1097,6 @@ EXTERNAL_GROUPS = YES # Configuration options related to the dot tool #--------------------------------------------------------------------------- -# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will -# generate a inheritance diagram (in HTML, RTF and la_te_x) for classes with base -# or super classes. Setting the tag to NO turns the diagrams off. Note that -# this option is superseded by the HAVE_DOT option below. This is only a -# fallback. It is recommended to install and use dot, since it yields more -# powerful graphs. - -CLASS_DIAGRAMS = YES - # If set to YES, the inheritance and collaboration graphs will hide # inheritance and usage relations if the target is undocumented # or is not a class. @@ -1119,10 +1110,14 @@ HIDE_UNDOC_RELATIONS = YES HAVE_DOT = NO -# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen -# will generate a graph for each documented class showing the direct and -# indirect inheritance relations. Setting this tag to YES will force the -# the CLASS_DIAGRAMS tag to NO. +# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a +# graph for each documented class showing the direct and indirect inheritance +# relations. In case HAVE_DOT is set as well dot will be used to draw the graph, +# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set +# to TEXT the direct and indirect inheritance relations will be shown as texts / +# links. +# Possible values are: NO, YES, TEXT and GRAPH. +# The default value is: YES. CLASS_GRAPH = YES From 722d4daf3581703c10c3ff79022d5b92991be832 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 25 Aug 2022 10:50:16 -0700 Subject: [PATCH 0489/1000] vpx_encoder.h: note VPX_ERROR_RESILIENT_PARTITIONS is VP8-only Change-Id: If71b2ec766f9f41253ce5a34987ffd208f9c8381 --- vpx/vpx_encoder.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index e776ec8136..efaf5ef366 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -136,7 +136,8 @@ typedef uint32_t vpx_codec_er_flags_t; /*!\brief The frame partitions are independently decodable by the bool decoder, * meaning that partitions can be decoded even though earlier partitions have * been lost. Note that intra prediction is still done over the partition - * boundary. */ + * boundary. + * \note This is only supported by VP8.*/ #define VPX_ERROR_RESILIENT_PARTITIONS 0x2u /*!\brief Encoder output packet variants From 13970b7ecaa539fcb311325520d981ef006db105 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 24 Aug 2022 12:28:43 +0000 Subject: [PATCH 0490/1000] [NEON] Add highbd *variance* functions Total gain for 12-bit encoding: * ~7.2% for best profile * ~5.8% for rt profile Change-Id: I5b70415fb89d1bbb02a0c139eb317ba6b08adede --- test/variance_test.cc | 218 +++++++++++++ vpx_dsp/arm/highbd_variance_neon.c | 502 +++++++++++++++++++++++++++++ vpx_dsp/variance.c | 6 +- vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 244 +++++++------- 5 files changed, 860 insertions(+), 111 deletions(-) create mode 100644 vpx_dsp/arm/highbd_variance_neon.c diff --git a/test/variance_test.cc b/test/variance_test.cc index 80855052dc..8aed5d2ed9 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1495,6 +1495,224 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0), SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0), SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0))); + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDVarianceTest, + ::testing::Values( + VarianceParams(6, 6, &vpx_highbd_12_variance64x64_neon, 12), + VarianceParams(6, 5, &vpx_highbd_12_variance64x32_neon, 12), + VarianceParams(5, 6, &vpx_highbd_12_variance32x64_neon, 12), + VarianceParams(5, 5, &vpx_highbd_12_variance32x32_neon, 12), + VarianceParams(5, 4, &vpx_highbd_12_variance32x16_neon, 12), + VarianceParams(4, 5, &vpx_highbd_12_variance16x32_neon, 12), + VarianceParams(4, 4, &vpx_highbd_12_variance16x16_neon, 12), + VarianceParams(4, 3, &vpx_highbd_12_variance16x8_neon, 12), + VarianceParams(3, 4, &vpx_highbd_12_variance8x16_neon, 12), + VarianceParams(3, 3, &vpx_highbd_12_variance8x8_neon, 12), + VarianceParams(3, 2, &vpx_highbd_12_variance8x4_neon, 12), + VarianceParams(2, 3, &vpx_highbd_12_variance4x8_neon, 12), + VarianceParams(2, 2, &vpx_highbd_12_variance4x4_neon, 12), + VarianceParams(6, 6, &vpx_highbd_10_variance64x64_neon, 10), + VarianceParams(6, 5, &vpx_highbd_10_variance64x32_neon, 10), + VarianceParams(5, 6, &vpx_highbd_10_variance32x64_neon, 10), + VarianceParams(5, 5, &vpx_highbd_10_variance32x32_neon, 10), + VarianceParams(5, 4, &vpx_highbd_10_variance32x16_neon, 10), + VarianceParams(4, 5, &vpx_highbd_10_variance16x32_neon, 10), + VarianceParams(4, 4, &vpx_highbd_10_variance16x16_neon, 10), + VarianceParams(4, 3, &vpx_highbd_10_variance16x8_neon, 10), + VarianceParams(3, 4, &vpx_highbd_10_variance8x16_neon, 10), + VarianceParams(3, 3, &vpx_highbd_10_variance8x8_neon, 10), + VarianceParams(3, 2, &vpx_highbd_10_variance8x4_neon, 10), + VarianceParams(2, 3, &vpx_highbd_10_variance4x8_neon, 10), + VarianceParams(2, 2, &vpx_highbd_10_variance4x4_neon, 10), + VarianceParams(6, 6, &vpx_highbd_8_variance64x64_neon, 8), + VarianceParams(6, 5, &vpx_highbd_8_variance64x32_neon, 8), + VarianceParams(5, 6, &vpx_highbd_8_variance32x64_neon, 8), + VarianceParams(5, 5, &vpx_highbd_8_variance32x32_neon, 8), + VarianceParams(5, 4, &vpx_highbd_8_variance32x16_neon, 8), + VarianceParams(4, 5, &vpx_highbd_8_variance16x32_neon, 8), + VarianceParams(4, 4, &vpx_highbd_8_variance16x16_neon, 8), + VarianceParams(4, 3, &vpx_highbd_8_variance16x8_neon, 8), + VarianceParams(3, 4, &vpx_highbd_8_variance8x16_neon, 8), + VarianceParams(3, 3, &vpx_highbd_8_variance8x8_neon, 8), + VarianceParams(3, 2, &vpx_highbd_8_variance8x4_neon, 8), + VarianceParams(2, 3, &vpx_highbd_8_variance4x8_neon, 8), + VarianceParams(2, 2, &vpx_highbd_8_variance4x4_neon, 8))); + +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_neon, + 12), + SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_neon, + 12), + SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_neon, + 12), + SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_neon, + 12), + SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_neon, + 12), + SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_neon, + 12), + SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_neon, + 12), + SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_neon, + 12), + SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_neon, + 12), + SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_neon, + 12), + SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon, + 12), + SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon, + 10), + SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon, + 10), + SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_neon, + 10), + SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_neon, + 10), + SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_neon, + 10), + SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_neon, + 10), + SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_neon, + 10), + SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_neon, + 10), + SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_neon, + 10), + SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_neon, + 10), + SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon, + 10), + SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon, + 8), + SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon, + 8), + SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_neon, + 8), + SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_neon, + 8), + SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_neon, + 8), + SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_neon, + 8), + SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_neon, + 8), + SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_neon, + 8), + SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon, + 8), + SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8), + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, + 8))); + +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_12_sub_pixel_avg_variance64x64_neon, + 12), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_12_sub_pixel_avg_variance64x32_neon, + 12), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_12_sub_pixel_avg_variance32x64_neon, + 12), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_12_sub_pixel_avg_variance32x32_neon, + 12), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_12_sub_pixel_avg_variance32x16_neon, + 12), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_12_sub_pixel_avg_variance16x32_neon, + 12), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_12_sub_pixel_avg_variance16x16_neon, + 12), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_12_sub_pixel_avg_variance16x8_neon, + 12), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_12_sub_pixel_avg_variance8x16_neon, + 12), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_12_sub_pixel_avg_variance8x8_neon, + 12), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_12_sub_pixel_avg_variance8x4_neon, + 12), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_10_sub_pixel_avg_variance64x64_neon, + 10), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_10_sub_pixel_avg_variance64x32_neon, + 10), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_10_sub_pixel_avg_variance32x64_neon, + 10), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_10_sub_pixel_avg_variance32x32_neon, + 10), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_10_sub_pixel_avg_variance32x16_neon, + 10), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_10_sub_pixel_avg_variance16x32_neon, + 10), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_10_sub_pixel_avg_variance16x16_neon, + 10), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_10_sub_pixel_avg_variance16x8_neon, + 10), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_10_sub_pixel_avg_variance8x16_neon, + 10), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_10_sub_pixel_avg_variance8x8_neon, + 10), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_10_sub_pixel_avg_variance8x4_neon, + 10), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_8_sub_pixel_avg_variance64x64_neon, + 8), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_8_sub_pixel_avg_variance64x32_neon, + 8), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_8_sub_pixel_avg_variance32x64_neon, + 8), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_8_sub_pixel_avg_variance32x32_neon, + 8), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_8_sub_pixel_avg_variance32x16_neon, + 8), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_8_sub_pixel_avg_variance16x32_neon, + 8), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_8_sub_pixel_avg_variance16x16_neon, + 8), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_8_sub_pixel_avg_variance16x8_neon, + 8), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_8_sub_pixel_avg_variance8x16_neon, + 8), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_8_sub_pixel_avg_variance8x8_neon, + 8), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_8_sub_pixel_avg_variance8x4_neon, + 8))); + +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON #if HAVE_MSA diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c new file mode 100644 index 0000000000..3a60a14ab8 --- /dev/null +++ b/vpx_dsp/arm/highbd_variance_neon.c @@ -0,0 +1,502 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_ports/mem.h" + +static const uint8_t bilinear_filters[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + int w, int h, uint64_t *sse, + int64_t *sum) { + int i, j; + + if (w >= 8) { + int32x4_t sum_s32 = vdupq_n_s32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const int16x8_t src_s16 = vreinterpretq_s16_u16(vld1q_u16(&src_ptr[j])); + const int16x8_t ref_s16 = vreinterpretq_s16_u16(vld1q_u16(&ref_ptr[j])); + const int32x4_t diff1_s32 = + vsubl_s16(vget_low_s16(src_s16), vget_low_s16(ref_s16)); + const int32x4_t diff2_s32 = + vsubl_s16(vget_high_s16(src_s16), vget_high_s16(ref_s16)); + const uint32x4_t diff1_u32 = vreinterpretq_u32_s32(diff1_s32); + const uint32x4_t diff2_u32 = vreinterpretq_u32_s32(diff2_s32); + sum_s32 = vaddq_s32(sum_s32, diff1_s32); + sum_s32 = vaddq_s32(sum_s32, diff2_s32); + sse_u32 = vmlaq_u32(sse_u32, diff1_u32, diff1_u32); + sse_u32 = vmlaq_u32(sse_u32, diff2_u32, diff2_u32); + } + src_ptr += src_stride; + ref_ptr += ref_stride; + } + *sum = horizontal_add_int32x4(sum_s32); + *sse = horizontal_add_uint32x4(sse_u32); + } else { + int32x4_t sum_s32 = vdupq_n_s32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + assert(w >= 4); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 4) { + const int16x4_t src_s16 = vreinterpret_s16_u16(vld1_u16(&src_ptr[j])); + const int16x4_t ref_s16 = vreinterpret_s16_u16(vld1_u16(&ref_ptr[j])); + const int32x4_t diff_s32 = vsubl_s16(src_s16, ref_s16); + const uint32x4_t diff_u32 = vreinterpretq_u32_s32(diff_s32); + sum_s32 = vaddq_s32(sum_s32, diff_s32); + sse_u32 = vmlaq_u32(sse_u32, diff_u32, diff_u32); + } + src_ptr += src_stride; + ref_ptr += ref_stride; + } + *sum = horizontal_add_int32x4(sum_s32); + *sse = horizontal_add_uint32x4(sse_u32); + } +} + +static INLINE void highbd_variance64(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, + int w, int h, uint64_t *sse, + int64_t *sum) { + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr); + + if (w < 32 && h < 32) { + highbd_variance16(src_ptr, src_stride, ref_ptr, ref_stride, w, h, sse, sum); + } else { + uint64_t sse_long = 0; + int64_t sum_long = 0; + int k, l; + for (k = 0; k + 16 <= h; k += 16) { + for (l = 0; l + 16 <= w; l += 16) { + uint64_t sse_tmp = 0; + int64_t sum_tmp = 0; + highbd_variance16(src_ptr + l, src_stride, ref_ptr + l, ref_stride, 16, + 16, &sse_tmp, &sum_tmp); + sum_long += sum_tmp; + sse_long += sse_tmp; + } + src_ptr += 16 * src_stride; + ref_ptr += 16 * ref_stride; + } + *sum = sum_long; + *sse = sse_long; + } +} + +static INLINE void highbd_8_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, + int w, int h, uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); + *sse = (uint32_t)sse_long; + *sum = (int)sum_long; +} + +static INLINE void highbd_10_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, + int w, int h, uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); +} + +static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, + int w, int h, uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); +} + +#define HIGHBD_VAR(W, H) \ + uint32_t vpx_highbd_8_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + uint32_t vpx_highbd_10_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t vpx_highbd_12_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_GET_VAR(S) \ + void vpx_highbd_8_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } \ + \ + void vpx_highbd_10_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } \ + \ + void vpx_highbd_12_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } + +#define HIGHBD_MSE(W, H) \ + uint32_t vpx_highbd_8_mse##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_10_mse##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_12_mse##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } + +static INLINE void highbd_var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr8, uint16_t *output_ptr, + unsigned int src_pixels_per_line, int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + uint32_t i, j; + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); + + uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1); + uint16x4_t filter1_u16 = vdup_n_u16(filter[0]); + uint16x4_t filter2_u16 = vdup_n_u16(filter[1]); + + if (output_width >= 8) { + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 8) { + uint32x4_t sum1_u32; + uint32x4_t sum2_u32; + uint16x4_t out1_u16; + uint16x4_t out2_u16; + const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]); + const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]); + sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); + sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); + sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16)); + sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16)); + out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS); + out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS); + vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16)); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } + } else { + assert(output_width >= 4); + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 4) { + uint32x4_t sum_u32; + uint16x4_t out_u16; + const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]); + const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]); + sum_u32 = vmull_u16(filter1_u16, src1_u16); + sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16); + out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS); + vst1_u16(&output_ptr[j], out_u16); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } + } +} + +static INLINE void highbd_var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + uint32_t i, j; + + uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1); + uint16x4_t filter1_u16 = vdup_n_u16(filter[0]); + uint16x4_t filter2_u16 = vdup_n_u16(filter[1]); + + if (output_width >= 8) { + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 8) { + uint32x4_t sum1_u32; + uint32x4_t sum2_u32; + uint16x4_t out1_u16; + uint16x4_t out2_u16; + const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]); + const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]); + sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); + sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); + sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16)); + sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16)); + out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS); + out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS); + vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16)); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } + } else { + assert(output_width >= 4); + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 4) { + uint32x4_t sum_u32; + uint16x4_t out_u16; + const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]); + const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]); + sum_u32 = vmull_u16(filter1_u16, src1_u16); + sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16); + out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS); + vst1_u16(&output_ptr[j], out_u16); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } + } +} + +#define HIGHBD_SUBPIX_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp2), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_10_variance##W##x##H##_neon( \ + CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_12_variance##W##x##H##_neon( \ + CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \ + } + +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ + H, temp2, W); \ + \ + return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp3), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ + H, temp2, W); \ + \ + return vpx_highbd_10_variance##W##x##H##_neon( \ + CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ + H, temp2, W); \ + \ + return vpx_highbd_12_variance##W##x##H##_neon( \ + CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \ + } + +void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i, j; + uint32x4_t one_u32 = vdupq_n_u32(1); + if (width >= 8) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 8) { + const uint16x8_t pred_u16 = vld1q_u16(&pred[j]); + const uint16x8_t ref_u16 = vld1q_u16(&ref[j]); + const uint32x4_t sum1_u32 = + vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16)); + const uint32x4_t sum2_u32 = + vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16)); + const uint16x4_t sum1_u16 = + vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1); + const uint16x4_t sum2_u16 = + vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1); + const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16); + vst1q_u16(&comp_pred[j], vcomp_pred); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else { + assert(width >= 4); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 4) { + const uint16x4_t pred_u16 = vld1_u16(&pred[j]); + const uint16x4_t ref_u16 = vld1_u16(&ref[j]); + const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16); + const uint16x4_t vcomp_pred = + vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1); + vst1_u16(&comp_pred[j], vcomp_pred); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } +} + +/* All three forms of the variance are available in the same sizes. */ +#define HIGHBD_VARIANCES(W, H) \ + HIGHBD_VAR(W, H) \ + HIGHBD_SUBPIX_VAR(W, H) \ + HIGHBD_SUBPIX_AVG_VAR(W, H) + +HIGHBD_VARIANCES(64, 64) +HIGHBD_VARIANCES(64, 32) +HIGHBD_VARIANCES(32, 64) +HIGHBD_VARIANCES(32, 32) +HIGHBD_VARIANCES(32, 16) +HIGHBD_VARIANCES(16, 32) +HIGHBD_VARIANCES(16, 16) +HIGHBD_VARIANCES(16, 8) +HIGHBD_VARIANCES(8, 16) +HIGHBD_VARIANCES(8, 8) +HIGHBD_VARIANCES(8, 4) +HIGHBD_VARIANCES(4, 8) +HIGHBD_VARIANCES(4, 4) + +HIGHBD_GET_VAR(8) +HIGHBD_GET_VAR(16) + +HIGHBD_MSE(16, 16) +HIGHBD_MSE(16, 8) +HIGHBD_MSE(8, 16) +HIGHBD_MSE(8, 8) diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c index 30b55dcb40..ce1e8382b9 100644 --- a/vpx_dsp/variance.c +++ b/vpx_dsp/variance.c @@ -549,9 +549,9 @@ HIGHBD_MSE(16, 8) HIGHBD_MSE(8, 16) HIGHBD_MSE(8, 8) -void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint16_t *pred, - int width, int height, const uint16_t *ref, - int ref_stride) { +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { int i, j; for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 1a03aed526..3019bff8f7 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -430,6 +430,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 6cd46129f0..44dee56780 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1237,369 +1237,397 @@ () if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance64x64 sse2/; + specialize qw/vpx_highbd_12_variance64x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance64x32 sse2/; + specialize qw/vpx_highbd_12_variance64x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance32x64 sse2/; + specialize qw/vpx_highbd_12_variance32x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance32x32 sse2/; + specialize qw/vpx_highbd_12_variance32x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance32x16 sse2/; + specialize qw/vpx_highbd_12_variance32x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance16x32 sse2/; + specialize qw/vpx_highbd_12_variance16x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance16x16 sse2/; + specialize qw/vpx_highbd_12_variance16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance16x8 sse2/; + specialize qw/vpx_highbd_12_variance16x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance8x16 sse2/; + specialize qw/vpx_highbd_12_variance8x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance8x8 sse2/; + specialize qw/vpx_highbd_12_variance8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance8x4 neon/; add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance4x8 neon/; add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance4x4 neon/; add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance64x64 sse2/; + specialize qw/vpx_highbd_10_variance64x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance64x32 sse2/; + specialize qw/vpx_highbd_10_variance64x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance32x64 sse2/; + specialize qw/vpx_highbd_10_variance32x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance32x32 sse2/; + specialize qw/vpx_highbd_10_variance32x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance32x16 sse2/; + specialize qw/vpx_highbd_10_variance32x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance16x32 sse2/; + specialize qw/vpx_highbd_10_variance16x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance16x16 sse2/; + specialize qw/vpx_highbd_10_variance16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance16x8 sse2/; + specialize qw/vpx_highbd_10_variance16x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance8x16 sse2/; + specialize qw/vpx_highbd_10_variance8x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance8x8 sse2/; + specialize qw/vpx_highbd_10_variance8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance8x4 neon/; add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance4x8 neon/; add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance4x4 neon/; add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance64x64 sse2/; + specialize qw/vpx_highbd_8_variance64x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance64x32 sse2/; + specialize qw/vpx_highbd_8_variance64x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance32x64 sse2/; + specialize qw/vpx_highbd_8_variance32x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance32x32 sse2/; + specialize qw/vpx_highbd_8_variance32x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance32x16 sse2/; + specialize qw/vpx_highbd_8_variance32x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance16x32 sse2/; + specialize qw/vpx_highbd_8_variance16x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance16x16 sse2/; + specialize qw/vpx_highbd_8_variance16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance16x8 sse2/; + specialize qw/vpx_highbd_8_variance16x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance8x16 sse2/; + specialize qw/vpx_highbd_8_variance8x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance8x8 sse2/; + specialize qw/vpx_highbd_8_variance8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance8x4 neon/; add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance4x8 neon/; add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance4x4 neon/; add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_8_get16x16var sse2/; + specialize qw/vpx_highbd_8_get16x16var sse2 neon/; add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_8_get8x8var sse2/; + specialize qw/vpx_highbd_8_get8x8var sse2 neon/; add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_10_get16x16var sse2/; + specialize qw/vpx_highbd_10_get16x16var sse2 neon/; add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_10_get8x8var sse2/; + specialize qw/vpx_highbd_10_get8x8var sse2 neon/; add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_12_get16x16var sse2/; + specialize qw/vpx_highbd_12_get16x16var sse2 neon/; add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_12_get8x8var sse2/; + specialize qw/vpx_highbd_12_get8x8var sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse16x16 sse2/; + specialize qw/vpx_highbd_8_mse16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_mse16x8 neon/; add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_mse8x16 neon/; add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse8x8 sse2/; + specialize qw/vpx_highbd_8_mse8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_mse16x16 sse2/; + specialize qw/vpx_highbd_10_mse16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_mse16x8 neon/; add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_mse8x16 neon/; add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_mse8x8 sse2/; + specialize qw/vpx_highbd_10_mse8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_mse16x16 sse2/; + specialize qw/vpx_highbd_12_mse16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_mse16x8 neon/; add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_mse8x16 neon/; add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_mse8x8 sse2/; + specialize qw/vpx_highbd_12_mse8x8 sse2 neon/; add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride"; + specialize qw/vpx_highbd_comp_avg_pred neon/; # # Subpixel Variance # add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/; } # CONFIG_VP9_HIGHBITDEPTH From fd45d113807eb00fd5b9e58784e48e662a6797b9 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Fri, 26 Aug 2022 14:29:32 -0700 Subject: [PATCH 0491/1000] L2E: Add gop size and ARF existence to frame info Pass the encode frame info to external ml model, with the information of gop size and whether alt ref is used. Change-Id: I55be2d3de83d7182c1a1a174e44ead7e19045c9d --- vp9/encoder/vp9_encoder.c | 8 +++++++- vp9/encoder/vp9_ext_ratectrl.c | 4 +++- vp9/encoder/vp9_ext_ratectrl.h | 2 +- vpx/vpx_ext_ratectrl.h | 8 ++++++++ 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 371779e772..91b64e5d13 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4501,11 +4501,17 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; const RefCntBuffer *curr_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx); + // index 0 of a gf group is always KEY/OVERLAY/GOLDEN. + // index 1 refers to the first encoding frame in a gf group. + // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref. + // See function define_gf_group_structure(). + const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE; get_ref_frame_bufs(cpi, ref_frame_bufs); codec_status = vp9_extrc_get_encodeframe_decision( &cpi->ext_ratectrl, curr_frame_buf->frame_index, cm->current_frame_coding_index, gf_group->index, update_type, - ref_frame_bufs, ref_frame_flags, &encode_frame_decision); + gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags, + &encode_frame_decision); if (codec_status != VPX_CODEC_OK) { vpx_internal_error(&cm->error, codec_status, "vp9_extrc_get_encodeframe_decision() failed"); diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index d5b60b02a6..7e38cc5247 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -137,7 +137,7 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { vpx_codec_err_t vp9_extrc_get_encodeframe_decision( EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, - FRAME_UPDATE_TYPE update_type, + FRAME_UPDATE_TYPE update_type, const int gop_size, const int use_alt_ref, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, vpx_rc_encodeframe_decision_t *encode_frame_decision) { if (ext_ratectrl == NULL) { @@ -150,6 +150,8 @@ vpx_codec_err_t vp9_extrc_get_encodeframe_decision( encode_frame_info.coding_index = coding_index; encode_frame_info.gop_index = gop_index; encode_frame_info.frame_type = extrc_get_frame_type(update_type); + encode_frame_info.gop_size = gop_size; + encode_frame_info.use_alt_ref = use_alt_ref; vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs, encode_frame_info.ref_frame_coding_indexes, diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index b46b776b91..b8f3d0c834 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -35,7 +35,7 @@ vpx_codec_err_t vp9_extrc_send_firstpass_stats( vpx_codec_err_t vp9_extrc_get_encodeframe_decision( EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, - FRAME_UPDATE_TYPE update_type, + FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, vpx_rc_encodeframe_decision_t *encode_frame_decision); diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index b6c950d87e..95b883413e 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -100,6 +100,14 @@ typedef struct vpx_rc_encodeframe_info { * 1: Valid */ int ref_frame_valid_list[3]; + /*! + * The length of the current GOP. + */ + int gop_size; + /*! + * Whether the current GOP uses an alt ref. + */ + int use_alt_ref; } vpx_rc_encodeframe_info_t; /*!\brief Frame coding result From 27fd546079a5566346b078754b51008ef46f5d2d Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 26 Aug 2022 22:12:44 -0700 Subject: [PATCH 0492/1000] highbd_variance_neon,cosmetics: reorder a few lines Change-Id: Ia6fa54652d7f94687e64108482bb0f28ca06cf49 --- vpx_dsp/arm/highbd_variance_neon.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c index 3a60a14ab8..96a35af01c 100644 --- a/vpx_dsp/arm/highbd_variance_neon.c +++ b/vpx_dsp/arm/highbd_variance_neon.c @@ -233,14 +233,12 @@ static INLINE void highbd_var_filter_block2d_bil_first_pass( if (output_width >= 8) { for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; j += 8) { - uint32x4_t sum1_u32; - uint32x4_t sum2_u32; - uint16x4_t out1_u16; - uint16x4_t out2_u16; const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]); const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]); - sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); - sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); + uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); + uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); + uint16x4_t out1_u16; + uint16x4_t out2_u16; sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16)); sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16)); out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS); @@ -255,11 +253,10 @@ static INLINE void highbd_var_filter_block2d_bil_first_pass( assert(output_width >= 4); for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; j += 4) { - uint32x4_t sum_u32; - uint16x4_t out_u16; const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]); const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]); - sum_u32 = vmull_u16(filter1_u16, src1_u16); + uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16); + uint16x4_t out_u16; sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16); out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS); vst1_u16(&output_ptr[j], out_u16); @@ -285,14 +282,12 @@ static INLINE void highbd_var_filter_block2d_bil_second_pass( if (output_width >= 8) { for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; j += 8) { - uint32x4_t sum1_u32; - uint32x4_t sum2_u32; - uint16x4_t out1_u16; - uint16x4_t out2_u16; const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]); const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]); - sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); - sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); + uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); + uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); + uint16x4_t out1_u16; + uint16x4_t out2_u16; sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16)); sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16)); out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS); @@ -307,11 +302,10 @@ static INLINE void highbd_var_filter_block2d_bil_second_pass( assert(output_width >= 4); for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; j += 4) { - uint32x4_t sum_u32; - uint16x4_t out_u16; const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]); const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]); - sum_u32 = vmull_u16(filter1_u16, src1_u16); + uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16); + uint16x4_t out_u16; sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16); out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS); vst1_u16(&output_ptr[j], out_u16); From 9d6d0624d7943a09cc0be9df1a7402522989ac1a Mon Sep 17 00:00:00 2001 From: Yaowu Xu Date: Tue, 30 Aug 2022 09:04:58 -0700 Subject: [PATCH 0493/1000] Remove const for pass-by-value parameters This also fixes MSVC compiler warnings. Change-Id: I20dc9ac821275ba95598f3016fc6b23e884e13b7 --- vp9/encoder/vp9_ext_ratectrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 7e38cc5247..b4ee574ff1 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -137,7 +137,7 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { vpx_codec_err_t vp9_extrc_get_encodeframe_decision( EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, - FRAME_UPDATE_TYPE update_type, const int gop_size, const int use_alt_ref, + FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, vpx_rc_encodeframe_decision_t *encode_frame_decision) { if (ext_ratectrl == NULL) { From 028fc1b50f196cab1ec93816654fbefe64f20cf3 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 31 Aug 2022 16:35:08 -0700 Subject: [PATCH 0494/1000] test/*,cosmetics: normalize void parameter lists replace (void) with (); use of this synonym is more common in C++ code. Change-Id: I9813e82234dc9caa7115918a0491b0040f6afaf4 --- test/acm_random.h | 16 ++++++++-------- test/error_resilience_test.cc | 2 +- test/md5_helper.h | 2 +- test/svc_datarate_test.cc | 2 +- test/vp8_datarate_test.cc | 2 +- test/vp9_datarate_test.cc | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/test/acm_random.h b/test/acm_random.h index 3458340a12..c7122b9338 100644 --- a/test/acm_random.h +++ b/test/acm_random.h @@ -28,43 +28,43 @@ class ACMRandom { explicit ACMRandom(int seed) : random_(seed) {} void Reset(int seed) { random_.Reseed(seed); } - uint16_t Rand16(void) { + uint16_t Rand16() { const uint32_t value = random_.Generate(testing::internal::Random::kMaxRange); return (value >> 15) & 0xffff; } - int32_t Rand20Signed(void) { + int32_t Rand20Signed() { // Use 20 bits: values between 524287 and -524288. const uint32_t value = random_.Generate(1048576); return static_cast(value) - 524288; } - int16_t Rand16Signed(void) { + int16_t Rand16Signed() { // Use 16 bits: values between 32767 and -32768. return static_cast(random_.Generate(65536)); } - int16_t Rand13Signed(void) { + int16_t Rand13Signed() { // Use 13 bits: values between 4095 and -4096. const uint32_t value = random_.Generate(8192); return static_cast(value) - 4096; } - int16_t Rand9Signed(void) { + int16_t Rand9Signed() { // Use 9 bits: values between 255 (0x0FF) and -256 (0x100). const uint32_t value = random_.Generate(512); return static_cast(value) - 256; } - uint8_t Rand8(void) { + uint8_t Rand8() { const uint32_t value = random_.Generate(testing::internal::Random::kMaxRange); // There's a bit more entropy in the upper bits of this implementation. return (value >> 23) & 0xff; } - uint8_t Rand8Extremes(void) { + uint8_t Rand8Extremes() { // Returns a random value near 0 or near 255, to better exercise // saturation behavior. const uint8_t r = Rand8(); @@ -82,7 +82,7 @@ class ACMRandom { int operator()(int n) { return PseudoUniform(n); } - static int DeterministicSeed(void) { return 0xbaba; } + static int DeterministicSeed() { return 0xbaba; } private: testing::internal::Random random_; diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc index 45a327ec2f..45138f14b9 100644 --- a/test/error_resilience_test.cc +++ b/test/error_resilience_test.cc @@ -496,7 +496,7 @@ class ErrorResilienceTestLargeCodecControls ++tot_frame_number_; } - virtual void EndPassHook(void) { + virtual void EndPassHook() { duration_ = (last_pts_ + 1) * timebase_; if (cfg_.ts_number_layers > 1) { for (int layer = 0; layer < static_cast(cfg_.ts_number_layers); diff --git a/test/md5_helper.h b/test/md5_helper.h index dc28dc6283..9095d96a8a 100644 --- a/test/md5_helper.h +++ b/test/md5_helper.h @@ -47,7 +47,7 @@ class MD5 { MD5Update(&md5_, data, static_cast(size)); } - const char *Get(void) { + const char *Get() { static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc index 51e90e776c..010c273421 100644 --- a/test/svc_datarate_test.cc +++ b/test/svc_datarate_test.cc @@ -571,7 +571,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } } - virtual void EndPassHook(void) { + virtual void EndPassHook() { if (change_bitrate_) last_pts_ = last_pts_ - last_pts_ref_; duration_ = (last_pts_ + 1) * timebase_; for (int sl = 0; sl < number_spatial_layers_; ++sl) { diff --git a/test/vp8_datarate_test.cc b/test/vp8_datarate_test.cc index dcd68a2d4c..64a861d15e 100644 --- a/test/vp8_datarate_test.cc +++ b/test/vp8_datarate_test.cc @@ -121,7 +121,7 @@ class DatarateTestLarge ++frame_number_; } - virtual void EndPassHook(void) { + virtual void EndPassHook() { if (bits_total_) { const double file_size_in_kb = bits_total_ / 1000.; // bits per kilobit diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc index 9930c754c7..286fa335a1 100644 --- a/test/vp9_datarate_test.cc +++ b/test/vp9_datarate_test.cc @@ -199,7 +199,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest { ++tot_frame_number_; } - virtual void EndPassHook(void) { + virtual void EndPassHook() { for (int layer = 0; layer < static_cast(cfg_.ts_number_layers); ++layer) { duration_ = (last_pts_ + 1) * timebase_; From 281dfae8353940fe380c73384607ec11a5c53f43 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 1 Sep 2022 18:47:50 -0700 Subject: [PATCH 0495/1000] neon,load_unaligned_*: use dup for lane 0 this produces better assembly with gcc (11.3.0-3); no change in assembly using clang from the r24 android sdk (Android (8075178, based on r437112b) clang version 14.0.1 (https://android.googlesource.com/toolchain/llvm-project 8671348b81b95fc603505dfc881b45103bee1731) Change-Id: Ifec252d4f499f23be1cd94aa8516caf6b3fbbc11 --- vpx_dsp/arm/mem_neon.h | 8 ++++---- vpx_dsp/arm/sad4d_neon.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index 50aaa94fe0..84aae161b3 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -116,11 +116,11 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) { static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, ptrdiff_t stride) { uint32_t a; - uint32x2_t a_u32 = vdup_n_u32(0); + uint32x2_t a_u32; if (stride == 4) return vld1_u8(buf); memcpy(&a, buf, 4); buf += stride; - a_u32 = vset_lane_u32(a, a_u32, 0); + a_u32 = vdup_n_u32(a); memcpy(&a, buf, 4); a_u32 = vset_lane_u32(a, a_u32, 1); return vreinterpret_u8_u32(a_u32); @@ -143,11 +143,11 @@ static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride, static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, ptrdiff_t stride) { uint32_t a; - uint32x4_t a_u32 = vdupq_n_u32(0); + uint32x4_t a_u32; if (stride == 4) return vld1q_u8(buf); memcpy(&a, buf, 4); buf += stride; - a_u32 = vsetq_lane_u32(a, a_u32, 0); + a_u32 = vdupq_n_u32(a); memcpy(&a, buf, 4); buf += stride; a_u32 = vsetq_lane_u32(a, a_u32, 1); diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 03f716c3d5..53866296ce 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -20,9 +20,9 @@ static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0, const void *const buf1) { uint32_t a; - uint32x2_t aa = vdup_n_u32(0); + uint32x2_t aa; memcpy(&a, buf0, 4); - aa = vset_lane_u32(a, aa, 0); + aa = vdup_n_u32(a); memcpy(&a, buf1, 4); aa = vset_lane_u32(a, aa, 1); return vreinterpret_u8_u32(aa); From 447e27588032064c02eeb864d086881806038c35 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 2 Sep 2022 12:17:20 -0700 Subject: [PATCH 0496/1000] vpx_dsp,neon: simplify __ARM_FEATURE_DOTPROD check only check that the macro is defined, the value doesn't have any effect. from https://arm-software.github.io/acle/main/acle.html: 5.5.7.7. Dot Product extension __ARM_FEATURE_DOTPROD is defined if the dot product data manipulation instructions are supported and the vector intrinsics are available. Note that this implies: - __ARM_NEON == 1 Change-Id: I164fe121ccefda99050a9b6a99738a2b518520f3 --- vpx_dsp/arm/sad4d_neon.c | 21 +++++++---------- vpx_dsp/arm/sad_neon.c | 40 +++++++++++++------------------- vpx_dsp/arm/variance_neon.c | 8 +++---- vpx_dsp/arm/vpx_convolve8_neon.c | 7 +++--- vpx_dsp/arm/vpx_convolve8_neon.h | 5 ++-- 5 files changed, 34 insertions(+), 47 deletions(-) diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 53866296ce..5fc621aee1 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -237,8 +237,7 @@ void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, uint32x4_t *const sum) { @@ -270,7 +269,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, vst1q_u32(sad_array, vpaddq_u32(r0, r1)); } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, uint16x8_t *const sum) { @@ -305,7 +304,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, sad_512_pel_final_neon(sum, sad_array); } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, @@ -327,8 +326,7 @@ void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, @@ -386,7 +384,7 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64); } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, @@ -444,12 +442,11 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, sad_2048_pel_final_neon(sum, sad_array); } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, @@ -554,7 +551,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, vst1q_u32(sad_array, vpaddq_u32(r0, r1)); } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, @@ -649,4 +646,4 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, sad_4096_pel_final_neon(sum, sad_array); } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index 34870375a3..4753aeaec6 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -21,8 +21,7 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8); const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); return horizontal_add_uint32x4(dp); @@ -40,8 +39,7 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg); const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); return horizontal_add_uint32x4(prod); @@ -54,8 +52,7 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) uint32x4_t prod = vdupq_n_u32(0); const uint8x16_t ones = vdupq_n_u8(1); const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); @@ -88,8 +85,7 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred) { -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) uint32x4_t prod = vdupq_n_u32(0); const uint8x16_t ones = vdupq_n_u8(1); const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); @@ -126,8 +122,7 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, #endif } -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -182,7 +177,7 @@ static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride, return horizontal_add_uint32x2(prod); \ } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -233,14 +228,13 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride, sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint16x8(abs); \ } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) SAD8XN(4) SAD8XN(8) SAD8XN(16) -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -294,7 +288,7 @@ static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride, sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint32x4(prod); \ } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -348,14 +342,13 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride, sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint16x8(abs); \ } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) SAD16XN(8) SAD16XN(16) SAD16XN(32) -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -420,7 +413,7 @@ static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride, return horizontal_add_uint32x4(prod); \ } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -484,14 +477,13 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride, sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint16x8(abs); \ } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) SAD32XN(16) SAD32XN(32) SAD32XN(64) -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -559,7 +551,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, } return prod; } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -637,7 +629,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, return vpadalq_u16(sum, abs_1); } } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) #define SAD64XN(n) \ uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \ diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index 7b93f142b1..1b5cbcc46e 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -111,7 +111,7 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32)); } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) // The variance helper functions use int16_t for sum. 8 values are accumulated // and then added (at which point they expand up to int32_t). To avoid overflow, @@ -254,7 +254,7 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, @@ -421,7 +421,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, return vget_lane_u32(sse, 0); } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, @@ -518,4 +518,4 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse)); } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index 06b58c438f..ca5222fa07 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -31,8 +31,7 @@ // instructions. This optimization is much faster in speed unit test, but slowed // down the whole decoder by 5%. -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, @@ -764,7 +763,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, const uint8x8_t s0, const uint8x8_t s1, @@ -1694,4 +1693,4 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index 857b6d54e2..b112cb249a 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -72,8 +72,7 @@ static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p, *s7 = vld1q_u8(s); } -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo, const int8x16_t samples_hi, @@ -171,7 +170,7 @@ static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples, return vqrshrun_n_s16(sum, 7); } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, From 2faa4bfc5c28294e5080f328a5c5f10f6008d552 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 2 Sep 2022 16:17:52 -0700 Subject: [PATCH 0497/1000] x86,cosmetics: prefer _mm_setzero_si128/_mm256_setzero_si256 over *_set1_*(0) Change-Id: I136e1798a2ce286480ebb9418db67a2f1e92b9a2 --- vp9/encoder/x86/vp9_dct_intrin_sse2.c | 6 +++--- vpx_dsp/x86/fwd_dct32x32_impl_avx2.h | 2 +- vpx_dsp/x86/fwd_dct32x32_impl_sse2.h | 2 +- vpx_dsp/x86/highbd_inv_txfm_sse2.h | 2 +- vpx_dsp/x86/highbd_loopfilter_sse2.c | 8 ++++---- vpx_dsp/x86/inv_txfm_sse2.c | 4 ++-- vpx_dsp/x86/loopfilter_avx2.c | 4 ++-- vpx_dsp/x86/loopfilter_sse2.c | 14 +++++++------- 8 files changed, 21 insertions(+), 21 deletions(-) diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c index 2188903b17..e9943447fd 100644 --- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -111,7 +111,7 @@ static void fadst4_sse2(__m128i *in) { const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u[8], v[8]; __m128i in7 = _mm_add_epi16(in[0], in[1]); @@ -424,7 +424,7 @@ static void fadst8_sse2(__m128i *in) { const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__const_0 = _mm_set1_epi16(0); + const __m128i k__const_0 = _mm_setzero_si128(); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; @@ -1056,7 +1056,7 @@ static void fadst16_8col(__m128i *in) { const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); u[0] = _mm_unpacklo_epi16(in[15], in[0]); u[1] = _mm_unpackhi_epi16(in[15], in[0]); diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h index 3f158b5e4e..f3a8020292 100644 --- a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h +++ b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h @@ -89,7 +89,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) { const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64); const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64); const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); - const __m256i kZero = _mm256_set1_epi16(0); + const __m256i kZero = _mm256_setzero_si256(); const __m256i kOne = _mm256_set1_epi16(1); // Do the two transform/transpose passes int pass; diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h index ac1246faa5..bf350b6da0 100644 --- a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h +++ b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h @@ -100,7 +100,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) { const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); const __m128i kOne = _mm_set1_epi16(1); // Do the two transform/transpose passes diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h index 78cf9111d9..1d07391b02 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -249,7 +249,7 @@ static INLINE void highbd_idct16_4col_stage7(const __m128i *const in, static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1, const int bd) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); // Faster than _mm_set1_epi16((1 << bd) - 1). const __m128i one = _mm_set1_epi16(1); const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c index d265fc1a92..9f45623dee 100644 --- a/vpx_dsp/x86/highbd_loopfilter_sse2.c +++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c @@ -18,7 +18,7 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { __m128i lbounded; __m128i retval; - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); __m128i t80, max, min; @@ -51,7 +51,7 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); __m128i blimit_v, limit_v, thresh_v; __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; @@ -492,7 +492,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); __m128i blimit_v, limit_v, thresh_v; __m128i mask, hev, flat; __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch)); @@ -720,7 +720,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); __m128i blimit_v, limit_v, thresh_v; __m128i mask, hev, flat; __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index 4b02da9666..f42b3df849 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -243,7 +243,7 @@ void iadst8_sse2(__m128i *const in) { const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); __m128i s[8], u[16], v[8], w[16]; // transpose @@ -546,7 +546,7 @@ void vpx_iadst16_8col_sse2(__m128i *const in) { const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); u[0] = _mm_unpacklo_epi16(in[15], in[0]); u[1] = _mm_unpackhi_epi16(in[15], in[0]); diff --git a/vpx_dsp/x86/loopfilter_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c index be391992af..a58fb65539 100644 --- a/vpx_dsp/x86/loopfilter_avx2.c +++ b/vpx_dsp/x86/loopfilter_avx2.c @@ -18,7 +18,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch, const unsigned char *limit, const unsigned char *thresh) { __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; __m128i abs_p1p0; @@ -372,7 +372,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch, const unsigned char *limit, const unsigned char *thresh) { __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c index 347c9fdbe9..6ea34cdd16 100644 --- a/vpx_dsp/x86/loopfilter_sse2.c +++ b/vpx_dsp/x86/loopfilter_sse2.c @@ -106,7 +106,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i limit_v = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit), _mm_loadl_epi64((const __m128i *)limit)); @@ -140,7 +140,7 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i limit_v = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit), _mm_loadl_epi64((const __m128i *)limit)); @@ -232,7 +232,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); const __m128i limit_v = _mm_load_si128((const __m128i *)limit); @@ -594,7 +594,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); const __m128i limit_v = _mm_load_si128((const __m128i *)limit); @@ -932,7 +932,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch, DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); const __m128i limit_v = _mm_load_si128((const __m128i *)limit); const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh); @@ -1152,7 +1152,7 @@ void vpx_lpf_horizontal_8_dual_sse2( DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i blimit = _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0), _mm_load_si128((const __m128i *)blimit1)); @@ -1406,7 +1406,7 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch, const __m128i thresh = _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0), _mm_load_si128((const __m128i *)thresh1)); - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i mask, hev, flat; From b3317970e702fe164240d71e73ca425efca72c46 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 2 Sep 2022 16:44:14 -0700 Subject: [PATCH 0498/1000] variance_neon.cc: simplify __ARM_FEATURE_DOTPROD check missed in 447e27588 vpx_dsp,neon: simplify __ARM_FEATURE_DOTPROD check + fix #if comments only check that the macro is defined, the value doesn't have any effect. from https://arm-software.github.io/acle/main/acle.html: 5.5.7.7. Dot Product extension __ARM_FEATURE_DOTPROD is defined if the dot product data manipulation instructions are supported and the vector intrinsics are available. Note that this implies: - __ARM_NEON == 1 Change-Id: I098b96421b7de5928bb3b11612ca1f32e7b6cbc4 --- vpx_dsp/arm/variance_neon.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index 1b5cbcc46e..f9969ed5a4 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -19,7 +19,7 @@ #include "vpx_dsp/arm/sum_neon.h" #include "vpx_ports/mem.h" -#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1) +#if defined(__ARM_FEATURE_DOTPROD) // Process a block of width 4 four rows at a time. static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride, @@ -111,7 +111,7 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32)); } -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) +#else // !defined(__ARM_FEATURE_DOTPROD) // The variance helper functions use int16_t for sum. 8 values are accumulated // and then added (at which point they expand up to int32_t). To avoid overflow, @@ -254,7 +254,7 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#endif // defined(__ARM_FEATURE_DOTPROD) void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, @@ -357,7 +357,7 @@ unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); } -#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1) +#if defined(__ARM_FEATURE_DOTPROD) unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, @@ -421,7 +421,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, return vget_lane_u32(sse, 0); } -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) +#else // !defined(__ARM_FEATURE_DOTPROD) unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, @@ -518,4 +518,4 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse)); } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#endif // defined(__ARM_FEATURE_DOTPROD) From a7527a26e84834ee9434dd3700c5faba0c839d6f Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 2 Sep 2022 16:55:43 -0700 Subject: [PATCH 0499/1000] sad_neon: enable UDOT implementation w/aarch32 Change-Id: Ia28305ec5c61518b732cbacbd102acd2cb7f9d82 --- vpx_dsp/arm/sad_neon.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index 4753aeaec6..ad575d4aae 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -21,7 +21,7 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#if defined(__ARM_FEATURE_DOTPROD) const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8); const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); return horizontal_add_uint32x4(dp); @@ -39,7 +39,7 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#if defined(__ARM_FEATURE_DOTPROD) const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg); const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); return horizontal_add_uint32x4(prod); @@ -52,7 +52,7 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#if defined(__ARM_FEATURE_DOTPROD) uint32x4_t prod = vdupq_n_u32(0); const uint8x16_t ones = vdupq_n_u8(1); const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); @@ -85,7 +85,7 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred) { -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#if defined(__ARM_FEATURE_DOTPROD) uint32x4_t prod = vdupq_n_u32(0); const uint8x16_t ones = vdupq_n_u8(1); const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); @@ -122,7 +122,7 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, #endif } -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#if defined(__ARM_FEATURE_DOTPROD) static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -177,7 +177,7 @@ static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride, return horizontal_add_uint32x2(prod); \ } -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) +#else // !defined(__ARM_FEATURE_DOTPROD) static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -228,13 +228,13 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride, sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint16x8(abs); \ } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#endif // defined(__ARM_FEATURE_DOTPROD) SAD8XN(4) SAD8XN(8) SAD8XN(16) -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#if defined(__ARM_FEATURE_DOTPROD) static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -288,7 +288,7 @@ static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride, sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint32x4(prod); \ } -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) +#else // !defined(__ARM_FEATURE_DOTPROD) static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -342,13 +342,13 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride, sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint16x8(abs); \ } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#endif // defined(__ARM_FEATURE_DOTPROD) SAD16XN(8) SAD16XN(16) SAD16XN(32) -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#if defined(__ARM_FEATURE_DOTPROD) static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -413,7 +413,7 @@ static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride, return horizontal_add_uint32x4(prod); \ } -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) +#else // defined(__ARM_FEATURE_DOTPROD) static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -477,13 +477,13 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride, sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint16x8(abs); \ } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#endif // defined(__ARM_FEATURE_DOTPROD) SAD32XN(16) SAD32XN(32) SAD32XN(64) -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#if defined(__ARM_FEATURE_DOTPROD) static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -551,7 +551,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, } return prod; } -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) +#else // !defined(__ARM_FEATURE_DOTPROD) static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -629,7 +629,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, return vpadalq_u16(sum, abs_1); } } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#endif // defined(__ARM_FEATURE_DOTPROD) #define SAD64XN(n) \ uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \ From a46ca4b6bd8e5f360da74693bb43b400d29d91e8 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 7 Sep 2022 18:41:13 -0700 Subject: [PATCH 0500/1000] vp8_decode: declare 2 variables volatile fixes -Wclobbered warnings with gcc 12.1.0: vp8/vp8_dx_iface.c|278 col 16| warning: variable 'w' might be clobbered by 'longjmp' or 'vfork' [-Wclobbered] vp8/vp8_dx_iface.c|278 col 19| warning: variable 'h' might be clobbered by 'longjmp' or 'vfork' [-Wclobbered] Change-Id: Ib2c606a3450188d7869c066cacaf5615d9746181 --- vp8/vp8_dx_iface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index 6d88e5154f..55a77ba7e5 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -275,7 +275,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, void *user_priv, long deadline) { volatile vpx_codec_err_t res; volatile unsigned int resolution_change = 0; - unsigned int w, h; + volatile unsigned int w, h; if (!ctx->fragments.enabled && (data == NULL && data_sz == 0)) { return 0; From 0d734728f6c2887a219de4bd118de8cd290f50c0 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Thu, 8 Sep 2022 13:05:55 -0700 Subject: [PATCH 0501/1000] Add vpx_highbd_sad16x{32,16,8}x4d_avx2. 1.98x to 2.3x faster than the sse2 version. Bug: b/245917257 Change-Id: Ie4f9bb942ffaf4af7d395fb5a5978b41aabfc93c --- test/sad_test.cc | 11 ++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +- vpx_dsp/x86/highbd_sad4d_avx2.c | 183 ++++++++++++++++++++++++++++++++ 4 files changed, 198 insertions(+), 3 deletions(-) create mode 100644 vpx_dsp/x86/highbd_sad4d_avx2.c diff --git a/test/sad_test.cc b/test/sad_test.cc index 4fb2af6244..92c9e6332a 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1079,6 +1079,17 @@ INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests)); const SadMxNx4Param x4d_avx2_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2), SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 8), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 8), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 8), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 10), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 10), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 10), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 12), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 12), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 3019bff8f7..34e9d736db 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -393,6 +393,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 44dee56780..df2c8da74e 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1048,13 +1048,13 @@ () specialize qw/vpx_highbd_sad32x16x4d sse2 neon/; add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad16x32x4d sse2 neon/; + specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad16x16x4d sse2 neon/; + specialize qw/vpx_highbd_sad16x16x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad16x8x4d sse2 neon/; + specialize qw/vpx_highbd_sad16x8x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad8x16x4d sse2 neon/; diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c new file mode 100644 index 0000000000..46c7e4fbc8 --- /dev/null +++ b/vpx_dsp/x86/highbd_sad4d_avx2.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include // AVX2 +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/, + uint32_t sad_array[4]) { + const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); + const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); + const __m256i t2 = _mm256_hadd_epi32(t0, t1); + const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2), + _mm256_extractf128_si256(t2, 1)); + _mm_storeu_si128((__m128i *)sad_array, sum); +} + +static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; i++) { + __m256i r[4]; + + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src); + r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); + r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); + r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s)); + + // sum every abs diff + sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]); + sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]); + sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]); + sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]); + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < 2; ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 4; + } + calc_final_4(sums_32, sad_array); +} + +void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); + + { + __m256i sums_32[4]; + sums_32[0] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); + sums_32[1] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); + sums_32[2] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); + sums_32[3] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); + calc_final_4(sums_32, sad_array); + } +} + +void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); + + { + __m256i sums_32[4]; + sums_32[0] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); + sums_32[1] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); + sums_32[2] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); + sums_32[3] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); + calc_final_4(sums_32, sad_array); + } +} From 33c43c14ee04e6bb74ce037d44b98c5ad4ea9474 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 8 Sep 2022 15:35:13 -0700 Subject: [PATCH 0502/1000] Update third_party/googletest to v1.12.1 See https://github.com/google/googletest/releases/tag/release-1.12.1. Modeled after https://aomedia-review.googlesource.com/c/aom/+/162601. Change-Id: If0ced3097b4c8490985e3381aaac9b3266d52ae7 --- third_party/googletest/README.libvpx | 10 +- third_party/googletest/src/.clang-format | 4 + third_party/googletest/src/CONTRIBUTORS | 2 + third_party/googletest/src/README.md | 8 +- .../include/gtest/gtest-assertion-result.h | 237 +++ .../src/include/gtest/gtest-death-test.h | 89 +- .../src/include/gtest/gtest-matchers.h | 76 +- .../src/include/gtest/gtest-message.h | 27 +- .../src/include/gtest/gtest-param-test.h | 87 +- .../src/include/gtest/gtest-printers.h | 65 +- .../googletest/src/include/gtest/gtest-spi.h | 132 +- .../src/include/gtest/gtest-test-part.h | 14 +- .../src/include/gtest/gtest-typed-test.h | 38 +- .../googletest/src/include/gtest/gtest.h | 532 ++---- .../src/include/gtest/gtest_pred_impl.h | 200 +-- .../googletest/src/include/gtest/gtest_prod.h | 9 +- .../include/gtest/internal/custom/README.md | 12 - .../gtest/internal/custom/gtest-port.h | 31 + .../internal/gtest-death-test-internal.h | 74 +- .../include/gtest/internal/gtest-filepath.h | 17 +- .../include/gtest/internal/gtest-internal.h | 330 ++-- .../include/gtest/internal/gtest-param-util.h | 145 +- .../include/gtest/internal/gtest-port-arch.h | 100 +- .../src/include/gtest/internal/gtest-port.h | 982 +++++------ .../src/include/gtest/internal/gtest-string.h | 18 +- .../include/gtest/internal/gtest-type-util.h | 21 +- third_party/googletest/src/src/gtest-all.cc | 3 +- .../src/src/gtest-assertion-result.cc | 77 + .../googletest/src/src/gtest-death-test.cc | 520 +++--- .../googletest/src/src/gtest-filepath.cc | 78 +- .../googletest/src/src/gtest-internal-inl.h | 217 ++- .../googletest/src/src/gtest-matchers.cc | 5 +- third_party/googletest/src/src/gtest-port.cc | 349 ++-- .../googletest/src/src/gtest-printers.cc | 124 +- .../googletest/src/src/gtest-test-part.cc | 19 +- .../googletest/src/src/gtest-typed-test.cc | 7 +- third_party/googletest/src/src/gtest.cc | 1509 +++++++++-------- third_party/googletest/src/src/gtest_main.cc | 5 +- 38 files changed, 3174 insertions(+), 2999 deletions(-) create mode 100644 third_party/googletest/src/.clang-format create mode 100644 third_party/googletest/src/include/gtest/gtest-assertion-result.h create mode 100644 third_party/googletest/src/src/gtest-assertion-result.cc diff --git a/third_party/googletest/README.libvpx b/third_party/googletest/README.libvpx index b9a74922f0..5f6b01b0ec 100644 --- a/third_party/googletest/README.libvpx +++ b/third_party/googletest/README.libvpx @@ -1,5 +1,5 @@ URL: https://github.com/google/googletest.git -Version: release-1.11.0 +Version: release-1.12.1 License: BSD License File: LICENSE @@ -13,9 +13,17 @@ generation. Local Modifications: - Remove everything but: + .clang-format CONTRIBUTORS googletest/ include README.md src LICENSE +- Move .clang-format, CONTRIBUTORS, and LICENSE into googletest/ +- In googletest/include/gtest/internal/custom/gtest-port.h, define + GTEST_HAS_NOTIFICATION_ as 1 and use a stub Notification class to fix + the mingw32 g++ compilation errors caused by the lack of std::mutex + and std::condition_variable in the and + headers if mingw32 is configured with the win32 threads option. See + https://stackoverflow.com/questions/17242516/mingw-w64-threads-posix-vs-win32 diff --git a/third_party/googletest/src/.clang-format b/third_party/googletest/src/.clang-format new file mode 100644 index 0000000000..5b9bfe6d22 --- /dev/null +++ b/third_party/googletest/src/.clang-format @@ -0,0 +1,4 @@ +# Run manually to reformat a file: +# clang-format -i --style=file +Language: Cpp +BasedOnStyle: Google diff --git a/third_party/googletest/src/CONTRIBUTORS b/third_party/googletest/src/CONTRIBUTORS index 76db0b40ff..77397a5b53 100644 --- a/third_party/googletest/src/CONTRIBUTORS +++ b/third_party/googletest/src/CONTRIBUTORS @@ -34,6 +34,7 @@ Manuel Klimek Mario Tanev Mark Paskin Markus Heule +Martijn Vels Matthew Simmons Mika Raento Mike Bland @@ -55,6 +56,7 @@ Russ Rufer Sean Mcafee Sigurður Ásgeirsson Sverre Sundsdal +Szymon Sobik Takeshi Yoshino Tracy Bialik Vadim Berman diff --git a/third_party/googletest/src/README.md b/third_party/googletest/src/README.md index 1f8b349ae7..d26b309ed0 100644 --- a/third_party/googletest/src/README.md +++ b/third_party/googletest/src/README.md @@ -25,7 +25,7 @@ When building GoogleTest as a standalone project, the typical workflow starts with ``` -git clone https://github.com/google/googletest.git -b release-1.10.0 +git clone https://github.com/google/googletest.git -b release-1.11.0 cd googletest # Main directory of the cloned repository. mkdir build # Create a directory to hold the build output. cd build @@ -94,7 +94,7 @@ include(FetchContent) FetchContent_Declare( googletest # Specify the commit you depend on and update it regularly. - URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip + URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip ) # For Windows: Prevent overriding the parent project's compiler/linker settings set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) @@ -203,7 +203,9 @@ add -DGTEST_DONT_DEFINE_FOO=1 to the compiler flags to tell GoogleTest to change the macro's name from `FOO` -to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For +to `GTEST_FOO`. Currently `FOO` can be `ASSERT_EQ`, `ASSERT_FALSE`, `ASSERT_GE`, +`ASSERT_GT`, `ASSERT_LE`, `ASSERT_LT`, `ASSERT_NE`, `ASSERT_TRUE`, +`EXPECT_FALSE`, `EXPECT_TRUE`, `FAIL`, `SUCCEED`, `TEST`, or `TEST_F`. For example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write GTEST_TEST(SomeTest, DoesThis) { ... } diff --git a/third_party/googletest/src/include/gtest/gtest-assertion-result.h b/third_party/googletest/src/include/gtest/gtest-assertion-result.h new file mode 100644 index 0000000000..addbb59c64 --- /dev/null +++ b/third_party/googletest/src/include/gtest/gtest-assertion-result.h @@ -0,0 +1,237 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file implements the AssertionResult type. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_ + +#include +#include +#include +#include + +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-port.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { + +// A class for indicating whether an assertion was successful. When +// the assertion wasn't successful, the AssertionResult object +// remembers a non-empty message that describes how it failed. +// +// To create an instance of this class, use one of the factory functions +// (AssertionSuccess() and AssertionFailure()). +// +// This class is useful for two purposes: +// 1. Defining predicate functions to be used with Boolean test assertions +// EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts +// 2. Defining predicate-format functions to be +// used with predicate assertions (ASSERT_PRED_FORMAT*, etc). +// +// For example, if you define IsEven predicate: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5))) +// will print the message +// +// Value of: IsEven(Fib(5)) +// Actual: false (5 is odd) +// Expected: true +// +// instead of a more opaque +// +// Value of: IsEven(Fib(5)) +// Actual: false +// Expected: true +// +// in case IsEven is a simple Boolean predicate. +// +// If you expect your predicate to be reused and want to support informative +// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up +// about half as often as positive ones in our tests), supply messages for +// both success and failure cases: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess() << n << " is even"; +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print +// +// Value of: IsEven(Fib(6)) +// Actual: true (8 is even) +// Expected: false +// +// NB: Predicates that support negative Boolean assertions have reduced +// performance in positive ones so be careful not to use them in tests +// that have lots (tens of thousands) of positive Boolean assertions. +// +// To use this class with EXPECT_PRED_FORMAT assertions such as: +// +// // Verifies that Foo() returns an even number. +// EXPECT_PRED_FORMAT1(IsEven, Foo()); +// +// you need to define: +// +// testing::AssertionResult IsEven(const char* expr, int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() +// << "Expected: " << expr << " is even\n Actual: it's " << n; +// } +// +// If Foo() returns 5, you will see the following message: +// +// Expected: Foo() is even +// Actual: it's 5 +// +class GTEST_API_ AssertionResult { + public: + // Copy constructor. + // Used in EXPECT_TRUE/FALSE(assertion_result). + AssertionResult(const AssertionResult& other); + +// C4800 is a level 3 warning in Visual Studio 2015 and earlier. +// This warning is not emitted in Visual Studio 2017. +// This warning is off by default starting in Visual Studio 2019 but can be +// enabled with command-line options. +#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */) +#endif + + // Used in the EXPECT_TRUE/FALSE(bool_expression). + // + // T must be contextually convertible to bool. + // + // The second parameter prevents this overload from being considered if + // the argument is implicitly convertible to AssertionResult. In that case + // we want AssertionResult's copy constructor to be used. + template + explicit AssertionResult( + const T& success, + typename std::enable_if< + !std::is_convertible::value>::type* + /*enabler*/ + = nullptr) + : success_(success) {} + +#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) + GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + + // Assignment operator. + AssertionResult& operator=(AssertionResult other) { + swap(other); + return *this; + } + + // Returns true if and only if the assertion succeeded. + operator bool() const { return success_; } // NOLINT + + // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. + AssertionResult operator!() const; + + // Returns the text streamed into this AssertionResult. Test assertions + // use it when they fail (i.e., the predicate's outcome doesn't match the + // assertion's expectation). When nothing has been streamed into the + // object, returns an empty string. + const char* message() const { + return message_.get() != nullptr ? message_->c_str() : ""; + } + // Deprecated; please use message() instead. + const char* failure_message() const { return message(); } + + // Streams a custom failure message into this object. + template + AssertionResult& operator<<(const T& value) { + AppendMessage(Message() << value); + return *this; + } + + // Allows streaming basic output manipulators such as endl or flush into + // this object. + AssertionResult& operator<<( + ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) { + AppendMessage(Message() << basic_manipulator); + return *this; + } + + private: + // Appends the contents of message to message_. + void AppendMessage(const Message& a_message) { + if (message_.get() == nullptr) message_.reset(new ::std::string); + message_->append(a_message.GetString().c_str()); + } + + // Swap the contents of this AssertionResult with other. + void swap(AssertionResult& other); + + // Stores result of the assertion predicate. + bool success_; + // Stores the message describing the condition in case the expectation + // construct is not satisfied with the predicate's outcome. + // Referenced via a pointer to avoid taking too much stack frame space + // with test assertions. + std::unique_ptr< ::std::string> message_; +}; + +// Makes a successful assertion result. +GTEST_API_ AssertionResult AssertionSuccess(); + +// Makes a failed assertion result. +GTEST_API_ AssertionResult AssertionFailure(); + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << msg. +GTEST_API_ AssertionResult AssertionFailure(const Message& msg); + +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_ diff --git a/third_party/googletest/src/include/gtest/gtest-death-test.h b/third_party/googletest/src/include/gtest/gtest-death-test.h index 9b4d4d1337..84e5a5bbd3 100644 --- a/third_party/googletest/src/include/gtest/gtest-death-test.h +++ b/third_party/googletest/src/include/gtest/gtest-death-test.h @@ -27,21 +27,21 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// // The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines the public API for death tests. It is // #included by gtest.h so a user doesn't need to include this // directly. -// GOOGLETEST_CM0001 DO NOT DELETE + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ #include "gtest/internal/gtest-death-test-internal.h" -namespace testing { - // This flag controls the style of death tests. Valid values are "threadsafe", // meaning that the death test child process will re-execute the test binary // from the start, running only a single death test, or "fast", @@ -49,6 +49,8 @@ namespace testing { // after forking. GTEST_DECLARE_string_(death_test_style); +namespace testing { + #if GTEST_HAS_DEATH_TEST namespace internal { @@ -103,7 +105,6 @@ GTEST_API_ bool InDeathTestChild(); // // On the regular expressions used in death tests: // -// GOOGLETEST_CM0005 DO NOT DELETE // On POSIX-compliant systems (*nix), we use the library, // which uses the POSIX extended regex syntax. // @@ -169,24 +170,24 @@ GTEST_API_ bool InDeathTestChild(); // Asserts that a given `statement` causes the program to exit, with an // integer exit status that satisfies `predicate`, and emitting error output // that matches `matcher`. -# define ASSERT_EXIT(statement, predicate, matcher) \ - GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_) +#define ASSERT_EXIT(statement, predicate, matcher) \ + GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_) // Like `ASSERT_EXIT`, but continues on to successive tests in the // test suite, if any: -# define EXPECT_EXIT(statement, predicate, matcher) \ - GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_) +#define EXPECT_EXIT(statement, predicate, matcher) \ + GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_) // Asserts that a given `statement` causes the program to exit, either by // explicitly exiting with a nonzero exit code or being killed by a // signal, and emitting error output that matches `matcher`. -# define ASSERT_DEATH(statement, matcher) \ - ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher) +#define ASSERT_DEATH(statement, matcher) \ + ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher) // Like `ASSERT_DEATH`, but continues on to successive tests in the // test suite, if any: -# define EXPECT_DEATH(statement, matcher) \ - EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher) +#define EXPECT_DEATH(statement, matcher) \ + EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher) // Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*: @@ -197,22 +198,23 @@ class GTEST_API_ ExitedWithCode { ExitedWithCode(const ExitedWithCode&) = default; void operator=(const ExitedWithCode& other) = delete; bool operator()(int exit_status) const; + private: const int exit_code_; }; -# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA // Tests that an exit code describes an exit due to termination by a // given signal. -// GOOGLETEST_CM0006 DO NOT DELETE class GTEST_API_ KilledBySignal { public: explicit KilledBySignal(int signum); bool operator()(int exit_status) const; + private: const int signum_; }; -# endif // !GTEST_OS_WINDOWS +#endif // !GTEST_OS_WINDOWS // EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode. // The death testing framework causes this to have interesting semantics, @@ -257,23 +259,21 @@ class GTEST_API_ KilledBySignal { // EXPECT_EQ(12, DieInDebugOr12(&sideeffect)); // }, "death"); // -# ifdef NDEBUG +#ifdef NDEBUG -# define EXPECT_DEBUG_DEATH(statement, regex) \ +#define EXPECT_DEBUG_DEATH(statement, regex) \ GTEST_EXECUTE_STATEMENT_(statement, regex) -# define ASSERT_DEBUG_DEATH(statement, regex) \ +#define ASSERT_DEBUG_DEATH(statement, regex) \ GTEST_EXECUTE_STATEMENT_(statement, regex) -# else +#else -# define EXPECT_DEBUG_DEATH(statement, regex) \ - EXPECT_DEATH(statement, regex) +#define EXPECT_DEBUG_DEATH(statement, regex) EXPECT_DEATH(statement, regex) -# define ASSERT_DEBUG_DEATH(statement, regex) \ - ASSERT_DEATH(statement, regex) +#define ASSERT_DEBUG_DEATH(statement, regex) ASSERT_DEATH(statement, regex) -# endif // NDEBUG for EXPECT_DEBUG_DEATH +#endif // NDEBUG for EXPECT_DEBUG_DEATH #endif // GTEST_HAS_DEATH_TEST // This macro is used for implementing macros such as @@ -311,18 +311,17 @@ class GTEST_API_ KilledBySignal { // statement unconditionally returns or throws. The Message constructor at // the end allows the syntax of streaming additional messages into the // macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH. -# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - GTEST_LOG_(WARNING) \ - << "Death tests are not supported on this platform.\n" \ - << "Statement '" #statement "' cannot be verified."; \ - } else if (::testing::internal::AlwaysFalse()) { \ - ::testing::internal::RE::PartialMatch(".*", (regex)); \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - terminator; \ - } else \ - ::testing::Message() +#define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_LOG_(WARNING) << "Death tests are not supported on this platform.\n" \ + << "Statement '" #statement "' cannot be verified."; \ + } else if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::RE::PartialMatch(".*", (regex)); \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + terminator; \ + } else \ + ::testing::Message() // EXPECT_DEATH_IF_SUPPORTED(statement, regex) and // ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if @@ -330,15 +329,15 @@ class GTEST_API_ KilledBySignal { // useful when you are combining death test assertions with normal test // assertions in one test. #if GTEST_HAS_DEATH_TEST -# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ - EXPECT_DEATH(statement, regex) -# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ - ASSERT_DEATH(statement, regex) +#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + EXPECT_DEATH(statement, regex) +#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + ASSERT_DEATH(statement, regex) #else -# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ - GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, ) -# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ - GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return) +#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, ) +#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return) #endif } // namespace testing diff --git a/third_party/googletest/src/include/gtest/gtest-matchers.h b/third_party/googletest/src/include/gtest/gtest-matchers.h index 9fa34a05ba..bffa00c533 100644 --- a/third_party/googletest/src/include/gtest/gtest-matchers.h +++ b/third_party/googletest/src/include/gtest/gtest-matchers.h @@ -32,6 +32,10 @@ // This file implements just enough of the matcher interface to allow // EXPECT_DEATH and friends to accept a matcher argument. +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ @@ -98,11 +102,11 @@ class MatchResultListener { private: ::std::ostream* const stream_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(MatchResultListener); + MatchResultListener(const MatchResultListener&) = delete; + MatchResultListener& operator=(const MatchResultListener&) = delete; }; -inline MatchResultListener::~MatchResultListener() { -} +inline MatchResultListener::~MatchResultListener() {} // An instance of a subclass of this knows how to describe itself as a // matcher. @@ -176,27 +180,39 @@ namespace internal { struct AnyEq { template - bool operator()(const A& a, const B& b) const { return a == b; } + bool operator()(const A& a, const B& b) const { + return a == b; + } }; struct AnyNe { template - bool operator()(const A& a, const B& b) const { return a != b; } + bool operator()(const A& a, const B& b) const { + return a != b; + } }; struct AnyLt { template - bool operator()(const A& a, const B& b) const { return a < b; } + bool operator()(const A& a, const B& b) const { + return a < b; + } }; struct AnyGt { template - bool operator()(const A& a, const B& b) const { return a > b; } + bool operator()(const A& a, const B& b) const { + return a > b; + } }; struct AnyLe { template - bool operator()(const A& a, const B& b) const { return a <= b; } + bool operator()(const A& a, const B& b) const { + return a <= b; + } }; struct AnyGe { template - bool operator()(const A& a, const B& b) const { return a >= b; } + bool operator()(const A& a, const B& b) const { + return a >= b; + } }; // A match result listener that ignores the explanation. @@ -205,7 +221,8 @@ class DummyMatchResultListener : public MatchResultListener { DummyMatchResultListener() : MatchResultListener(nullptr) {} private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(DummyMatchResultListener); + DummyMatchResultListener(const DummyMatchResultListener&) = delete; + DummyMatchResultListener& operator=(const DummyMatchResultListener&) = delete; }; // A match result listener that forwards the explanation to a given @@ -217,7 +234,9 @@ class StreamMatchResultListener : public MatchResultListener { : MatchResultListener(os) {} private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener); + StreamMatchResultListener(const StreamMatchResultListener&) = delete; + StreamMatchResultListener& operator=(const StreamMatchResultListener&) = + delete; }; struct SharedPayloadBase { @@ -284,17 +303,18 @@ class MatcherBase : private MatcherDescriberInterface { } protected: - MatcherBase() : vtable_(nullptr) {} + MatcherBase() : vtable_(nullptr), buffer_() {} // Constructs a matcher from its implementation. template - explicit MatcherBase(const MatcherInterface* impl) { + explicit MatcherBase(const MatcherInterface* impl) + : vtable_(nullptr), buffer_() { Init(impl); } template ::type::is_gtest_matcher> - MatcherBase(M&& m) { // NOLINT + MatcherBase(M&& m) : vtable_(nullptr), buffer_() { // NOLINT Init(std::forward(m)); } @@ -420,8 +440,8 @@ class MatcherBase : private MatcherDescriberInterface { static const M& Get(const MatcherBase& m) { // When inlined along with Init, need to be explicit to avoid violating // strict aliasing rules. - const M *ptr = static_cast( - static_cast(&m.buffer_)); + const M* ptr = + static_cast(static_cast(&m.buffer_)); return *ptr; } static void Init(MatcherBase& m, M impl) { @@ -741,7 +761,7 @@ template class EqMatcher : public ComparisonBase, Rhs, AnyEq> { public: explicit EqMatcher(const Rhs& rhs) - : ComparisonBase, Rhs, AnyEq>(rhs) { } + : ComparisonBase, Rhs, AnyEq>(rhs) {} static const char* Desc() { return "is equal to"; } static const char* NegatedDesc() { return "isn't equal to"; } }; @@ -749,7 +769,7 @@ template class NeMatcher : public ComparisonBase, Rhs, AnyNe> { public: explicit NeMatcher(const Rhs& rhs) - : ComparisonBase, Rhs, AnyNe>(rhs) { } + : ComparisonBase, Rhs, AnyNe>(rhs) {} static const char* Desc() { return "isn't equal to"; } static const char* NegatedDesc() { return "is equal to"; } }; @@ -757,7 +777,7 @@ template class LtMatcher : public ComparisonBase, Rhs, AnyLt> { public: explicit LtMatcher(const Rhs& rhs) - : ComparisonBase, Rhs, AnyLt>(rhs) { } + : ComparisonBase, Rhs, AnyLt>(rhs) {} static const char* Desc() { return "is <"; } static const char* NegatedDesc() { return "isn't <"; } }; @@ -765,7 +785,7 @@ template class GtMatcher : public ComparisonBase, Rhs, AnyGt> { public: explicit GtMatcher(const Rhs& rhs) - : ComparisonBase, Rhs, AnyGt>(rhs) { } + : ComparisonBase, Rhs, AnyGt>(rhs) {} static const char* Desc() { return "is >"; } static const char* NegatedDesc() { return "isn't >"; } }; @@ -773,7 +793,7 @@ template class LeMatcher : public ComparisonBase, Rhs, AnyLe> { public: explicit LeMatcher(const Rhs& rhs) - : ComparisonBase, Rhs, AnyLe>(rhs) { } + : ComparisonBase, Rhs, AnyLe>(rhs) {} static const char* Desc() { return "is <="; } static const char* NegatedDesc() { return "isn't <="; } }; @@ -781,7 +801,7 @@ template class GeMatcher : public ComparisonBase, Rhs, AnyGe> { public: explicit GeMatcher(const Rhs& rhs) - : ComparisonBase, Rhs, AnyGe>(rhs) { } + : ComparisonBase, Rhs, AnyGe>(rhs) {} static const char* Desc() { return "is >="; } static const char* NegatedDesc() { return "isn't >="; } }; @@ -872,12 +892,16 @@ PolymorphicMatcher ContainsRegex( // Note: if the parameter of Eq() were declared as const T&, Eq("foo") // wouldn't compile. template -inline internal::EqMatcher Eq(T x) { return internal::EqMatcher(x); } +inline internal::EqMatcher Eq(T x) { + return internal::EqMatcher(x); +} // Constructs a Matcher from a 'value' of type T. The constructed // matcher matches any value that's equal to 'value'. template -Matcher::Matcher(T value) { *this = Eq(value); } +Matcher::Matcher(T value) { + *this = Eq(value); +} // Creates a monomorphic matcher that matches anything with type Lhs // and equal to rhs. A user may need to use this instead of Eq(...) @@ -892,7 +916,9 @@ Matcher::Matcher(T value) { *this = Eq(value); } // can always write Matcher(Lt(5)) to be explicit about the type, // for example. template -inline Matcher TypedEq(const Rhs& rhs) { return Eq(rhs); } +inline Matcher TypedEq(const Rhs& rhs) { + return Eq(rhs); +} // Creates a polymorphic matcher that matches anything >= x. template diff --git a/third_party/googletest/src/include/gtest/gtest-message.h b/third_party/googletest/src/include/gtest/gtest-message.h index becfd49fcb..6c8bf90009 100644 --- a/third_party/googletest/src/include/gtest/gtest-message.h +++ b/third_party/googletest/src/include/gtest/gtest-message.h @@ -27,7 +27,6 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// // The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines the Message class. @@ -42,7 +41,9 @@ // to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user // program! -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ @@ -110,8 +111,8 @@ class GTEST_API_ Message { // Streams a non-pointer value to this object. template - inline Message& operator <<(const T& val) { - // Some libraries overload << for STL containers. These + inline Message& operator<<(const T& val) { + // Some libraries overload << for STL containers. These // overloads are defined in the global namespace instead of ::std. // // C++'s symbol lookup rule (i.e. Koenig lookup) says that these @@ -125,7 +126,7 @@ class GTEST_API_ Message { // from the global namespace. With this using declaration, // overloads of << defined in the global namespace and those // visible via Koenig lookup are both exposed in this function. - using ::operator <<; + using ::operator<<; *ss_ << val; return *this; } @@ -144,7 +145,7 @@ class GTEST_API_ Message { // ensure consistent result across compilers, we always treat NULL // as "(null)". template - inline Message& operator <<(T* const& pointer) { // NOLINT + inline Message& operator<<(T* const& pointer) { // NOLINT if (pointer == nullptr) { *ss_ << "(null)"; } else { @@ -159,25 +160,23 @@ class GTEST_API_ Message { // templatized version above. Without this definition, streaming // endl or other basic IO manipulators to Message will confuse the // compiler. - Message& operator <<(BasicNarrowIoManip val) { + Message& operator<<(BasicNarrowIoManip val) { *ss_ << val; return *this; } // Instead of 1/0, we want to see true/false for bool values. - Message& operator <<(bool b) { - return *this << (b ? "true" : "false"); - } + Message& operator<<(bool b) { return *this << (b ? "true" : "false"); } // These two overloads allow streaming a wide C string to a Message // using the UTF-8 encoding. - Message& operator <<(const wchar_t* wide_c_str); - Message& operator <<(wchar_t* wide_c_str); + Message& operator<<(const wchar_t* wide_c_str); + Message& operator<<(wchar_t* wide_c_str); #if GTEST_HAS_STD_WSTRING // Converts the given wide string to a narrow string using the UTF-8 // encoding, and streams the result to this Message object. - Message& operator <<(const ::std::wstring& wstr); + Message& operator<<(const ::std::wstring& wstr); #endif // GTEST_HAS_STD_WSTRING // Gets the text streamed to this object so far as an std::string. @@ -196,7 +195,7 @@ class GTEST_API_ Message { }; // Streams a Message to an ostream. -inline std::ostream& operator <<(std::ostream& os, const Message& sb) { +inline std::ostream& operator<<(std::ostream& os, const Message& sb) { return os << sb.GetString(); } diff --git a/third_party/googletest/src/include/gtest/gtest-param-test.h b/third_party/googletest/src/include/gtest/gtest-param-test.h index 804e702817..b55119ac62 100644 --- a/third_party/googletest/src/include/gtest/gtest-param-test.h +++ b/third_party/googletest/src/include/gtest/gtest-param-test.h @@ -26,11 +26,14 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// + // Macros and functions for implementing parameterized tests // in Google C++ Testing and Mocking Framework (Google Test) -// -// GOOGLETEST_CM0001 DO NOT DELETE + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ @@ -353,9 +356,7 @@ internal::ValueArray Values(T... v) { // } // INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool()); // -inline internal::ParamGenerator Bool() { - return Values(false, true); -} +inline internal::ParamGenerator Bool() { return Values(false, true); } // Combine() allows the user to combine two or more sequences to produce // values of a Cartesian product of those sequences' elements. @@ -428,8 +429,11 @@ internal::CartesianProductHolder Combine(const Generator&... g) { return 0; \ } \ static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ - GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ - test_name)); \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \ + const GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name) &) = delete; /* NOLINT */ \ }; \ int GTEST_TEST_CLASS_NAME_(test_suite_name, \ test_name)::gtest_registering_dummy_ = \ @@ -453,43 +457,42 @@ internal::CartesianProductHolder Combine(const Generator&... g) { #define GTEST_GET_FIRST_(first, ...) first #define GTEST_GET_SECOND_(first, second, ...) second -#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...) \ - static ::testing::internal::ParamGenerator \ - gtest_##prefix##test_suite_name##_EvalGenerator_() { \ - return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_)); \ - } \ - static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_( \ - const ::testing::TestParamInfo& info) { \ - if (::testing::internal::AlwaysFalse()) { \ - ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_( \ - __VA_ARGS__, \ - ::testing::internal::DefaultParamName, \ - DUMMY_PARAM_))); \ - auto t = std::make_tuple(__VA_ARGS__); \ - static_assert(std::tuple_size::value <= 2, \ - "Too Many Args!"); \ - } \ - return ((GTEST_EXPAND_(GTEST_GET_SECOND_( \ - __VA_ARGS__, \ - ::testing::internal::DefaultParamName, \ - DUMMY_PARAM_))))(info); \ - } \ - static int gtest_##prefix##test_suite_name##_dummy_ \ - GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::UnitTest::GetInstance() \ - ->parameterized_test_registry() \ - .GetTestSuitePatternHolder( \ - GTEST_STRINGIFY_(test_suite_name), \ - ::testing::internal::CodeLocation(__FILE__, __LINE__)) \ - ->AddTestSuiteInstantiation( \ - GTEST_STRINGIFY_(prefix), \ - >est_##prefix##test_suite_name##_EvalGenerator_, \ - >est_##prefix##test_suite_name##_EvalGenerateName_, \ +#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...) \ + static ::testing::internal::ParamGenerator \ + gtest_##prefix##test_suite_name##_EvalGenerator_() { \ + return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_)); \ + } \ + static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_( \ + const ::testing::TestParamInfo& info) { \ + if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_( \ + __VA_ARGS__, \ + ::testing::internal::DefaultParamName, \ + DUMMY_PARAM_))); \ + auto t = std::make_tuple(__VA_ARGS__); \ + static_assert(std::tuple_size::value <= 2, \ + "Too Many Args!"); \ + } \ + return ((GTEST_EXPAND_(GTEST_GET_SECOND_( \ + __VA_ARGS__, \ + ::testing::internal::DefaultParamName, \ + DUMMY_PARAM_))))(info); \ + } \ + static int gtest_##prefix##test_suite_name##_dummy_ \ + GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::UnitTest::GetInstance() \ + ->parameterized_test_registry() \ + .GetTestSuitePatternHolder( \ + GTEST_STRINGIFY_(test_suite_name), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__)) \ + ->AddTestSuiteInstantiation( \ + GTEST_STRINGIFY_(prefix), \ + >est_##prefix##test_suite_name##_EvalGenerator_, \ + >est_##prefix##test_suite_name##_EvalGenerateName_, \ __FILE__, __LINE__) - // Allow Marking a Parameterized test class as not needing to be instantiated. -#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T) \ +#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T) \ namespace gtest_do_not_use_outside_namespace_scope {} \ static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \ GTEST_STRINGIFY_(T)) diff --git a/third_party/googletest/src/include/gtest/gtest-printers.h b/third_party/googletest/src/include/gtest/gtest-printers.h index 076c9de1f4..a91e8b8b10 100644 --- a/third_party/googletest/src/include/gtest/gtest-printers.h +++ b/third_party/googletest/src/include/gtest/gtest-printers.h @@ -27,7 +27,6 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - // Google Test - The Google C++ Testing and Mocking Framework // // This file implements a universal value printer that can print a @@ -95,7 +94,9 @@ // being defined as many user-defined container types don't have // value_type. -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ @@ -257,12 +258,10 @@ struct ConvertibleToStringViewPrinter { #endif }; - // Prints the given number of bytes in the given object to the given // ostream. GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes, - size_t count, - ::std::ostream* os); + size_t count, ::std::ostream* os); struct RawBytesPrinter { // SFINAE on `sizeof` to make sure we have a complete type. template @@ -360,7 +359,7 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char); GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char); GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t); GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t); -#ifdef __cpp_char8_t +#ifdef __cpp_lib_char8_t GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t); GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t); #endif @@ -375,12 +374,12 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t); // to point to a NUL-terminated string, and thus can print it as a string. #define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \ - template <> \ - class FormatForComparison { \ - public: \ - static ::std::string Format(CharType* value) { \ - return ::testing::PrintToString(value); \ - } \ + template <> \ + class FormatForComparison { \ + public: \ + static ::std::string Format(CharType* value) { \ + return ::testing::PrintToString(value); \ + } \ } GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string); @@ -410,8 +409,8 @@ GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring); // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. template -std::string FormatForComparisonFailureMessage( - const T1& value, const T2& /* other_operand */) { +std::string FormatForComparisonFailureMessage(const T1& value, + const T2& /* other_operand */) { return FormatForComparison::Format(value); } @@ -479,6 +478,12 @@ inline void PrintTo(char8_t c, ::std::ostream* os) { } #endif +// gcc/clang __{u,}int128_t +#if defined(__SIZEOF_INT128__) +GTEST_API_ void PrintTo(__uint128_t v, ::std::ostream* os); +GTEST_API_ void PrintTo(__int128_t v, ::std::ostream* os); +#endif // __SIZEOF_INT128__ + // Overloads for C strings. GTEST_API_ void PrintTo(const char* s, ::std::ostream* os); inline void PrintTo(char* s, ::std::ostream* os) { @@ -545,7 +550,7 @@ void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) { } // Overloads for ::std::string. -GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os); +GTEST_API_ void PrintStringTo(const ::std::string& s, ::std::ostream* os); inline void PrintTo(const ::std::string& s, ::std::ostream* os) { PrintStringTo(s, os); } @@ -572,7 +577,7 @@ inline void PrintTo(const ::std::u32string& s, ::std::ostream* os) { // Overloads for ::std::wstring. #if GTEST_HAS_STD_WSTRING -GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os); +GTEST_API_ void PrintWideStringTo(const ::std::wstring& s, ::std::ostream* os); inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) { PrintWideStringTo(s, os); } @@ -587,6 +592,12 @@ inline void PrintTo(internal::StringView sp, ::std::ostream* os) { inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; } +#if GTEST_HAS_RTTI +inline void PrintTo(const std::type_info& info, std::ostream* os) { + *os << internal::GetTypeName(info); +} +#endif // GTEST_HAS_RTTI + template void PrintTo(std::reference_wrapper ref, ::std::ostream* os) { UniversalPrinter::Print(ref.get(), os); @@ -744,6 +755,14 @@ class UniversalPrinter> { } }; +template <> +class UniversalPrinter { + public: + static void Print(decltype(Nullopt()), ::std::ostream* os) { + *os << "(nullopt)"; + } +}; + #endif // GTEST_INTERNAL_HAS_OPTIONAL #if GTEST_INTERNAL_HAS_VARIANT @@ -802,8 +821,8 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) { } } // This overload prints a (const) char array compactly. -GTEST_API_ void UniversalPrintArray( - const char* begin, size_t len, ::std::ostream* os); +GTEST_API_ void UniversalPrintArray(const char* begin, size_t len, + ::std::ostream* os); #ifdef __cpp_char8_t // This overload prints a (const) char8_t array compactly. @@ -820,8 +839,8 @@ GTEST_API_ void UniversalPrintArray(const char32_t* begin, size_t len, ::std::ostream* os); // This overload prints a (const) wchar_t array compactly. -GTEST_API_ void UniversalPrintArray( - const wchar_t* begin, size_t len, ::std::ostream* os); +GTEST_API_ void UniversalPrintArray(const wchar_t* begin, size_t len, + ::std::ostream* os); // Implements printing an array type T[N]. template @@ -980,10 +999,10 @@ void UniversalPrint(const T& value, ::std::ostream* os) { UniversalPrinter::Print(value, os); } -typedef ::std::vector< ::std::string> Strings; +typedef ::std::vector<::std::string> Strings; - // Tersely prints the first N fields of a tuple to a string vector, - // one element for each field. +// Tersely prints the first N fields of a tuple to a string vector, +// one element for each field. template void TersePrintPrefixToStrings(const Tuple&, std::integral_constant, Strings*) {} diff --git a/third_party/googletest/src/include/gtest/gtest-spi.h b/third_party/googletest/src/include/gtest/gtest-spi.h index eacef44669..bec8c4810b 100644 --- a/third_party/googletest/src/include/gtest/gtest-spi.h +++ b/third_party/googletest/src/include/gtest/gtest-spi.h @@ -27,12 +27,9 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// // Utilities for testing Google Test itself and code that uses Google Test // (e.g. frameworks built on top of Google Test). -// GOOGLETEST_CM0004 DO NOT DELETE - #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_ @@ -88,7 +85,10 @@ class GTEST_API_ ScopedFakeTestPartResultReporter TestPartResultReporterInterface* old_reporter_; TestPartResultArray* const result_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter); + ScopedFakeTestPartResultReporter(const ScopedFakeTestPartResultReporter&) = + delete; + ScopedFakeTestPartResultReporter& operator=( + const ScopedFakeTestPartResultReporter&) = delete; }; namespace internal { @@ -104,12 +104,14 @@ class GTEST_API_ SingleFailureChecker { SingleFailureChecker(const TestPartResultArray* results, TestPartResult::Type type, const std::string& substr); ~SingleFailureChecker(); + private: const TestPartResultArray* const results_; const TestPartResult::Type type_; const std::string substr_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker); + SingleFailureChecker(const SingleFailureChecker&) = delete; + SingleFailureChecker& operator=(const SingleFailureChecker&) = delete; }; } // namespace internal @@ -119,7 +121,8 @@ class GTEST_API_ SingleFailureChecker { GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 // A set of macros for testing Google Test assertions or code that's expected -// to generate Google Test fatal failures. It verifies that the given +// to generate Google Test fatal failures (e.g. a failure from an ASSERT_EQ, but +// not a non-fatal failure, as from EXPECT_EQ). It verifies that the given // statement will cause exactly one fatal Google Test failure with 'substr' // being part of the failure message. // @@ -141,44 +144,46 @@ GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 // helper macro, due to some peculiarity in how the preprocessor // works. The AcceptsMacroThatExpandsToUnprotectedComma test in // gtest_unittest.cc will fail to compile if we do that. -#define EXPECT_FATAL_FAILURE(statement, substr) \ - do { \ - class GTestExpectFatalFailureHelper {\ - public:\ - static void Execute() { statement; }\ - };\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ - >est_failures, ::testing::TestPartResult::kFatalFailure, (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ - ::testing::ScopedFakeTestPartResultReporter:: \ - INTERCEPT_ONLY_CURRENT_THREAD, >est_failures);\ - GTestExpectFatalFailureHelper::Execute();\ - }\ +#define EXPECT_FATAL_FAILURE(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper { \ + public: \ + static void Execute() { statement; } \ + }; \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, \ + >est_failures); \ + GTestExpectFatalFailureHelper::Execute(); \ + } \ } while (::testing::internal::AlwaysFalse()) -#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ - do { \ - class GTestExpectFatalFailureHelper {\ - public:\ - static void Execute() { statement; }\ - };\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ - >est_failures, ::testing::TestPartResult::kFatalFailure, (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ - ::testing::ScopedFakeTestPartResultReporter:: \ - INTERCEPT_ALL_THREADS, >est_failures);\ - GTestExpectFatalFailureHelper::Execute();\ - }\ +#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper { \ + public: \ + static void Execute() { statement; } \ + }; \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \ + >est_failures); \ + GTestExpectFatalFailureHelper::Execute(); \ + } \ } while (::testing::internal::AlwaysFalse()) // A macro for testing Google Test assertions or code that's expected to -// generate Google Test non-fatal failures. It asserts that the given -// statement will cause exactly one non-fatal Google Test failure with 'substr' -// being part of the failure message. +// generate Google Test non-fatal failures (e.g. a failure from an EXPECT_EQ, +// but not from an ASSERT_EQ). It asserts that the given statement will cause +// exactly one non-fatal Google Test failure with 'substr' being part of the +// failure message. // // There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only // affects and considers failures generated in the current thread and @@ -207,32 +212,37 @@ GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 // instead of // GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) // to avoid an MSVC warning on unreachable code. -#define EXPECT_NONFATAL_FAILURE(statement, substr) \ - do {\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ +#define EXPECT_NONFATAL_FAILURE(statement, substr) \ + do { \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ - (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ - ::testing::ScopedFakeTestPartResultReporter:: \ - INTERCEPT_ONLY_CURRENT_THREAD, >est_failures);\ - if (::testing::internal::AlwaysTrue()) { statement; }\ - }\ + (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, \ + >est_failures); \ + if (::testing::internal::AlwaysTrue()) { \ + statement; \ + } \ + } \ } while (::testing::internal::AlwaysFalse()) -#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ - do {\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ - >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ - (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ +#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do { \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ + (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \ - >est_failures);\ - if (::testing::internal::AlwaysTrue()) { statement; }\ - }\ + >est_failures); \ + if (::testing::internal::AlwaysTrue()) { \ + statement; \ + } \ + } \ } while (::testing::internal::AlwaysFalse()) #endif // GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_ diff --git a/third_party/googletest/src/include/gtest/gtest-test-part.h b/third_party/googletest/src/include/gtest/gtest-test-part.h index 203fdf98c6..09cc8c34f0 100644 --- a/third_party/googletest/src/include/gtest/gtest-test-part.h +++ b/third_party/googletest/src/include/gtest/gtest-test-part.h @@ -26,14 +26,17 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// GOOGLETEST_CM0001 DO NOT DELETE + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ #include #include + #include "gtest/internal/gtest-internal.h" #include "gtest/internal/gtest-string.h" @@ -142,7 +145,8 @@ class GTEST_API_ TestPartResultArray { private: std::vector array_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray); + TestPartResultArray(const TestPartResultArray&) = delete; + TestPartResultArray& operator=(const TestPartResultArray&) = delete; }; // This interface knows how to report a test part result. @@ -168,11 +172,13 @@ class GTEST_API_ HasNewFatalFailureHelper ~HasNewFatalFailureHelper() override; void ReportTestPartResult(const TestPartResult& result) override; bool has_new_fatal_failure() const { return has_new_fatal_failure_; } + private: bool has_new_fatal_failure_; TestPartResultReporterInterface* original_reporter_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper); + HasNewFatalFailureHelper(const HasNewFatalFailureHelper&) = delete; + HasNewFatalFailureHelper& operator=(const HasNewFatalFailureHelper&) = delete; }; } // namespace internal diff --git a/third_party/googletest/src/include/gtest/gtest-typed-test.h b/third_party/googletest/src/include/gtest/gtest-typed-test.h index 9fdc6be10d..bd35a32660 100644 --- a/third_party/googletest/src/include/gtest/gtest-typed-test.h +++ b/third_party/googletest/src/include/gtest/gtest-typed-test.h @@ -27,7 +27,9 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ @@ -190,7 +192,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); typedef ::testing::internal::GenerateTypeList::type \ GTEST_TYPE_PARAMS_(CaseName); \ typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \ - GTEST_NAME_GENERATOR_(CaseName) + GTEST_NAME_GENERATOR_(CaseName) #define TYPED_TEST(CaseName, TestName) \ static_assert(sizeof(GTEST_STRINGIFY_(TestName)) > 1, \ @@ -256,7 +258,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); // #included in multiple translation units linked together. #define TYPED_TEST_SUITE_P(SuiteName) \ static ::testing::internal::TypedTestSuitePState \ - GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName) + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName) // Legacy API is deprecated but still available #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ @@ -301,21 +303,21 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); REGISTER_TYPED_TEST_SUITE_P #endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ -#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...) \ - static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1, \ - "test-suit-prefix must not be empty"); \ - static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::internal::TypeParameterizedTestSuite< \ - SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_, \ - ::testing::internal::GenerateTypeList::type>:: \ - Register(GTEST_STRINGIFY_(Prefix), \ - ::testing::internal::CodeLocation(__FILE__, __LINE__), \ - >EST_TYPED_TEST_SUITE_P_STATE_(SuiteName), \ - GTEST_STRINGIFY_(SuiteName), \ - GTEST_REGISTERED_TEST_NAMES_(SuiteName), \ - ::testing::internal::GenerateNames< \ - ::testing::internal::NameGeneratorSelector< \ - __VA_ARGS__>::type, \ +#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...) \ + static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1, \ + "test-suit-prefix must not be empty"); \ + static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTestSuite< \ + SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_, \ + ::testing::internal::GenerateTypeList::type>:: \ + Register(GTEST_STRINGIFY_(Prefix), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), \ + >EST_TYPED_TEST_SUITE_P_STATE_(SuiteName), \ + GTEST_STRINGIFY_(SuiteName), \ + GTEST_REGISTERED_TEST_NAMES_(SuiteName), \ + ::testing::internal::GenerateNames< \ + ::testing::internal::NameGeneratorSelector< \ + __VA_ARGS__>::type, \ ::testing::internal::GenerateTypeList::type>()) // Legacy API is deprecated but still available diff --git a/third_party/googletest/src/include/gtest/gtest.h b/third_party/googletest/src/include/gtest/gtest.h index 7a5d057c4a..d19a587a18 100644 --- a/third_party/googletest/src/include/gtest/gtest.h +++ b/third_party/googletest/src/include/gtest/gtest.h @@ -27,7 +27,6 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// // The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines the public API for Google Test. It should be @@ -47,8 +46,6 @@ // registration from Barthelemy Dagenais' (barthelemy@prologique.com) // easyUnit framework. -// GOOGLETEST_CM0001 DO NOT DELETE - #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_H_ @@ -59,31 +56,22 @@ #include #include -#include "gtest/internal/gtest-internal.h" -#include "gtest/internal/gtest-string.h" +#include "gtest/gtest-assertion-result.h" #include "gtest/gtest-death-test.h" #include "gtest/gtest-matchers.h" #include "gtest/gtest-message.h" #include "gtest/gtest-param-test.h" #include "gtest/gtest-printers.h" -#include "gtest/gtest_prod.h" #include "gtest/gtest-test-part.h" #include "gtest/gtest-typed-test.h" +#include "gtest/gtest_pred_impl.h" +#include "gtest/gtest_prod.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-string.h" GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ /* class A needs to have dll-interface to be used by clients of class B */) -namespace testing { - -// Silence C4100 (unreferenced formal parameter) and 4805 -// unsafe mix of type 'const int' and type 'const bool' -#ifdef _MSC_VER -# pragma warning(push) -# pragma warning(disable:4805) -# pragma warning(disable:4100) -#endif - - // Declares the flags. // This flag temporary enables the disabled tests. @@ -138,6 +126,12 @@ GTEST_DECLARE_int32_(random_seed); // is 1. If the value is -1 the tests are repeating forever. GTEST_DECLARE_int32_(repeat); +// This flag controls whether Google Test Environments are recreated for each +// repeat of the tests. The default value is true. If set to false the global +// test Environment objects are only set up once, for the first iteration, and +// only torn down once, for the last. +GTEST_DECLARE_bool_(recreate_environments_when_repeating); + // This flag controls whether Google Test includes Google Test internal // stack frames in failure stack traces. GTEST_DECLARE_bool_(show_internal_stack_frames); @@ -163,6 +157,16 @@ GTEST_DECLARE_string_(stream_result_to); GTEST_DECLARE_string_(flagfile); #endif // GTEST_USE_OWN_FLAGFILE_FLAG_ +namespace testing { + +// Silence C4100 (unreferenced formal parameter) and 4805 +// unsafe mix of type 'const int' and type 'const bool' +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4805) +#pragma warning(disable : 4100) +#endif + // The upper limit for valid stack trace depths. const int kMaxStackTraceDepth = 100; @@ -201,193 +205,6 @@ using TestCase = TestSuite; class TestInfo; class UnitTest; -// A class for indicating whether an assertion was successful. When -// the assertion wasn't successful, the AssertionResult object -// remembers a non-empty message that describes how it failed. -// -// To create an instance of this class, use one of the factory functions -// (AssertionSuccess() and AssertionFailure()). -// -// This class is useful for two purposes: -// 1. Defining predicate functions to be used with Boolean test assertions -// EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts -// 2. Defining predicate-format functions to be -// used with predicate assertions (ASSERT_PRED_FORMAT*, etc). -// -// For example, if you define IsEven predicate: -// -// testing::AssertionResult IsEven(int n) { -// if ((n % 2) == 0) -// return testing::AssertionSuccess(); -// else -// return testing::AssertionFailure() << n << " is odd"; -// } -// -// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5))) -// will print the message -// -// Value of: IsEven(Fib(5)) -// Actual: false (5 is odd) -// Expected: true -// -// instead of a more opaque -// -// Value of: IsEven(Fib(5)) -// Actual: false -// Expected: true -// -// in case IsEven is a simple Boolean predicate. -// -// If you expect your predicate to be reused and want to support informative -// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up -// about half as often as positive ones in our tests), supply messages for -// both success and failure cases: -// -// testing::AssertionResult IsEven(int n) { -// if ((n % 2) == 0) -// return testing::AssertionSuccess() << n << " is even"; -// else -// return testing::AssertionFailure() << n << " is odd"; -// } -// -// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print -// -// Value of: IsEven(Fib(6)) -// Actual: true (8 is even) -// Expected: false -// -// NB: Predicates that support negative Boolean assertions have reduced -// performance in positive ones so be careful not to use them in tests -// that have lots (tens of thousands) of positive Boolean assertions. -// -// To use this class with EXPECT_PRED_FORMAT assertions such as: -// -// // Verifies that Foo() returns an even number. -// EXPECT_PRED_FORMAT1(IsEven, Foo()); -// -// you need to define: -// -// testing::AssertionResult IsEven(const char* expr, int n) { -// if ((n % 2) == 0) -// return testing::AssertionSuccess(); -// else -// return testing::AssertionFailure() -// << "Expected: " << expr << " is even\n Actual: it's " << n; -// } -// -// If Foo() returns 5, you will see the following message: -// -// Expected: Foo() is even -// Actual: it's 5 -// -class GTEST_API_ AssertionResult { - public: - // Copy constructor. - // Used in EXPECT_TRUE/FALSE(assertion_result). - AssertionResult(const AssertionResult& other); - -// C4800 is a level 3 warning in Visual Studio 2015 and earlier. -// This warning is not emitted in Visual Studio 2017. -// This warning is off by default starting in Visual Studio 2019 but can be -// enabled with command-line options. -#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) - GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */) -#endif - - // Used in the EXPECT_TRUE/FALSE(bool_expression). - // - // T must be contextually convertible to bool. - // - // The second parameter prevents this overload from being considered if - // the argument is implicitly convertible to AssertionResult. In that case - // we want AssertionResult's copy constructor to be used. - template - explicit AssertionResult( - const T& success, - typename std::enable_if< - !std::is_convertible::value>::type* - /*enabler*/ - = nullptr) - : success_(success) {} - -#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) - GTEST_DISABLE_MSC_WARNINGS_POP_() -#endif - - // Assignment operator. - AssertionResult& operator=(AssertionResult other) { - swap(other); - return *this; - } - - // Returns true if and only if the assertion succeeded. - operator bool() const { return success_; } // NOLINT - - // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. - AssertionResult operator!() const; - - // Returns the text streamed into this AssertionResult. Test assertions - // use it when they fail (i.e., the predicate's outcome doesn't match the - // assertion's expectation). When nothing has been streamed into the - // object, returns an empty string. - const char* message() const { - return message_.get() != nullptr ? message_->c_str() : ""; - } - // Deprecated; please use message() instead. - const char* failure_message() const { return message(); } - - // Streams a custom failure message into this object. - template AssertionResult& operator<<(const T& value) { - AppendMessage(Message() << value); - return *this; - } - - // Allows streaming basic output manipulators such as endl or flush into - // this object. - AssertionResult& operator<<( - ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) { - AppendMessage(Message() << basic_manipulator); - return *this; - } - - private: - // Appends the contents of message to message_. - void AppendMessage(const Message& a_message) { - if (message_.get() == nullptr) message_.reset(new ::std::string); - message_->append(a_message.GetString().c_str()); - } - - // Swap the contents of this AssertionResult with other. - void swap(AssertionResult& other); - - // Stores result of the assertion predicate. - bool success_; - // Stores the message describing the condition in case the expectation - // construct is not satisfied with the predicate's outcome. - // Referenced via a pointer to avoid taking too much stack frame space - // with test assertions. - std::unique_ptr< ::std::string> message_; -}; - -// Makes a successful assertion result. -GTEST_API_ AssertionResult AssertionSuccess(); - -// Makes a failed assertion result. -GTEST_API_ AssertionResult AssertionFailure(); - -// Makes a failed assertion result with the given failure message. -// Deprecated; use AssertionFailure() << msg. -GTEST_API_ AssertionResult AssertionFailure(const Message& msg); - -} // namespace testing - -// Includes the auto-generated header that implements a family of generic -// predicate assertion macros. This include comes late because it relies on -// APIs declared above. -#include "gtest/gtest_pred_impl.h" - -namespace testing { - // The abstract class that all tests inherit from. // // In Google Test, a unit test program contains one or many TestSuites, and @@ -522,7 +339,8 @@ class GTEST_API_ Test { virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; } // We disallow copying Tests. - GTEST_DISALLOW_COPY_AND_ASSIGN_(Test); + Test(const Test&) = delete; + Test& operator=(const Test&) = delete; }; typedef internal::TimeInMillis TimeInMillis; @@ -536,24 +354,17 @@ class TestProperty { // C'tor. TestProperty does NOT have a default constructor. // Always use this constructor (with parameters) to create a // TestProperty object. - TestProperty(const std::string& a_key, const std::string& a_value) : - key_(a_key), value_(a_value) { - } + TestProperty(const std::string& a_key, const std::string& a_value) + : key_(a_key), value_(a_value) {} // Gets the user supplied key. - const char* key() const { - return key_.c_str(); - } + const char* key() const { return key_.c_str(); } // Gets the user supplied value. - const char* value() const { - return value_.c_str(); - } + const char* value() const { return value_.c_str(); } // Sets a new value, overriding the one supplied in the constructor. - void SetValue(const std::string& new_value) { - value_ = new_value; - } + void SetValue(const std::string& new_value) { value_ = new_value; } private: // The key supplied by the user. @@ -687,7 +498,8 @@ class GTEST_API_ TestResult { TimeInMillis elapsed_time_; // We disallow copying TestResult. - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult); + TestResult(const TestResult&) = delete; + TestResult& operator=(const TestResult&) = delete; }; // class TestResult // A TestInfo object stores the following information about a test: @@ -811,8 +623,8 @@ class GTEST_API_ TestInfo { } // These fields are immutable properties of the test. - const std::string test_suite_name_; // test suite name - const std::string name_; // Test name + const std::string test_suite_name_; // test suite name + const std::string name_; // Test name // Name of the parameter type, or NULL if this is not a typed or a // type-parameterized test. const std::unique_ptr type_param_; @@ -833,7 +645,8 @@ class GTEST_API_ TestInfo { // test for the second time. TestResult result_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo); + TestInfo(const TestInfo&) = delete; + TestInfo& operator=(const TestInfo&) = delete; }; // A test suite, which consists of a vector of TestInfos. @@ -941,7 +754,7 @@ class GTEST_API_ TestSuite { // Adds a TestInfo to this test suite. Will delete the TestInfo upon // destruction of the TestSuite object. - void AddTestInfo(TestInfo * test_info); + void AddTestInfo(TestInfo* test_info); // Clears the results of all tests in this test suite. void ClearResult(); @@ -1042,7 +855,8 @@ class GTEST_API_ TestSuite { TestResult ad_hoc_test_result_; // We disallow copying TestSuites. - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestSuite); + TestSuite(const TestSuite&) = delete; + TestSuite& operator=(const TestSuite&) = delete; }; // An Environment object is capable of setting up and tearing down an @@ -1069,6 +883,7 @@ class Environment { // Override this to define how to tear down the environment. virtual void TearDown() {} + private: // If you see an error about overriding the following function or // about it being private, you have mis-spelled SetUp() as Setup(). @@ -1120,6 +935,9 @@ class TestEventListener { // Fired before the test starts. virtual void OnTestStart(const TestInfo& test_info) = 0; + // Fired when a test is disabled + virtual void OnTestDisabled(const TestInfo& /*test_info*/) {} + // Fired after a failed assertion or a SUCCEED() invocation. // If you want to throw an exception from this function to skip to the next // TEST, it must be AssertionException defined above, or inherited from it. @@ -1143,8 +961,7 @@ class TestEventListener { virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0; // Fired after each iteration of tests finishes. - virtual void OnTestIterationEnd(const UnitTest& unit_test, - int iteration) = 0; + virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration) = 0; // Fired after all test activities have ended. virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0; @@ -1169,6 +986,7 @@ class EmptyTestEventListener : public TestEventListener { #endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ void OnTestStart(const TestInfo& /*test_info*/) override {} + void OnTestDisabled(const TestInfo& /*test_info*/) override {} void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {} void OnTestEnd(const TestInfo& /*test_info*/) override {} void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {} @@ -1258,7 +1076,8 @@ class GTEST_API_ TestEventListeners { TestEventListener* default_xml_generator_; // We disallow copying TestEventListeners. - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners); + TestEventListeners(const TestEventListeners&) = delete; + TestEventListeners& operator=(const TestEventListeners&) = delete; }; // A UnitTest consists of a vector of TestSuites. @@ -1301,8 +1120,7 @@ class GTEST_API_ UnitTest { // Returns the TestInfo object for the test that's currently running, // or NULL if no test is running. - const TestInfo* current_test_info() const - GTEST_LOCK_EXCLUDED_(mutex_); + const TestInfo* current_test_info() const GTEST_LOCK_EXCLUDED_(mutex_); // Returns the random seed used at the start of the current test run. int random_seed() const; @@ -1408,8 +1226,7 @@ class GTEST_API_ UnitTest { // eventually call this to report their results. The user code // should use the assertion macros instead of calling this directly. void AddTestPartResult(TestPartResult::Type result_type, - const char* file_name, - int line_number, + const char* file_name, int line_number, const std::string& message, const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_); @@ -1440,8 +1257,7 @@ class GTEST_API_ UnitTest { friend std::set* internal::GetIgnoredParameterizedTestSuites(); friend internal::UnitTestImpl* internal::GetUnitTestImpl(); friend void internal::ReportFailureInUnknownLocation( - TestPartResult::Type result_type, - const std::string& message); + TestPartResult::Type result_type, const std::string& message); // Creates an empty UnitTest. UnitTest(); @@ -1455,8 +1271,7 @@ class GTEST_API_ UnitTest { GTEST_LOCK_EXCLUDED_(mutex_); // Pops a trace from the per-thread Google Test trace stack. - void PopGTestTrace() - GTEST_LOCK_EXCLUDED_(mutex_); + void PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_); // Protects mutable state in *impl_. This is mutable as some const // methods need to lock it too. @@ -1469,7 +1284,8 @@ class GTEST_API_ UnitTest { internal::UnitTestImpl* impl_; // We disallow copying UnitTest. - GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest); + UnitTest(const UnitTest&) = delete; + UnitTest& operator=(const UnitTest&) = delete; }; // A convenient wrapper for adding an environment for the test @@ -1520,13 +1336,11 @@ namespace internal { // when calling EXPECT_* in a tight loop. template AssertionResult CmpHelperEQFailure(const char* lhs_expression, - const char* rhs_expression, - const T1& lhs, const T2& rhs) { - return EqFailure(lhs_expression, - rhs_expression, + const char* rhs_expression, const T1& lhs, + const T2& rhs) { + return EqFailure(lhs_expression, rhs_expression, FormatForComparisonFailureMessage(lhs, rhs), - FormatForComparisonFailureMessage(rhs, lhs), - false); + FormatForComparisonFailureMessage(rhs, lhs), false); } // This block of code defines operator==/!= @@ -1539,8 +1353,7 @@ inline bool operator!=(faketype, faketype) { return false; } // The helper function for {ASSERT|EXPECT}_EQ. template AssertionResult CmpHelperEQ(const char* lhs_expression, - const char* rhs_expression, - const T1& lhs, + const char* rhs_expression, const T1& lhs, const T2& rhs) { if (lhs == rhs) { return AssertionSuccess(); @@ -1571,8 +1384,7 @@ class EqHelper { // Even though its body looks the same as the above version, we // cannot merge the two, as it will make anonymous enums unhappy. static AssertionResult Compare(const char* lhs_expression, - const char* rhs_expression, - BiggestInt lhs, + const char* rhs_expression, BiggestInt lhs, BiggestInt rhs) { return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); } @@ -1607,16 +1419,16 @@ AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2, // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. -#define GTEST_IMPL_CMP_HELPER_(op_name, op)\ -template \ -AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ - const T1& val1, const T2& val2) {\ - if (val1 op val2) {\ - return AssertionSuccess();\ - } else {\ - return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\ - }\ -} +#define GTEST_IMPL_CMP_HELPER_(op_name, op) \ + template \ + AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ + const T1& val1, const T2& val2) { \ + if (val1 op val2) { \ + return AssertionSuccess(); \ + } else { \ + return CmpHelperOpFailure(expr1, expr2, val1, val2, #op); \ + } \ + } // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. @@ -1638,49 +1450,42 @@ GTEST_IMPL_CMP_HELPER_(GT, >) // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression, const char* s2_expression, - const char* s1, - const char* s2); + const char* s1, const char* s2); // The helper function for {ASSERT|EXPECT}_STRCASEEQ. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression, const char* s2_expression, - const char* s1, - const char* s2); + const char* s1, const char* s2); // The helper function for {ASSERT|EXPECT}_STRNE. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, const char* s2_expression, - const char* s1, - const char* s2); + const char* s1, const char* s2); // The helper function for {ASSERT|EXPECT}_STRCASENE. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression, const char* s2_expression, - const char* s1, - const char* s2); - + const char* s1, const char* s2); // Helper function for *_STREQ on wide strings. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression, const char* s2_expression, - const wchar_t* s1, - const wchar_t* s2); + const wchar_t* s1, const wchar_t* s2); // Helper function for *_STRNE on wide strings. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, const char* s2_expression, - const wchar_t* s1, - const wchar_t* s2); + const wchar_t* s1, const wchar_t* s2); } // namespace internal @@ -1692,32 +1497,40 @@ GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, // // The {needle,haystack}_expr arguments are the stringified // expressions that generated the two real arguments. -GTEST_API_ AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const char* needle, const char* haystack); -GTEST_API_ AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const wchar_t* needle, const wchar_t* haystack); -GTEST_API_ AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const char* needle, const char* haystack); -GTEST_API_ AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const wchar_t* needle, const wchar_t* haystack); -GTEST_API_ AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::string& needle, const ::std::string& haystack); -GTEST_API_ AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::string& needle, const ::std::string& haystack); +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const char* needle, + const char* haystack); +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const wchar_t* needle, + const wchar_t* haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const char* needle, + const char* haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const wchar_t* needle, + const wchar_t* haystack); +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack); #if GTEST_HAS_STD_WSTRING -GTEST_API_ AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::wstring& needle, const ::std::wstring& haystack); -GTEST_API_ AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::wstring& needle, const ::std::wstring& haystack); +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack); #endif // GTEST_HAS_STD_WSTRING namespace internal { @@ -1732,8 +1545,7 @@ namespace internal { template AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression, const char* rhs_expression, - RawType lhs_value, - RawType rhs_value) { + RawType lhs_value, RawType rhs_value) { const FloatingPoint lhs(lhs_value), rhs(rhs_value); if (lhs.AlmostEquals(rhs)) { @@ -1748,10 +1560,8 @@ AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression, rhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) << rhs_value; - return EqFailure(lhs_expression, - rhs_expression, - StringStreamToString(&lhs_ss), - StringStreamToString(&rhs_ss), + return EqFailure(lhs_expression, rhs_expression, + StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss), false); } @@ -1761,8 +1571,7 @@ AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression, GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1, const char* expr2, const char* abs_error_expr, - double val1, - double val2, + double val1, double val2, double abs_error); // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. @@ -1770,9 +1579,7 @@ GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1, class GTEST_API_ AssertHelper { public: // Constructor. - AssertHelper(TestPartResult::Type type, - const char* file, - int line, + AssertHelper(TestPartResult::Type type, const char* file, int line, const char* message); ~AssertHelper(); @@ -1786,11 +1593,9 @@ class GTEST_API_ AssertHelper { // re-using stack space even for temporary variables, so every EXPECT_EQ // reserves stack space for another AssertHelper. struct AssertHelperData { - AssertHelperData(TestPartResult::Type t, - const char* srcfile, - int line_num, + AssertHelperData(TestPartResult::Type t, const char* srcfile, int line_num, const char* msg) - : type(t), file(srcfile), line(line_num), message(msg) { } + : type(t), file(srcfile), line(line_num), message(msg) {} TestPartResult::Type const type; const char* const file; @@ -1798,12 +1603,14 @@ class GTEST_API_ AssertHelper { std::string const message; private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData); + AssertHelperData(const AssertHelperData&) = delete; + AssertHelperData& operator=(const AssertHelperData&) = delete; }; AssertHelperData* const data_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper); + AssertHelper(const AssertHelper&) = delete; + AssertHelper& operator=(const AssertHelper&) = delete; }; } // namespace internal @@ -1860,15 +1667,14 @@ class WithParamInterface { private: // Sets parameter value. The caller is responsible for making sure the value // remains alive and unchanged throughout the current test. - static void SetParam(const ParamType* parameter) { - parameter_ = parameter; - } + static void SetParam(const ParamType* parameter) { parameter_ = parameter; } // Static value used for accessing parameter during a test lifetime. static const ParamType* parameter_; // TestClass must be a subclass of WithParamInterface and Test. - template friend class internal::ParameterizedTestFactory; + template + friend class internal::ParameterizedTestFactory; }; template @@ -1878,8 +1684,7 @@ const T* WithParamInterface::parameter_ = nullptr; // WithParamInterface, and can just inherit from ::testing::TestWithParam. template -class TestWithParam : public Test, public WithParamInterface { -}; +class TestWithParam : public Test, public WithParamInterface {}; // Macros for indicating success/failure in test code. @@ -1910,7 +1715,7 @@ class TestWithParam : public Test, public WithParamInterface { // Generates a nonfatal failure at the given source file location with // a generic message. -#define ADD_FAILURE_AT(file, line) \ +#define ADD_FAILURE_AT(file, line) \ GTEST_MESSAGE_AT_(file, line, "Failed", \ ::testing::TestPartResult::kNonFatalFailure) @@ -1925,7 +1730,7 @@ class TestWithParam : public Test, public WithParamInterface { // Define this macro to 1 to omit the definition of FAIL(), which is a // generic name and clashes with some other libraries. #if !GTEST_DONT_DEFINE_FAIL -# define FAIL() GTEST_FAIL() +#define FAIL() GTEST_FAIL() #endif // Generates a success with a generic message. @@ -1934,7 +1739,7 @@ class TestWithParam : public Test, public WithParamInterface { // Define this macro to 1 to omit the definition of SUCCEED(), which // is a generic name and clashes with some other libraries. #if !GTEST_DONT_DEFINE_SUCCEED -# define SUCCEED() GTEST_SUCCEED() +#define SUCCEED() GTEST_SUCCEED() #endif // Macros for testing exceptions. @@ -1962,16 +1767,15 @@ class TestWithParam : public Test, public WithParamInterface { // Boolean assertions. Condition can be either a Boolean expression or an // AssertionResult. For more information on how to use AssertionResult with // these macros see comments on that class. -#define GTEST_EXPECT_TRUE(condition) \ +#define GTEST_EXPECT_TRUE(condition) \ GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ GTEST_NONFATAL_FAILURE_) -#define GTEST_EXPECT_FALSE(condition) \ +#define GTEST_EXPECT_FALSE(condition) \ GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ GTEST_NONFATAL_FAILURE_) #define GTEST_ASSERT_TRUE(condition) \ - GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ - GTEST_FATAL_FAILURE_) -#define GTEST_ASSERT_FALSE(condition) \ + GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_) +#define GTEST_ASSERT_FALSE(condition) \ GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ GTEST_FATAL_FAILURE_) @@ -2070,27 +1874,27 @@ class TestWithParam : public Test, public WithParamInterface { // ASSERT_XY(), which clashes with some users' own code. #if !GTEST_DONT_DEFINE_ASSERT_EQ -# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2) +#define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2) #endif #if !GTEST_DONT_DEFINE_ASSERT_NE -# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2) +#define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2) #endif #if !GTEST_DONT_DEFINE_ASSERT_LE -# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2) +#define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2) #endif #if !GTEST_DONT_DEFINE_ASSERT_LT -# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2) +#define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2) #endif #if !GTEST_DONT_DEFINE_ASSERT_GE -# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2) +#define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2) #endif #if !GTEST_DONT_DEFINE_ASSERT_GT -# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2) +#define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2) #endif // C-string Comparisons. All tests treat NULL and any non-NULL string @@ -2115,7 +1919,7 @@ class TestWithParam : public Test, public WithParamInterface { EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) #define EXPECT_STRCASEEQ(s1, s2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2) -#define EXPECT_STRCASENE(s1, s2)\ +#define EXPECT_STRCASENE(s1, s2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) #define ASSERT_STREQ(s1, s2) \ @@ -2124,7 +1928,7 @@ class TestWithParam : public Test, public WithParamInterface { ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) #define ASSERT_STRCASEEQ(s1, s2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2) -#define ASSERT_STRCASENE(s1, s2)\ +#define ASSERT_STRCASENE(s1, s2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) // Macros for comparing floating-point numbers. @@ -2141,29 +1945,29 @@ class TestWithParam : public Test, public WithParamInterface { // FloatingPoint template class in gtest-internal.h if you are // interested in the implementation details. -#define EXPECT_FLOAT_EQ(val1, val2)\ +#define EXPECT_FLOAT_EQ(val1, val2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ val1, val2) -#define EXPECT_DOUBLE_EQ(val1, val2)\ +#define EXPECT_DOUBLE_EQ(val1, val2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ val1, val2) -#define ASSERT_FLOAT_EQ(val1, val2)\ +#define ASSERT_FLOAT_EQ(val1, val2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ val1, val2) -#define ASSERT_DOUBLE_EQ(val1, val2)\ +#define ASSERT_DOUBLE_EQ(val1, val2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ val1, val2) -#define EXPECT_NEAR(val1, val2, abs_error)\ - EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \ - val1, val2, abs_error) +#define EXPECT_NEAR(val1, val2, abs_error) \ + EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \ + abs_error) -#define ASSERT_NEAR(val1, val2, abs_error)\ - ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \ - val1, val2, abs_error) +#define ASSERT_NEAR(val1, val2, abs_error) \ + ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \ + abs_error) // These predicate format functions work on floating-point values, and // can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g. @@ -2177,7 +1981,6 @@ GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2, GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, double val1, double val2); - #if GTEST_OS_WINDOWS // Macros that test for HRESULT failure and success, these are only useful @@ -2189,17 +1992,17 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, // expected result and the actual result with both a human-readable // string representation of the error, if available, as well as the // hex result code. -# define EXPECT_HRESULT_SUCCEEDED(expr) \ - EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) +#define EXPECT_HRESULT_SUCCEEDED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) -# define ASSERT_HRESULT_SUCCEEDED(expr) \ - ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) +#define ASSERT_HRESULT_SUCCEEDED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) -# define EXPECT_HRESULT_FAILED(expr) \ - EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) +#define EXPECT_HRESULT_FAILED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) -# define ASSERT_HRESULT_FAILED(expr) \ - ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) +#define ASSERT_HRESULT_FAILED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) #endif // GTEST_OS_WINDOWS @@ -2214,9 +2017,9 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, // ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed"; // #define ASSERT_NO_FATAL_FAILURE(statement) \ - GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_) + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_) #define EXPECT_NO_FATAL_FAILURE(statement) \ - GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_) + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_) // Causes a trace (including the given source file path and line number, // and the given message) to be included in every test failure message generated @@ -2258,7 +2061,8 @@ class GTEST_API_ ScopedTrace { private: void PushTrace(const char* file, int line, std::string message); - GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace); + ScopedTrace(const ScopedTrace&) = delete; + ScopedTrace& operator=(const ScopedTrace&) = delete; } GTEST_ATTRIBUTE_UNUSED_; // A ScopedTrace object does its job in its // c'tor and d'tor. Therefore it doesn't // need to be used otherwise. @@ -2278,9 +2082,9 @@ class GTEST_API_ ScopedTrace { // Assuming that each thread maintains its own stack of traces. // Therefore, a SCOPED_TRACE() would (correctly) only affect the // assertions in its own thread. -#define SCOPED_TRACE(message) \ - ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\ - __FILE__, __LINE__, (message)) +#define SCOPED_TRACE(message) \ + ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)( \ + __FILE__, __LINE__, (message)) // Compile-time assertion for type equality. // StaticAssertTypeEq() compiles if and only if type1 and type2 @@ -2378,20 +2182,19 @@ constexpr bool StaticAssertTypeEq() noexcept { // EXPECT_EQ(a_.size(), 0); // EXPECT_EQ(b_.size(), 1); // } -// -// GOOGLETEST_CM0011 DO NOT DELETE -#if !GTEST_DONT_DEFINE_TEST -#define TEST_F(test_fixture, test_name)\ +#define GTEST_TEST_F(test_fixture, test_name) \ GTEST_TEST_(test_fixture, test_name, test_fixture, \ ::testing::internal::GetTypeId()) -#endif // !GTEST_DONT_DEFINE_TEST +#if !GTEST_DONT_DEFINE_TEST_F +#define TEST_F(test_fixture, test_name) GTEST_TEST_F(test_fixture, test_name) +#endif // Returns a path to temporary directory. // Tries to determine an appropriate directory for the platform. GTEST_API_ std::string TempDir(); #ifdef _MSC_VER -# pragma warning(pop) +#pragma warning(pop) #endif // Dynamically registers a test with the framework. @@ -2445,6 +2248,7 @@ GTEST_API_ std::string TempDir(); // } // ... // int main(int argc, char** argv) { +// ::testing::InitGoogleTest(&argc, argv); // std::vector values_to_test = LoadValuesFromConfig(); // RegisterMyTests(values_to_test); // ... @@ -2486,9 +2290,7 @@ TestInfo* RegisterTest(const char* test_suite_name, const char* test_name, // namespace and has an all-caps name. int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_; -inline int RUN_ALL_TESTS() { - return ::testing::UnitTest::GetInstance()->Run(); -} +inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); } GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 diff --git a/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/third_party/googletest/src/include/gtest/gtest_pred_impl.h index 5029a9bb02..47a24aa687 100644 --- a/third_party/googletest/src/include/gtest/gtest_pred_impl.h +++ b/third_party/googletest/src/include/gtest/gtest_pred_impl.h @@ -26,17 +26,19 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// This file is AUTOMATICALLY GENERATED on 01/02/2019 by command -// 'gen_gtest_pred_impl.py 5'. DO NOT EDIT BY HAND! // // Implements a family of generic predicate assertion macros. -// GOOGLETEST_CM0001 DO NOT DELETE + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ -#include "gtest/gtest.h" +#include "gtest/gtest-assertion-result.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" namespace testing { @@ -72,22 +74,18 @@ namespace testing { // GTEST_ASSERT_ is the basic statement to which all of the assertions // in this file reduce. Don't use this in your code. -#define GTEST_ASSERT_(expression, on_failure) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ +#define GTEST_ASSERT_(expression, on_failure) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ if (const ::testing::AssertionResult gtest_ar = (expression)) \ - ; \ - else \ + ; \ + else \ on_failure(gtest_ar.failure_message()) - // Helper function for implementing {EXPECT|ASSERT}_PRED1. Don't use // this in your code. -template -AssertionResult AssertPred1Helper(const char* pred_text, - const char* e1, - Pred pred, - const T1& v1) { +template +AssertionResult AssertPred1Helper(const char* pred_text, const char* e1, + Pred pred, const T1& v1) { if (pred(v1)) return AssertionSuccess(); return AssertionFailure() @@ -98,40 +96,27 @@ AssertionResult AssertPred1Helper(const char* pred_text, // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1. // Don't use this in your code. -#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, v1), \ - on_failure) +#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, v1), on_failure) // Internal macro for implementing {EXPECT|ASSERT}_PRED1. Don't use // this in your code. -#define GTEST_PRED1_(pred, v1, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \ - #v1, \ - pred, \ - v1), on_failure) +#define GTEST_PRED1_(pred, v1, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, #v1, pred, v1), on_failure) // Unary predicate assertion macros. #define EXPECT_PRED_FORMAT1(pred_format, v1) \ GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_) -#define EXPECT_PRED1(pred, v1) \ - GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_) #define ASSERT_PRED_FORMAT1(pred_format, v1) \ GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_) -#define ASSERT_PRED1(pred, v1) \ - GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_) - - +#define ASSERT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_) // Helper function for implementing {EXPECT|ASSERT}_PRED2. Don't use // this in your code. -template -AssertionResult AssertPred2Helper(const char* pred_text, - const char* e1, - const char* e2, - Pred pred, - const T1& v1, +template +AssertionResult AssertPred2Helper(const char* pred_text, const char* e1, + const char* e2, Pred pred, const T1& v1, const T2& v2) { if (pred(v1, v2)) return AssertionSuccess(); @@ -145,19 +130,14 @@ AssertionResult AssertPred2Helper(const char* pred_text, // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2. // Don't use this in your code. -#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \ - on_failure) +#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure) // Internal macro for implementing {EXPECT|ASSERT}_PRED2. Don't use // this in your code. -#define GTEST_PRED2_(pred, v1, v2, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \ - #v1, \ - #v2, \ - pred, \ - v1, \ - v2), on_failure) +#define GTEST_PRED2_(pred, v1, v2, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, #v1, #v2, pred, v1, v2), \ + on_failure) // Binary predicate assertion macros. #define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \ @@ -169,22 +149,12 @@ AssertionResult AssertPred2Helper(const char* pred_text, #define ASSERT_PRED2(pred, v1, v2) \ GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_) - - // Helper function for implementing {EXPECT|ASSERT}_PRED3. Don't use // this in your code. -template -AssertionResult AssertPred3Helper(const char* pred_text, - const char* e1, - const char* e2, - const char* e3, - Pred pred, - const T1& v1, - const T2& v2, - const T3& v3) { +template +AssertionResult AssertPred3Helper(const char* pred_text, const char* e1, + const char* e2, const char* e3, Pred pred, + const T1& v1, const T2& v2, const T3& v3) { if (pred(v1, v2, v3)) return AssertionSuccess(); return AssertionFailure() @@ -198,21 +168,15 @@ AssertionResult AssertPred3Helper(const char* pred_text, // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3. // Don't use this in your code. -#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \ - on_failure) +#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), on_failure) // Internal macro for implementing {EXPECT|ASSERT}_PRED3. Don't use // this in your code. -#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \ - #v1, \ - #v2, \ - #v3, \ - pred, \ - v1, \ - v2, \ - v3), on_failure) +#define GTEST_PRED3_(pred, v1, v2, v3, on_failure) \ + GTEST_ASSERT_( \ + ::testing::AssertPred3Helper(#pred, #v1, #v2, #v3, pred, v1, v2, v3), \ + on_failure) // Ternary predicate assertion macros. #define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \ @@ -224,25 +188,13 @@ AssertionResult AssertPred3Helper(const char* pred_text, #define ASSERT_PRED3(pred, v1, v2, v3) \ GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_) - - // Helper function for implementing {EXPECT|ASSERT}_PRED4. Don't use // this in your code. -template -AssertionResult AssertPred4Helper(const char* pred_text, - const char* e1, - const char* e2, - const char* e3, - const char* e4, - Pred pred, - const T1& v1, - const T2& v2, - const T3& v3, - const T4& v4) { +template +AssertionResult AssertPred4Helper(const char* pred_text, const char* e1, + const char* e2, const char* e3, + const char* e4, Pred pred, const T1& v1, + const T2& v2, const T3& v3, const T4& v4) { if (pred(v1, v2, v3, v4)) return AssertionSuccess(); return AssertionFailure() @@ -257,23 +209,15 @@ AssertionResult AssertPred4Helper(const char* pred_text, // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4. // Don't use this in your code. -#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \ - on_failure) +#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), on_failure) // Internal macro for implementing {EXPECT|ASSERT}_PRED4. Don't use // this in your code. -#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \ - #v1, \ - #v2, \ - #v3, \ - #v4, \ - pred, \ - v1, \ - v2, \ - v3, \ - v4), on_failure) +#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, #v1, #v2, #v3, #v4, pred, \ + v1, v2, v3, v4), \ + on_failure) // 4-ary predicate assertion macros. #define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ @@ -285,28 +229,15 @@ AssertionResult AssertPred4Helper(const char* pred_text, #define ASSERT_PRED4(pred, v1, v2, v3, v4) \ GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) - - // Helper function for implementing {EXPECT|ASSERT}_PRED5. Don't use // this in your code. -template -AssertionResult AssertPred5Helper(const char* pred_text, - const char* e1, - const char* e2, - const char* e3, - const char* e4, - const char* e5, - Pred pred, - const T1& v1, - const T2& v2, - const T3& v3, - const T4& v4, - const T5& v5) { +AssertionResult AssertPred5Helper(const char* pred_text, const char* e1, + const char* e2, const char* e3, + const char* e4, const char* e5, Pred pred, + const T1& v1, const T2& v2, const T3& v3, + const T4& v4, const T5& v5) { if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess(); return AssertionFailure() @@ -322,25 +253,16 @@ AssertionResult AssertPred5Helper(const char* pred_text, // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5. // Don't use this in your code. -#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\ +#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure) \ GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \ on_failure) // Internal macro for implementing {EXPECT|ASSERT}_PRED5. Don't use // this in your code. -#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \ - #v1, \ - #v2, \ - #v3, \ - #v4, \ - #v5, \ - pred, \ - v1, \ - v2, \ - v3, \ - v4, \ - v5), on_failure) +#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, #v1, #v2, #v3, #v4, #v5, \ + pred, v1, v2, v3, v4, v5), \ + on_failure) // 5-ary predicate assertion macros. #define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ @@ -352,8 +274,6 @@ AssertionResult AssertPred5Helper(const char* pred_text, #define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \ GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) - - } // namespace testing #endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ diff --git a/third_party/googletest/src/include/gtest/gtest_prod.h b/third_party/googletest/src/include/gtest/gtest_prod.h index 38b9d85a51..1f37dc31c3 100644 --- a/third_party/googletest/src/include/gtest/gtest_prod.h +++ b/third_party/googletest/src/include/gtest/gtest_prod.h @@ -27,9 +27,8 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Google C++ Testing and Mocking Framework definitions useful in production code. -// GOOGLETEST_CM0003 DO NOT DELETE +// Google C++ Testing and Mocking Framework definitions useful in production +// code. #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_ @@ -55,7 +54,7 @@ // Note: The test class must be in the same namespace as the class being tested. // For example, putting MyClassTest in an anonymous namespace will not work. -#define FRIEND_TEST(test_case_name, test_name)\ -friend class test_case_name##_##test_name##_Test +#define FRIEND_TEST(test_case_name, test_name) \ + friend class test_case_name##_##test_name##_Test #endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_ diff --git a/third_party/googletest/src/include/gtest/internal/custom/README.md b/third_party/googletest/src/include/gtest/internal/custom/README.md index ff391fb4e2..cb49e2c754 100644 --- a/third_party/googletest/src/include/gtest/internal/custom/README.md +++ b/third_party/googletest/src/include/gtest/internal/custom/README.md @@ -15,18 +15,6 @@ The custom directory is an injection point for custom user configurations. The following macros can be defined: -### Flag related macros: - -* `GTEST_FLAG(flag_name)` -* `GTEST_USE_OWN_FLAGFILE_FLAG_` - Define to 0 when the system provides its - own flagfile flag parsing. -* `GTEST_DECLARE_bool_(name)` -* `GTEST_DECLARE_int32_(name)` -* `GTEST_DECLARE_string_(name)` -* `GTEST_DEFINE_bool_(name, default_val, doc)` -* `GTEST_DEFINE_int32_(name, default_val, doc)` -* `GTEST_DEFINE_string_(name, default_val, doc)` - ### Logging: * `GTEST_LOG_(severity)` diff --git a/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h index db02881c0c..9b7fb4261a 100644 --- a/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h +++ b/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h @@ -34,4 +34,35 @@ #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ +// Use a stub Notification class. +// +// The built-in Notification class in GoogleTest v1.12.1 uses std::mutex and +// std::condition_variable. The and headers of +// mingw32 g++ (GNU 10.0.0) define std::mutex and std::condition_variable only +// when configured with the posix threads option but don't define them when +// configured with the win32 threads option. The Notification class is only +// used in GoogleTest's internal tests. Since we don't build GoogleTest's +// internal tests, we don't need a working Notification class. Although it's +// not hard to fix the mingw32 g++ compilation errors by implementing the +// Notification class using Windows CRITICAL_SECTION and CONDITION_VARIABLE, +// it's simpler to just use a stub Notification class on all platforms. +// +// The default constructor of the stub class is deleted and the declaration of +// the Notify() method is commented out, so that compilation will fail if any +// code actually uses the Notification class. + +#define GTEST_HAS_NOTIFICATION_ 1 +namespace testing { +namespace internal { +class Notification { + public: + Notification() = delete; + Notification(const Notification&) = delete; + Notification& operator=(const Notification&) = delete; + // void Notify(); + void WaitForNotification() {} +}; +} // namespace internal +} // namespace testing + #endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h index 490296dfad..45580ae805 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h @@ -26,27 +26,31 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// + // The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines internal utilities needed for implementing // death tests. They are subject to change without notice. -// GOOGLETEST_CM0001 DO NOT DELETE + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ +#include + +#include + #include "gtest/gtest-matchers.h" #include "gtest/internal/gtest-internal.h" -#include -#include +GTEST_DECLARE_string_(internal_run_death_test); namespace testing { namespace internal { -GTEST_DECLARE_string_(internal_run_death_test); - // Names of the flags (needed for parsing Google Test flags). const char kDeathTestStyleFlag[] = "death_test_style"; const char kDeathTestUseFork[] = "death_test_use_fork"; @@ -83,16 +87,18 @@ class GTEST_API_ DeathTest { static bool Create(const char* statement, Matcher matcher, const char* file, int line, DeathTest** test); DeathTest(); - virtual ~DeathTest() { } + virtual ~DeathTest() {} // A helper class that aborts a death test when it's deleted. class ReturnSentinel { public: - explicit ReturnSentinel(DeathTest* test) : test_(test) { } + explicit ReturnSentinel(DeathTest* test) : test_(test) {} ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); } + private: DeathTest* const test_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel); + ReturnSentinel(const ReturnSentinel&) = delete; + ReturnSentinel& operator=(const ReturnSentinel&) = delete; } GTEST_ATTRIBUTE_UNUSED_; // An enumeration of possible roles that may be taken when a death @@ -137,7 +143,8 @@ class GTEST_API_ DeathTest { // A string containing a description of the outcome of the last death test. static std::string last_death_test_message_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest); + DeathTest(const DeathTest&) = delete; + DeathTest& operator=(const DeathTest&) = delete; }; GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 @@ -145,7 +152,7 @@ GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 // Factory interface for death tests. May be mocked out for testing. class DeathTestFactory { public: - virtual ~DeathTestFactory() { } + virtual ~DeathTestFactory() {} virtual bool Create(const char* statement, Matcher matcher, const char* file, int line, DeathTest** test) = 0; @@ -186,28 +193,28 @@ inline Matcher MakeDeathTestMatcher( // Traps C++ exceptions escaping statement and reports them as test // failures. Note that trapping SEH exceptions is not implemented here. -# if GTEST_HAS_EXCEPTIONS -# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } catch (const ::std::exception& gtest_exception) { \ - fprintf(\ - stderr, \ - "\n%s: Caught std::exception-derived exception escaping the " \ - "death test statement. Exception message: %s\n", \ +#if GTEST_HAS_EXCEPTIONS +#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (const ::std::exception& gtest_exception) { \ + fprintf( \ + stderr, \ + "\n%s: Caught std::exception-derived exception escaping the " \ + "death test statement. Exception message: %s\n", \ ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \ - gtest_exception.what()); \ - fflush(stderr); \ + gtest_exception.what()); \ + fflush(stderr); \ death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ - } catch (...) { \ + } catch (...) { \ death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ } -# else -# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ +#else +#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) -# endif +#endif // This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*, // ASSERT_EXIT*, and EXPECT_EXIT*. @@ -236,8 +243,6 @@ inline Matcher MakeDeathTestMatcher( gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \ break; \ } \ - default: \ - break; \ } \ } \ } else \ @@ -265,16 +270,12 @@ inline Matcher MakeDeathTestMatcher( // RUN_ALL_TESTS was called. class InternalRunDeathTestFlag { public: - InternalRunDeathTestFlag(const std::string& a_file, - int a_line, - int an_index, + InternalRunDeathTestFlag(const std::string& a_file, int a_line, int an_index, int a_write_fd) - : file_(a_file), line_(a_line), index_(an_index), - write_fd_(a_write_fd) {} + : file_(a_file), line_(a_line), index_(an_index), write_fd_(a_write_fd) {} ~InternalRunDeathTestFlag() { - if (write_fd_ >= 0) - posix::Close(write_fd_); + if (write_fd_ >= 0) posix::Close(write_fd_); } const std::string& file() const { return file_; } @@ -288,7 +289,8 @@ class InternalRunDeathTestFlag { int index_; int write_fd_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag); + InternalRunDeathTestFlag(const InternalRunDeathTestFlag&) = delete; + InternalRunDeathTestFlag& operator=(const InternalRunDeathTestFlag&) = delete; }; // Returns a newly created InternalRunDeathTestFlag object with fields diff --git a/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/third_party/googletest/src/include/gtest/internal/gtest-filepath.h index 0c033abc34..a2a60a962b 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-filepath.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-filepath.h @@ -26,7 +26,7 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// + // Google Test filepath utilities // // This header file declares classes and functions used internally by @@ -35,7 +35,9 @@ // This file is #included in gtest/internal/gtest-internal.h. // Do not include this header file separately! -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ @@ -61,8 +63,8 @@ namespace internal { class GTEST_API_ FilePath { public: - FilePath() : pathname_("") { } - FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { } + FilePath() : pathname_("") {} + FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) {} explicit FilePath(const std::string& pathname) : pathname_(pathname) { Normalize(); @@ -73,9 +75,7 @@ class GTEST_API_ FilePath { return *this; } - void Set(const FilePath& rhs) { - pathname_ = rhs.pathname_; - } + void Set(const FilePath& rhs) { pathname_ = rhs.pathname_; } const std::string& string() const { return pathname_; } const char* c_str() const { return pathname_.c_str(); } @@ -88,8 +88,7 @@ class GTEST_API_ FilePath { // than zero (e.g., 12), returns "dir/test_12.xml". // On Windows platform, uses \ as the separator rather than /. static FilePath MakeFileName(const FilePath& directory, - const FilePath& base_name, - int number, + const FilePath& base_name, int number, const char* extension); // Given directory = "dir", relative_path = "test.xml", diff --git a/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/third_party/googletest/src/include/gtest/internal/gtest-internal.h index f8cbdbd81d..9b04e4c85f 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-internal.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-internal.h @@ -26,13 +26,15 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// + // The Google C++ Testing and Mocking Framework (Google Test) // // This header file declares functions and macros used internally by // Google Test. They are subject to change without notice. -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ @@ -40,19 +42,20 @@ #include "gtest/internal/gtest-port.h" #if GTEST_OS_LINUX -# include -# include -# include -# include +#include +#include +#include +#include #endif // GTEST_OS_LINUX #if GTEST_HAS_EXCEPTIONS -# include +#include #endif #include #include #include + #include #include #include @@ -76,7 +79,7 @@ // the current line number. For more details, see // http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6 #define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar) -#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar +#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo##bar // Stringifies its argument. // Work around a bug in visual studio which doesn't accept code like this: @@ -98,21 +101,21 @@ namespace testing { // Forward declarations. -class AssertionResult; // Result of an assertion. -class Message; // Represents a failure message. -class Test; // Represents a test. -class TestInfo; // Information about a test. -class TestPartResult; // Result of a test part. -class UnitTest; // A collection of test suites. +class AssertionResult; // Result of an assertion. +class Message; // Represents a failure message. +class Test; // Represents a test. +class TestInfo; // Information about a test. +class TestPartResult; // Result of a test part. +class UnitTest; // A collection of test suites. template ::std::string PrintToString(const T& value); namespace internal { -struct TraceInfo; // Information about a trace point. -class TestInfoImpl; // Opaque implementation of TestInfo -class UnitTestImpl; // Opaque implementation of UnitTest +struct TraceInfo; // Information about a trace point. +class TestInfoImpl; // Opaque implementation of TestInfo +class UnitTestImpl; // Opaque implementation of UnitTest // The text used in failure messages to indicate the start of the // stack trace. @@ -121,6 +124,7 @@ GTEST_API_ extern const char kStackTraceMarker[]; // An IgnoredValue object can be implicitly constructed from ANY value. class IgnoredValue { struct Sink {}; + public: // This constructor template allows any value to be implicitly // converted to IgnoredValue. The object has no data member and @@ -136,13 +140,13 @@ class IgnoredValue { }; // Appends the user-supplied message to the Google-Test-generated message. -GTEST_API_ std::string AppendUserMessage( - const std::string& gtest_msg, const Message& user_msg); +GTEST_API_ std::string AppendUserMessage(const std::string& gtest_msg, + const Message& user_msg); #if GTEST_HAS_EXCEPTIONS -GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \ -/* an exported class was derived from a class that was not exported */) +GTEST_DISABLE_MSC_WARNINGS_PUSH_( + 4275 /* an exported class was derived from a class that was not exported */) // This exception is thrown by (and only by) a failed Google Test // assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions @@ -181,14 +185,6 @@ GTEST_API_ std::string CreateUnifiedDiff(const std::vector& left, } // namespace edit_distance -// Calculate the diff between 'left' and 'right' and return it in unified diff -// format. -// If not null, stores in 'total_line_count' the total number of lines found -// in left + right. -GTEST_API_ std::string DiffStrings(const std::string& left, - const std::string& right, - size_t* total_line_count); - // Constructs and returns the message for an equality assertion // (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. // @@ -212,10 +208,8 @@ GTEST_API_ AssertionResult EqFailure(const char* expected_expression, // Constructs a failure message for Boolean assertions such as EXPECT_TRUE. GTEST_API_ std::string GetBoolAssertionFailureMessage( - const AssertionResult& assertion_result, - const char* expression_text, - const char* actual_predicate_value, - const char* expected_predicate_value); + const AssertionResult& assertion_result, const char* expression_text, + const char* actual_predicate_value, const char* expected_predicate_value); // This template class represents an IEEE floating-point number // (either single-precision or double-precision, depending on the @@ -256,11 +250,11 @@ class FloatingPoint { // Constants. // # of bits in a number. - static const size_t kBitCount = 8*sizeof(RawType); + static const size_t kBitCount = 8 * sizeof(RawType); // # of fraction bits in a number. static const size_t kFractionBitCount = - std::numeric_limits::digits - 1; + std::numeric_limits::digits - 1; // # of exponent bits in a number. static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount; @@ -269,8 +263,8 @@ class FloatingPoint { static const Bits kSignBitMask = static_cast(1) << (kBitCount - 1); // The mask for the fraction bits. - static const Bits kFractionBitMask = - ~static_cast(0) >> (kExponentBitCount + 1); + static const Bits kFractionBitMask = ~static_cast(0) >> + (kExponentBitCount + 1); // The mask for the exponent bits. static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask); @@ -309,9 +303,7 @@ class FloatingPoint { } // Returns the floating-point number that represent positive infinity. - static RawType Infinity() { - return ReinterpretBits(kExponentBitMask); - } + static RawType Infinity() { return ReinterpretBits(kExponentBitMask); } // Returns the maximum representable finite floating-point number. static RawType Max(); @@ -319,7 +311,7 @@ class FloatingPoint { // Non-static methods // Returns the bits that represents this number. - const Bits &bits() const { return u_.bits_; } + const Bits& bits() const { return u_.bits_; } // Returns the exponent bits of this number. Bits exponent_bits() const { return kExponentBitMask & u_.bits_; } @@ -348,8 +340,8 @@ class FloatingPoint { // a NAN must return false. if (is_nan() || rhs.is_nan()) return false; - return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) - <= kMaxUlps; + return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) <= + kMaxUlps; } private: @@ -374,7 +366,7 @@ class FloatingPoint { // // Read http://en.wikipedia.org/wiki/Signed_number_representations // for more details on signed number representations. - static Bits SignAndMagnitudeToBiased(const Bits &sam) { + static Bits SignAndMagnitudeToBiased(const Bits& sam) { if (kSignBitMask & sam) { // sam represents a negative number. return ~sam + 1; @@ -386,8 +378,8 @@ class FloatingPoint { // Given two numbers in the sign-and-magnitude representation, // returns the distance between them as an unsigned number. - static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1, - const Bits &sam2) { + static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits& sam1, + const Bits& sam2) { const Bits biased1 = SignAndMagnitudeToBiased(sam1); const Bits biased2 = SignAndMagnitudeToBiased(sam2); return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1); @@ -399,9 +391,13 @@ class FloatingPoint { // We cannot use std::numeric_limits::max() as it clashes with the max() // macro defined by . template <> -inline float FloatingPoint::Max() { return FLT_MAX; } +inline float FloatingPoint::Max() { + return FLT_MAX; +} template <> -inline double FloatingPoint::Max() { return DBL_MAX; } +inline double FloatingPoint::Max() { + return DBL_MAX; +} // Typedefs the instances of the FloatingPoint template class that we // care to use. @@ -461,7 +457,8 @@ class TestFactoryBase { TestFactoryBase() {} private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase); + TestFactoryBase(const TestFactoryBase&) = delete; + TestFactoryBase& operator=(const TestFactoryBase&) = delete; }; // This class provides implementation of TeastFactoryBase interface. @@ -510,11 +507,11 @@ inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull( template // Note that SuiteApiResolver inherits from T because -// SetUpTestSuite()/TearDownTestSuite() could be protected. Ths way +// SetUpTestSuite()/TearDownTestSuite() could be protected. This way // SuiteApiResolver can access them. struct SuiteApiResolver : T { // testing::Test is only forward declared at this point. So we make it a - // dependend class for the compiler to be OK with it. + // dependent class for the compiler to be OK with it. using Test = typename std::conditional::type; @@ -654,7 +651,8 @@ inline const char* SkipComma(const char* str) { if (comma == nullptr) { return nullptr; } - while (IsSpace(*(++comma))) {} + while (IsSpace(*(++comma))) { + } return comma; } @@ -668,7 +666,7 @@ inline std::string GetPrefixUntilComma(const char* str) { // Splits a given string on a given delimiter, populating a given // vector with the fields. void SplitString(const ::std::string& str, char delimiter, - ::std::vector< ::std::string>* dest); + ::std::vector<::std::string>* dest); // The default argument to the template below for the case when the user does // not provide a name generator. @@ -781,13 +779,13 @@ class TypeParameterizedTestSuite { const std::vector& type_names = GenerateNames()) { RegisterTypeParameterizedTestSuiteInstantiation(case_name); - std::string test_name = StripTrailingSpaces( - GetPrefixUntilComma(test_names)); + std::string test_name = + StripTrailingSpaces(GetPrefixUntilComma(test_names)); if (!state->TestExists(test_name)) { fprintf(stderr, "Failed to get code location for test %s.%s at %s.", case_name, test_name.c_str(), - FormatFileLocation(code_location.file.c_str(), - code_location.line).c_str()); + FormatFileLocation(code_location.file.c_str(), code_location.line) + .c_str()); fflush(stderr); posix::Abort(); } @@ -831,8 +829,8 @@ class TypeParameterizedTestSuite { // For example, if Foo() calls Bar(), which in turn calls // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. -GTEST_API_ std::string GetCurrentOsStackTraceExceptTop( - UnitTest* unit_test, int skip_count); +GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(UnitTest* unit_test, + int skip_count); // Helpers for suppressing warnings on unreachable code or constant // condition. @@ -881,7 +879,8 @@ class GTEST_API_ Random { private: uint32_t state_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(Random); + Random(const Random&) = delete; + Random& operator=(const Random&) = delete; }; // Turns const U&, U&, const U, and U all into U. @@ -954,7 +953,9 @@ IsContainer IsContainerTest(int /* dummy */) { typedef char IsNotContainer; template -IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; } +IsNotContainer IsContainerTest(long /* dummy */) { + return '\0'; +} // Trait to detect whether a type T is a hash table. // The heuristic used is that the type contains an inner type `hasher` and does @@ -1017,11 +1018,13 @@ bool ArrayEq(const T* lhs, size_t size, const U* rhs); // This generic version is used when k is 0. template -inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; } +inline bool ArrayEq(const T& lhs, const U& rhs) { + return lhs == rhs; +} // This overload is used when k >= 1. template -inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) { +inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) { return internal::ArrayEq(lhs, N, rhs); } @@ -1031,8 +1034,7 @@ inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) { template bool ArrayEq(const T* lhs, size_t size, const U* rhs) { for (size_t i = 0; i != size; i++) { - if (!internal::ArrayEq(lhs[i], rhs[i])) - return false; + if (!internal::ArrayEq(lhs[i], rhs[i])) return false; } return true; } @@ -1042,8 +1044,7 @@ bool ArrayEq(const T* lhs, size_t size, const U* rhs) { template Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) { for (Iter it = begin; it != end; ++it) { - if (internal::ArrayEq(*it, elem)) - return it; + if (internal::ArrayEq(*it, elem)) return it; } return end; } @@ -1057,11 +1058,13 @@ void CopyArray(const T* from, size_t size, U* to); // This generic version is used when k is 0. template -inline void CopyArray(const T& from, U* to) { *to = from; } +inline void CopyArray(const T& from, U* to) { + *to = from; +} // This overload is used when k >= 1. template -inline void CopyArray(const T(&from)[N], U(*to)[N]) { +inline void CopyArray(const T (&from)[N], U (*to)[N]) { internal::CopyArray(from, N, *to); } @@ -1114,8 +1117,7 @@ class NativeArray { } ~NativeArray() { - if (clone_ != &NativeArray::InitRef) - delete[] array_; + if (clone_ != &NativeArray::InitRef) delete[] array_; } // STL-style container methods. @@ -1123,8 +1125,7 @@ class NativeArray { const_iterator begin() const { return array_; } const_iterator end() const { return array_ + size_; } bool operator==(const NativeArray& rhs) const { - return size() == rhs.size() && - ArrayEq(begin(), size(), rhs.begin()); + return size() == rhs.size() && ArrayEq(begin(), size(), rhs.begin()); } private: @@ -1335,9 +1336,9 @@ struct tuple_size> #endif } // namespace std -#define GTEST_MESSAGE_AT_(file, line, message, result_type) \ - ::testing::internal::AssertHelper(result_type, file, line, message) \ - = ::testing::Message() +#define GTEST_MESSAGE_AT_(file, line, message, result_type) \ + ::testing::internal::AssertHelper(result_type, file, line, message) = \ + ::testing::Message() #define GTEST_MESSAGE_(message, result_type) \ GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type) @@ -1458,103 +1459,112 @@ class NeverThrown { #endif // GTEST_HAS_EXCEPTIONS -#define GTEST_TEST_NO_THROW_(statement, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::TrueWithString gtest_msg{}) { \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } \ - GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \ - catch (...) { \ - gtest_msg.value = "it throws."; \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \ - fail(("Expected: " #statement " doesn't throw an exception.\n" \ - " Actual: " + gtest_msg.value).c_str()) - -#define GTEST_TEST_ANY_THROW_(statement, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - bool gtest_caught_any = false; \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } \ - catch (...) { \ - gtest_caught_any = true; \ - } \ - if (!gtest_caught_any) { \ +#define GTEST_TEST_NO_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::TrueWithString gtest_msg{}) { \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \ + catch (...) { \ + gtest_msg.value = "it throws."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__) \ + : fail(("Expected: " #statement " doesn't throw an exception.\n" \ + " Actual: " + \ + gtest_msg.value) \ + .c_str()) + +#define GTEST_TEST_ANY_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + bool gtest_caught_any = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (...) { \ + gtest_caught_any = true; \ + } \ + if (!gtest_caught_any) { \ goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \ - fail("Expected: " #statement " throws an exception.\n" \ - " Actual: it doesn't.") - + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__) \ + : fail("Expected: " #statement \ + " throws an exception.\n" \ + " Actual: it doesn't.") // Implements Boolean test assertions such as EXPECT_TRUE. expression can be // either a boolean expression or an AssertionResult. text is a textual // representation of expression as it was passed into the EXPECT_TRUE. #define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (const ::testing::AssertionResult gtest_ar_ = \ - ::testing::AssertionResult(expression)) \ - ; \ - else \ - fail(::testing::internal::GetBoolAssertionFailureMessage(\ - gtest_ar_, text, #actual, #expected).c_str()) - -#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar_ = \ + ::testing::AssertionResult(expression)) \ + ; \ + else \ + fail(::testing::internal::GetBoolAssertionFailureMessage( \ + gtest_ar_, text, #actual, #expected) \ + .c_str()) + +#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \ - fail("Expected: " #statement " doesn't generate new fatal " \ - "failures in the current thread.\n" \ - " Actual: it does.") + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__) \ + : fail("Expected: " #statement \ + " doesn't generate new fatal " \ + "failures in the current thread.\n" \ + " Actual: it does.") // Expands to the name of the class that implements the given test. #define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ test_suite_name##_##test_name##_Test // Helper macro for defining tests. -#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id) \ - static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1, \ - "test_suite_name must not be empty"); \ - static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1, \ - "test_name must not be empty"); \ - class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ - : public parent_class { \ - public: \ - GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default; \ - ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \ - GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ - test_name)); \ - GTEST_DISALLOW_MOVE_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ - test_name)); \ - \ - private: \ - void TestBody() override; \ - static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_; \ - }; \ - \ - ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name, \ - test_name)::test_info_ = \ - ::testing::internal::MakeAndRegisterTestInfo( \ - #test_suite_name, #test_name, nullptr, nullptr, \ - ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \ - ::testing::internal::SuiteApiResolver< \ - parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__), \ - ::testing::internal::SuiteApiResolver< \ - parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__), \ - new ::testing::internal::TestFactoryImpl); \ +#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id) \ + static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1, \ + "test_suite_name must not be empty"); \ + static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1, \ + "test_name must not be empty"); \ + class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + : public parent_class { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default; \ + ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \ + const GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name) &) = delete; /* NOLINT */ \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + (GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &&) noexcept = delete; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \ + GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name) &&) noexcept = delete; /* NOLINT */ \ + \ + private: \ + void TestBody() override; \ + static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_; \ + }; \ + \ + ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)::test_info_ = \ + ::testing::internal::MakeAndRegisterTestInfo( \ + #test_suite_name, #test_name, nullptr, nullptr, \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \ + ::testing::internal::SuiteApiResolver< \ + parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__), \ + ::testing::internal::SuiteApiResolver< \ + parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__), \ + new ::testing::internal::TestFactoryImpl); \ void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody() #endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ diff --git a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h index c2ef6e3124..e7af2f904a 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h @@ -27,10 +27,11 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - // Type and function utilities for implementing parameterized tests. -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ @@ -46,19 +47,18 @@ #include #include -#include "gtest/internal/gtest-internal.h" -#include "gtest/internal/gtest-port.h" #include "gtest/gtest-printers.h" #include "gtest/gtest-test-part.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" namespace testing { // Input to a parameterized test name generator, describing a test parameter. // Consists of the parameter value and the integer parameter index. template struct TestParamInfo { - TestParamInfo(const ParamType& a_param, size_t an_index) : - param(a_param), - index(an_index) {} + TestParamInfo(const ParamType& a_param, size_t an_index) + : param(a_param), index(an_index) {} ParamType param; size_t index; }; @@ -84,8 +84,10 @@ namespace internal { GTEST_API_ void ReportInvalidTestSuiteType(const char* test_suite_name, CodeLocation code_location); -template class ParamGeneratorInterface; -template class ParamGenerator; +template +class ParamGeneratorInterface; +template +class ParamGenerator; // Interface for iterating over elements provided by an implementation // of ParamGeneratorInterface. @@ -129,8 +131,7 @@ class ParamIterator { // ParamIterator assumes ownership of the impl_ pointer. ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {} ParamIterator& operator=(const ParamIterator& other) { - if (this != &other) - impl_.reset(other.impl_->Clone()); + if (this != &other) impl_.reset(other.impl_->Clone()); return *this; } @@ -157,7 +158,7 @@ class ParamIterator { private: friend class ParamGenerator; explicit ParamIterator(ParamIteratorInterface* impl) : impl_(impl) {} - std::unique_ptr > impl_; + std::unique_ptr> impl_; }; // ParamGeneratorInterface is the binary interface to access generators @@ -179,7 +180,7 @@ class ParamGeneratorInterface { // This class implements copy initialization semantics and the contained // ParamGeneratorInterface instance is shared among all copies // of the original object. This is possible because that instance is immutable. -template +template class ParamGenerator { public: typedef ParamIterator iterator; @@ -196,7 +197,7 @@ class ParamGenerator { iterator end() const { return iterator(impl_->End()); } private: - std::shared_ptr > impl_; + std::shared_ptr> impl_; }; // Generates values from a range of two comparable values. Can be used to @@ -207,8 +208,10 @@ template class RangeGenerator : public ParamGeneratorInterface { public: RangeGenerator(T begin, T end, IncrementT step) - : begin_(begin), end_(end), - step_(step), end_index_(CalculateEndIndex(begin, end, step)) {} + : begin_(begin), + end_(end), + step_(step), + end_index_(CalculateEndIndex(begin, end, step)) {} ~RangeGenerator() override {} ParamIteratorInterface* Begin() const override { @@ -251,7 +254,9 @@ class RangeGenerator : public ParamGeneratorInterface { private: Iterator(const Iterator& other) : ParamIteratorInterface(), - base_(other.base_), value_(other.value_), index_(other.index_), + base_(other.base_), + value_(other.value_), + index_(other.index_), step_(other.step_) {} // No implementation - assignment is unsupported. @@ -263,12 +268,10 @@ class RangeGenerator : public ParamGeneratorInterface { const IncrementT step_; }; // class RangeGenerator::Iterator - static int CalculateEndIndex(const T& begin, - const T& end, + static int CalculateEndIndex(const T& begin, const T& end, const IncrementT& step) { int end_index = 0; - for (T i = begin; i < end; i = static_cast(i + step)) - end_index++; + for (T i = begin; i < end; i = static_cast(i + step)) end_index++; return end_index; } @@ -283,7 +286,6 @@ class RangeGenerator : public ParamGeneratorInterface { const int end_index_; }; // class RangeGenerator - // Generates values from a pair of STL-style iterators. Used in the // ValuesIn() function. The elements are copied from the source range // since the source can be located on the stack, and the generator @@ -341,13 +343,13 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface { << "The program attempted to compare iterators " << "from different generators." << std::endl; return iterator_ == - CheckedDowncastToActualType(&other)->iterator_; + CheckedDowncastToActualType(&other)->iterator_; } private: Iterator(const Iterator& other) - // The explicit constructor call suppresses a false warning - // emitted by gcc when supplied with the -Wextra option. + // The explicit constructor call suppresses a false warning + // emitted by gcc when supplied with the -Wextra option. : ParamIteratorInterface(), base_(other.base_), iterator_(other.iterator_) {} @@ -394,8 +396,8 @@ template class ParameterizedTestFactory : public TestFactoryBase { public: typedef typename TestClass::ParamType ParamType; - explicit ParameterizedTestFactory(ParamType parameter) : - parameter_(parameter) {} + explicit ParameterizedTestFactory(ParamType parameter) + : parameter_(parameter) {} Test* CreateTest() override { TestClass::SetParam(¶meter_); return new TestClass(); @@ -404,7 +406,8 @@ class ParameterizedTestFactory : public TestFactoryBase { private: const ParamType parameter_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory); + ParameterizedTestFactory(const ParameterizedTestFactory&) = delete; + ParameterizedTestFactory& operator=(const ParameterizedTestFactory&) = delete; }; // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. @@ -440,7 +443,8 @@ class TestMetaFactory } private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory); + TestMetaFactory(const TestMetaFactory&) = delete; + TestMetaFactory& operator=(const TestMetaFactory&) = delete; }; // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. @@ -471,7 +475,10 @@ class ParameterizedTestSuiteInfoBase { ParameterizedTestSuiteInfoBase() {} private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfoBase); + ParameterizedTestSuiteInfoBase(const ParameterizedTestSuiteInfoBase&) = + delete; + ParameterizedTestSuiteInfoBase& operator=( + const ParameterizedTestSuiteInfoBase&) = delete; }; // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. @@ -547,8 +554,8 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { test_it != tests_.end(); ++test_it) { std::shared_ptr test_info = *test_it; for (typename InstantiationContainer::iterator gen_it = - instantiations_.begin(); gen_it != instantiations_.end(); - ++gen_it) { + instantiations_.begin(); + gen_it != instantiations_.end(); ++gen_it) { const std::string& instantiation_name = gen_it->name; ParamGenerator generator((*gen_it->generator)()); ParamNameGeneratorFunc* name_func = gen_it->name_func; @@ -556,7 +563,7 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { int line = gen_it->line; std::string test_suite_name; - if ( !instantiation_name.empty() ) + if (!instantiation_name.empty()) test_suite_name = instantiation_name + "/"; test_suite_name += test_info->test_suite_base_name; @@ -569,17 +576,16 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { Message test_name_stream; - std::string param_name = name_func( - TestParamInfo(*param_it, i)); + std::string param_name = + name_func(TestParamInfo(*param_it, i)); GTEST_CHECK_(IsValidParamName(param_name)) << "Parameterized test name '" << param_name - << "' is invalid, in " << file - << " line " << line << std::endl; + << "' is invalid, in " << file << " line " << line << std::endl; GTEST_CHECK_(test_param_names.count(param_name) == 0) - << "Duplicate parameterized test name '" << param_name - << "', in " << file << " line " << line << std::endl; + << "Duplicate parameterized test name '" << param_name << "', in " + << file << " line " << line << std::endl; test_param_names.insert(param_name); @@ -596,15 +602,15 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { SuiteApiResolver::GetTearDownCaseOrSuite(file, line), test_info->test_meta_factory->CreateTestFactory(*param_it)); } // for param_it - } // for gen_it - } // for test_it + } // for gen_it + } // for test_it if (!generated_instantiations) { // There are no generaotrs, or they all generate nothing ... InsertSyntheticTestCase(GetTestSuiteName(), code_location_, !tests_.empty()); } - } // RegisterTests + } // RegisterTests private: // LocalTestInfo structure keeps information about a single test registered @@ -620,42 +626,39 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { const std::string test_suite_base_name; const std::string test_base_name; - const std::unique_ptr > test_meta_factory; + const std::unique_ptr> test_meta_factory; const CodeLocation code_location; }; - using TestInfoContainer = ::std::vector >; + using TestInfoContainer = ::std::vector>; // Records data received from INSTANTIATE_TEST_SUITE_P macros: // struct InstantiationInfo { - InstantiationInfo(const std::string &name_in, - GeneratorCreationFunc* generator_in, - ParamNameGeneratorFunc* name_func_in, - const char* file_in, - int line_in) - : name(name_in), - generator(generator_in), - name_func(name_func_in), - file(file_in), - line(line_in) {} - - std::string name; - GeneratorCreationFunc* generator; - ParamNameGeneratorFunc* name_func; - const char* file; - int line; + InstantiationInfo(const std::string& name_in, + GeneratorCreationFunc* generator_in, + ParamNameGeneratorFunc* name_func_in, const char* file_in, + int line_in) + : name(name_in), + generator(generator_in), + name_func(name_func_in), + file(file_in), + line(line_in) {} + + std::string name; + GeneratorCreationFunc* generator; + ParamNameGeneratorFunc* name_func; + const char* file; + int line; }; typedef ::std::vector InstantiationContainer; static bool IsValidParamName(const std::string& name) { // Check for empty string - if (name.empty()) - return false; + if (name.empty()) return false; // Check for invalid characters for (std::string::size_type index = 0; index < name.size(); ++index) { - if (!IsAlNum(name[index]) && name[index] != '_') - return false; + if (!IsAlNum(name[index]) && name[index] != '_') return false; } return true; @@ -666,7 +669,9 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { TestInfoContainer tests_; InstantiationContainer instantiations_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfo); + ParameterizedTestSuiteInfo(const ParameterizedTestSuiteInfo&) = delete; + ParameterizedTestSuiteInfo& operator=(const ParameterizedTestSuiteInfo&) = + delete; }; // class ParameterizedTestSuiteInfo // Legacy API is deprecated but still available @@ -709,7 +714,7 @@ class ParameterizedTestSuiteRegistry { // type we are looking for, so we downcast it to that type // without further checks. typed_test_info = CheckedDowncastToActualType< - ParameterizedTestSuiteInfo >(test_suite_info); + ParameterizedTestSuiteInfo>(test_suite_info); } break; } @@ -741,7 +746,10 @@ class ParameterizedTestSuiteRegistry { TestSuiteInfoContainer test_suite_infos_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteRegistry); + ParameterizedTestSuiteRegistry(const ParameterizedTestSuiteRegistry&) = + delete; + ParameterizedTestSuiteRegistry& operator=( + const ParameterizedTestSuiteRegistry&) = delete; }; // Keep track of what type-parameterized test suite are defined and @@ -836,7 +844,8 @@ class CartesianProductGenerator : public ParamIteratorInterface { public: IteratorImpl(const ParamGeneratorInterface* base, - const std::tuple...>& generators, bool is_end) + const std::tuple...>& generators, + bool is_end) : base_(base), begin_(std::get(generators).begin()...), end_(std::get(generators).end()...), diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h index dd845915e3..f025db76ad 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h @@ -26,7 +26,7 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// + // The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines the GTEST_OS_* macro. @@ -37,70 +37,72 @@ // Determines the platform on which Google Test is compiled. #ifdef __CYGWIN__ -# define GTEST_OS_CYGWIN 1 -# elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__) -# define GTEST_OS_WINDOWS_MINGW 1 -# define GTEST_OS_WINDOWS 1 +#define GTEST_OS_CYGWIN 1 +#elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__) +#define GTEST_OS_WINDOWS_MINGW 1 +#define GTEST_OS_WINDOWS 1 #elif defined _WIN32 -# define GTEST_OS_WINDOWS 1 -# ifdef _WIN32_WCE -# define GTEST_OS_WINDOWS_MOBILE 1 -# elif defined(WINAPI_FAMILY) -# include -# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) -# define GTEST_OS_WINDOWS_DESKTOP 1 -# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP) -# define GTEST_OS_WINDOWS_PHONE 1 -# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) -# define GTEST_OS_WINDOWS_RT 1 -# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE) -# define GTEST_OS_WINDOWS_PHONE 1 -# define GTEST_OS_WINDOWS_TV_TITLE 1 -# else - // WINAPI_FAMILY defined but no known partition matched. - // Default to desktop. -# define GTEST_OS_WINDOWS_DESKTOP 1 -# endif -# else -# define GTEST_OS_WINDOWS_DESKTOP 1 -# endif // _WIN32_WCE +#define GTEST_OS_WINDOWS 1 +#ifdef _WIN32_WCE +#define GTEST_OS_WINDOWS_MOBILE 1 +#elif defined(WINAPI_FAMILY) +#include +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define GTEST_OS_WINDOWS_DESKTOP 1 +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP) +#define GTEST_OS_WINDOWS_PHONE 1 +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) +#define GTEST_OS_WINDOWS_RT 1 +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE) +#define GTEST_OS_WINDOWS_PHONE 1 +#define GTEST_OS_WINDOWS_TV_TITLE 1 +#else +// WINAPI_FAMILY defined but no known partition matched. +// Default to desktop. +#define GTEST_OS_WINDOWS_DESKTOP 1 +#endif +#else +#define GTEST_OS_WINDOWS_DESKTOP 1 +#endif // _WIN32_WCE #elif defined __OS2__ -# define GTEST_OS_OS2 1 +#define GTEST_OS_OS2 1 #elif defined __APPLE__ -# define GTEST_OS_MAC 1 -# include -# if TARGET_OS_IPHONE -# define GTEST_OS_IOS 1 -# endif +#define GTEST_OS_MAC 1 +#include +#if TARGET_OS_IPHONE +#define GTEST_OS_IOS 1 +#endif #elif defined __DragonFly__ -# define GTEST_OS_DRAGONFLY 1 +#define GTEST_OS_DRAGONFLY 1 #elif defined __FreeBSD__ -# define GTEST_OS_FREEBSD 1 +#define GTEST_OS_FREEBSD 1 #elif defined __Fuchsia__ -# define GTEST_OS_FUCHSIA 1 +#define GTEST_OS_FUCHSIA 1 +#elif defined(__GNU__) +#define GTEST_OS_GNU_HURD 1 #elif defined(__GLIBC__) && defined(__FreeBSD_kernel__) -# define GTEST_OS_GNU_KFREEBSD 1 +#define GTEST_OS_GNU_KFREEBSD 1 #elif defined __linux__ -# define GTEST_OS_LINUX 1 -# if defined __ANDROID__ -# define GTEST_OS_LINUX_ANDROID 1 -# endif +#define GTEST_OS_LINUX 1 +#if defined __ANDROID__ +#define GTEST_OS_LINUX_ANDROID 1 +#endif #elif defined __MVS__ -# define GTEST_OS_ZOS 1 +#define GTEST_OS_ZOS 1 #elif defined(__sun) && defined(__SVR4) -# define GTEST_OS_SOLARIS 1 +#define GTEST_OS_SOLARIS 1 #elif defined(_AIX) -# define GTEST_OS_AIX 1 +#define GTEST_OS_AIX 1 #elif defined(__hpux) -# define GTEST_OS_HPUX 1 +#define GTEST_OS_HPUX 1 #elif defined __native_client__ -# define GTEST_OS_NACL 1 +#define GTEST_OS_NACL 1 #elif defined __NetBSD__ -# define GTEST_OS_NETBSD 1 +#define GTEST_OS_NETBSD 1 #elif defined __OpenBSD__ -# define GTEST_OS_OPENBSD 1 +#define GTEST_OS_OPENBSD 1 #elif defined __QNX__ -# define GTEST_OS_QNX 1 +#define GTEST_OS_QNX 1 #elif defined(__HAIKU__) #define GTEST_OS_HAIKU 1 #elif defined ESP8266 diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port.h b/third_party/googletest/src/include/gtest/internal/gtest-port.h index 0953a781c0..0003d27658 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-port.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-port.h @@ -26,7 +26,7 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// + // Low-level types and utilities for porting Google Test to various // platforms. All macros ending with _ and symbols defined in an // internal namespace are subject to change without notice. Code @@ -38,7 +38,9 @@ // files are expected to #include this. Therefore, it cannot #include // any other Google Test header. -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ @@ -116,6 +118,7 @@ // GTEST_OS_DRAGONFLY - DragonFlyBSD // GTEST_OS_FREEBSD - FreeBSD // GTEST_OS_FUCHSIA - Fuchsia +// GTEST_OS_GNU_HURD - GNU/Hurd // GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD // GTEST_OS_HAIKU - Haiku // GTEST_OS_HPUX - HP-UX @@ -167,7 +170,7 @@ // GTEST_HAS_TYPED_TEST - typed tests // GTEST_HAS_TYPED_TEST_P - type-parameterized tests // GTEST_IS_THREADSAFE - Google Test is thread-safe. -// GOOGLETEST_CM0007 DO NOT DELETE +// GTEST_USES_RE2 - the RE2 regular expression library is used // GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with // GTEST_HAS_POSIX_RE (see above) which users can // define themselves. @@ -190,10 +193,6 @@ // GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning. // GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a // variable don't have to be used. -// GTEST_DISALLOW_ASSIGN_ - disables copy operator=. -// GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=. -// GTEST_DISALLOW_MOVE_ASSIGN_ - disables move operator=. -// GTEST_DISALLOW_MOVE_AND_ASSIGN_ - disables move ctor and operator=. // GTEST_MUST_USE_RESULT_ - declares that a function's result must be used. // GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is // suppressed (constant conditional). @@ -217,11 +216,13 @@ // - synchronization primitives. // // Regular expressions: -// RE - a simple regular expression class using the POSIX -// Extended Regular Expression syntax on UNIX-like platforms -// GOOGLETEST_CM0008 DO NOT DELETE -// or a reduced regular exception syntax on other -// platforms, including Windows. +// RE - a simple regular expression class using +// 1) the RE2 syntax on all platforms when built with RE2 +// and Abseil as dependencies +// 2) the POSIX Extended Regular Expression syntax on +// UNIX-like platforms, +// 3) A reduced regular exception syntax on other platforms, +// including Windows. // Logging: // GTEST_LOG_() - logs messages at the specified severity level. // LogToStderr() - directs all log messages to stderr. @@ -241,8 +242,6 @@ // BiggestInt - the biggest signed integer type. // // Command-line utilities: -// GTEST_DECLARE_*() - declares a flag. -// GTEST_DEFINE_*() - defines a flag. // GetInjectableArgvs() - returns the command line as a vector of strings. // // Environment variable utilities: @@ -263,48 +262,55 @@ #include #include +// #include // Guarded by GTEST_IS_THREADSAFE below #include +#include #include +#include +#include +#include +// #include // Guarded by GTEST_IS_THREADSAFE below +#include #include +#include #ifndef _WIN32_WCE -# include -# include +#include +#include #endif // !_WIN32_WCE #if defined __APPLE__ -# include -# include +#include +#include #endif -#include // NOLINT -#include -#include -#include // NOLINT -#include -#include // NOLINT - #include "gtest/internal/custom/gtest-port.h" #include "gtest/internal/gtest-port-arch.h" +#if GTEST_HAS_ABSL +#include "absl/flags/declare.h" +#include "absl/flags/flag.h" +#include "absl/flags/reflection.h" +#endif + #if !defined(GTEST_DEV_EMAIL_) -# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com" -# define GTEST_FLAG_PREFIX_ "gtest_" -# define GTEST_FLAG_PREFIX_DASH_ "gtest-" -# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_" -# define GTEST_NAME_ "Google Test" -# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/" +#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com" +#define GTEST_FLAG_PREFIX_ "gtest_" +#define GTEST_FLAG_PREFIX_DASH_ "gtest-" +#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_" +#define GTEST_NAME_ "Google Test" +#define GTEST_PROJECT_URL_ "https://github.com/google/googletest/" #endif // !defined(GTEST_DEV_EMAIL_) #if !defined(GTEST_INIT_GOOGLE_TEST_NAME_) -# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest" +#define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest" #endif // !defined(GTEST_INIT_GOOGLE_TEST_NAME_) // Determines the version of gcc that is used to compile this. #ifdef __GNUC__ // 40302 means version 4.3.2. -# define GTEST_GCC_VER_ \ - (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__) +#define GTEST_GCC_VER_ \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) #endif // __GNUC__ // Macros for disabling Microsoft Visual C++ warnings. @@ -313,41 +319,37 @@ // /* code that triggers warnings C4800 and C4385 */ // GTEST_DISABLE_MSC_WARNINGS_POP_() #if defined(_MSC_VER) -# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \ - __pragma(warning(push)) \ - __pragma(warning(disable: warnings)) -# define GTEST_DISABLE_MSC_WARNINGS_POP_() \ - __pragma(warning(pop)) +#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \ + __pragma(warning(push)) __pragma(warning(disable : warnings)) +#define GTEST_DISABLE_MSC_WARNINGS_POP_() __pragma(warning(pop)) #else // Not all compilers are MSVC -# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) -# define GTEST_DISABLE_MSC_WARNINGS_POP_() +#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) +#define GTEST_DISABLE_MSC_WARNINGS_POP_() #endif // Clang on Windows does not understand MSVC's pragma warning. // We need clang-specific way to disable function deprecation warning. #ifdef __clang__ -# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ - _Pragma("clang diagnostic push") \ - _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \ - _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"") -#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \ - _Pragma("clang diagnostic pop") +#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"") +#define GTEST_DISABLE_MSC_DEPRECATED_POP_() _Pragma("clang diagnostic pop") #else -# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ - GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996) -# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \ - GTEST_DISABLE_MSC_WARNINGS_POP_() +#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996) +#define GTEST_DISABLE_MSC_DEPRECATED_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_() #endif // Brings in definitions for functions used in the testing::internal::posix // namespace (read, write, close, chdir, isatty, stat). We do not currently // use them on Windows Mobile. #if GTEST_OS_WINDOWS -# if !GTEST_OS_WINDOWS_MOBILE -# include -# include -# endif +#if !GTEST_OS_WINDOWS_MOBILE +#include +#include +#endif // In order to avoid having to include , use forward declaration #if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR) // MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two @@ -367,68 +369,55 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // This assumes that non-Windows OSes provide unistd.h. For OSes where this // is not the case, we need to include headers that provide the functions // mentioned above. -# include -# include +#include +#include #endif // GTEST_OS_WINDOWS #if GTEST_OS_LINUX_ANDROID // Used to define __ANDROID_API__ matching the target NDK API level. -# include // NOLINT +#include // NOLINT #endif // Defines this to true if and only if Google Test can use POSIX regular // expressions. #ifndef GTEST_HAS_POSIX_RE -# if GTEST_OS_LINUX_ANDROID +#if GTEST_OS_LINUX_ANDROID // On Android, is only available starting with Gingerbread. -# define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9) -# else +#define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9) +#else #define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS && !GTEST_OS_XTENSA) -# endif +#endif #endif -#if GTEST_USES_PCRE -// The appropriate headers have already been included. - +// Select the regular expression implementation. +#if GTEST_HAS_ABSL +// When using Abseil, RE2 is required. +#include "absl/strings/string_view.h" +#include "re2/re2.h" +#define GTEST_USES_RE2 1 #elif GTEST_HAS_POSIX_RE - -// On some platforms, needs someone to define size_t, and -// won't compile otherwise. We can #include it here as we already -// included , which is guaranteed to define size_t through -// . -# include // NOLINT - -# define GTEST_USES_POSIX_RE 1 - -#elif GTEST_OS_WINDOWS - -// is not available on Windows. Use our own simple regex -// implementation instead. -# define GTEST_USES_SIMPLE_RE 1 - +#include // NOLINT +#define GTEST_USES_POSIX_RE 1 #else - -// may not be available on this platform. Use our own -// simple regex implementation instead. -# define GTEST_USES_SIMPLE_RE 1 - -#endif // GTEST_USES_PCRE +// Use our own simple regex implementation. +#define GTEST_USES_SIMPLE_RE 1 +#endif #ifndef GTEST_HAS_EXCEPTIONS // The user didn't tell us whether exceptions are enabled, so we need // to figure it out. -# if defined(_MSC_VER) && defined(_CPPUNWIND) +#if defined(_MSC_VER) && defined(_CPPUNWIND) // MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled. -# define GTEST_HAS_EXCEPTIONS 1 -# elif defined(__BORLANDC__) +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__BORLANDC__) // C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS // macro to enable exceptions, so we'll do the same. // Assumes that exceptions are enabled by default. -# ifndef _HAS_EXCEPTIONS -# define _HAS_EXCEPTIONS 1 -# endif // _HAS_EXCEPTIONS -# define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS -# elif defined(__clang__) +#ifndef _HAS_EXCEPTIONS +#define _HAS_EXCEPTIONS 1 +#endif // _HAS_EXCEPTIONS +#define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS +#elif defined(__clang__) // clang defines __EXCEPTIONS if and only if exceptions are enabled before clang // 220714, but if and only if cleanups are enabled after that. In Obj-C++ files, // there can be cleanups for ObjC exceptions which also need cleanups, even if @@ -437,27 +426,27 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // cleanups prior to that. To reliably check for C++ exception availability with // clang, check for // __EXCEPTIONS && __has_feature(cxx_exceptions). -# define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions)) -# elif defined(__GNUC__) && __EXCEPTIONS +#define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions)) +#elif defined(__GNUC__) && __EXCEPTIONS // gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled. -# define GTEST_HAS_EXCEPTIONS 1 -# elif defined(__SUNPRO_CC) +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__SUNPRO_CC) // Sun Pro CC supports exceptions. However, there is no compile-time way of // detecting whether they are enabled or not. Therefore, we assume that // they are enabled unless the user tells us otherwise. -# define GTEST_HAS_EXCEPTIONS 1 -# elif defined(__IBMCPP__) && __EXCEPTIONS +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__IBMCPP__) && __EXCEPTIONS // xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled. -# define GTEST_HAS_EXCEPTIONS 1 -# elif defined(__HP_aCC) +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__HP_aCC) // Exception handling is in effect by default in HP aCC compiler. It has to // be turned of by +noeh compiler option if desired. -# define GTEST_HAS_EXCEPTIONS 1 -# else +#define GTEST_HAS_EXCEPTIONS 1 +#else // For other compilers, we assume exceptions are disabled to be // conservative. -# define GTEST_HAS_EXCEPTIONS 0 -# endif // defined(_MSC_VER) || defined(__BORLANDC__) +#define GTEST_HAS_EXCEPTIONS 0 +#endif // defined(_MSC_VER) || defined(__BORLANDC__) #endif // GTEST_HAS_EXCEPTIONS #ifndef GTEST_HAS_STD_WSTRING @@ -477,63 +466,62 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // The user didn't tell us whether RTTI is enabled, so we need to // figure it out. -# ifdef _MSC_VER +#ifdef _MSC_VER #ifdef _CPPRTTI // MSVC defines this macro if and only if RTTI is enabled. -# define GTEST_HAS_RTTI 1 -# else -# define GTEST_HAS_RTTI 0 -# endif +#define GTEST_HAS_RTTI 1 +#else +#define GTEST_HAS_RTTI 0 +#endif // Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is // enabled. -# elif defined(__GNUC__) +#elif defined(__GNUC__) -# ifdef __GXX_RTTI +#ifdef __GXX_RTTI // When building against STLport with the Android NDK and with // -frtti -fno-exceptions, the build fails at link time with undefined // references to __cxa_bad_typeid. Note sure if STL or toolchain bug, // so disable RTTI when detected. -# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \ - !defined(__EXCEPTIONS) -# define GTEST_HAS_RTTI 0 -# else -# define GTEST_HAS_RTTI 1 -# endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS -# else -# define GTEST_HAS_RTTI 0 -# endif // __GXX_RTTI +#if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && !defined(__EXCEPTIONS) +#define GTEST_HAS_RTTI 0 +#else +#define GTEST_HAS_RTTI 1 +#endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS +#else +#define GTEST_HAS_RTTI 0 +#endif // __GXX_RTTI // Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends // using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the // first version with C++ support. -# elif defined(__clang__) +#elif defined(__clang__) -# define GTEST_HAS_RTTI __has_feature(cxx_rtti) +#define GTEST_HAS_RTTI __has_feature(cxx_rtti) // Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if // both the typeid and dynamic_cast features are present. -# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900) +#elif defined(__IBMCPP__) && (__IBMCPP__ >= 900) -# ifdef __RTTI_ALL__ -# define GTEST_HAS_RTTI 1 -# else -# define GTEST_HAS_RTTI 0 -# endif +#ifdef __RTTI_ALL__ +#define GTEST_HAS_RTTI 1 +#else +#define GTEST_HAS_RTTI 0 +#endif -# else +#else // For all other compilers, we assume RTTI is enabled. -# define GTEST_HAS_RTTI 1 +#define GTEST_HAS_RTTI 1 -# endif // _MSC_VER +#endif // _MSC_VER #endif // GTEST_HAS_RTTI // It's this header's responsibility to #include when RTTI // is enabled. #if GTEST_HAS_RTTI -# include +#include #endif // Determines whether Google Test can use the pthreads library. @@ -547,16 +535,16 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX || \ GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \ GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD || \ - GTEST_OS_HAIKU) + GTEST_OS_HAIKU || GTEST_OS_GNU_HURD) #endif // GTEST_HAS_PTHREAD #if GTEST_HAS_PTHREAD // gtest-port.h guarantees to #include when GTEST_HAS_PTHREAD is // true. -# include // NOLINT +#include // NOLINT // For timespec and nanosleep, used below. -# include // NOLINT +#include // NOLINT #endif // Determines whether clone(2) is supported. @@ -566,24 +554,23 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; #ifndef GTEST_HAS_CLONE // The user didn't tell us, so we need to figure it out. -# if GTEST_OS_LINUX && !defined(__ia64__) -# if GTEST_OS_LINUX_ANDROID +#if GTEST_OS_LINUX && !defined(__ia64__) +#if GTEST_OS_LINUX_ANDROID // On Android, clone() became available at different API levels for each 32-bit // architecture. -# if defined(__LP64__) || \ - (defined(__arm__) && __ANDROID_API__ >= 9) || \ - (defined(__mips__) && __ANDROID_API__ >= 12) || \ - (defined(__i386__) && __ANDROID_API__ >= 17) -# define GTEST_HAS_CLONE 1 -# else -# define GTEST_HAS_CLONE 0 -# endif -# else -# define GTEST_HAS_CLONE 1 -# endif -# else -# define GTEST_HAS_CLONE 0 -# endif // GTEST_OS_LINUX && !defined(__ia64__) +#if defined(__LP64__) || (defined(__arm__) && __ANDROID_API__ >= 9) || \ + (defined(__mips__) && __ANDROID_API__ >= 12) || \ + (defined(__i386__) && __ANDROID_API__ >= 17) +#define GTEST_HAS_CLONE 1 +#else +#define GTEST_HAS_CLONE 0 +#endif +#else +#define GTEST_HAS_CLONE 1 +#endif +#else +#define GTEST_HAS_CLONE 0 +#endif // GTEST_OS_LINUX && !defined(__ia64__) #endif // GTEST_HAS_CLONE @@ -594,10 +581,10 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // platforms except known mobile ones. #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA -# define GTEST_HAS_STREAM_REDIRECTION 0 -# else -# define GTEST_HAS_STREAM_REDIRECTION 1 -# endif // !GTEST_OS_WINDOWS_MOBILE +#define GTEST_HAS_STREAM_REDIRECTION 0 +#else +#define GTEST_HAS_STREAM_REDIRECTION 1 +#endif // !GTEST_OS_WINDOWS_MOBILE #endif // GTEST_HAS_STREAM_REDIRECTION // Determines whether to support death tests. @@ -607,8 +594,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW || \ GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \ GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \ - GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU) -# define GTEST_HAS_DEATH_TEST 1 + GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU || \ + GTEST_OS_GNU_HURD) +#define GTEST_HAS_DEATH_TEST 1 #endif // Determines whether to support type-driven tests. @@ -617,8 +605,8 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // Sun Pro CC, IBM Visual Age, and HP aCC support. #if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \ defined(__IBMCPP__) || defined(__HP_aCC) -# define GTEST_HAS_TYPED_TEST 1 -# define GTEST_HAS_TYPED_TEST_P 1 +#define GTEST_HAS_TYPED_TEST 1 +#define GTEST_HAS_TYPED_TEST_P 1 #endif // Determines whether the system compiler uses UTF-16 for encoding wide strings. @@ -627,8 +615,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // Determines whether test results can be streamed to a socket. #if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \ - GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD -# define GTEST_CAN_STREAM_RESULTS_ 1 + GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD || \ + GTEST_OS_GNU_HURD +#define GTEST_CAN_STREAM_RESULTS_ 1 #endif // Defines some utility macros. @@ -642,9 +631,12 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // // The "switch (0) case 0:" idiom is used to suppress this. #ifdef __INTEL_COMPILER -# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ +#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ #else -# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default: // NOLINT +#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + switch (0) \ + case 0: \ + default: // NOLINT #endif // Use this annotation at the end of a struct/class definition to @@ -659,55 +651,32 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // Also use it after a variable or parameter declaration to tell the // compiler the variable/parameter does not have to be used. #if defined(__GNUC__) && !defined(COMPILER_ICC) -# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused)) +#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused)) #elif defined(__clang__) -# if __has_attribute(unused) -# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused)) -# endif +#if __has_attribute(unused) +#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused)) +#endif #endif #ifndef GTEST_ATTRIBUTE_UNUSED_ -# define GTEST_ATTRIBUTE_UNUSED_ +#define GTEST_ATTRIBUTE_UNUSED_ #endif // Use this annotation before a function that takes a printf format string. #if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC) -# if defined(__MINGW_PRINTF_FORMAT) +#if defined(__MINGW_PRINTF_FORMAT) // MinGW has two different printf implementations. Ensure the format macro // matches the selected implementation. See // https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/. -# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ - __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \ - first_to_check))) -# else -# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ - __attribute__((__format__(__printf__, string_index, first_to_check))) -# endif +#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ + __attribute__(( \ + __format__(__MINGW_PRINTF_FORMAT, string_index, first_to_check))) #else -# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) +#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +#endif +#else +#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) #endif - - -// A macro to disallow copy operator= -// This should be used in the private: declarations for a class. -#define GTEST_DISALLOW_ASSIGN_(type) \ - type& operator=(type const &) = delete - -// A macro to disallow copy constructor and operator= -// This should be used in the private: declarations for a class. -#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \ - type(type const&) = delete; \ - type& operator=(type const&) = delete - -// A macro to disallow move operator= -// This should be used in the private: declarations for a class. -#define GTEST_DISALLOW_MOVE_ASSIGN_(type) \ - type& operator=(type &&) noexcept = delete - -// A macro to disallow move constructor and operator= -// This should be used in the private: declarations for a class. -#define GTEST_DISALLOW_MOVE_AND_ASSIGN_(type) \ - type(type&&) noexcept = delete; \ - type& operator=(type&&) noexcept = delete // Tell the compiler to warn about unused return values for functions declared // with this macro. The macro should be used on function declarations @@ -715,9 +684,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // // Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_; #if defined(__GNUC__) && !defined(COMPILER_ICC) -# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result)) +#define GTEST_MUST_USE_RESULT_ __attribute__((warn_unused_result)) #else -# define GTEST_MUST_USE_RESULT_ +#define GTEST_MUST_USE_RESULT_ #endif // __GNUC__ && !COMPILER_ICC // MS C++ compiler emits warning when a conditional expression is compile time @@ -728,10 +697,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // while (true) { // GTEST_INTENTIONAL_CONST_COND_POP_() // } -# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \ - GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127) -# define GTEST_INTENTIONAL_CONST_COND_POP_() \ - GTEST_DISABLE_MSC_WARNINGS_POP_() +#define GTEST_INTENTIONAL_CONST_COND_PUSH_() \ + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127) +#define GTEST_INTENTIONAL_CONST_COND_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_() // Determine whether the compiler supports Microsoft's Structured Exception // Handling. This is supported by several Windows compilers but generally @@ -739,13 +707,13 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; #ifndef GTEST_HAS_SEH // The user didn't tell us, so we need to figure it out. -# if defined(_MSC_VER) || defined(__BORLANDC__) +#if defined(_MSC_VER) || defined(__BORLANDC__) // These two compilers are known to support SEH. -# define GTEST_HAS_SEH 1 -# else +#define GTEST_HAS_SEH 1 +#else // Assume no SEH. -# define GTEST_HAS_SEH 0 -# endif +#define GTEST_HAS_SEH 0 +#endif #endif // GTEST_HAS_SEH @@ -758,94 +726,112 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; #endif // GTEST_IS_THREADSAFE +#if GTEST_IS_THREADSAFE +// Some platforms don't support including these threading related headers. +#include // NOLINT +#include // NOLINT +#endif // GTEST_IS_THREADSAFE + // GTEST_API_ qualifies all symbols that must be exported. The definitions below // are guarded by #ifndef to give embedders a chance to define GTEST_API_ in // gtest/internal/custom/gtest-port.h #ifndef GTEST_API_ #ifdef _MSC_VER -# if GTEST_LINKED_AS_SHARED_LIBRARY -# define GTEST_API_ __declspec(dllimport) -# elif GTEST_CREATE_SHARED_LIBRARY -# define GTEST_API_ __declspec(dllexport) -# endif +#if GTEST_LINKED_AS_SHARED_LIBRARY +#define GTEST_API_ __declspec(dllimport) +#elif GTEST_CREATE_SHARED_LIBRARY +#define GTEST_API_ __declspec(dllexport) +#endif #elif __GNUC__ >= 4 || defined(__clang__) -# define GTEST_API_ __attribute__((visibility ("default"))) +#define GTEST_API_ __attribute__((visibility("default"))) #endif // _MSC_VER #endif // GTEST_API_ #ifndef GTEST_API_ -# define GTEST_API_ +#define GTEST_API_ #endif // GTEST_API_ #ifndef GTEST_DEFAULT_DEATH_TEST_STYLE -# define GTEST_DEFAULT_DEATH_TEST_STYLE "fast" +#define GTEST_DEFAULT_DEATH_TEST_STYLE "fast" #endif // GTEST_DEFAULT_DEATH_TEST_STYLE #ifdef __GNUC__ // Ask the compiler to never inline a given function. -# define GTEST_NO_INLINE_ __attribute__((noinline)) +#define GTEST_NO_INLINE_ __attribute__((noinline)) #else -# define GTEST_NO_INLINE_ +#define GTEST_NO_INLINE_ +#endif + +#if defined(__clang__) +// Nested ifs to avoid triggering MSVC warning. +#if __has_attribute(disable_tail_calls) +// Ask the compiler not to perform tail call optimization inside +// the marked function. +#define GTEST_NO_TAIL_CALL_ __attribute__((disable_tail_calls)) +#endif +#elif __GNUC__ +#define GTEST_NO_TAIL_CALL_ \ + __attribute__((optimize("no-optimize-sibling-calls"))) +#else +#define GTEST_NO_TAIL_CALL_ #endif // _LIBCPP_VERSION is defined by the libc++ library from the LLVM project. #if !defined(GTEST_HAS_CXXABI_H_) -# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) -# define GTEST_HAS_CXXABI_H_ 1 -# else -# define GTEST_HAS_CXXABI_H_ 0 -# endif +#if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) +#define GTEST_HAS_CXXABI_H_ 1 +#else +#define GTEST_HAS_CXXABI_H_ 0 +#endif #endif // A function level attribute to disable checking for use of uninitialized // memory when built with MemorySanitizer. #if defined(__clang__) -# if __has_feature(memory_sanitizer) -# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \ - __attribute__((no_sanitize_memory)) -# else -# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ -# endif // __has_feature(memory_sanitizer) +#if __has_feature(memory_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ __attribute__((no_sanitize_memory)) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +#endif // __has_feature(memory_sanitizer) #else -# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ #endif // __clang__ // A function level attribute to disable AddressSanitizer instrumentation. #if defined(__clang__) -# if __has_feature(address_sanitizer) -# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \ - __attribute__((no_sanitize_address)) -# else -# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ -# endif // __has_feature(address_sanitizer) +#if __has_feature(address_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \ + __attribute__((no_sanitize_address)) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +#endif // __has_feature(address_sanitizer) #else -# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ #endif // __clang__ // A function level attribute to disable HWAddressSanitizer instrumentation. #if defined(__clang__) -# if __has_feature(hwaddress_sanitizer) -# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \ - __attribute__((no_sanitize("hwaddress"))) -# else -# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ -# endif // __has_feature(hwaddress_sanitizer) +#if __has_feature(hwaddress_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \ + __attribute__((no_sanitize("hwaddress"))) #else -# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +#endif // __has_feature(hwaddress_sanitizer) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ #endif // __clang__ // A function level attribute to disable ThreadSanitizer instrumentation. #if defined(__clang__) -# if __has_feature(thread_sanitizer) -# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \ - __attribute__((no_sanitize_thread)) -# else -# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ -# endif // __has_feature(thread_sanitizer) +#if __has_feature(thread_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ __attribute__((no_sanitize_thread)) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +#endif // __has_feature(thread_sanitizer) #else -# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ #endif // __clang__ namespace testing { @@ -867,25 +853,37 @@ namespace internal { // Secret object, which is what we want. class Secret; -// The GTEST_COMPILE_ASSERT_ is a legacy macro used to verify that a compile -// time expression is true (in new code, use static_assert instead). For -// example, you could use it to verify the size of a static array: -// -// GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES, -// names_incorrect_size); -// -// The second argument to the macro must be a valid C++ identifier. If the -// expression is false, compiler will issue an error containing this identifier. -#define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg) - // A helper for suppressing warnings on constant condition. It just // returns 'condition'. GTEST_API_ bool IsTrue(bool condition); // Defines RE. -#if GTEST_USES_PCRE -// if used, PCRE is injected by custom/gtest-port.h +#if GTEST_USES_RE2 + +// This is almost `using RE = ::RE2`, except it is copy-constructible, and it +// needs to disambiguate the `std::string`, `absl::string_view`, and `const +// char*` constructors. +class GTEST_API_ RE { + public: + RE(absl::string_view regex) : regex_(regex) {} // NOLINT + RE(const char* regex) : RE(absl::string_view(regex)) {} // NOLINT + RE(const std::string& regex) : RE(absl::string_view(regex)) {} // NOLINT + RE(const RE& other) : RE(other.pattern()) {} + + const std::string& pattern() const { return regex_.pattern(); } + + static bool FullMatch(absl::string_view str, const RE& re) { + return RE2::FullMatch(str, re.regex_); + } + static bool PartialMatch(absl::string_view str, const RE& re) { + return RE2::PartialMatch(str, re.regex_); + } + + private: + RE2 regex_; +}; + #elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE // A simple C++ wrapper for . It uses the POSIX Extended @@ -924,19 +922,19 @@ class GTEST_API_ RE { const char* pattern_; bool is_valid_; -# if GTEST_USES_POSIX_RE +#if GTEST_USES_POSIX_RE regex_t full_regex_; // For FullMatch(). regex_t partial_regex_; // For PartialMatch(). -# else // GTEST_USES_SIMPLE_RE +#else // GTEST_USES_SIMPLE_RE const char* full_pattern_; // For FullMatch(); -# endif +#endif }; -#endif // GTEST_USES_PCRE +#endif // ::testing::internal::RE implementation // Formats a source file path and a line number as they would appear // in an error message from the compiler used to compile this code. @@ -954,12 +952,7 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file, // LogToStderr() - directs all log messages to stderr. // FlushInfoLog() - flushes informational log messages. -enum GTestLogSeverity { - GTEST_INFO, - GTEST_WARNING, - GTEST_ERROR, - GTEST_FATAL -}; +enum GTestLogSeverity { GTEST_INFO, GTEST_WARNING, GTEST_ERROR, GTEST_FATAL }; // Formats log entry severity, provides a stream object for streaming the // log message, and terminates the message with a newline when going out of @@ -976,14 +969,16 @@ class GTEST_API_ GTestLog { private: const GTestLogSeverity severity_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog); + GTestLog(const GTestLog&) = delete; + GTestLog& operator=(const GTestLog&) = delete; }; #if !defined(GTEST_LOG_) -# define GTEST_LOG_(severity) \ - ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \ - __FILE__, __LINE__).GetStream() +#define GTEST_LOG_(severity) \ + ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \ + __FILE__, __LINE__) \ + .GetStream() inline void LogToStderr() {} inline void FlushInfoLog() { fflush(nullptr); } @@ -995,7 +990,7 @@ inline void FlushInfoLog() { fflush(nullptr); } // // GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition // is not satisfied. -// Synopsys: +// Synopsis: // GTEST_CHECK_(boolean_condition); // or // GTEST_CHECK_(boolean_condition) << "Additional message"; @@ -1005,12 +1000,12 @@ inline void FlushInfoLog() { fflush(nullptr); } // condition itself, plus additional message streamed into it, if any, // and then it aborts the program. It aborts the program irrespective of // whether it is built in the debug mode or not. -# define GTEST_CHECK_(condition) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::IsTrue(condition)) \ - ; \ - else \ - GTEST_LOG_(FATAL) << "Condition " #condition " failed. " +#define GTEST_CHECK_(condition) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::IsTrue(condition)) \ + ; \ + else \ + GTEST_LOG_(FATAL) << "Condition " #condition " failed. " #endif // !defined(GTEST_CHECK_) // An all-mode assert to verify that the given POSIX-style function @@ -1019,9 +1014,8 @@ inline void FlushInfoLog() { fflush(nullptr); } // in {} if you need to use it as the only statement in an 'if' // branch. #define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \ - if (const int gtest_error = (posix_call)) \ - GTEST_LOG_(FATAL) << #posix_call << "failed with error " \ - << gtest_error + if (const int gtest_error = (posix_call)) \ + GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error // Transforms "T" into "const T&" according to standard reference collapsing // rules (this is only needed as a backport for C++98 compilers that do not @@ -1035,9 +1029,13 @@ inline void FlushInfoLog() { fflush(nullptr); } // Note that the non-const reference will not have "const" added. This is // standard, and necessary so that "T" can always bind to "const T&". template -struct ConstRef { typedef const T& type; }; +struct ConstRef { + typedef const T& type; +}; template -struct ConstRef { typedef T& type; }; +struct ConstRef { + typedef T& type; +}; // The argument T must depend on some template parameters. #define GTEST_REFERENCE_TO_CONST_(T) \ @@ -1050,7 +1048,7 @@ struct ConstRef { typedef T& type; }; // const Foo*). When you use ImplicitCast_, the compiler checks that // the cast is safe. Such explicit ImplicitCast_s are necessary in // surprisingly many situations where C++ demands an exact type match -// instead of an argument type convertable to a target type. +// instead of an argument type convertible to a target type. // // The syntax for using ImplicitCast_ is the same as for static_cast: // @@ -1063,8 +1061,10 @@ struct ConstRef { typedef T& type; }; // This relatively ugly name is intentional. It prevents clashes with // similar functions users may have (e.g., implicit_cast). The internal // namespace alone is not enough because the function can be found by ADL. -template -inline To ImplicitCast_(To x) { return x; } +template +inline To ImplicitCast_(To x) { + return x; +} // When you upcast (that is, cast a pointer from type Foo to type // SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts @@ -1087,17 +1087,17 @@ inline To ImplicitCast_(To x) { return x; } // This relatively ugly name is intentional. It prevents clashes with // similar functions users may have (e.g., down_cast). The internal // namespace alone is not enough because the function can be found by ADL. -template // use like this: DownCast_(foo); -inline To DownCast_(From* f) { // so we only accept pointers +template // use like this: DownCast_(foo); +inline To DownCast_(From* f) { // so we only accept pointers // Ensures that To is a sub-type of From *. This test is here only // for compile-time type checking, and has no overhead in an // optimized build at run-time, as it will be optimized away // completely. GTEST_INTENTIONAL_CONST_COND_PUSH_() if (false) { - GTEST_INTENTIONAL_CONST_COND_POP_() - const To to = nullptr; - ::testing::internal::ImplicitCast_(to); + GTEST_INTENTIONAL_CONST_COND_POP_() + const To to = nullptr; + ::testing::internal::ImplicitCast_(to); } #if GTEST_HAS_RTTI @@ -1162,71 +1162,8 @@ void ClearInjectableArgvs(); // Defines synchronization primitives. #if GTEST_IS_THREADSAFE -# if GTEST_HAS_PTHREAD -// Sleeps for (roughly) n milliseconds. This function is only for testing -// Google Test's own constructs. Don't use it in user tests, either -// directly or indirectly. -inline void SleepMilliseconds(int n) { - const timespec time = { - 0, // 0 seconds. - n * 1000L * 1000L, // And n ms. - }; - nanosleep(&time, nullptr); -} -# endif // GTEST_HAS_PTHREAD - -# if GTEST_HAS_NOTIFICATION_ -// Notification has already been imported into the namespace. -// Nothing to do here. - -# elif GTEST_HAS_PTHREAD -// Allows a controller thread to pause execution of newly created -// threads until notified. Instances of this class must be created -// and destroyed in the controller thread. -// -// This class is only for testing Google Test's own constructs. Do not -// use it in user tests, either directly or indirectly. -class Notification { - public: - Notification() : notified_(false) { - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr)); - } - ~Notification() { - pthread_mutex_destroy(&mutex_); - } - - // Notifies all threads created with this notification to start. Must - // be called from the controller thread. - void Notify() { - pthread_mutex_lock(&mutex_); - notified_ = true; - pthread_mutex_unlock(&mutex_); - } - - // Blocks until the controller thread notifies. Must be called from a test - // thread. - void WaitForNotification() { - for (;;) { - pthread_mutex_lock(&mutex_); - const bool notified = notified_; - pthread_mutex_unlock(&mutex_); - if (notified) - break; - SleepMilliseconds(10); - } - } - - private: - pthread_mutex_t mutex_; - bool notified_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification); -}; - -# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT - -GTEST_API_ void SleepMilliseconds(int n); +#if GTEST_OS_WINDOWS // Provides leak-safe Windows kernel handle ownership. // Used in death tests and in threading support. class GTEST_API_ AutoHandle { @@ -1253,8 +1190,18 @@ class GTEST_API_ AutoHandle { Handle handle_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle); + AutoHandle(const AutoHandle&) = delete; + AutoHandle& operator=(const AutoHandle&) = delete; }; +#endif + +#if GTEST_HAS_NOTIFICATION_ +// Notification has already been imported into the namespace. +// Nothing to do here. + +#else +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) // Allows a controller thread to pause execution of newly created // threads until notified. Instances of this class must be created @@ -1262,23 +1209,40 @@ class GTEST_API_ AutoHandle { // // This class is only for testing Google Test's own constructs. Do not // use it in user tests, either directly or indirectly. +// TODO(b/203539622): Replace unconditionally with absl::Notification. class GTEST_API_ Notification { public: - Notification(); - void Notify(); - void WaitForNotification(); + Notification() : notified_(false) {} + Notification(const Notification&) = delete; + Notification& operator=(const Notification&) = delete; - private: - AutoHandle event_; + // Notifies all threads created with this notification to start. Must + // be called from the controller thread. + void Notify() { + std::lock_guard lock(mu_); + notified_ = true; + cv_.notify_all(); + } - GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification); + // Blocks until the controller thread notifies. Must be called from a test + // thread. + void WaitForNotification() { + std::unique_lock lock(mu_); + cv_.wait(lock, [this]() { return notified_; }); + } + + private: + std::mutex mu_; + std::condition_variable cv_; + bool notified_; }; -# endif // GTEST_HAS_NOTIFICATION_ +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 +#endif // GTEST_HAS_NOTIFICATION_ // On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD // defined, but we don't want to use MinGW's pthreads implementation, which // has conformance problems with some versions of the POSIX standard. -# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW +#if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW // As a C-function, ThreadFuncWithCLinkage cannot be templated itself. // Consequently, it cannot select a correct instantiation of ThreadWithParam @@ -1354,16 +1318,17 @@ class ThreadWithParam : public ThreadWithParamBase { // finished. pthread_t thread_; // The native thread object. - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam); + ThreadWithParam(const ThreadWithParam&) = delete; + ThreadWithParam& operator=(const ThreadWithParam&) = delete; }; -# endif // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD || - // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ +#endif // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD || + // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ -# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ +#if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ // Mutex and ThreadLocal have already been imported into the namespace. // Nothing to do here. -# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT +#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT // Mutex implements mutex on Windows platforms. It is used in conjunction // with class MutexLock: @@ -1417,14 +1382,15 @@ class GTEST_API_ Mutex { long critical_section_init_phase_; // NOLINT GTEST_CRITICAL_SECTION* critical_section_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex); + Mutex(const Mutex&) = delete; + Mutex& operator=(const Mutex&) = delete; }; -# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ - extern ::testing::internal::Mutex mutex +#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::Mutex mutex -# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ - ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex) +#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ + ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex) // We cannot name this class MutexLock because the ctor declaration would // conflict with a macro named MutexLock, which is defined on some @@ -1433,15 +1399,15 @@ class GTEST_API_ Mutex { // "MutexLock l(&mu)". Hence the typedef trick below. class GTestMutexLock { public: - explicit GTestMutexLock(Mutex* mutex) - : mutex_(mutex) { mutex_->Lock(); } + explicit GTestMutexLock(Mutex* mutex) : mutex_(mutex) { mutex_->Lock(); } ~GTestMutexLock() { mutex_->Unlock(); } private: Mutex* const mutex_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock); + GTestMutexLock(const GTestMutexLock&) = delete; + GTestMutexLock& operator=(const GTestMutexLock&) = delete; }; typedef GTestMutexLock MutexLock; @@ -1468,7 +1434,8 @@ class ThreadLocalBase { virtual ~ThreadLocalBase() {} private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase); + ThreadLocalBase(const ThreadLocalBase&) = delete; + ThreadLocalBase& operator=(const ThreadLocalBase&) = delete; }; // Maps a thread to a set of ThreadLocals that have values instantiated on that @@ -1497,7 +1464,7 @@ class GTEST_API_ ThreadWithParamBase { virtual void Run() = 0; }; - ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start); + ThreadWithParamBase(Runnable* runnable, Notification* thread_can_start); virtual ~ThreadWithParamBase(); private: @@ -1511,30 +1478,26 @@ class ThreadWithParam : public ThreadWithParamBase { typedef void UserThreadFunc(T); ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start) - : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) { - } + : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {} virtual ~ThreadWithParam() {} private: class RunnableImpl : public Runnable { public: - RunnableImpl(UserThreadFunc* func, T param) - : func_(func), - param_(param) { - } + RunnableImpl(UserThreadFunc* func, T param) : func_(func), param_(param) {} virtual ~RunnableImpl() {} - virtual void Run() { - func_(param_); - } + virtual void Run() { func_(param_); } private: UserThreadFunc* const func_; const T param_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl); + RunnableImpl(const RunnableImpl&) = delete; + RunnableImpl& operator=(const RunnableImpl&) = delete; }; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam); + ThreadWithParam(const ThreadWithParam&) = delete; + ThreadWithParam& operator=(const ThreadWithParam&) = delete; }; // Implements thread-local storage on Windows systems. @@ -1571,7 +1534,7 @@ class ThreadLocal : public ThreadLocalBase { explicit ThreadLocal(const T& value) : default_factory_(new InstanceValueHolderFactory(value)) {} - ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); } + ~ThreadLocal() override { ThreadLocalRegistry::OnThreadLocalDestroyed(this); } T* pointer() { return GetOrCreateValue(); } const T* pointer() const { return GetOrCreateValue(); } @@ -1590,16 +1553,17 @@ class ThreadLocal : public ThreadLocalBase { private: T value_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder); + ValueHolder(const ValueHolder&) = delete; + ValueHolder& operator=(const ValueHolder&) = delete; }; - T* GetOrCreateValue() const { return static_cast( - ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer(); + ThreadLocalRegistry::GetValueOnCurrentThread(this)) + ->pointer(); } - virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const { + ThreadLocalValueHolderBase* NewValueForCurrentThread() const override { return default_factory_->MakeNewHolder(); } @@ -1610,7 +1574,8 @@ class ThreadLocal : public ThreadLocalBase { virtual ValueHolder* MakeNewHolder() const = 0; private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory); + ValueHolderFactory(const ValueHolderFactory&) = delete; + ValueHolderFactory& operator=(const ValueHolderFactory&) = delete; }; class DefaultValueHolderFactory : public ValueHolderFactory { @@ -1619,7 +1584,9 @@ class ThreadLocal : public ThreadLocalBase { ValueHolder* MakeNewHolder() const override { return new ValueHolder(); } private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory); + DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete; + DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) = + delete; }; class InstanceValueHolderFactory : public ValueHolderFactory { @@ -1632,15 +1599,18 @@ class ThreadLocal : public ThreadLocalBase { private: const T value_; // The value for each thread. - GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory); + InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete; + InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) = + delete; }; std::unique_ptr default_factory_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); + ThreadLocal(const ThreadLocal&) = delete; + ThreadLocal& operator=(const ThreadLocal&) = delete; }; -# elif GTEST_HAS_PTHREAD +#elif GTEST_HAS_PTHREAD // MutexBase and Mutex implement mutex on pthreads-based platforms. class MutexBase { @@ -1687,8 +1657,8 @@ class MutexBase { }; // Forward-declares a static mutex. -# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ - extern ::testing::internal::MutexBase mutex +#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::MutexBase mutex // Defines and statically (i.e. at link time) initializes a static mutex. // The initialization list here does not explicitly initialize each field, @@ -1707,12 +1677,11 @@ class Mutex : public MutexBase { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr)); has_owner_ = false; } - ~Mutex() { - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); - } + ~Mutex() { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); } private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex); + Mutex(const Mutex&) = delete; + Mutex& operator=(const Mutex&) = delete; }; // We cannot name this class MutexLock because the ctor declaration would @@ -1722,15 +1691,15 @@ class Mutex : public MutexBase { // "MutexLock l(&mu)". Hence the typedef trick below. class GTestMutexLock { public: - explicit GTestMutexLock(MutexBase* mutex) - : mutex_(mutex) { mutex_->Lock(); } + explicit GTestMutexLock(MutexBase* mutex) : mutex_(mutex) { mutex_->Lock(); } ~GTestMutexLock() { mutex_->Unlock(); } private: MutexBase* const mutex_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock); + GTestMutexLock(const GTestMutexLock&) = delete; + GTestMutexLock& operator=(const GTestMutexLock&) = delete; }; typedef GTestMutexLock MutexLock; @@ -1787,7 +1756,8 @@ class GTEST_API_ ThreadLocal { private: T value_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder); + ValueHolder(const ValueHolder&) = delete; + ValueHolder& operator=(const ValueHolder&) = delete; }; static pthread_key_t CreateKey() { @@ -1819,7 +1789,8 @@ class GTEST_API_ ThreadLocal { virtual ValueHolder* MakeNewHolder() const = 0; private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory); + ValueHolderFactory(const ValueHolderFactory&) = delete; + ValueHolderFactory& operator=(const ValueHolderFactory&) = delete; }; class DefaultValueHolderFactory : public ValueHolderFactory { @@ -1828,7 +1799,9 @@ class GTEST_API_ ThreadLocal { ValueHolder* MakeNewHolder() const override { return new ValueHolder(); } private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory); + DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete; + DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) = + delete; }; class InstanceValueHolderFactory : public ValueHolderFactory { @@ -1841,17 +1814,20 @@ class GTEST_API_ ThreadLocal { private: const T value_; // The value for each thread. - GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory); + InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete; + InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) = + delete; }; // A key pthreads uses for looking up per-thread values. const pthread_key_t key_; std::unique_ptr default_factory_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); + ThreadLocal(const ThreadLocal&) = delete; + ThreadLocal& operator=(const ThreadLocal&) = delete; }; -# endif // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ +#endif // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ #else // GTEST_IS_THREADSAFE @@ -1868,10 +1844,10 @@ class Mutex { void AssertHeld() const {} }; -# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ +#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ extern ::testing::internal::Mutex mutex -# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex +#define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex // We cannot name this class MutexLock because the ctor declaration would // conflict with a macro named MutexLock, which is defined on some @@ -1894,6 +1870,7 @@ class GTEST_API_ ThreadLocal { const T* pointer() const { return &value_; } const T& get() const { return value_; } void set(const T& value) { value_ = value; } + private: T value_; }; @@ -1905,11 +1882,11 @@ class GTEST_API_ ThreadLocal { GTEST_API_ size_t GetThreadCount(); #if GTEST_OS_WINDOWS -# define GTEST_PATH_SEP_ "\\" -# define GTEST_HAS_ALT_PATH_SEP_ 1 +#define GTEST_PATH_SEP_ "\\" +#define GTEST_HAS_ALT_PATH_SEP_ 1 #else -# define GTEST_PATH_SEP_ "/" -# define GTEST_HAS_ALT_PATH_SEP_ 0 +#define GTEST_PATH_SEP_ "/" +#define GTEST_HAS_ALT_PATH_SEP_ 0 #endif // GTEST_OS_WINDOWS // Utilities for char. @@ -1967,8 +1944,7 @@ inline char ToUpper(char ch) { inline std::string StripTrailingSpaces(std::string str) { std::string::iterator it = str.end(); - while (it != str.begin() && IsSpace(*--it)) - it = str.erase(it); + while (it != str.begin() && IsSpace(*--it)) it = str.erase(it); return str; } @@ -1986,36 +1962,35 @@ namespace posix { typedef struct _stat StatStruct; -# ifdef __BORLANDC__ +#ifdef __BORLANDC__ inline int DoIsATTY(int fd) { return isatty(fd); } inline int StrCaseCmp(const char* s1, const char* s2) { return stricmp(s1, s2); } inline char* StrDup(const char* src) { return strdup(src); } -# else // !__BORLANDC__ -# if GTEST_OS_WINDOWS_MOBILE +#else // !__BORLANDC__ +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \ + GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM) inline int DoIsATTY(int /* fd */) { return 0; } -# else +#else inline int DoIsATTY(int fd) { return _isatty(fd); } -# endif // GTEST_OS_WINDOWS_MOBILE +#endif // GTEST_OS_WINDOWS_MOBILE inline int StrCaseCmp(const char* s1, const char* s2) { return _stricmp(s1, s2); } inline char* StrDup(const char* src) { return _strdup(src); } -# endif // __BORLANDC__ +#endif // __BORLANDC__ -# if GTEST_OS_WINDOWS_MOBILE +#if GTEST_OS_WINDOWS_MOBILE inline int FileNo(FILE* file) { return reinterpret_cast(_fileno(file)); } // Stat(), RmDir(), and IsDir() are not needed on Windows CE at this // time and thus not defined there. -# else +#else inline int FileNo(FILE* file) { return _fileno(file); } inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); } inline int RmDir(const char* dir) { return _rmdir(dir); } -inline bool IsDir(const StatStruct& st) { - return (_S_IFDIR & st.st_mode) != 0; -} -# endif // GTEST_OS_WINDOWS_MOBILE +inline bool IsDir(const StatStruct& st) { return (_S_IFDIR & st.st_mode) != 0; } +#endif // GTEST_OS_WINDOWS_MOBILE #elif GTEST_OS_ESP8266 typedef struct stat StatStruct; @@ -2079,12 +2054,12 @@ inline FILE* FOpen(const char* path, const char* mode) { std::wstring wide_path = converter.from_bytes(path); std::wstring wide_mode = converter.from_bytes(mode); return _wfopen(wide_path.c_str(), wide_mode.c_str()); -#else // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW +#else // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW return fopen(path, mode); #endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW } #if !GTEST_OS_WINDOWS_MOBILE -inline FILE *FReopen(const char* path, const char* mode, FILE* stream) { +inline FILE* FReopen(const char* path, const char* mode, FILE* stream) { return freopen(path, mode, stream); } inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); } @@ -2136,13 +2111,13 @@ GTEST_DISABLE_MSC_DEPRECATED_POP_() // snprintf is a variadic function. #if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE // MSVC 2005 and above support variadic macros. -# define GTEST_SNPRINTF_(buffer, size, format, ...) \ - _snprintf_s(buffer, size, size, format, __VA_ARGS__) +#define GTEST_SNPRINTF_(buffer, size, format, ...) \ + _snprintf_s(buffer, size, size, format, __VA_ARGS__) #elif defined(_MSC_VER) // Windows CE does not define _snprintf_s -# define GTEST_SNPRINTF_ _snprintf +#define GTEST_SNPRINTF_ _snprintf #else -# define GTEST_SNPRINTF_ snprintf +#define GTEST_SNPRINTF_ snprintf #endif // The biggest signed integer type the compiler supports. @@ -2202,37 +2177,84 @@ using TimeInMillis = int64_t; // Represents time in milliseconds. // Macro for referencing flags. #if !defined(GTEST_FLAG) -# define GTEST_FLAG(name) FLAGS_gtest_##name +#define GTEST_FLAG_NAME_(name) gtest_##name +#define GTEST_FLAG(name) FLAGS_gtest_##name #endif // !defined(GTEST_FLAG) -#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_) -# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1 -#endif // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_) +// Pick a command line flags implementation. +#if GTEST_HAS_ABSL -#if !defined(GTEST_DECLARE_bool_) -# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver +// Macros for defining flags. +#define GTEST_DEFINE_bool_(name, default_val, doc) \ + ABSL_FLAG(bool, GTEST_FLAG_NAME_(name), default_val, doc) +#define GTEST_DEFINE_int32_(name, default_val, doc) \ + ABSL_FLAG(int32_t, GTEST_FLAG_NAME_(name), default_val, doc) +#define GTEST_DEFINE_string_(name, default_val, doc) \ + ABSL_FLAG(std::string, GTEST_FLAG_NAME_(name), default_val, doc) // Macros for declaring flags. -# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name) -# define GTEST_DECLARE_int32_(name) \ - GTEST_API_ extern std::int32_t GTEST_FLAG(name) -# define GTEST_DECLARE_string_(name) \ - GTEST_API_ extern ::std::string GTEST_FLAG(name) +#define GTEST_DECLARE_bool_(name) \ + ABSL_DECLARE_FLAG(bool, GTEST_FLAG_NAME_(name)) +#define GTEST_DECLARE_int32_(name) \ + ABSL_DECLARE_FLAG(int32_t, GTEST_FLAG_NAME_(name)) +#define GTEST_DECLARE_string_(name) \ + ABSL_DECLARE_FLAG(std::string, GTEST_FLAG_NAME_(name)) + +#define GTEST_FLAG_SAVER_ ::absl::FlagSaver + +#define GTEST_FLAG_GET(name) ::absl::GetFlag(GTEST_FLAG(name)) +#define GTEST_FLAG_SET(name, value) \ + (void)(::absl::SetFlag(>EST_FLAG(name), value)) +#define GTEST_USE_OWN_FLAGFILE_FLAG_ 0 + +#else // GTEST_HAS_ABSL // Macros for defining flags. -# define GTEST_DEFINE_bool_(name, default_val, doc) \ - GTEST_API_ bool GTEST_FLAG(name) = (default_val) -# define GTEST_DEFINE_int32_(name, default_val, doc) \ - GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val) -# define GTEST_DEFINE_string_(name, default_val, doc) \ - GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val) +#define GTEST_DEFINE_bool_(name, default_val, doc) \ + namespace testing { \ + GTEST_API_ bool GTEST_FLAG(name) = (default_val); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DEFINE_int32_(name, default_val, doc) \ + namespace testing { \ + GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DEFINE_string_(name, default_val, doc) \ + namespace testing { \ + GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val); \ + } \ + static_assert(true, "no-op to require trailing semicolon") -#endif // !defined(GTEST_DECLARE_bool_) +// Macros for declaring flags. +#define GTEST_DECLARE_bool_(name) \ + namespace testing { \ + GTEST_API_ extern bool GTEST_FLAG(name); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DECLARE_int32_(name) \ + namespace testing { \ + GTEST_API_ extern std::int32_t GTEST_FLAG(name); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DECLARE_string_(name) \ + namespace testing { \ + GTEST_API_ extern ::std::string GTEST_FLAG(name); \ + } \ + static_assert(true, "no-op to require trailing semicolon") + +#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver + +#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name) +#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value) +#define GTEST_USE_OWN_FLAGFILE_FLAG_ 1 + +#endif // GTEST_HAS_ABSL // Thread annotations #if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_) -# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks) -# define GTEST_LOCK_EXCLUDED_(locks) +#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks) +#define GTEST_LOCK_EXCLUDED_(locks) #endif // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_) // Parses 'str' for a 32-bit signed integer. If successful, writes the result @@ -2308,6 +2330,7 @@ namespace testing { namespace internal { template using Optional = ::absl::optional; +inline ::absl::nullopt_t Nullopt() { return ::absl::nullopt; } } // namespace internal } // namespace testing #else @@ -2321,6 +2344,7 @@ namespace testing { namespace internal { template using Optional = ::std::optional; +inline ::std::nullopt_t Nullopt() { return ::std::nullopt; } } // namespace internal } // namespace testing // The case where absl is configured NOT to alias std::optional is not @@ -2332,7 +2356,7 @@ using Optional = ::std::optional; #if GTEST_HAS_ABSL // Always use absl::string_view for Matcher<> specializations if googletest // is built with absl support. -# define GTEST_INTERNAL_HAS_STRING_VIEW 1 +#define GTEST_INTERNAL_HAS_STRING_VIEW 1 #include "absl/strings/string_view.h" namespace testing { namespace internal { @@ -2340,11 +2364,11 @@ using StringView = ::absl::string_view; } // namespace internal } // namespace testing #else -# ifdef __has_include -# if __has_include() && __cplusplus >= 201703L +#ifdef __has_include +#if __has_include() && __cplusplus >= 201703L // Otherwise for C++17 and higher use std::string_view for Matcher<> // specializations. -# define GTEST_INTERNAL_HAS_STRING_VIEW 1 +#define GTEST_INTERNAL_HAS_STRING_VIEW 1 #include namespace testing { namespace internal { @@ -2353,8 +2377,8 @@ using StringView = ::std::string_view; } // namespace testing // The case where absl is configured NOT to alias std::string_view is not // supported. -# endif // __has_include() && __cplusplus >= 201703L -# endif // __has_include +#endif // __has_include() && __cplusplus >= 201703L +#endif // __has_include #endif // GTEST_HAS_ABSL #if GTEST_HAS_ABSL diff --git a/third_party/googletest/src/include/gtest/internal/gtest-string.h b/third_party/googletest/src/include/gtest/internal/gtest-string.h index 10f774f966..cca2e1f2ad 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-string.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-string.h @@ -26,7 +26,7 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// + // The Google C++ Testing and Mocking Framework (Google Test) // // This header file declares the String class and functions used internally by @@ -36,17 +36,20 @@ // This header file is #included by gtest-internal.h. // It should not be #included by other files. -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ #ifdef __BORLANDC__ // string.h is not guaranteed to provide strcpy on C++ Builder. -# include +#include #endif #include + #include #include @@ -123,8 +126,7 @@ class GTEST_API_ String { // Unlike strcasecmp(), this function can handle NULL argument(s). // A NULL C string is considered different to any non-NULL C string, // including the empty string. - static bool CaseInsensitiveCStringEquals(const char* lhs, - const char* rhs); + static bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs); // Compares two wide C strings, ignoring case. Returns true if and only if // they have the same content. @@ -143,8 +145,8 @@ class GTEST_API_ String { // Returns true if and only if the given string ends with the given suffix, // ignoring case. Any string is considered to end with an empty suffix. - static bool EndsWithCaseInsensitive( - const std::string& str, const std::string& suffix); + static bool EndsWithCaseInsensitive(const std::string& str, + const std::string& suffix); // Formats an int value as "%02d". static std::string FormatIntWidth2(int value); // "%02d" for width == 2 @@ -163,7 +165,7 @@ class GTEST_API_ String { private: String(); // Not meant to be instantiated. -}; // class String +}; // class String // Gets the content of the stringstream's buffer as an std::string. Each '\0' // character in the buffer is replaced with "\\0". diff --git a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h index b87a2e2cac..6bc02a7de3 100644 --- a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h +++ b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h @@ -30,7 +30,9 @@ // Type utilities needed for implementing typed and type-parameterized // tests. -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ @@ -39,11 +41,11 @@ // #ifdef __GNUC__ is too general here. It is possible to use gcc without using // libstdc++ (which is where cxxabi.h comes from). -# if GTEST_HAS_CXXABI_H_ -# include -# elif defined(__HP_aCC) -# include -# endif // GTEST_HASH_CXXABI_H_ +#if GTEST_HAS_CXXABI_H_ +#include +#elif defined(__HP_aCC) +#include +#endif // GTEST_HASH_CXXABI_H_ namespace testing { namespace internal { @@ -101,7 +103,9 @@ std::string GetTypeName() { // A unique type indicating an empty node struct None {}; -# define GTEST_TEMPLATE_ template class +#define GTEST_TEMPLATE_ \ + template \ + class // The template "selector" struct TemplateSel is used to // represent Tmpl, which must be a class template with one type @@ -119,8 +123,7 @@ struct TemplateSel { }; }; -# define GTEST_BIND_(TmplSel, T) \ - TmplSel::template Bind::type +#define GTEST_BIND_(TmplSel, T) TmplSel::template Bind::type template struct Templates { diff --git a/third_party/googletest/src/src/gtest-all.cc b/third_party/googletest/src/src/gtest-all.cc index ad292905cf..2a70ed88c7 100644 --- a/third_party/googletest/src/src/gtest-all.cc +++ b/third_party/googletest/src/src/gtest-all.cc @@ -38,7 +38,7 @@ #include "gtest/gtest.h" // The following lines pull in the real gtest *.cc files. -#include "src/gtest.cc" +#include "src/gtest-assertion-result.cc" #include "src/gtest-death-test.cc" #include "src/gtest-filepath.cc" #include "src/gtest-matchers.cc" @@ -46,3 +46,4 @@ #include "src/gtest-printers.cc" #include "src/gtest-test-part.cc" #include "src/gtest-typed-test.cc" +#include "src/gtest.cc" diff --git a/third_party/googletest/src/src/gtest-assertion-result.cc b/third_party/googletest/src/src/gtest-assertion-result.cc new file mode 100644 index 0000000000..f1c0b10dc9 --- /dev/null +++ b/third_party/googletest/src/src/gtest-assertion-result.cc @@ -0,0 +1,77 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file defines the AssertionResult type. + +#include "gtest/gtest-assertion-result.h" + +#include +#include + +#include "gtest/gtest-message.h" + +namespace testing { + +// AssertionResult constructors. +// Used in EXPECT_TRUE/FALSE(assertion_result). +AssertionResult::AssertionResult(const AssertionResult& other) + : success_(other.success_), + message_(other.message_.get() != nullptr + ? new ::std::string(*other.message_) + : static_cast< ::std::string*>(nullptr)) {} + +// Swaps two AssertionResults. +void AssertionResult::swap(AssertionResult& other) { + using std::swap; + swap(success_, other.success_); + swap(message_, other.message_); +} + +// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. +AssertionResult AssertionResult::operator!() const { + AssertionResult negation(!success_); + if (message_.get() != nullptr) negation << *message_; + return negation; +} + +// Makes a successful assertion result. +AssertionResult AssertionSuccess() { return AssertionResult(true); } + +// Makes a failed assertion result. +AssertionResult AssertionFailure() { return AssertionResult(false); } + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << message. +AssertionResult AssertionFailure(const Message& message) { + return AssertionFailure() << message; +} + +} // namespace testing diff --git a/third_party/googletest/src/src/gtest-death-test.cc b/third_party/googletest/src/src/gtest-death-test.cc index bf4f6331da..e6abc6278a 100644 --- a/third_party/googletest/src/src/gtest-death-test.cc +++ b/third_party/googletest/src/src/gtest-death-test.cc @@ -35,49 +35,49 @@ #include #include -#include "gtest/internal/gtest-port.h" #include "gtest/internal/custom/gtest.h" +#include "gtest/internal/gtest-port.h" #if GTEST_HAS_DEATH_TEST -# if GTEST_OS_MAC -# include -# endif // GTEST_OS_MAC - -# include -# include -# include - -# if GTEST_OS_LINUX -# include -# endif // GTEST_OS_LINUX - -# include - -# if GTEST_OS_WINDOWS -# include -# else -# include -# include -# endif // GTEST_OS_WINDOWS - -# if GTEST_OS_QNX -# include -# endif // GTEST_OS_QNX - -# if GTEST_OS_FUCHSIA -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# endif // GTEST_OS_FUCHSIA +#if GTEST_OS_MAC +#include +#endif // GTEST_OS_MAC + +#include +#include +#include + +#if GTEST_OS_LINUX +#include +#endif // GTEST_OS_LINUX + +#include + +#if GTEST_OS_WINDOWS +#include +#else +#include +#include +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_QNX +#include +#endif // GTEST_OS_QNX + +#if GTEST_OS_FUCHSIA +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif // GTEST_OS_FUCHSIA #endif // GTEST_HAS_DEATH_TEST @@ -96,9 +96,12 @@ namespace testing { // used internally at Google, is "threadsafe". static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE; +} // namespace testing + GTEST_DEFINE_string_( death_test_style, - internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle), + testing::internal::StringFromGTestEnv("death_test_style", + testing::kDefaultDeathTestStyle), "Indicates how to run a death test in a forked child process: " "\"threadsafe\" (child process re-executes the test binary " "from the beginning, running only the specific death test) or " @@ -107,7 +110,7 @@ GTEST_DEFINE_string_( GTEST_DEFINE_bool_( death_test_use_fork, - internal::BoolFromGTestEnv("death_test_use_fork", false), + testing::internal::BoolFromGTestEnv("death_test_use_fork", false), "Instructs to use fork()/_exit() instead of clone() in death tests. " "Ignored and always uses fork() on POSIX systems where clone() is not " "implemented. Useful when running under valgrind or similar tools if " @@ -117,7 +120,6 @@ GTEST_DEFINE_bool_( "work in 99% of the cases. Once valgrind is fixed, this flag will " "most likely be removed."); -namespace internal { GTEST_DEFINE_string_( internal_run_death_test, "", "Indicates the file, line number, temporal index of " @@ -126,7 +128,8 @@ GTEST_DEFINE_string_( "the '|' characters. This flag is specified if and only if the " "current process is a sub-process launched for running a thread-safe " "death test. FOR INTERNAL USE ONLY."); -} // namespace internal + +namespace testing { #if GTEST_HAS_DEATH_TEST @@ -134,9 +137,9 @@ namespace internal { // Valid only for fast death tests. Indicates the code is running in the // child process of a fast style death test. -# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA static bool g_in_fast_death_test_child = false; -# endif +#endif // Returns a Boolean value indicating whether the caller is currently // executing in the context of the death test child process. Tools such as @@ -144,16 +147,16 @@ static bool g_in_fast_death_test_child = false; // tests. IMPORTANT: This is an internal utility. Using it may break the // implementation of death tests. User code MUST NOT use it. bool InDeathTestChild() { -# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA +#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA // On Windows and Fuchsia, death tests are thread-safe regardless of the value // of the death_test_style flag. - return !GTEST_FLAG(internal_run_death_test).empty(); + return !GTEST_FLAG_GET(internal_run_death_test).empty(); -# else +#else - if (GTEST_FLAG(death_test_style) == "threadsafe") - return !GTEST_FLAG(internal_run_death_test).empty(); + if (GTEST_FLAG_GET(death_test_style) == "threadsafe") + return !GTEST_FLAG_GET(internal_run_death_test).empty(); else return g_in_fast_death_test_child; #endif @@ -162,40 +165,38 @@ bool InDeathTestChild() { } // namespace internal // ExitedWithCode constructor. -ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) { -} +ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {} // ExitedWithCode function-call operator. bool ExitedWithCode::operator()(int exit_status) const { -# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA +#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA return exit_status == exit_code_; -# else +#else return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_; -# endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA +#endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA } -# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA // KilledBySignal constructor. -KilledBySignal::KilledBySignal(int signum) : signum_(signum) { -} +KilledBySignal::KilledBySignal(int signum) : signum_(signum) {} // KilledBySignal function-call operator. bool KilledBySignal::operator()(int exit_status) const { -# if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) +#if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) { bool result; if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) { return result; } } -# endif // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) +#endif // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_; } -# endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +#endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA namespace internal { @@ -206,23 +207,23 @@ namespace internal { static std::string ExitSummary(int exit_code) { Message m; -# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA +#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA m << "Exited with exit status " << exit_code; -# else +#else if (WIFEXITED(exit_code)) { m << "Exited with exit status " << WEXITSTATUS(exit_code); } else if (WIFSIGNALED(exit_code)) { m << "Terminated by signal " << WTERMSIG(exit_code); } -# ifdef WCOREDUMP +#ifdef WCOREDUMP if (WCOREDUMP(exit_code)) { m << " (core dumped)"; } -# endif -# endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA +#endif +#endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA return m.GetString(); } @@ -233,7 +234,7 @@ bool ExitedUnsuccessfully(int exit_status) { return !ExitedWithCode(0)(exit_status); } -# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA // Generates a textual failure message when a death test finds more than // one thread running, or cannot determine the number of threads, prior // to executing the given statement. It is the responsibility of the @@ -254,7 +255,7 @@ static std::string DeathTestThreadWarning(size_t thread_count) { << " this is the last message you see before your test times out."; return msg.GetString(); } -# endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +#endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA // Flag characters for reporting a death test that did not die. static const char kDeathTestLived = 'L'; @@ -304,14 +305,14 @@ static void DeathTestAbort(const std::string& message) { // A replacement for CHECK that calls DeathTestAbort if the assertion // fails. -# define GTEST_DEATH_TEST_CHECK_(expression) \ - do { \ - if (!::testing::internal::IsTrue(expression)) { \ - DeathTestAbort( \ - ::std::string("CHECK failed: File ") + __FILE__ + ", line " \ - + ::testing::internal::StreamableToString(__LINE__) + ": " \ - + #expression); \ - } \ +#define GTEST_DEATH_TEST_CHECK_(expression) \ + do { \ + if (!::testing::internal::IsTrue(expression)) { \ + DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ + \ + ", line " + \ + ::testing::internal::StreamableToString(__LINE__) + \ + ": " + #expression); \ + } \ } while (::testing::internal::AlwaysFalse()) // This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for @@ -321,23 +322,23 @@ static void DeathTestAbort(const std::string& message) { // evaluates the expression as long as it evaluates to -1 and sets // errno to EINTR. If the expression evaluates to -1 but errno is // something other than EINTR, DeathTestAbort is called. -# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \ - do { \ - int gtest_retval; \ - do { \ - gtest_retval = (expression); \ - } while (gtest_retval == -1 && errno == EINTR); \ - if (gtest_retval == -1) { \ - DeathTestAbort( \ - ::std::string("CHECK failed: File ") + __FILE__ + ", line " \ - + ::testing::internal::StreamableToString(__LINE__) + ": " \ - + #expression + " != -1"); \ - } \ +#define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \ + do { \ + int gtest_retval; \ + do { \ + gtest_retval = (expression); \ + } while (gtest_retval == -1 && errno == EINTR); \ + if (gtest_retval == -1) { \ + DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ + \ + ", line " + \ + ::testing::internal::StreamableToString(__LINE__) + \ + ": " + #expression + " != -1"); \ + } \ } while (::testing::internal::AlwaysFalse()) // Returns the message describing the last system error in errno. std::string GetLastErrnoDescription() { - return errno == 0 ? "" : posix::StrError(errno); + return errno == 0 ? "" : posix::StrError(errno); } // This is called from a death test parent process to read a failure @@ -370,8 +371,9 @@ static void FailFromInternalError(int fd) { DeathTest::DeathTest() { TestInfo* const info = GetUnitTestImpl()->current_test_info(); if (info == nullptr) { - DeathTestAbort("Cannot run a death test outside of a TEST or " - "TEST_F construct"); + DeathTestAbort( + "Cannot run a death test outside of a TEST or " + "TEST_F construct"); } } @@ -500,9 +502,7 @@ void DeathTestImpl::ReadAndInterpretStatusByte() { set_read_fd(-1); } -std::string DeathTestImpl::GetErrorLogs() { - return GetCapturedStderr(); -} +std::string DeathTestImpl::GetErrorLogs() { return GetCapturedStderr(); } // Signals that the death test code which should have exited, didn't. // Should be called only in a death test child process. @@ -512,9 +512,9 @@ void DeathTestImpl::Abort(AbortReason reason) { // The parent process considers the death test to be a failure if // it finds any data in our pipe. So, here we write a single flag byte // to the pipe, then exit. - const char status_ch = - reason == TEST_DID_NOT_DIE ? kDeathTestLived : - reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned; + const char status_ch = reason == TEST_DID_NOT_DIE ? kDeathTestLived + : reason == TEST_THREW_EXCEPTION ? kDeathTestThrew + : kDeathTestReturned; GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1)); // We are leaking the descriptor here because on some platforms (i.e., @@ -533,7 +533,7 @@ void DeathTestImpl::Abort(AbortReason reason) { // much easier. static ::std::string FormatDeathTestOutput(const ::std::string& output) { ::std::string ret; - for (size_t at = 0; ; ) { + for (size_t at = 0;;) { const size_t line_end = output.find('\n', at); ret += "[ DEATH ] "; if (line_end == ::std::string::npos) { @@ -568,8 +568,7 @@ static ::std::string FormatDeathTestOutput(const ::std::string& output) { // the first failing condition, in the order given above, is the one that is // reported. Also sets the last death test message string. bool DeathTestImpl::Passed(bool status_ok) { - if (!spawned()) - return false; + if (!spawned()) return false; const std::string error_message = GetErrorLogs(); @@ -580,15 +579,18 @@ bool DeathTestImpl::Passed(bool status_ok) { switch (outcome()) { case LIVED: buffer << " Result: failed to die.\n" - << " Error msg:\n" << FormatDeathTestOutput(error_message); + << " Error msg:\n" + << FormatDeathTestOutput(error_message); break; case THREW: buffer << " Result: threw an exception.\n" - << " Error msg:\n" << FormatDeathTestOutput(error_message); + << " Error msg:\n" + << FormatDeathTestOutput(error_message); break; case RETURNED: buffer << " Result: illegal return in test statement.\n" - << " Error msg:\n" << FormatDeathTestOutput(error_message); + << " Error msg:\n" + << FormatDeathTestOutput(error_message); break; case DIED: if (status_ok) { @@ -605,7 +607,8 @@ bool DeathTestImpl::Passed(bool status_ok) { } else { buffer << " Result: died but not with expected exit code:\n" << " " << ExitSummary(status()) << "\n" - << "Actual msg:\n" << FormatDeathTestOutput(error_message); + << "Actual msg:\n" + << FormatDeathTestOutput(error_message); } break; case IN_PROGRESS: @@ -618,7 +621,7 @@ bool DeathTestImpl::Passed(bool status_ok) { return success; } -# if GTEST_OS_WINDOWS +#if GTEST_OS_WINDOWS // WindowsDeathTest implements death tests on Windows. Due to the // specifics of starting new processes on Windows, death tests there are // always threadsafe, and Google Test considers the @@ -679,14 +682,12 @@ class WindowsDeathTest : public DeathTestImpl { // status, or 0 if no child process exists. As a side effect, sets the // outcome data member. int WindowsDeathTest::Wait() { - if (!spawned()) - return 0; + if (!spawned()) return 0; // Wait until the child either signals that it has acquired the write end // of the pipe or it dies. - const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() }; - switch (::WaitForMultipleObjects(2, - wait_handles, + const HANDLE wait_handles[2] = {child_handle_.Get(), event_handle_.Get()}; + switch (::WaitForMultipleObjects(2, wait_handles, FALSE, // Waits for any of the handles. INFINITE)) { case WAIT_OBJECT_0: @@ -707,9 +708,8 @@ int WindowsDeathTest::Wait() { // returns immediately if the child has already exited, regardless of // whether previous calls to WaitForMultipleObjects synchronized on this // handle or not. - GTEST_DEATH_TEST_CHECK_( - WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(), - INFINITE)); + GTEST_DEATH_TEST_CHECK_(WAIT_OBJECT_0 == + ::WaitForSingleObject(child_handle_.Get(), INFINITE)); DWORD status_code; GTEST_DEATH_TEST_CHECK_( ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE); @@ -742,12 +742,12 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() { SECURITY_ATTRIBUTES handles_are_inheritable = {sizeof(SECURITY_ATTRIBUTES), nullptr, TRUE}; HANDLE read_handle, write_handle; - GTEST_DEATH_TEST_CHECK_( - ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable, - 0) // Default buffer size. - != FALSE); - set_read_fd(::_open_osfhandle(reinterpret_cast(read_handle), - O_RDONLY)); + GTEST_DEATH_TEST_CHECK_(::CreatePipe(&read_handle, &write_handle, + &handles_are_inheritable, + 0) // Default buffer size. + != FALSE); + set_read_fd( + ::_open_osfhandle(reinterpret_cast(read_handle), O_RDONLY)); write_handle_.Reset(write_handle); event_handle_.Reset(::CreateEvent( &handles_are_inheritable, @@ -756,27 +756,26 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() { nullptr)); // The even is unnamed. GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr); const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + - kFilterFlag + "=" + info->test_suite_name() + - "." + info->name(); + "filter=" + info->test_suite_name() + "." + + info->name(); const std::string internal_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + - "=" + file_ + "|" + StreamableToString(line_) + "|" + - StreamableToString(death_test_index) + "|" + + std::string("--") + GTEST_FLAG_PREFIX_ + + "internal_run_death_test=" + file_ + "|" + StreamableToString(line_) + + "|" + StreamableToString(death_test_index) + "|" + StreamableToString(static_cast(::GetCurrentProcessId())) + // size_t has the same width as pointers on both 32-bit and 64-bit // Windows platforms. // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx. - "|" + StreamableToString(reinterpret_cast(write_handle)) + - "|" + StreamableToString(reinterpret_cast(event_handle_.Get())); + "|" + StreamableToString(reinterpret_cast(write_handle)) + "|" + + StreamableToString(reinterpret_cast(event_handle_.Get())); char executable_path[_MAX_PATH + 1]; // NOLINT GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr, executable_path, _MAX_PATH)); - std::string command_line = - std::string(::GetCommandLineA()) + " " + filter_flag + " \"" + - internal_flag + "\""; + std::string command_line = std::string(::GetCommandLineA()) + " " + + filter_flag + " \"" + internal_flag + "\""; DeathTest::set_last_death_test_message(""); @@ -796,8 +795,8 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() { GTEST_DEATH_TEST_CHECK_( ::CreateProcessA( executable_path, const_cast(command_line.c_str()), - nullptr, // Retuned process handle is not inheritable. - nullptr, // Retuned thread handle is not inheritable. + nullptr, // Returned process handle is not inheritable. + nullptr, // Returned thread handle is not inheritable. TRUE, // Child inherits all inheritable handles (for write_handle_). 0x0, // Default creation flags. nullptr, // Inherit the parent's environment. @@ -809,7 +808,7 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() { return OVERSEE_TEST; } -# elif GTEST_OS_FUCHSIA +#elif GTEST_OS_FUCHSIA class FuchsiaDeathTest : public DeathTestImpl { public: @@ -855,18 +854,13 @@ class Arguments { template void AddArguments(const ::std::vector& arguments) { for (typename ::std::vector::const_iterator i = arguments.begin(); - i != arguments.end(); - ++i) { + i != arguments.end(); ++i) { args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); } } - char* const* Argv() { - return &args_[0]; - } + char* const* Argv() { return &args_[0]; } - int size() { - return static_cast(args_.size()) - 1; - } + int size() { return static_cast(args_.size()) - 1; } private: std::vector args_; @@ -880,8 +874,7 @@ int FuchsiaDeathTest::Wait() { const int kSocketKey = 1; const int kExceptionKey = 2; - if (!spawned()) - return 0; + if (!spawned()) return 0; // Create a port to wait for socket/task/exception events. zx_status_t status_zx; @@ -890,8 +883,8 @@ int FuchsiaDeathTest::Wait() { GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); // Register to wait for the child process to terminate. - status_zx = child_process_.wait_async( - port, kProcessKey, ZX_PROCESS_TERMINATED, 0); + status_zx = + child_process_.wait_async(port, kProcessKey, ZX_PROCESS_TERMINATED, 0); GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); // Register to wait for the socket to be readable or closed. @@ -900,8 +893,8 @@ int FuchsiaDeathTest::Wait() { GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); // Register to wait for an exception. - status_zx = exception_channel_.wait_async( - port, kExceptionKey, ZX_CHANNEL_READABLE, 0); + status_zx = exception_channel_.wait_async(port, kExceptionKey, + ZX_CHANNEL_READABLE, 0); GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); bool process_terminated = false; @@ -931,9 +924,9 @@ int FuchsiaDeathTest::Wait() { size_t old_length = captured_stderr_.length(); size_t bytes_read = 0; captured_stderr_.resize(old_length + kBufferSize); - status_zx = stderr_socket_.read( - 0, &captured_stderr_.front() + old_length, kBufferSize, - &bytes_read); + status_zx = + stderr_socket_.read(0, &captured_stderr_.front() + old_length, + kBufferSize, &bytes_read); captured_stderr_.resize(old_length + bytes_read); } while (status_zx == ZX_OK); if (status_zx == ZX_ERR_PEER_CLOSED) { @@ -987,13 +980,12 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() { // Build the child process command line. const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + - kFilterFlag + "=" + info->test_suite_name() + - "." + info->name(); - const std::string internal_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" - + file_ + "|" - + StreamableToString(line_) + "|" - + StreamableToString(death_test_index); + "filter=" + info->test_suite_name() + "." + + info->name(); + const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + kInternalRunDeathTestFlag + "=" + file_ + + "|" + StreamableToString(line_) + "|" + + StreamableToString(death_test_index); Arguments args; args.AddArguments(GetInjectableArgvs()); args.AddArgument(filter_flag.c_str()); @@ -1016,8 +1008,7 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() { // Create a socket pair will be used to receive the child process' stderr. zx::socket stderr_producer_socket; - status = - zx::socket::create(0, &stderr_producer_socket, &stderr_socket_); + status = zx::socket::create(0, &stderr_producer_socket, &stderr_socket_); GTEST_DEATH_TEST_CHECK_(status >= 0); int stderr_producer_fd = -1; status = @@ -1034,35 +1025,32 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() { // Create a child job. zx_handle_t child_job = ZX_HANDLE_INVALID; - status = zx_job_create(zx_job_default(), 0, & child_job); + status = zx_job_create(zx_job_default(), 0, &child_job); GTEST_DEATH_TEST_CHECK_(status == ZX_OK); zx_policy_basic_t policy; policy.condition = ZX_POL_NEW_ANY; policy.policy = ZX_POL_ACTION_ALLOW; - status = zx_job_set_policy( - child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, &policy, 1); + status = zx_job_set_policy(child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, + &policy, 1); GTEST_DEATH_TEST_CHECK_(status == ZX_OK); // Create an exception channel attached to the |child_job|, to allow // us to suppress the system default exception handler from firing. - status = - zx_task_create_exception_channel( - child_job, 0, exception_channel_.reset_and_get_address()); + status = zx_task_create_exception_channel( + child_job, 0, exception_channel_.reset_and_get_address()); GTEST_DEATH_TEST_CHECK_(status == ZX_OK); // Spawn the child process. - status = fdio_spawn_etc( - child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], args.Argv(), nullptr, - 2, spawn_actions, child_process_.reset_and_get_address(), nullptr); + status = fdio_spawn_etc(child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], + args.Argv(), nullptr, 2, spawn_actions, + child_process_.reset_and_get_address(), nullptr); GTEST_DEATH_TEST_CHECK_(status == ZX_OK); set_spawned(true); return OVERSEE_TEST; } -std::string FuchsiaDeathTest::GetErrorLogs() { - return captured_stderr_; -} +std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; } #else // We are neither on Windows, nor on Fuchsia. @@ -1093,8 +1081,7 @@ ForkingDeathTest::ForkingDeathTest(const char* a_statement, // status, or 0 if no child process exists. As a side effect, sets the // outcome data member. int ForkingDeathTest::Wait() { - if (!spawned()) - return 0; + if (!spawned()) return 0; ReadAndInterpretStatusByte(); @@ -1173,11 +1160,11 @@ class ExecDeathTest : public ForkingDeathTest { private: static ::std::vector GetArgvsForDeathTestChildProcess() { ::std::vector args = GetInjectableArgvs(); -# if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) +#if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) ::std::vector extra_args = GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_(); args.insert(args.end(), extra_args.begin(), extra_args.end()); -# endif // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) +#endif // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) return args; } // The name of the file in which the death test is located. @@ -1204,14 +1191,11 @@ class Arguments { template void AddArguments(const ::std::vector& arguments) { for (typename ::std::vector::const_iterator i = arguments.begin(); - i != arguments.end(); - ++i) { + i != arguments.end(); ++i) { args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); } } - char* const* Argv() { - return &args_[0]; - } + char* const* Argv() { return &args_[0]; } private: std::vector args_; @@ -1224,9 +1208,9 @@ struct ExecDeathTestArgs { int close_fd; // File descriptor to close; the read end of a pipe }; -# if GTEST_OS_QNX +#if GTEST_OS_QNX extern "C" char** environ; -# else // GTEST_OS_QNX +#else // GTEST_OS_QNX // The main function for a threadsafe-style death test child process. // This function is called in a clone()-ed process and thus must avoid // any potentially unsafe operations like malloc or libc functions. @@ -1241,8 +1225,8 @@ static int ExecDeathTestChildMain(void* child_arg) { UnitTest::GetInstance()->original_working_dir(); // We can safely call chdir() as it's a direct system call. if (chdir(original_dir) != 0) { - DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " + - GetLastErrnoDescription()); + DeathTestAbort(std::string("chdir(\"") + original_dir + + "\") failed: " + GetLastErrnoDescription()); return EXIT_FAILURE; } @@ -1253,13 +1237,12 @@ static int ExecDeathTestChildMain(void* child_arg) { // one path separator. execv(args->argv[0], args->argv); DeathTestAbort(std::string("execv(") + args->argv[0] + ", ...) in " + - original_dir + " failed: " + - GetLastErrnoDescription()); + original_dir + " failed: " + GetLastErrnoDescription()); return EXIT_FAILURE; } -# endif // GTEST_OS_QNX +#endif // GTEST_OS_QNX -# if GTEST_HAS_CLONE +#if GTEST_HAS_CLONE // Two utility routines that together determine the direction the stack // grows. // This could be accomplished more elegantly by a single recursive @@ -1293,7 +1276,7 @@ static bool StackGrowsDown() { StackLowerThanAddress(&dummy, &result); return result; } -# endif // GTEST_HAS_CLONE +#endif // GTEST_HAS_CLONE // Spawns a child process with the same executable as the current process in // a thread-safe manner and instructs it to run the death test. The @@ -1303,10 +1286,10 @@ static bool StackGrowsDown() { // spawn(2) there instead. The function dies with an error message if // anything goes wrong. static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { - ExecDeathTestArgs args = { argv, close_fd }; + ExecDeathTestArgs args = {argv, close_fd}; pid_t child_pid = -1; -# if GTEST_OS_QNX +#if GTEST_OS_QNX // Obtains the current directory and sets it to be closed in the child // process. const int cwd_fd = open(".", O_RDONLY); @@ -1319,16 +1302,16 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { UnitTest::GetInstance()->original_working_dir(); // We can safely call chdir() as it's a direct system call. if (chdir(original_dir) != 0) { - DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " + - GetLastErrnoDescription()); + DeathTestAbort(std::string("chdir(\"") + original_dir + + "\") failed: " + GetLastErrnoDescription()); return EXIT_FAILURE; } int fd_flags; // Set close_fd to be closed after spawn. GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD)); - GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD, - fd_flags | FD_CLOEXEC)); + GTEST_DEATH_TEST_CHECK_SYSCALL_( + fcntl(close_fd, F_SETFD, fd_flags | FD_CLOEXEC)); struct inheritance inherit = {0}; // spawn is a system call. child_pid = spawn(args.argv[0], 0, nullptr, &inherit, args.argv, environ); @@ -1336,8 +1319,8 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1); GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd)); -# else // GTEST_OS_QNX -# if GTEST_OS_LINUX +#else // GTEST_OS_QNX +#if GTEST_OS_LINUX // When a SIGPROF signal is received while fork() or clone() are executing, // the process may hang. To avoid this, we ignore SIGPROF here and re-enable // it after the call to fork()/clone() is complete. @@ -1346,12 +1329,12 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action)); sigemptyset(&ignore_sigprof_action.sa_mask); ignore_sigprof_action.sa_handler = SIG_IGN; - GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction( - SIGPROF, &ignore_sigprof_action, &saved_sigprof_action)); -# endif // GTEST_OS_LINUX + GTEST_DEATH_TEST_CHECK_SYSCALL_( + sigaction(SIGPROF, &ignore_sigprof_action, &saved_sigprof_action)); +#endif // GTEST_OS_LINUX -# if GTEST_HAS_CLONE - const bool use_fork = GTEST_FLAG(death_test_use_fork); +#if GTEST_HAS_CLONE + const bool use_fork = GTEST_FLAG_GET(death_test_use_fork); if (!use_fork) { static const bool stack_grows_down = StackGrowsDown(); @@ -1370,7 +1353,7 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { const size_t kMaxStackAlignment = 64; void* const stack_top = static_cast(stack) + - (stack_grows_down ? stack_size - kMaxStackAlignment : 0); + (stack_grows_down ? stack_size - kMaxStackAlignment : 0); GTEST_DEATH_TEST_CHECK_( static_cast(stack_size) > kMaxStackAlignment && reinterpret_cast(stack_top) % kMaxStackAlignment == 0); @@ -1379,19 +1362,19 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1); } -# else +#else const bool use_fork = true; -# endif // GTEST_HAS_CLONE +#endif // GTEST_HAS_CLONE if (use_fork && (child_pid = fork()) == 0) { - ExecDeathTestChildMain(&args); - _exit(0); + ExecDeathTestChildMain(&args); + _exit(0); } -# endif // GTEST_OS_QNX -# if GTEST_OS_LINUX +#endif // GTEST_OS_QNX +#if GTEST_OS_LINUX GTEST_DEATH_TEST_CHECK_SYSCALL_( sigaction(SIGPROF, &saved_sigprof_action, nullptr)); -# endif // GTEST_OS_LINUX +#endif // GTEST_OS_LINUX GTEST_DEATH_TEST_CHECK_(child_pid != -1); return child_pid; @@ -1420,13 +1403,13 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() { GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1); const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + - kFilterFlag + "=" + info->test_suite_name() + - "." + info->name(); - const std::string internal_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" - + file_ + "|" + StreamableToString(line_) + "|" - + StreamableToString(death_test_index) + "|" - + StreamableToString(pipe_fd[1]); + "filter=" + info->test_suite_name() + "." + + info->name(); + const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + "internal_run_death_test=" + file_ + "|" + + StreamableToString(line_) + "|" + + StreamableToString(death_test_index) + "|" + + StreamableToString(pipe_fd[1]); Arguments args; args.AddArguments(GetArgvsForDeathTestChildProcess()); args.AddArgument(filter_flag.c_str()); @@ -1447,7 +1430,7 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() { return OVERSEE_TEST; } -# endif // !GTEST_OS_WINDOWS +#endif // !GTEST_OS_WINDOWS // Creates a concrete DeathTest-derived class that depends on the // --gtest_death_test_style flag, and sets the pointer pointed to @@ -1461,15 +1444,15 @@ bool DefaultDeathTestFactory::Create(const char* statement, UnitTestImpl* const impl = GetUnitTestImpl(); const InternalRunDeathTestFlag* const flag = impl->internal_run_death_test_flag(); - const int death_test_index = impl->current_test_info() - ->increment_death_test_count(); + const int death_test_index = + impl->current_test_info()->increment_death_test_count(); if (flag != nullptr) { if (death_test_index > flag->index()) { DeathTest::set_last_death_test_message( - "Death test count (" + StreamableToString(death_test_index) - + ") somehow exceeded expected maximum (" - + StreamableToString(flag->index()) + ")"); + "Death test count (" + StreamableToString(death_test_index) + + ") somehow exceeded expected maximum (" + + StreamableToString(flag->index()) + ")"); return false; } @@ -1480,50 +1463,50 @@ bool DefaultDeathTestFactory::Create(const char* statement, } } -# if GTEST_OS_WINDOWS +#if GTEST_OS_WINDOWS - if (GTEST_FLAG(death_test_style) == "threadsafe" || - GTEST_FLAG(death_test_style) == "fast") { + if (GTEST_FLAG_GET(death_test_style) == "threadsafe" || + GTEST_FLAG_GET(death_test_style) == "fast") { *test = new WindowsDeathTest(statement, std::move(matcher), file, line); } -# elif GTEST_OS_FUCHSIA +#elif GTEST_OS_FUCHSIA - if (GTEST_FLAG(death_test_style) == "threadsafe" || - GTEST_FLAG(death_test_style) == "fast") { + if (GTEST_FLAG_GET(death_test_style) == "threadsafe" || + GTEST_FLAG_GET(death_test_style) == "fast") { *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line); } -# else +#else - if (GTEST_FLAG(death_test_style) == "threadsafe") { + if (GTEST_FLAG_GET(death_test_style) == "threadsafe") { *test = new ExecDeathTest(statement, std::move(matcher), file, line); - } else if (GTEST_FLAG(death_test_style) == "fast") { + } else if (GTEST_FLAG_GET(death_test_style) == "fast") { *test = new NoExecDeathTest(statement, std::move(matcher)); } -# endif // GTEST_OS_WINDOWS +#endif // GTEST_OS_WINDOWS else { // NOLINT - this is more readable than unbalanced brackets inside #if. - DeathTest::set_last_death_test_message( - "Unknown death test style \"" + GTEST_FLAG(death_test_style) - + "\" encountered"); + DeathTest::set_last_death_test_message("Unknown death test style \"" + + GTEST_FLAG_GET(death_test_style) + + "\" encountered"); return false; } return true; } -# if GTEST_OS_WINDOWS +#if GTEST_OS_WINDOWS // Recreates the pipe and event handles from the provided parameters, // signals the event, and returns a file descriptor wrapped around the pipe // handle. This function is called in the child process only. static int GetStatusFileDescriptor(unsigned int parent_process_id, - size_t write_handle_as_size_t, - size_t event_handle_as_size_t) { + size_t write_handle_as_size_t, + size_t event_handle_as_size_t) { AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE, - FALSE, // Non-inheritable. - parent_process_id)); + FALSE, // Non-inheritable. + parent_process_id)); if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) { DeathTestAbort("Unable to open parent process " + StreamableToString(parent_process_id)); @@ -1531,8 +1514,7 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id, GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t)); - const HANDLE write_handle = - reinterpret_cast(write_handle_as_size_t); + const HANDLE write_handle = reinterpret_cast(write_handle_as_size_t); HANDLE dup_write_handle; // The newly initialized handle is accessible only in the parent @@ -1554,9 +1536,7 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id, HANDLE dup_event_handle; if (!::DuplicateHandle(parent_process_handle.Get(), event_handle, - ::GetCurrentProcess(), &dup_event_handle, - 0x0, - FALSE, + ::GetCurrentProcess(), &dup_event_handle, 0x0, FALSE, DUPLICATE_SAME_ACCESS)) { DeathTestAbort("Unable to duplicate the event handle " + StreamableToString(event_handle_as_size_t) + @@ -1578,61 +1558,57 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id, return write_fd; } -# endif // GTEST_OS_WINDOWS +#endif // GTEST_OS_WINDOWS // Returns a newly created InternalRunDeathTestFlag object with fields // initialized from the GTEST_FLAG(internal_run_death_test) flag if // the flag is specified; otherwise returns NULL. InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() { - if (GTEST_FLAG(internal_run_death_test) == "") return nullptr; + if (GTEST_FLAG_GET(internal_run_death_test) == "") return nullptr; // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we // can use it here. int line = -1; int index = -1; ::std::vector< ::std::string> fields; - SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields); + SplitString(GTEST_FLAG_GET(internal_run_death_test), '|', &fields); int write_fd = -1; -# if GTEST_OS_WINDOWS +#if GTEST_OS_WINDOWS unsigned int parent_process_id = 0; size_t write_handle_as_size_t = 0; size_t event_handle_as_size_t = 0; - if (fields.size() != 6 - || !ParseNaturalNumber(fields[1], &line) - || !ParseNaturalNumber(fields[2], &index) - || !ParseNaturalNumber(fields[3], &parent_process_id) - || !ParseNaturalNumber(fields[4], &write_handle_as_size_t) - || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) { + if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) || + !ParseNaturalNumber(fields[2], &index) || + !ParseNaturalNumber(fields[3], &parent_process_id) || + !ParseNaturalNumber(fields[4], &write_handle_as_size_t) || + !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) { DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + - GTEST_FLAG(internal_run_death_test)); + GTEST_FLAG_GET(internal_run_death_test)); } - write_fd = GetStatusFileDescriptor(parent_process_id, - write_handle_as_size_t, + write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t, event_handle_as_size_t); -# elif GTEST_OS_FUCHSIA +#elif GTEST_OS_FUCHSIA - if (fields.size() != 3 - || !ParseNaturalNumber(fields[1], &line) - || !ParseNaturalNumber(fields[2], &index)) { - DeathTestAbort("Bad --gtest_internal_run_death_test flag: " - + GTEST_FLAG(internal_run_death_test)); + if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) || + !ParseNaturalNumber(fields[2], &index)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG_GET(internal_run_death_test)); } -# else +#else - if (fields.size() != 4 - || !ParseNaturalNumber(fields[1], &line) - || !ParseNaturalNumber(fields[2], &index) - || !ParseNaturalNumber(fields[3], &write_fd)) { - DeathTestAbort("Bad --gtest_internal_run_death_test flag: " - + GTEST_FLAG(internal_run_death_test)); + if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) || + !ParseNaturalNumber(fields[2], &index) || + !ParseNaturalNumber(fields[3], &write_fd)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG_GET(internal_run_death_test)); } -# endif // GTEST_OS_WINDOWS +#endif // GTEST_OS_WINDOWS return new InternalRunDeathTestFlag(fields[0], line, index, write_fd); } diff --git a/third_party/googletest/src/src/gtest-filepath.cc b/third_party/googletest/src/src/gtest-filepath.cc index 0b5629401b..f6ee90cdb7 100644 --- a/third_party/googletest/src/src/gtest-filepath.cc +++ b/third_party/googletest/src/src/gtest-filepath.cc @@ -30,29 +30,31 @@ #include "gtest/internal/gtest-filepath.h" #include -#include "gtest/internal/gtest-port.h" + #include "gtest/gtest-message.h" +#include "gtest/internal/gtest-port.h" #if GTEST_OS_WINDOWS_MOBILE -# include +#include #elif GTEST_OS_WINDOWS -# include -# include +#include +#include #else -# include -# include // Some Linux distributions define PATH_MAX here. -#endif // GTEST_OS_WINDOWS_MOBILE +#include + +#include // Some Linux distributions define PATH_MAX here. +#endif // GTEST_OS_WINDOWS_MOBILE #include "gtest/internal/gtest-string.h" #if GTEST_OS_WINDOWS -# define GTEST_PATH_MAX_ _MAX_PATH +#define GTEST_PATH_MAX_ _MAX_PATH #elif defined(PATH_MAX) -# define GTEST_PATH_MAX_ PATH_MAX +#define GTEST_PATH_MAX_ PATH_MAX #elif defined(_XOPEN_PATH_MAX) -# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX +#define GTEST_PATH_MAX_ _XOPEN_PATH_MAX #else -# define GTEST_PATH_MAX_ _POSIX_PATH_MAX +#define GTEST_PATH_MAX_ _POSIX_PATH_MAX #endif // GTEST_OS_WINDOWS namespace testing { @@ -66,16 +68,16 @@ namespace internal { const char kPathSeparator = '\\'; const char kAlternatePathSeparator = '/'; const char kAlternatePathSeparatorString[] = "/"; -# if GTEST_OS_WINDOWS_MOBILE +#if GTEST_OS_WINDOWS_MOBILE // Windows CE doesn't have a current directory. You should not use // the current directory in tests on Windows CE, but this at least // provides a reasonable fallback. const char kCurrentDirectoryString[] = "\\"; // Windows CE doesn't define INVALID_FILE_ATTRIBUTES const DWORD kInvalidFileAttributes = 0xffffffff; -# else +#else const char kCurrentDirectoryString[] = ".\\"; -# endif // GTEST_OS_WINDOWS_MOBILE +#endif // GTEST_OS_WINDOWS_MOBILE #else const char kPathSeparator = '/'; const char kCurrentDirectoryString[] = "./"; @@ -99,17 +101,17 @@ FilePath FilePath::GetCurrentDir() { // something reasonable. return FilePath(kCurrentDirectoryString); #elif GTEST_OS_WINDOWS - char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; + char cwd[GTEST_PATH_MAX_ + 1] = {'\0'}; return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd); #else - char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; + char cwd[GTEST_PATH_MAX_ + 1] = {'\0'}; char* result = getcwd(cwd, sizeof(cwd)); -# if GTEST_OS_NACL +#if GTEST_OS_NACL // getcwd will likely fail in NaCl due to the sandbox, so return something // reasonable. The user may have provided a shim implementation for getcwd, // however, so fallback only when failure is detected. return FilePath(result == nullptr ? kCurrentDirectoryString : cwd); -# endif // GTEST_OS_NACL +#endif // GTEST_OS_NACL return FilePath(result == nullptr ? "" : cwd); #endif // GTEST_OS_WINDOWS_MOBILE } @@ -121,8 +123,8 @@ FilePath FilePath::GetCurrentDir() { FilePath FilePath::RemoveExtension(const char* extension) const { const std::string dot_extension = std::string(".") + extension; if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) { - return FilePath(pathname_.substr( - 0, pathname_.length() - dot_extension.length())); + return FilePath( + pathname_.substr(0, pathname_.length() - dot_extension.length())); } return *this; } @@ -178,15 +180,14 @@ FilePath FilePath::RemoveFileName() const { // than zero (e.g., 12), returns "dir/test_12.xml". // On Windows platform, uses \ as the separator rather than /. FilePath FilePath::MakeFileName(const FilePath& directory, - const FilePath& base_name, - int number, + const FilePath& base_name, int number, const char* extension) { std::string file; if (number == 0) { file = base_name.string() + "." + extension; } else { - file = base_name.string() + "_" + StreamableToString(number) - + "." + extension; + file = + base_name.string() + "_" + StreamableToString(number) + "." + extension; } return ConcatPaths(directory, FilePath(file)); } @@ -195,8 +196,7 @@ FilePath FilePath::MakeFileName(const FilePath& directory, // On Windows, uses \ as the separator rather than /. FilePath FilePath::ConcatPaths(const FilePath& directory, const FilePath& relative_path) { - if (directory.IsEmpty()) - return relative_path; + if (directory.IsEmpty()) return relative_path; const FilePath dir(directory.RemoveTrailingPathSeparator()); return FilePath(dir.string() + kPathSeparator + relative_path.string()); } @@ -207,7 +207,7 @@ bool FilePath::FileOrDirectoryExists() const { #if GTEST_OS_WINDOWS_MOBILE LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str()); const DWORD attributes = GetFileAttributes(unicode); - delete [] unicode; + delete[] unicode; return attributes != kInvalidFileAttributes; #else posix::StatStruct file_stat{}; @@ -222,8 +222,8 @@ bool FilePath::DirectoryExists() const { #if GTEST_OS_WINDOWS // Don't strip off trailing separator if path is a root directory on // Windows (like "C:\\"). - const FilePath& path(IsRootDirectory() ? *this : - RemoveTrailingPathSeparator()); + const FilePath& path(IsRootDirectory() ? *this + : RemoveTrailingPathSeparator()); #else const FilePath& path(*this); #endif @@ -231,15 +231,15 @@ bool FilePath::DirectoryExists() const { #if GTEST_OS_WINDOWS_MOBILE LPCWSTR unicode = String::AnsiToUtf16(path.c_str()); const DWORD attributes = GetFileAttributes(unicode); - delete [] unicode; + delete[] unicode; if ((attributes != kInvalidFileAttributes) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) { result = true; } #else posix::StatStruct file_stat{}; - result = posix::Stat(path.c_str(), &file_stat) == 0 && - posix::IsDir(file_stat); + result = + posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat); #endif // GTEST_OS_WINDOWS_MOBILE return result; @@ -260,10 +260,9 @@ bool FilePath::IsAbsolutePath() const { const char* const name = pathname_.c_str(); #if GTEST_OS_WINDOWS return pathname_.length() >= 3 && - ((name[0] >= 'a' && name[0] <= 'z') || - (name[0] >= 'A' && name[0] <= 'Z')) && - name[1] == ':' && - IsPathSeparator(name[2]); + ((name[0] >= 'a' && name[0] <= 'z') || + (name[0] >= 'A' && name[0] <= 'Z')) && + name[1] == ':' && IsPathSeparator(name[2]); #else return IsPathSeparator(name[0]); #endif @@ -321,7 +320,7 @@ bool FilePath::CreateFolder() const { FilePath removed_sep(this->RemoveTrailingPathSeparator()); LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str()); int result = CreateDirectory(unicode, nullptr) ? 0 : -1; - delete [] unicode; + delete[] unicode; #elif GTEST_OS_WINDOWS int result = _mkdir(pathname_.c_str()); #elif GTEST_OS_ESP8266 || GTEST_OS_XTENSA @@ -341,9 +340,8 @@ bool FilePath::CreateFolder() const { // name, otherwise return the name string unmodified. // On Windows platform, uses \ as the separator, other platforms use /. FilePath FilePath::RemoveTrailingPathSeparator() const { - return IsDirectory() - ? FilePath(pathname_.substr(0, pathname_.length() - 1)) - : *this; + return IsDirectory() ? FilePath(pathname_.substr(0, pathname_.length() - 1)) + : *this; } // Removes any redundant separators that might be in the pathname. diff --git a/third_party/googletest/src/src/gtest-internal-inl.h b/third_party/googletest/src/src/gtest-internal-inl.h index 6d8cecbbb3..0b9e929c68 100644 --- a/third_party/googletest/src/src/gtest-internal-inl.h +++ b/third_party/googletest/src/src/gtest-internal-inl.h @@ -35,7 +35,7 @@ #define GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_ #ifndef _WIN32_WCE -# include +#include #endif // !_WIN32_WCE #include #include // For strtoll/_strtoul64/malloc/free. @@ -50,22 +50,20 @@ #include "gtest/internal/gtest-port.h" #if GTEST_CAN_STREAM_RESULTS_ -# include // NOLINT -# include // NOLINT +#include // NOLINT +#include // NOLINT #endif #if GTEST_OS_WINDOWS -# include // NOLINT -#endif // GTEST_OS_WINDOWS +#include // NOLINT +#endif // GTEST_OS_WINDOWS -#include "gtest/gtest.h" #include "gtest/gtest-spi.h" +#include "gtest/gtest.h" GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ /* class A needs to have dll-interface to be used by clients of class B */) -namespace testing { - // Declares the flags. // // We don't want the users to modify this flag in the code, but want @@ -73,32 +71,13 @@ namespace testing { // declare it here as opposed to in gtest.h. GTEST_DECLARE_bool_(death_test_use_fork); +namespace testing { namespace internal { // The value of GetTestTypeId() as seen from within the Google Test // library. This is solely for testing GetTestTypeId(). GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest; -// Names of the flags (needed for parsing Google Test flags). -const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests"; -const char kBreakOnFailureFlag[] = "break_on_failure"; -const char kCatchExceptionsFlag[] = "catch_exceptions"; -const char kColorFlag[] = "color"; -const char kFailFast[] = "fail_fast"; -const char kFilterFlag[] = "filter"; -const char kListTestsFlag[] = "list_tests"; -const char kOutputFlag[] = "output"; -const char kBriefFlag[] = "brief"; -const char kPrintTimeFlag[] = "print_time"; -const char kPrintUTF8Flag[] = "print_utf8"; -const char kRandomSeedFlag[] = "random_seed"; -const char kRepeatFlag[] = "repeat"; -const char kShuffleFlag[] = "shuffle"; -const char kStackTraceDepthFlag[] = "stack_trace_depth"; -const char kStreamResultToFlag[] = "stream_result_to"; -const char kThrowOnFailureFlag[] = "throw_on_failure"; -const char kFlagfileFlag[] = "flagfile"; - // A valid random seed must be in [1, kMaxRandomSeed]. const int kMaxRandomSeed = 99999; @@ -125,21 +104,21 @@ GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms); // // On success, stores the value of the flag in *value, and returns // true. On failure, returns false without changing *value. -GTEST_API_ bool ParseInt32Flag( - const char* str, const char* flag, int32_t* value); +GTEST_API_ bool ParseFlag(const char* str, const char* flag, int32_t* value); // Returns a random seed in range [1, kMaxRandomSeed] based on the // given --gtest_random_seed flag value. inline int GetRandomSeedFromFlag(int32_t random_seed_flag) { - const unsigned int raw_seed = (random_seed_flag == 0) ? - static_cast(GetTimeInMillis()) : - static_cast(random_seed_flag); + const unsigned int raw_seed = + (random_seed_flag == 0) ? static_cast(GetTimeInMillis()) + : static_cast(random_seed_flag); // Normalizes the actual seed to range [1, kMaxRandomSeed] such that // it's easy to type. const int normalized_seed = static_cast((raw_seed - 1U) % - static_cast(kMaxRandomSeed)) + 1; + static_cast(kMaxRandomSeed)) + + 1; return normalized_seed; } @@ -160,50 +139,54 @@ class GTestFlagSaver { public: // The c'tor. GTestFlagSaver() { - also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests); - break_on_failure_ = GTEST_FLAG(break_on_failure); - catch_exceptions_ = GTEST_FLAG(catch_exceptions); - color_ = GTEST_FLAG(color); - death_test_style_ = GTEST_FLAG(death_test_style); - death_test_use_fork_ = GTEST_FLAG(death_test_use_fork); - fail_fast_ = GTEST_FLAG(fail_fast); - filter_ = GTEST_FLAG(filter); - internal_run_death_test_ = GTEST_FLAG(internal_run_death_test); - list_tests_ = GTEST_FLAG(list_tests); - output_ = GTEST_FLAG(output); - brief_ = GTEST_FLAG(brief); - print_time_ = GTEST_FLAG(print_time); - print_utf8_ = GTEST_FLAG(print_utf8); - random_seed_ = GTEST_FLAG(random_seed); - repeat_ = GTEST_FLAG(repeat); - shuffle_ = GTEST_FLAG(shuffle); - stack_trace_depth_ = GTEST_FLAG(stack_trace_depth); - stream_result_to_ = GTEST_FLAG(stream_result_to); - throw_on_failure_ = GTEST_FLAG(throw_on_failure); + also_run_disabled_tests_ = GTEST_FLAG_GET(also_run_disabled_tests); + break_on_failure_ = GTEST_FLAG_GET(break_on_failure); + catch_exceptions_ = GTEST_FLAG_GET(catch_exceptions); + color_ = GTEST_FLAG_GET(color); + death_test_style_ = GTEST_FLAG_GET(death_test_style); + death_test_use_fork_ = GTEST_FLAG_GET(death_test_use_fork); + fail_fast_ = GTEST_FLAG_GET(fail_fast); + filter_ = GTEST_FLAG_GET(filter); + internal_run_death_test_ = GTEST_FLAG_GET(internal_run_death_test); + list_tests_ = GTEST_FLAG_GET(list_tests); + output_ = GTEST_FLAG_GET(output); + brief_ = GTEST_FLAG_GET(brief); + print_time_ = GTEST_FLAG_GET(print_time); + print_utf8_ = GTEST_FLAG_GET(print_utf8); + random_seed_ = GTEST_FLAG_GET(random_seed); + repeat_ = GTEST_FLAG_GET(repeat); + recreate_environments_when_repeating_ = + GTEST_FLAG_GET(recreate_environments_when_repeating); + shuffle_ = GTEST_FLAG_GET(shuffle); + stack_trace_depth_ = GTEST_FLAG_GET(stack_trace_depth); + stream_result_to_ = GTEST_FLAG_GET(stream_result_to); + throw_on_failure_ = GTEST_FLAG_GET(throw_on_failure); } // The d'tor is not virtual. DO NOT INHERIT FROM THIS CLASS. ~GTestFlagSaver() { - GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_; - GTEST_FLAG(break_on_failure) = break_on_failure_; - GTEST_FLAG(catch_exceptions) = catch_exceptions_; - GTEST_FLAG(color) = color_; - GTEST_FLAG(death_test_style) = death_test_style_; - GTEST_FLAG(death_test_use_fork) = death_test_use_fork_; - GTEST_FLAG(filter) = filter_; - GTEST_FLAG(fail_fast) = fail_fast_; - GTEST_FLAG(internal_run_death_test) = internal_run_death_test_; - GTEST_FLAG(list_tests) = list_tests_; - GTEST_FLAG(output) = output_; - GTEST_FLAG(brief) = brief_; - GTEST_FLAG(print_time) = print_time_; - GTEST_FLAG(print_utf8) = print_utf8_; - GTEST_FLAG(random_seed) = random_seed_; - GTEST_FLAG(repeat) = repeat_; - GTEST_FLAG(shuffle) = shuffle_; - GTEST_FLAG(stack_trace_depth) = stack_trace_depth_; - GTEST_FLAG(stream_result_to) = stream_result_to_; - GTEST_FLAG(throw_on_failure) = throw_on_failure_; + GTEST_FLAG_SET(also_run_disabled_tests, also_run_disabled_tests_); + GTEST_FLAG_SET(break_on_failure, break_on_failure_); + GTEST_FLAG_SET(catch_exceptions, catch_exceptions_); + GTEST_FLAG_SET(color, color_); + GTEST_FLAG_SET(death_test_style, death_test_style_); + GTEST_FLAG_SET(death_test_use_fork, death_test_use_fork_); + GTEST_FLAG_SET(filter, filter_); + GTEST_FLAG_SET(fail_fast, fail_fast_); + GTEST_FLAG_SET(internal_run_death_test, internal_run_death_test_); + GTEST_FLAG_SET(list_tests, list_tests_); + GTEST_FLAG_SET(output, output_); + GTEST_FLAG_SET(brief, brief_); + GTEST_FLAG_SET(print_time, print_time_); + GTEST_FLAG_SET(print_utf8, print_utf8_); + GTEST_FLAG_SET(random_seed, random_seed_); + GTEST_FLAG_SET(repeat, repeat_); + GTEST_FLAG_SET(recreate_environments_when_repeating, + recreate_environments_when_repeating_); + GTEST_FLAG_SET(shuffle, shuffle_); + GTEST_FLAG_SET(stack_trace_depth, stack_trace_depth_); + GTEST_FLAG_SET(stream_result_to, stream_result_to_); + GTEST_FLAG_SET(throw_on_failure, throw_on_failure_); } private: @@ -224,6 +207,7 @@ class GTestFlagSaver { bool print_utf8_; int32_t random_seed_; int32_t repeat_; + bool recreate_environments_when_repeating_; bool shuffle_; int32_t stack_trace_depth_; std::string stream_result_to_; @@ -278,8 +262,8 @@ GTEST_API_ int32_t Int32FromEnvOrDie(const char* env_var, int32_t default_val); // returns true if and only if the test should be run on this shard. The test id // is some arbitrary but unique non-negative integer assigned to each test // method. Assumes that 0 <= shard_index < total_shards. -GTEST_API_ bool ShouldRunTestOnShard( - int total_shards, int shard_index, int test_id); +GTEST_API_ bool ShouldRunTestOnShard(int total_shards, int shard_index, + int test_id); // STL container utilities. @@ -290,9 +274,8 @@ inline int CountIf(const Container& c, Predicate predicate) { // Implemented as an explicit loop since std::count_if() in libCstd on // Solaris has a non-standard signature. int count = 0; - for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) { - if (predicate(*it)) - ++count; + for (auto it = c.begin(); it != c.end(); ++it) { + if (predicate(*it)) ++count; } return count; } @@ -441,7 +424,9 @@ class OsStackTraceGetterInterface { static const char* const kElidedFramesMarker; private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface); + OsStackTraceGetterInterface(const OsStackTraceGetterInterface&) = delete; + OsStackTraceGetterInterface& operator=(const OsStackTraceGetterInterface&) = + delete; }; // A working implementation of the OsStackTraceGetterInterface interface. @@ -463,7 +448,8 @@ class OsStackTraceGetter : public OsStackTraceGetterInterface { void* caller_frame_ = nullptr; #endif // GTEST_HAS_ABSL - GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter); + OsStackTraceGetter(const OsStackTraceGetter&) = delete; + OsStackTraceGetter& operator=(const OsStackTraceGetter&) = delete; }; // Information about a Google Test trace point. @@ -476,7 +462,7 @@ struct TraceInfo { // This is the default global test part result reporter used in UnitTestImpl. // This class should only be used by UnitTestImpl. class DefaultGlobalTestPartResultReporter - : public TestPartResultReporterInterface { + : public TestPartResultReporterInterface { public: explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test); // Implements the TestPartResultReporterInterface. Reports the test part @@ -486,7 +472,10 @@ class DefaultGlobalTestPartResultReporter private: UnitTestImpl* const unit_test_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter); + DefaultGlobalTestPartResultReporter( + const DefaultGlobalTestPartResultReporter&) = delete; + DefaultGlobalTestPartResultReporter& operator=( + const DefaultGlobalTestPartResultReporter&) = delete; }; // This is the default per thread test part result reporter used in @@ -502,7 +491,10 @@ class DefaultPerThreadTestPartResultReporter private: UnitTestImpl* const unit_test_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter); + DefaultPerThreadTestPartResultReporter( + const DefaultPerThreadTestPartResultReporter&) = delete; + DefaultPerThreadTestPartResultReporter& operator=( + const DefaultPerThreadTestPartResultReporter&) = delete; }; // The private implementation of the UnitTest class. We don't protect @@ -640,7 +632,8 @@ class GTEST_API_ UnitTestImpl { // For example, if Foo() calls Bar(), which in turn calls // CurrentOsStackTraceExceptTop(1), Foo() will be included in the // trace but Bar() and CurrentOsStackTraceExceptTop() won't. - std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_; + std::string CurrentOsStackTraceExceptTop(int skip_count) + GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_; // Finds and returns a TestSuite with the given name. If one doesn't // exist, creates one and returns it. @@ -744,9 +737,7 @@ class GTEST_API_ UnitTestImpl { } // Clears the results of ad-hoc test assertions. - void ClearAdHocTestResult() { - ad_hoc_test_result_.Clear(); - } + void ClearAdHocTestResult() { ad_hoc_test_result_.Clear(); } // Adds a TestProperty to the current TestResult object when invoked in a // context of a test or a test suite, or to the global property set. If the @@ -754,10 +745,7 @@ class GTEST_API_ UnitTestImpl { // updated. void RecordProperty(const TestProperty& test_property); - enum ReactionToSharding { - HONOR_SHARDING_PROTOCOL, - IGNORE_SHARDING_PROTOCOL - }; + enum ReactionToSharding { HONOR_SHARDING_PROTOCOL, IGNORE_SHARDING_PROTOCOL }; // Matches the full name of each test against the user-specified // filter to decide whether the test should run, then records the @@ -963,7 +951,8 @@ class GTEST_API_ UnitTestImpl { // starts. bool catch_exceptions_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl); + UnitTestImpl(const UnitTestImpl&) = delete; + UnitTestImpl& operator=(const UnitTestImpl&) = delete; }; // class UnitTestImpl // Convenience function for accessing the global UnitTest @@ -986,8 +975,9 @@ GTEST_API_ bool IsValidEscape(char ch); GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch); GTEST_API_ bool ValidateRegex(const char* regex); GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str); -GTEST_API_ bool MatchRepetitionAndRegexAtHead( - bool escaped, char ch, char repeat, const char* regex, const char* str); +GTEST_API_ bool MatchRepetitionAndRegexAtHead(bool escaped, char ch, + char repeat, const char* regex, + const char* str); GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str); #endif // GTEST_USES_SIMPLE_RE @@ -1089,8 +1079,7 @@ class StreamingListener : public EmptyTestEventListener { } ~SocketWriter() override { - if (sockfd_ != -1) - CloseConnection(); + if (sockfd_ != -1) CloseConnection(); } // Sends a string to the socket. @@ -1100,9 +1089,8 @@ class StreamingListener : public EmptyTestEventListener { const auto len = static_cast(message.length()); if (write(sockfd_, message.c_str(), len) != static_cast(len)) { - GTEST_LOG_(WARNING) - << "stream_result_to: failed to stream to " - << host_name_ << ":" << port_num_; + GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to " + << host_name_ << ":" << port_num_; } } @@ -1123,7 +1111,8 @@ class StreamingListener : public EmptyTestEventListener { const std::string host_name_; const std::string port_num_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter); + SocketWriter(const SocketWriter&) = delete; + SocketWriter& operator=(const SocketWriter&) = delete; }; // class SocketWriter // Escapes '=', '&', '%', and '\n' characters in str as "%xx". @@ -1135,7 +1124,9 @@ class StreamingListener : public EmptyTestEventListener { } explicit StreamingListener(AbstractSocketWriter* socket_writer) - : socket_writer_(socket_writer) { Start(); } + : socket_writer_(socket_writer) { + Start(); + } void OnTestProgramStart(const UnitTest& /* unit_test */) override { SendLn("event=TestProgramStart"); @@ -1158,22 +1149,22 @@ class StreamingListener : public EmptyTestEventListener { void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) override { - SendLn("event=TestIterationEnd&passed=" + - FormatBool(unit_test.Passed()) + "&elapsed_time=" + - StreamableToString(unit_test.elapsed_time()) + "ms"); + SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) + + "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) + + "ms"); } // Note that "event=TestCaseStart" is a wire format and has to remain // "case" for compatibility - void OnTestCaseStart(const TestCase& test_case) override { - SendLn(std::string("event=TestCaseStart&name=") + test_case.name()); + void OnTestSuiteStart(const TestSuite& test_suite) override { + SendLn(std::string("event=TestCaseStart&name=") + test_suite.name()); } // Note that "event=TestCaseEnd" is a wire format and has to remain // "case" for compatibility - void OnTestCaseEnd(const TestCase& test_case) override { - SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) + - "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) + + void OnTestSuiteEnd(const TestSuite& test_suite) override { + SendLn("event=TestCaseEnd&passed=" + FormatBool(test_suite.Passed()) + + "&elapsed_time=" + StreamableToString(test_suite.elapsed_time()) + "ms"); } @@ -1183,8 +1174,7 @@ class StreamingListener : public EmptyTestEventListener { void OnTestEnd(const TestInfo& test_info) override { SendLn("event=TestEnd&passed=" + - FormatBool((test_info.result())->Passed()) + - "&elapsed_time=" + + FormatBool((test_info.result())->Passed()) + "&elapsed_time=" + StreamableToString((test_info.result())->elapsed_time()) + "ms"); } @@ -1208,7 +1198,8 @@ class StreamingListener : public EmptyTestEventListener { const std::unique_ptr socket_writer_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener); + StreamingListener(const StreamingListener&) = delete; + StreamingListener& operator=(const StreamingListener&) = delete; }; // class StreamingListener #endif // GTEST_CAN_STREAM_RESULTS_ diff --git a/third_party/googletest/src/src/gtest-matchers.cc b/third_party/googletest/src/src/gtest-matchers.cc index 65104ebab1..7e3bcc0cff 100644 --- a/third_party/googletest/src/src/gtest-matchers.cc +++ b/third_party/googletest/src/src/gtest-matchers.cc @@ -32,12 +32,13 @@ // This file implements just enough of the matcher interface to allow // EXPECT_DEATH and friends to accept a matcher argument. -#include "gtest/internal/gtest-internal.h" -#include "gtest/internal/gtest-port.h" #include "gtest/gtest-matchers.h" #include +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" + namespace testing { // Constructs a matcher that matches a const std::string& whose value is diff --git a/third_party/googletest/src/src/gtest-port.cc b/third_party/googletest/src/src/gtest-port.cc index 53a4d37f97..d797fe4d58 100644 --- a/third_party/googletest/src/src/gtest-port.cc +++ b/third_party/googletest/src/src/gtest-port.cc @@ -27,61 +27,62 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #include "gtest/internal/gtest-port.h" #include #include #include #include + #include #include #include #if GTEST_OS_WINDOWS -# include -# include -# include -# include // Used in ThreadLocal. -# ifdef _MSC_VER -# include -# endif // _MSC_VER +#include +#include +#include + +#include // Used in ThreadLocal. +#ifdef _MSC_VER +#include +#endif // _MSC_VER #else -# include +#include #endif // GTEST_OS_WINDOWS #if GTEST_OS_MAC -# include -# include -# include +#include +#include +#include #endif // GTEST_OS_MAC #if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \ GTEST_OS_NETBSD || GTEST_OS_OPENBSD -# include -# if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD -# include -# endif +#include +#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD +#include +#endif #endif #if GTEST_OS_QNX -# include -# include -# include +#include +#include +#include #endif // GTEST_OS_QNX #if GTEST_OS_AIX -# include -# include +#include +#include #endif // GTEST_OS_AIX #if GTEST_OS_FUCHSIA -# include -# include +#include +#include #endif // GTEST_OS_FUCHSIA -#include "gtest/gtest-spi.h" #include "gtest/gtest-message.h" +#include "gtest/gtest-spi.h" #include "gtest/internal/gtest-internal.h" #include "gtest/internal/gtest-string.h" #include "src/gtest-internal-inl.h" @@ -89,16 +90,7 @@ namespace testing { namespace internal { -#if defined(_MSC_VER) || defined(__BORLANDC__) -// MSVC and C++Builder do not provide a definition of STDERR_FILENO. -const int kStdOutFileno = 1; -const int kStdErrFileno = 2; -#else -const int kStdOutFileno = STDOUT_FILENO; -const int kStdErrFileno = STDERR_FILENO; -#endif // _MSC_VER - -#if GTEST_OS_LINUX +#if GTEST_OS_LINUX || GTEST_OS_GNU_HURD namespace { template @@ -131,8 +123,7 @@ size_t GetThreadCount() { if (status == KERN_SUCCESS) { // task_threads allocates resources in thread_list and we need to free them // to avoid leaks. - vm_deallocate(task, - reinterpret_cast(thread_list), + vm_deallocate(task, reinterpret_cast(thread_list), sizeof(thread_t) * thread_count); return static_cast(thread_count); } else { @@ -141,7 +132,7 @@ size_t GetThreadCount() { } #elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \ - GTEST_OS_NETBSD + GTEST_OS_NETBSD #if GTEST_OS_NETBSD #undef KERN_PROC @@ -184,12 +175,12 @@ size_t GetThreadCount() { // we cannot detect it. size_t GetThreadCount() { int mib[] = { - CTL_KERN, - KERN_PROC, - KERN_PROC_PID | KERN_PROC_SHOW_THREADS, - getpid(), - sizeof(struct kinfo_proc), - 0, + CTL_KERN, + KERN_PROC, + KERN_PROC_PID | KERN_PROC_SHOW_THREADS, + getpid(), + sizeof(struct kinfo_proc), + 0, }; u_int miblen = sizeof(mib) / sizeof(mib[0]); @@ -210,8 +201,7 @@ size_t GetThreadCount() { // exclude empty members size_t nthreads = 0; for (size_t i = 0; i < size / static_cast(mib[4]); i++) { - if (info[i].p_tid != -1) - nthreads++; + if (info[i].p_tid != -1) nthreads++; } return nthreads; } @@ -254,13 +244,9 @@ size_t GetThreadCount() { size_t GetThreadCount() { int dummy_buffer; size_t avail; - zx_status_t status = zx_object_get_info( - zx_process_self(), - ZX_INFO_PROCESS_THREADS, - &dummy_buffer, - 0, - nullptr, - &avail); + zx_status_t status = + zx_object_get_info(zx_process_self(), ZX_INFO_PROCESS_THREADS, + &dummy_buffer, 0, nullptr, &avail); if (status == ZX_OK) { return avail; } else { @@ -280,27 +266,15 @@ size_t GetThreadCount() { #if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS -void SleepMilliseconds(int n) { - ::Sleep(static_cast(n)); -} +AutoHandle::AutoHandle() : handle_(INVALID_HANDLE_VALUE) {} -AutoHandle::AutoHandle() - : handle_(INVALID_HANDLE_VALUE) {} +AutoHandle::AutoHandle(Handle handle) : handle_(handle) {} -AutoHandle::AutoHandle(Handle handle) - : handle_(handle) {} +AutoHandle::~AutoHandle() { Reset(); } -AutoHandle::~AutoHandle() { - Reset(); -} - -AutoHandle::Handle AutoHandle::Get() const { - return handle_; -} +AutoHandle::Handle AutoHandle::Get() const { return handle_; } -void AutoHandle::Reset() { - Reset(INVALID_HANDLE_VALUE); -} +void AutoHandle::Reset() { Reset(INVALID_HANDLE_VALUE); } void AutoHandle::Reset(HANDLE handle) { // Resetting with the same handle we already own is invalid. @@ -312,7 +286,7 @@ void AutoHandle::Reset(HANDLE handle) { } else { GTEST_CHECK_(!IsCloseable()) << "Resetting a valid handle to itself is likely a programmer error " - "and thus not allowed."; + "and thus not allowed."; } } @@ -322,23 +296,6 @@ bool AutoHandle::IsCloseable() const { return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE; } -Notification::Notification() - : event_(::CreateEvent(nullptr, // Default security attributes. - TRUE, // Do not reset automatically. - FALSE, // Initially unset. - nullptr)) { // Anonymous event. - GTEST_CHECK_(event_.Get() != nullptr); -} - -void Notification::Notify() { - GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE); -} - -void Notification::WaitForNotification() { - GTEST_CHECK_( - ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0); -} - Mutex::Mutex() : owner_thread_id_(0), type_(kDynamic), @@ -391,25 +348,25 @@ namespace { // MemoryIsNotDeallocated memory_is_not_deallocated; // critical_section_ = new CRITICAL_SECTION; // -class MemoryIsNotDeallocated -{ +class MemoryIsNotDeallocated { public: MemoryIsNotDeallocated() : old_crtdbg_flag_(0) { old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG); // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT // doesn't report mem leak if there's no matching deallocation. - _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF); + (void)_CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF); } ~MemoryIsNotDeallocated() { // Restore the original _CRTDBG_ALLOC_MEM_DF flag - _CrtSetDbgFlag(old_crtdbg_flag_); + (void)_CrtSetDbgFlag(old_crtdbg_flag_); } private: int old_crtdbg_flag_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated); + MemoryIsNotDeallocated(const MemoryIsNotDeallocated&) = delete; + MemoryIsNotDeallocated& operator=(const MemoryIsNotDeallocated&) = delete; }; #endif // _MSC_VER @@ -435,15 +392,13 @@ void Mutex::ThreadSafeLazyInit() { ::InitializeCriticalSection(critical_section_); // Updates the critical_section_init_phase_ to 2 to signal // initialization complete. - GTEST_CHECK_(::InterlockedCompareExchange( - &critical_section_init_phase_, 2L, 1L) == - 1L); + GTEST_CHECK_(::InterlockedCompareExchange(&critical_section_init_phase_, + 2L, 1L) == 1L); break; case 1: // Somebody else is already initializing the mutex; spin until they // are done. - while (::InterlockedCompareExchange(&critical_section_init_phase_, - 2L, + while (::InterlockedCompareExchange(&critical_section_init_phase_, 2L, 2L) != 2L) { // Possibly yields the rest of the thread's time slice to other // threads. @@ -488,9 +443,7 @@ class ThreadWithParamSupport : public ThreadWithParamBase { private: struct ThreadMainParam { ThreadMainParam(Runnable* runnable, Notification* thread_can_start) - : runnable_(runnable), - thread_can_start_(thread_can_start) { - } + : runnable_(runnable), thread_can_start_(thread_can_start) {} std::unique_ptr runnable_; // Does not own. Notification* thread_can_start_; @@ -508,20 +461,18 @@ class ThreadWithParamSupport : public ThreadWithParamBase { // Prohibit instantiation. ThreadWithParamSupport(); - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport); + ThreadWithParamSupport(const ThreadWithParamSupport&) = delete; + ThreadWithParamSupport& operator=(const ThreadWithParamSupport&) = delete; }; } // namespace -ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable, +ThreadWithParamBase::ThreadWithParamBase(Runnable* runnable, Notification* thread_can_start) - : thread_(ThreadWithParamSupport::CreateThread(runnable, - thread_can_start)) { -} + : thread_( + ThreadWithParamSupport::CreateThread(runnable, thread_can_start)) {} -ThreadWithParamBase::~ThreadWithParamBase() { - Join(); -} +ThreadWithParamBase::~ThreadWithParamBase() { Join(); } void ThreadWithParamBase::Join() { GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0) @@ -548,8 +499,10 @@ class ThreadLocalRegistryImpl { ThreadIdToThreadLocals::iterator thread_local_pos = thread_to_thread_locals->find(current_thread); if (thread_local_pos == thread_to_thread_locals->end()) { - thread_local_pos = thread_to_thread_locals->insert( - std::make_pair(current_thread, ThreadLocalValues())).first; + thread_local_pos = + thread_to_thread_locals + ->insert(std::make_pair(current_thread, ThreadLocalValues())) + .first; StartWatcherThreadFor(current_thread); } ThreadLocalValues& thread_local_values = thread_local_pos->second; @@ -577,9 +530,8 @@ class ThreadLocalRegistryImpl { ThreadIdToThreadLocals* const thread_to_thread_locals = GetThreadLocalsMapLocked(); for (ThreadIdToThreadLocals::iterator it = - thread_to_thread_locals->begin(); - it != thread_to_thread_locals->end(); - ++it) { + thread_to_thread_locals->begin(); + it != thread_to_thread_locals->end(); ++it) { ThreadLocalValues& thread_local_values = it->second; ThreadLocalValues::iterator value_pos = thread_local_values.find(thread_local_instance); @@ -609,9 +561,8 @@ class ThreadLocalRegistryImpl { if (thread_local_pos != thread_to_thread_locals->end()) { ThreadLocalValues& thread_local_values = thread_local_pos->second; for (ThreadLocalValues::iterator value_pos = - thread_local_values.begin(); - value_pos != thread_local_values.end(); - ++value_pos) { + thread_local_values.begin(); + value_pos != thread_local_values.end(); ++value_pos) { value_holders.push_back(value_pos->second); } thread_to_thread_locals->erase(thread_local_pos); @@ -637,9 +588,8 @@ class ThreadLocalRegistryImpl { static void StartWatcherThreadFor(DWORD thread_id) { // The returned handle will be kept in thread_map and closed by // watcher_thread in WatcherThreadFunc. - HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, - FALSE, - thread_id); + HANDLE thread = + ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id); GTEST_CHECK_(thread != nullptr); // We need to pass a valid thread ID pointer into CreateThread for it // to work correctly under Win98. @@ -650,7 +600,8 @@ class ThreadLocalRegistryImpl { &ThreadLocalRegistryImpl::WatcherThreadFunc, reinterpret_cast(new ThreadIdAndHandle(thread_id, thread)), CREATE_SUSPENDED, &watcher_thread_id); - GTEST_CHECK_(watcher_thread != nullptr); + GTEST_CHECK_(watcher_thread != nullptr) + << "CreateThread failed with error " << ::GetLastError() << "."; // Give the watcher thread the same priority as ours to avoid being // blocked by it. ::SetThreadPriority(watcher_thread, @@ -664,8 +615,7 @@ class ThreadLocalRegistryImpl { static DWORD WINAPI WatcherThreadFunc(LPVOID param) { const ThreadIdAndHandle* tah = reinterpret_cast(param); - GTEST_CHECK_( - ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0); + GTEST_CHECK_(::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0); OnThreadExit(tah->first); ::CloseHandle(tah->second); delete tah; @@ -689,16 +639,17 @@ class ThreadLocalRegistryImpl { }; Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex); // NOLINT -Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex); // NOLINT +Mutex ThreadLocalRegistryImpl::thread_map_mutex_( + Mutex::kStaticMutex); // NOLINT ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread( - const ThreadLocalBase* thread_local_instance) { + const ThreadLocalBase* thread_local_instance) { return ThreadLocalRegistryImpl::GetValueOnCurrentThread( thread_local_instance); } void ThreadLocalRegistry::OnThreadLocalDestroyed( - const ThreadLocalBase* thread_local_instance) { + const ThreadLocalBase* thread_local_instance) { ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance); } @@ -786,7 +737,7 @@ bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); } bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); } bool IsAsciiWordChar(char ch) { return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || - ('0' <= ch && ch <= '9') || ch == '_'; + ('0' <= ch && ch <= '9') || ch == '_'; } // Returns true if and only if "\\c" is a supported escape sequence. @@ -799,17 +750,28 @@ bool IsValidEscape(char c) { bool AtomMatchesChar(bool escaped, char pattern_char, char ch) { if (escaped) { // "\\p" where p is pattern_char. switch (pattern_char) { - case 'd': return IsAsciiDigit(ch); - case 'D': return !IsAsciiDigit(ch); - case 'f': return ch == '\f'; - case 'n': return ch == '\n'; - case 'r': return ch == '\r'; - case 's': return IsAsciiWhiteSpace(ch); - case 'S': return !IsAsciiWhiteSpace(ch); - case 't': return ch == '\t'; - case 'v': return ch == '\v'; - case 'w': return IsAsciiWordChar(ch); - case 'W': return !IsAsciiWordChar(ch); + case 'd': + return IsAsciiDigit(ch); + case 'D': + return !IsAsciiDigit(ch); + case 'f': + return ch == '\f'; + case 'n': + return ch == '\n'; + case 'r': + return ch == '\r'; + case 's': + return IsAsciiWhiteSpace(ch); + case 'S': + return !IsAsciiWhiteSpace(ch); + case 't': + return ch == '\t'; + case 'v': + return ch == '\v'; + case 'w': + return IsAsciiWordChar(ch); + case 'W': + return !IsAsciiWordChar(ch); } return IsAsciiPunct(pattern_char) && pattern_char == ch; } @@ -820,7 +782,8 @@ bool AtomMatchesChar(bool escaped, char pattern_char, char ch) { // Helper function used by ValidateRegex() to format error messages. static std::string FormatRegexSyntaxError(const char* regex, int index) { return (Message() << "Syntax error at index " << index - << " in simple regular expression \"" << regex << "\": ").GetString(); + << " in simple regular expression \"" << regex << "\": ") + .GetString(); } // Generates non-fatal failures and returns false if regex is invalid; @@ -862,12 +825,12 @@ bool ValidateRegex(const char* regex) { << "'$' can only appear at the end."; is_valid = false; } else if (IsInSet(ch, "()[]{}|")) { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i) - << "'" << ch << "' is unsupported."; + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch + << "' is unsupported."; is_valid = false; } else if (IsRepeat(ch) && !prev_repeatable) { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i) - << "'" << ch << "' can only follow a repeatable token."; + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch + << "' can only follow a repeatable token."; is_valid = false; } @@ -885,12 +848,10 @@ bool ValidateRegex(const char* regex) { // characters to be indexable by size_t, in which case the test will // probably time out anyway. We are fine with this limitation as // std::string has it too. -bool MatchRepetitionAndRegexAtHead( - bool escaped, char c, char repeat, const char* regex, - const char* str) { +bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat, + const char* regex, const char* str) { const size_t min_count = (repeat == '+') ? 1 : 0; - const size_t max_count = (repeat == '?') ? 1 : - static_cast(-1) - 1; + const size_t max_count = (repeat == '?') ? 1 : static_cast(-1) - 1; // We cannot call numeric_limits::max() as it conflicts with the // max() macro on Windows. @@ -903,8 +864,7 @@ bool MatchRepetitionAndRegexAtHead( // greedy match. return true; } - if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) - return false; + if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) return false; } return false; } @@ -918,25 +878,23 @@ bool MatchRegexAtHead(const char* regex, const char* str) { // "$" only matches the end of a string. Note that regex being // valid guarantees that there's nothing after "$" in it. - if (*regex == '$') - return *str == '\0'; + if (*regex == '$') return *str == '\0'; // Is the first thing in regex an escape sequence? const bool escaped = *regex == '\\'; - if (escaped) - ++regex; + if (escaped) ++regex; if (IsRepeat(regex[1])) { // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so // here's an indirect recursion. It terminates as the regex gets // shorter in each recursion. - return MatchRepetitionAndRegexAtHead( - escaped, regex[0], regex[1], regex + 2, str); + return MatchRepetitionAndRegexAtHead(escaped, regex[0], regex[1], regex + 2, + str); } else { // regex isn't empty, isn't "$", and doesn't start with a // repetition. We match the first atom of regex with the first // character of str and recurse. return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) && - MatchRegexAtHead(regex + 1, str + 1); + MatchRegexAtHead(regex + 1, str + 1); } } @@ -951,13 +909,11 @@ bool MatchRegexAtHead(const char* regex, const char* str) { bool MatchRegexAnywhere(const char* regex, const char* str) { if (regex == nullptr || str == nullptr) return false; - if (*regex == '^') - return MatchRegexAtHead(regex + 1, str); + if (*regex == '^') return MatchRegexAtHead(regex + 1, str); // A successful match can be anywhere in str. do { - if (MatchRegexAtHead(regex, str)) - return true; + if (MatchRegexAtHead(regex, str)) return true; } while (*str++ != '\0'); return false; } @@ -1038,8 +994,8 @@ GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) { // FormatFileLocation in order to contrast the two functions. // Note that FormatCompilerIndependentFileLocation() does NOT append colon // to the file location it produces, unlike FormatFileLocation(). -GTEST_API_ ::std::string FormatCompilerIndependentFileLocation( - const char* file, int line) { +GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file, + int line) { const std::string file_name(file == nullptr ? kUnknownFile : file); if (line < 0) @@ -1050,12 +1006,13 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation( GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line) : severity_(severity) { - const char* const marker = - severity == GTEST_INFO ? "[ INFO ]" : - severity == GTEST_WARNING ? "[WARNING]" : - severity == GTEST_ERROR ? "[ ERROR ]" : "[ FATAL ]"; - GetStream() << ::std::endl << marker << " " - << FormatFileLocation(file, line).c_str() << ": "; + const char* const marker = severity == GTEST_INFO ? "[ INFO ]" + : severity == GTEST_WARNING ? "[WARNING]" + : severity == GTEST_ERROR ? "[ ERROR ]" + : "[ FATAL ]"; + GetStream() << ::std::endl + << marker << " " << FormatFileLocation(file, line).c_str() + << ": "; } // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. @@ -1078,27 +1035,26 @@ class CapturedStream { public: // The ctor redirects the stream to a temporary file. explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) { -# if GTEST_OS_WINDOWS - char temp_dir_path[MAX_PATH + 1] = { '\0' }; // NOLINT - char temp_file_path[MAX_PATH + 1] = { '\0' }; // NOLINT +#if GTEST_OS_WINDOWS + char temp_dir_path[MAX_PATH + 1] = {'\0'}; // NOLINT + char temp_file_path[MAX_PATH + 1] = {'\0'}; // NOLINT ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path); - const UINT success = ::GetTempFileNameA(temp_dir_path, - "gtest_redir", + const UINT success = ::GetTempFileNameA(temp_dir_path, "gtest_redir", 0, // Generate unique file name. temp_file_path); GTEST_CHECK_(success != 0) << "Unable to create a temporary file in " << temp_dir_path; const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE); - GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file " - << temp_file_path; + GTEST_CHECK_(captured_fd != -1) + << "Unable to open temporary file " << temp_file_path; filename_ = temp_file_path; -# else +#else // There's no guarantee that a test has write access to the current // directory, so we create the temporary file in a temporary directory. std::string name_template; -# if GTEST_OS_LINUX_ANDROID +#if GTEST_OS_LINUX_ANDROID // Note: Android applications are expected to call the framework's // Context.getExternalStorageDirectory() method through JNI to get // the location of the world-writable SD Card directory. However, @@ -1111,7 +1067,7 @@ class CapturedStream { // '/sdcard' and other variants cannot be relied on, as they are not // guaranteed to be mounted, or may have a delay in mounting. name_template = "/data/local/tmp/"; -# elif GTEST_OS_IOS +#elif GTEST_OS_IOS char user_temp_dir[PATH_MAX + 1]; // Documented alternative to NSTemporaryDirectory() (for obtaining creating @@ -1132,9 +1088,9 @@ class CapturedStream { name_template = user_temp_dir; if (name_template.back() != GTEST_PATH_SEP_[0]) name_template.push_back(GTEST_PATH_SEP_[0]); -# else +#else name_template = "/tmp/"; -# endif +#endif name_template.append("gtest_captured_stream.XXXXXX"); // mkstemp() modifies the string bytes in place, and does not go beyond the @@ -1150,15 +1106,13 @@ class CapturedStream { << " for test; does the test have access to the /tmp directory?"; } filename_ = std::move(name_template); -# endif // GTEST_OS_WINDOWS +#endif // GTEST_OS_WINDOWS fflush(nullptr); dup2(captured_fd, fd_); close(captured_fd); } - ~CapturedStream() { - remove(filename_.c_str()); - } + ~CapturedStream() { remove(filename_.c_str()); } std::string GetCapturedString() { if (uncaptured_fd_ != -1) { @@ -1185,7 +1139,8 @@ class CapturedStream { // Name of the temporary file holding the stderr output. ::std::string filename_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream); + CapturedStream(const CapturedStream&) = delete; + CapturedStream& operator=(const CapturedStream&) = delete; }; GTEST_DISABLE_MSC_DEPRECATED_POP_() @@ -1213,6 +1168,15 @@ static std::string GetCapturedStream(CapturedStream** captured_stream) { return content; } +#if defined(_MSC_VER) || defined(__BORLANDC__) +// MSVC and C++Builder do not provide a definition of STDERR_FILENO. +const int kStdOutFileno = 1; +const int kStdErrFileno = 2; +#else +const int kStdOutFileno = STDOUT_FILENO; +const int kStdErrFileno = STDERR_FILENO; +#endif // defined(_MSC_VER) || defined(__BORLANDC__) + // Starts capturing stdout. void CaptureStdout() { CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout); @@ -1235,10 +1199,6 @@ std::string GetCapturedStderr() { #endif // GTEST_HAS_STREAM_REDIRECTION - - - - size_t GetFileSize(FILE* file) { fseek(file, 0, SEEK_END); return static_cast(ftell(file)); @@ -1256,7 +1216,8 @@ std::string ReadEntireFile(FILE* file) { // Keeps reading the file until we cannot read further or the // pre-determined file size is reached. do { - bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file); + bytes_last_read = + fread(buffer + bytes_read, 1, file_size - bytes_read, file); bytes_read += bytes_last_read; } while (bytes_last_read > 0 && bytes_read < file_size); @@ -1344,7 +1305,7 @@ bool ParseInt32(const Message& src_text, const char* str, int32_t* value) { // LONG_MAX or LONG_MIN when the input overflows.) result != long_value // The parsed value overflows as an int32_t. - ) { + ) { Message msg; msg << "WARNING: " << src_text << " is expected to be a 32-bit integer, but actually" @@ -1388,8 +1349,8 @@ int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) { } int32_t result = default_value; - if (!ParseInt32(Message() << "Environment variable " << env_var, - string_value, &result)) { + if (!ParseInt32(Message() << "Environment variable " << env_var, string_value, + &result)) { printf("The default value %s is used.\n", (Message() << default_value).GetString().c_str()); fflush(stdout); @@ -1408,7 +1369,7 @@ int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) { // not check that the flag is 'output' // In essence this checks an env variable called XML_OUTPUT_FILE // and if it is set we prepend "xml:" to its value, if it not set we return "" -std::string OutputFlagAlsoCheckEnvVar(){ +std::string OutputFlagAlsoCheckEnvVar() { std::string default_value_for_output_flag = ""; const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE"); if (nullptr != xml_output_file_env) { diff --git a/third_party/googletest/src/src/gtest-printers.cc b/third_party/googletest/src/src/gtest-printers.cc index 1b68fcb500..f3976d230d 100644 --- a/third_party/googletest/src/src/gtest-printers.cc +++ b/third_party/googletest/src/src/gtest-printers.cc @@ -27,7 +27,6 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - // Google Test - The Google C++ Testing and Mocking Framework // // This file implements a universal value printer that can print a @@ -101,7 +100,7 @@ void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count, PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os); *os << " ... "; // Rounds up to 2-byte boundary. - const size_t resume_pos = (count - kChunkSize + 1)/2*2; + const size_t resume_pos = (count - kChunkSize + 1) / 2 * 2; PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os); } *os << ">"; @@ -136,11 +135,7 @@ void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count, // - as is if it's a printable ASCII (e.g. 'a', '2', ' '), // - as a hexadecimal escape sequence (e.g. '\x7F'), or // - as a special escape sequence (e.g. '\r', '\n'). -enum CharFormat { - kAsIs, - kHexEscape, - kSpecialEscape -}; +enum CharFormat { kAsIs, kHexEscape, kSpecialEscape }; // Returns true if c is a printable ASCII character. We test the // value of c directly instead of calling isprint(), which is buggy on @@ -213,35 +208,21 @@ static CharFormat PrintAsStringLiteralTo(char32_t c, ostream* os) { } } -static const char* GetCharWidthPrefix(char) { - return ""; -} +static const char* GetCharWidthPrefix(char) { return ""; } -static const char* GetCharWidthPrefix(signed char) { - return ""; -} +static const char* GetCharWidthPrefix(signed char) { return ""; } -static const char* GetCharWidthPrefix(unsigned char) { - return ""; -} +static const char* GetCharWidthPrefix(unsigned char) { return ""; } #ifdef __cpp_char8_t -static const char* GetCharWidthPrefix(char8_t) { - return "u8"; -} +static const char* GetCharWidthPrefix(char8_t) { return "u8"; } #endif -static const char* GetCharWidthPrefix(char16_t) { - return "u"; -} +static const char* GetCharWidthPrefix(char16_t) { return "u"; } -static const char* GetCharWidthPrefix(char32_t) { - return "U"; -} +static const char* GetCharWidthPrefix(char32_t) { return "U"; } -static const char* GetCharWidthPrefix(wchar_t) { - return "L"; -} +static const char* GetCharWidthPrefix(wchar_t) { return "L"; } // Prints a char c as if it's part of a string literal, escaping it when // necessary; returns how c was formatted. @@ -276,8 +257,7 @@ void PrintCharAndCodeTo(Char c, ostream* os) { // To aid user debugging, we also print c's code in decimal, unless // it's 0 (in which case c was printed as '\\0', making the code // obvious). - if (c == 0) - return; + if (c == 0) return; *os << " (" << static_cast(c); // For more convenience, we print c's code again in hexadecimal, @@ -304,17 +284,60 @@ void PrintTo(char32_t c, ::std::ostream* os) { << static_cast(c); } +// gcc/clang __{u,}int128_t +#if defined(__SIZEOF_INT128__) +void PrintTo(__uint128_t v, ::std::ostream* os) { + if (v == 0) { + *os << "0"; + return; + } + + // Buffer large enough for ceil(log10(2^128))==39 and the null terminator + char buf[40]; + char* p = buf + sizeof(buf); + + // Some configurations have a __uint128_t, but no support for built in + // division. Do manual long division instead. + + uint64_t high = static_cast(v >> 64); + uint64_t low = static_cast(v); + + *--p = 0; + while (high != 0 || low != 0) { + uint64_t high_mod = high % 10; + high = high / 10; + // This is the long division algorithm specialized for a divisor of 10 and + // only two elements. + // Notable values: + // 2^64 / 10 == 1844674407370955161 + // 2^64 % 10 == 6 + const uint64_t carry = 6 * high_mod + low % 10; + low = low / 10 + high_mod * 1844674407370955161 + carry / 10; + + char digit = static_cast(carry % 10); + *--p = '0' + digit; + } + *os << p; +} +void PrintTo(__int128_t v, ::std::ostream* os) { + __uint128_t uv = static_cast<__uint128_t>(v); + if (v < 0) { + *os << "-"; + uv = -uv; + } + PrintTo(uv, os); +} +#endif // __SIZEOF_INT128__ + // Prints the given array of characters to the ostream. CharType must be either // char, char8_t, char16_t, char32_t, or wchar_t. // The array starts at begin, the length is len, it may include '\0' characters // and may not be NUL-terminated. template -GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ -GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ -GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ -GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ -static CharFormat PrintCharsAsStringTo( - const CharType* begin, size_t len, ostream* os) { +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat + PrintCharsAsStringTo(const CharType* begin, size_t len, ostream* os) { const char* const quote_prefix = GetCharWidthPrefix(*begin); *os << quote_prefix << "\""; bool is_previous_hex = false; @@ -340,12 +363,11 @@ static CharFormat PrintCharsAsStringTo( // Prints a (const) char/wchar_t array of 'len' elements, starting at address // 'begin'. CharType must be either char or wchar_t. template -GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ -GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ -GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ -GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ -static void UniversalPrintCharArray( - const CharType* begin, size_t len, ostream* os) { +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void + UniversalPrintCharArray(const CharType* begin, size_t len, + ostream* os) { // The code // const char kFoo[] = "foo"; // generates an array of 4, not 3, elements, with the last one being '\0'. @@ -436,28 +458,28 @@ void PrintTo(const wchar_t* s, ostream* os) { PrintCStringTo(s, os); } namespace { bool ContainsUnprintableControlCodes(const char* str, size_t length) { - const unsigned char *s = reinterpret_cast(str); + const unsigned char* s = reinterpret_cast(str); for (size_t i = 0; i < length; i++) { unsigned char ch = *s++; if (std::iscntrl(ch)) { - switch (ch) { + switch (ch) { case '\t': case '\n': case '\r': break; default: return true; - } } + } } return false; } -bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; } +bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t <= 0xbf; } bool IsValidUTF8(const char* str, size_t length) { - const unsigned char *s = reinterpret_cast(str); + const unsigned char* s = reinterpret_cast(str); for (size_t i = 0; i < length;) { unsigned char lead = s[i++]; @@ -470,15 +492,13 @@ bool IsValidUTF8(const char* str, size_t length) { } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) { ++i; // 2-byte character } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length && - IsUTF8TrailByte(s[i]) && - IsUTF8TrailByte(s[i + 1]) && + IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) && // check for non-shortest form and surrogate (lead != 0xe0 || s[i] >= 0xa0) && (lead != 0xed || s[i] < 0xa0)) { i += 2; // 3-byte character } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length && - IsUTF8TrailByte(s[i]) && - IsUTF8TrailByte(s[i + 1]) && + IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) && IsUTF8TrailByte(s[i + 2]) && // check for non-shortest form (lead != 0xf0 || s[i] >= 0x90) && @@ -502,7 +522,7 @@ void ConditionalPrintAsText(const char* str, size_t length, ostream* os) { void PrintStringTo(const ::std::string& s, ostream* os) { if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) { - if (GTEST_FLAG(print_utf8)) { + if (GTEST_FLAG_GET(print_utf8)) { ConditionalPrintAsText(s.data(), s.size(), os); } } diff --git a/third_party/googletest/src/src/gtest-test-part.cc b/third_party/googletest/src/src/gtest-test-part.cc index a938683ced..eb7c8d1cf9 100644 --- a/third_party/googletest/src/src/gtest-test-part.cc +++ b/third_party/googletest/src/src/gtest-test-part.cc @@ -51,13 +51,11 @@ std::ostream& operator<<(std::ostream& os, const TestPartResult& result) { return os << internal::FormatFileLocation(result.file_name(), result.line_number()) << " " - << (result.type() == TestPartResult::kSuccess - ? "Success" - : result.type() == TestPartResult::kSkip - ? "Skipped" - : result.type() == TestPartResult::kFatalFailure - ? "Fatal failure" - : "Non-fatal failure") + << (result.type() == TestPartResult::kSuccess ? "Success" + : result.type() == TestPartResult::kSkip ? "Skipped" + : result.type() == TestPartResult::kFatalFailure + ? "Fatal failure" + : "Non-fatal failure") << ":\n" << result.message() << std::endl; } @@ -86,8 +84,8 @@ namespace internal { HasNewFatalFailureHelper::HasNewFatalFailureHelper() : has_new_fatal_failure_(false), - original_reporter_(GetUnitTestImpl()-> - GetTestPartResultReporterForCurrentThread()) { + original_reporter_( + GetUnitTestImpl()->GetTestPartResultReporterForCurrentThread()) { GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this); } @@ -98,8 +96,7 @@ HasNewFatalFailureHelper::~HasNewFatalFailureHelper() { void HasNewFatalFailureHelper::ReportTestPartResult( const TestPartResult& result) { - if (result.fatally_failed()) - has_new_fatal_failure_ = true; + if (result.fatally_failed()) has_new_fatal_failure_ = true; original_reporter_->ReportTestPartResult(result); } diff --git a/third_party/googletest/src/src/gtest-typed-test.cc b/third_party/googletest/src/src/gtest-typed-test.cc index c02c3df659..a2828b83c6 100644 --- a/third_party/googletest/src/src/gtest-typed-test.cc +++ b/third_party/googletest/src/src/gtest-typed-test.cc @@ -27,7 +27,6 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #include "gtest/gtest-typed-test.h" #include "gtest/gtest.h" @@ -38,8 +37,7 @@ namespace internal { // Skips to the first non-space char in str. Returns an empty string if str // contains only whitespace characters. static const char* SkipSpaces(const char* str) { - while (IsSpace(*str)) - str++; + while (IsSpace(*str)) str++; return str; } @@ -85,8 +83,7 @@ const char* TypedTestSuitePState::VerifyRegisteredTestNames( } for (RegisteredTestIter it = registered_tests_.begin(); - it != registered_tests_.end(); - ++it) { + it != registered_tests_.end(); ++it) { if (tests.count(it->first) == 0) { errors << "You forgot to list test " << it->first << ".\n"; } diff --git a/third_party/googletest/src/src/gtest.cc b/third_party/googletest/src/src/gtest.cc index 21c611aff1..6f31dd2260 100644 --- a/third_party/googletest/src/src/gtest.cc +++ b/third_party/googletest/src/src/gtest.cc @@ -31,8 +31,6 @@ // The Google C++ Testing and Mocking Framework (Google Test) #include "gtest/gtest.h" -#include "gtest/internal/custom/gtest.h" -#include "gtest/gtest-spi.h" #include #include @@ -46,79 +44,87 @@ #include // NOLINT #include #include +#include #include +#include #include #include #include #include // NOLINT #include +#include #include +#include "gtest/gtest-assertion-result.h" +#include "gtest/gtest-spi.h" +#include "gtest/internal/custom/gtest.h" + #if GTEST_OS_LINUX -# include // NOLINT -# include // NOLINT -# include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT // Declares vsnprintf(). This header is not available on Windows. -# include // NOLINT -# include // NOLINT -# include // NOLINT -# include // NOLINT -# include +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT + +#include #elif GTEST_OS_ZOS -# include // NOLINT +#include // NOLINT // On z/OS we additionally need strings.h for strcasecmp. -# include // NOLINT +#include // NOLINT #elif GTEST_OS_WINDOWS_MOBILE // We are on Windows CE. -# include // NOLINT -# undef min +#include // NOLINT +#undef min #elif GTEST_OS_WINDOWS // We are on Windows proper. -# include // NOLINT -# undef min +#include // NOLINT +#undef min #ifdef _MSC_VER -# include // NOLINT +#include // NOLINT #endif -# include // NOLINT -# include // NOLINT -# include // NOLINT -# include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT -# if GTEST_OS_WINDOWS_MINGW -# include // NOLINT -# endif // GTEST_OS_WINDOWS_MINGW +#if GTEST_OS_WINDOWS_MINGW +#include // NOLINT +#endif // GTEST_OS_WINDOWS_MINGW #else // cpplint thinks that the header is already included, so we want to // silence it. -# include // NOLINT -# include // NOLINT +#include // NOLINT +#include // NOLINT #endif // GTEST_OS_LINUX #if GTEST_HAS_EXCEPTIONS -# include +#include #endif #if GTEST_CAN_STREAM_RESULTS_ -# include // NOLINT -# include // NOLINT -# include // NOLINT -# include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT #endif #include "src/gtest-internal-inl.h" #if GTEST_OS_WINDOWS -# define vsnprintf _vsnprintf +#define vsnprintf _vsnprintf #endif // GTEST_OS_WINDOWS #if GTEST_OS_MAC @@ -131,7 +137,10 @@ #include "absl/debugging/failure_signal_handler.h" #include "absl/debugging/stacktrace.h" #include "absl/debugging/symbolize.h" +#include "absl/flags/parse.h" +#include "absl/flags/usage.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_replace.h" #endif // GTEST_HAS_ABSL namespace testing { @@ -177,7 +186,7 @@ const char kStackTraceMarker[] = "\nStack trace:\n"; // is specified on the command line. bool g_help_flag = false; -// Utilty function to Open File for Writing +// Utility function to Open File for Writing static FILE* OpenFileForWriting(const std::string& output_file) { FILE* fileout = nullptr; FilePath output_file_path(output_file); @@ -216,28 +225,33 @@ static bool GetDefaultFailFast() { return false; } +} // namespace testing + GTEST_DEFINE_bool_( - fail_fast, internal::BoolFromGTestEnv("fail_fast", GetDefaultFailFast()), + fail_fast, + testing::internal::BoolFromGTestEnv("fail_fast", + testing::GetDefaultFailFast()), "True if and only if a test failure should stop further test execution."); GTEST_DEFINE_bool_( also_run_disabled_tests, - internal::BoolFromGTestEnv("also_run_disabled_tests", false), + testing::internal::BoolFromGTestEnv("also_run_disabled_tests", false), "Run disabled tests too, in addition to the tests normally being run."); GTEST_DEFINE_bool_( - break_on_failure, internal::BoolFromGTestEnv("break_on_failure", false), + break_on_failure, + testing::internal::BoolFromGTestEnv("break_on_failure", false), "True if and only if a failed assertion should be a debugger " "break-point."); GTEST_DEFINE_bool_(catch_exceptions, - internal::BoolFromGTestEnv("catch_exceptions", true), + testing::internal::BoolFromGTestEnv("catch_exceptions", + true), "True if and only if " GTEST_NAME_ " should catch exceptions and treat them as test failures."); GTEST_DEFINE_string_( - color, - internal::StringFromGTestEnv("color", "auto"), + color, testing::internal::StringFromGTestEnv("color", "auto"), "Whether to use colors in the output. Valid values: yes, no, " "and auto. 'auto' means to use colors if the output is " "being sent to a terminal and the TERM environment variable " @@ -245,7 +259,8 @@ GTEST_DEFINE_string_( GTEST_DEFINE_string_( filter, - internal::StringFromGTestEnv("filter", GetDefaultFilter()), + testing::internal::StringFromGTestEnv("filter", + testing::GetDefaultFilter()), "A colon-separated list of glob (not regex) patterns " "for filtering the tests to run, optionally followed by a " "'-' and a : separated list of negative patterns (tests to " @@ -254,13 +269,14 @@ GTEST_DEFINE_string_( GTEST_DEFINE_bool_( install_failure_signal_handler, - internal::BoolFromGTestEnv("install_failure_signal_handler", false), - "If true and supported on the current platform, " GTEST_NAME_ " should " + testing::internal::BoolFromGTestEnv("install_failure_signal_handler", + false), + "If true and supported on the current platform, " GTEST_NAME_ + " should " "install a signal handler that dumps debugging information when fatal " "signals are raised."); -GTEST_DEFINE_bool_(list_tests, false, - "List all tests without running them."); +GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them."); // The net priority order after flag processing is thus: // --gtest_output command line flag @@ -269,8 +285,8 @@ GTEST_DEFINE_bool_(list_tests, false, // '' GTEST_DEFINE_string_( output, - internal::StringFromGTestEnv("output", - internal::OutputFlagAlsoCheckEnvVar().c_str()), + testing::internal::StringFromGTestEnv( + "output", testing::internal::OutputFlagAlsoCheckEnvVar().c_str()), "A format (defaults to \"xml\" but can be specified to be \"json\"), " "optionally followed by a colon and an output file name or directory. " "A directory is indicated by a trailing pathname separator. " @@ -281,65 +297,79 @@ GTEST_DEFINE_string_( "digits."); GTEST_DEFINE_bool_( - brief, internal::BoolFromGTestEnv("brief", false), + brief, testing::internal::BoolFromGTestEnv("brief", false), "True if only test failures should be displayed in text output."); -GTEST_DEFINE_bool_(print_time, internal::BoolFromGTestEnv("print_time", true), +GTEST_DEFINE_bool_(print_time, + testing::internal::BoolFromGTestEnv("print_time", true), "True if and only if " GTEST_NAME_ " should display elapsed time in text output."); -GTEST_DEFINE_bool_(print_utf8, internal::BoolFromGTestEnv("print_utf8", true), +GTEST_DEFINE_bool_(print_utf8, + testing::internal::BoolFromGTestEnv("print_utf8", true), "True if and only if " GTEST_NAME_ " prints UTF8 characters as text."); GTEST_DEFINE_int32_( - random_seed, - internal::Int32FromGTestEnv("random_seed", 0), + random_seed, testing::internal::Int32FromGTestEnv("random_seed", 0), "Random number seed to use when shuffling test orders. Must be in range " "[1, 99999], or 0 to use a seed based on the current time."); GTEST_DEFINE_int32_( - repeat, - internal::Int32FromGTestEnv("repeat", 1), + repeat, testing::internal::Int32FromGTestEnv("repeat", 1), "How many times to repeat each test. Specify a negative number " "for repeating forever. Useful for shaking out flaky tests."); +GTEST_DEFINE_bool_( + recreate_environments_when_repeating, + testing::internal::BoolFromGTestEnv("recreate_environments_when_repeating", + false), + "Controls whether global test environments are recreated for each repeat " + "of the tests. If set to false the global test environments are only set " + "up once, for the first iteration, and only torn down once, for the last. " + "Useful for shaking out flaky tests with stable, expensive test " + "environments. If --gtest_repeat is set to a negative number, meaning " + "there is no last run, the environments will always be recreated to avoid " + "leaks."); + GTEST_DEFINE_bool_(show_internal_stack_frames, false, "True if and only if " GTEST_NAME_ " should include internal stack frames when " "printing test failure stack traces."); -GTEST_DEFINE_bool_(shuffle, internal::BoolFromGTestEnv("shuffle", false), +GTEST_DEFINE_bool_(shuffle, + testing::internal::BoolFromGTestEnv("shuffle", false), "True if and only if " GTEST_NAME_ " should randomize tests' order on every run."); GTEST_DEFINE_int32_( stack_trace_depth, - internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth), + testing::internal::Int32FromGTestEnv("stack_trace_depth", + testing::kMaxStackTraceDepth), "The maximum number of stack frames to print when an " "assertion fails. The valid range is 0 through 100, inclusive."); GTEST_DEFINE_string_( stream_result_to, - internal::StringFromGTestEnv("stream_result_to", ""), + testing::internal::StringFromGTestEnv("stream_result_to", ""), "This flag specifies the host name and the port number on which to stream " "test results. Example: \"localhost:555\". The flag is effective only on " "Linux."); GTEST_DEFINE_bool_( throw_on_failure, - internal::BoolFromGTestEnv("throw_on_failure", false), + testing::internal::BoolFromGTestEnv("throw_on_failure", false), "When this flag is specified, a failed assertion will throw an exception " "if exceptions are enabled or exit the program with a non-zero code " "otherwise. For use with an external test framework."); #if GTEST_USE_OWN_FLAGFILE_FLAG_ GTEST_DEFINE_string_( - flagfile, - internal::StringFromGTestEnv("flagfile", ""), + flagfile, testing::internal::StringFromGTestEnv("flagfile", ""), "This flag specifies the flagfile to read command-line flags from."); #endif // GTEST_USE_OWN_FLAGFILE_FLAG_ +namespace testing { namespace internal { // Generates a random number from [0, range), using a Linear @@ -348,10 +378,9 @@ namespace internal { uint32_t Random::Generate(uint32_t range) { // These constants are the same as are used in glibc's rand(3). // Use wider types than necessary to prevent unsigned overflow diagnostics. - state_ = static_cast(1103515245ULL*state_ + 12345U) % kMaxRange; + state_ = static_cast(1103515245ULL * state_ + 12345U) % kMaxRange; - GTEST_CHECK_(range > 0) - << "Cannot generate a number in the range [0, 0)."; + GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0)."; GTEST_CHECK_(range <= kMaxRange) << "Generation of a number in [0, " << range << ") was requested, " << "but this can only generate numbers in [0, " << kMaxRange << ")."; @@ -396,32 +425,26 @@ static bool ShouldRunTestSuite(const TestSuite* test_suite) { } // AssertHelper constructor. -AssertHelper::AssertHelper(TestPartResult::Type type, - const char* file, - int line, - const char* message) - : data_(new AssertHelperData(type, file, line, message)) { -} +AssertHelper::AssertHelper(TestPartResult::Type type, const char* file, + int line, const char* message) + : data_(new AssertHelperData(type, file, line, message)) {} -AssertHelper::~AssertHelper() { - delete data_; -} +AssertHelper::~AssertHelper() { delete data_; } // Message assignment, for assertion streaming support. void AssertHelper::operator=(const Message& message) const { - UnitTest::GetInstance()-> - AddTestPartResult(data_->type, data_->file, data_->line, - AppendUserMessage(data_->message, message), - UnitTest::GetInstance()->impl() - ->CurrentOsStackTraceExceptTop(1) - // Skips the stack frame for this function itself. - ); // NOLINT + UnitTest::GetInstance()->AddTestPartResult( + data_->type, data_->file, data_->line, + AppendUserMessage(data_->message, message), + UnitTest::GetInstance()->impl()->CurrentOsStackTraceExceptTop(1) + // Skips the stack frame for this function itself. + ); // NOLINT } namespace { // When TEST_P is found without a matching INSTANTIATE_TEST_SUITE_P -// to creates test cases for it, a syntetic test case is +// to creates test cases for it, a synthetic test case is // inserted to report ether an error or a log message. // // This configuration bit will likely be removed at some point. @@ -452,7 +475,6 @@ class FailureTest : public Test { const bool as_error_; }; - } // namespace std::set* GetIgnoredParameterizedTestSuites() { @@ -496,7 +518,8 @@ void InsertSyntheticTestCase(const std::string& name, CodeLocation location, "To suppress this error for this test suite, insert the following line " "(in a non-header) in the namespace it is defined in:" "\n\n" - "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + name + ");"; + "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + + name + ");"; std::string full_name = "UninstantiatedParameterizedTestSuite<" + name + ">"; RegisterTest( // @@ -516,19 +539,18 @@ void RegisterTypeParameterizedTestSuite(const char* test_suite_name, } void RegisterTypeParameterizedTestSuiteInstantiation(const char* case_name) { - GetUnitTestImpl() - ->type_parameterized_test_registry() - .RegisterInstantiation(case_name); + GetUnitTestImpl()->type_parameterized_test_registry().RegisterInstantiation( + case_name); } void TypeParameterizedTestSuiteRegistry::RegisterTestSuite( const char* test_suite_name, CodeLocation code_location) { suites_.emplace(std::string(test_suite_name), - TypeParameterizedTestSuiteInfo(code_location)); + TypeParameterizedTestSuiteInfo(code_location)); } void TypeParameterizedTestSuiteRegistry::RegisterInstantiation( - const char* test_suite_name) { + const char* test_suite_name) { auto it = suites_.find(std::string(test_suite_name)); if (it != suites_.end()) { it->second.instantiated = true; @@ -606,7 +628,8 @@ FilePath GetCurrentExecutableName() { // Returns the output format, or "" for normal printed output. std::string UnitTestOptions::GetOutputFormat() { - const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); + std::string s = GTEST_FLAG_GET(output); + const char* const gtest_output_flag = s.c_str(); const char* const colon = strchr(gtest_output_flag, ':'); return (colon == nullptr) ? std::string(gtest_output_flag) @@ -617,19 +640,19 @@ std::string UnitTestOptions::GetOutputFormat() { // Returns the name of the requested output file, or the default if none // was explicitly specified. std::string UnitTestOptions::GetAbsolutePathToOutputFile() { - const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); + std::string s = GTEST_FLAG_GET(output); + const char* const gtest_output_flag = s.c_str(); std::string format = GetOutputFormat(); - if (format.empty()) - format = std::string(kDefaultOutputFormat); + if (format.empty()) format = std::string(kDefaultOutputFormat); const char* const colon = strchr(gtest_output_flag, ':'); if (colon == nullptr) return internal::FilePath::MakeFileName( - internal::FilePath( - UnitTest::GetInstance()->original_working_dir()), - internal::FilePath(kDefaultOutputFile), 0, - format.c_str()).string(); + internal::FilePath( + UnitTest::GetInstance()->original_working_dir()), + internal::FilePath(kDefaultOutputFile), 0, format.c_str()) + .string(); internal::FilePath output_name(colon + 1); if (!output_name.IsAbsolutePath()) @@ -637,8 +660,7 @@ std::string UnitTestOptions::GetAbsolutePathToOutputFile() { internal::FilePath(UnitTest::GetInstance()->original_working_dir()), internal::FilePath(colon + 1)); - if (!output_name.IsDirectory()) - return output_name.string(); + if (!output_name.IsDirectory()) return output_name.string(); internal::FilePath result(internal::FilePath::GenerateUniqueFileName( output_name, internal::GetCurrentExecutableName(), @@ -699,59 +721,119 @@ static bool PatternMatchesString(const std::string& name_str, return true; } -bool UnitTestOptions::MatchesFilter(const std::string& name_str, - const char* filter) { - // The filter is a list of patterns separated by colons (:). - const char* pattern = filter; - while (true) { - // Find the bounds of this pattern. - const char* const next_sep = strchr(pattern, ':'); - const char* const pattern_end = - next_sep != nullptr ? next_sep : pattern + strlen(pattern); - - // Check if this pattern matches name_str. - if (PatternMatchesString(name_str, pattern, pattern_end)) { - return true; - } +namespace { + +bool IsGlobPattern(const std::string& pattern) { + return std::any_of(pattern.begin(), pattern.end(), + [](const char c) { return c == '?' || c == '*'; }); +} + +class UnitTestFilter { + public: + UnitTestFilter() = default; + + // Constructs a filter from a string of patterns separated by `:`. + explicit UnitTestFilter(const std::string& filter) { + // By design "" filter matches "" string. + std::vector all_patterns; + SplitString(filter, ':', &all_patterns); + const auto exact_match_patterns_begin = std::partition( + all_patterns.begin(), all_patterns.end(), &IsGlobPattern); + + glob_patterns_.reserve(static_cast( + std::distance(all_patterns.begin(), exact_match_patterns_begin))); + std::move(all_patterns.begin(), exact_match_patterns_begin, + std::inserter(glob_patterns_, glob_patterns_.begin())); + std::move( + exact_match_patterns_begin, all_patterns.end(), + std::inserter(exact_match_patterns_, exact_match_patterns_.begin())); + } + + // Returns true if and only if name matches at least one of the patterns in + // the filter. + bool MatchesName(const std::string& name) const { + return exact_match_patterns_.count(name) > 0 || + std::any_of(glob_patterns_.begin(), glob_patterns_.end(), + [&name](const std::string& pattern) { + return PatternMatchesString( + name, pattern.c_str(), + pattern.c_str() + pattern.size()); + }); + } + + private: + std::vector glob_patterns_; + std::unordered_set exact_match_patterns_; +}; - // Give up on this pattern. However, if we found a pattern separator (:), - // advance to the next pattern (skipping over the separator) and restart. - if (next_sep == nullptr) { - return false; +class PositiveAndNegativeUnitTestFilter { + public: + // Constructs a positive and a negative filter from a string. The string + // contains a positive filter optionally followed by a '-' character and a + // negative filter. In case only a negative filter is provided the positive + // filter will be assumed "*". + // A filter is a list of patterns separated by ':'. + explicit PositiveAndNegativeUnitTestFilter(const std::string& filter) { + std::vector positive_and_negative_filters; + + // NOTE: `SplitString` always returns a non-empty container. + SplitString(filter, '-', &positive_and_negative_filters); + const auto& positive_filter = positive_and_negative_filters.front(); + + if (positive_and_negative_filters.size() > 1) { + positive_filter_ = UnitTestFilter( + positive_filter.empty() ? kUniversalFilter : positive_filter); + + // TODO(b/214626361): Fail on multiple '-' characters + // For the moment to preserve old behavior we concatenate the rest of the + // string parts with `-` as separator to generate the negative filter. + auto negative_filter_string = positive_and_negative_filters[1]; + for (std::size_t i = 2; i < positive_and_negative_filters.size(); i++) + negative_filter_string = + negative_filter_string + '-' + positive_and_negative_filters[i]; + negative_filter_ = UnitTestFilter(negative_filter_string); + } else { + // In case we don't have a negative filter and positive filter is "" + // we do not use kUniversalFilter by design as opposed to when we have a + // negative filter. + positive_filter_ = UnitTestFilter(positive_filter); } - pattern = next_sep + 1; } - return true; + + // Returns true if and only if test name (this is generated by appending test + // suit name and test name via a '.' character) matches the positive filter + // and does not match the negative filter. + bool MatchesTest(const std::string& test_suite_name, + const std::string& test_name) const { + return MatchesName(test_suite_name + "." + test_name); + } + + // Returns true if and only if name matches the positive filter and does not + // match the negative filter. + bool MatchesName(const std::string& name) const { + return positive_filter_.MatchesName(name) && + !negative_filter_.MatchesName(name); + } + + private: + UnitTestFilter positive_filter_; + UnitTestFilter negative_filter_; +}; +} // namespace + +bool UnitTestOptions::MatchesFilter(const std::string& name_str, + const char* filter) { + return UnitTestFilter(filter).MatchesName(name_str); } // Returns true if and only if the user-specified filter matches the test // suite name and the test name. bool UnitTestOptions::FilterMatchesTest(const std::string& test_suite_name, const std::string& test_name) { - const std::string& full_name = test_suite_name + "." + test_name.c_str(); - // Split --gtest_filter at '-', if there is one, to separate into // positive filter and negative filter portions - const char* const p = GTEST_FLAG(filter).c_str(); - const char* const dash = strchr(p, '-'); - std::string positive; - std::string negative; - if (dash == nullptr) { - positive = GTEST_FLAG(filter).c_str(); // Whole string is a positive filter - negative = ""; - } else { - positive = std::string(p, dash); // Everything up to the dash - negative = std::string(dash + 1); // Everything after the dash - if (positive.empty()) { - // Treat '-test1' as the same as '*-test1' - positive = kUniversalFilter; - } - } - - // A filter is a colon-separated list of patterns. It matches a - // test if any pattern in it matches the test. - return (MatchesFilter(full_name, positive.c_str()) && - !MatchesFilter(full_name, negative.c_str())); + return PositiveAndNegativeUnitTestFilter(GTEST_FLAG_GET(filter)) + .MatchesTest(test_suite_name, test_name); } #if GTEST_HAS_SEH @@ -771,7 +853,7 @@ int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) { bool should_handle = true; - if (!GTEST_FLAG(catch_exceptions)) + if (!GTEST_FLAG_GET(catch_exceptions)) should_handle = false; else if (exception_code == EXCEPTION_BREAKPOINT) should_handle = false; @@ -789,8 +871,7 @@ int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) { // results. Intercepts only failures from the current thread. ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( TestPartResultArray* result) - : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), - result_(result) { + : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), result_(result) { Init(); } @@ -799,8 +880,7 @@ ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( // results. ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( InterceptMode intercept_mode, TestPartResultArray* result) - : intercept_mode_(intercept_mode), - result_(result) { + : intercept_mode_(intercept_mode), result_(result) { Init(); } @@ -844,9 +924,7 @@ namespace internal { // from user test code. GetTestTypeId() is guaranteed to always // return the same value, as it always calls GetTypeId<>() from the // gtest.cc, which is within the Google Test framework. -TypeId GetTestTypeId() { - return GetTypeId(); -} +TypeId GetTestTypeId() { return GetTypeId(); } // The value of GetTestTypeId() as seen from within the Google Test // library. This is solely for testing GetTestTypeId(). @@ -861,9 +939,9 @@ static AssertionResult HasOneFailure(const char* /* results_expr */, const TestPartResultArray& results, TestPartResult::Type type, const std::string& substr) { - const std::string expected(type == TestPartResult::kFatalFailure ? - "1 fatal failure" : - "1 non-fatal failure"); + const std::string expected(type == TestPartResult::kFatalFailure + ? "1 fatal failure" + : "1 non-fatal failure"); Message msg; if (results.size() != 1) { msg << "Expected: " << expected << "\n" @@ -882,10 +960,10 @@ static AssertionResult HasOneFailure(const char* /* results_expr */, } if (strstr(r.message(), substr.c_str()) == nullptr) { - return AssertionFailure() << "Expected: " << expected << " containing \"" - << substr << "\"\n" - << " Actual:\n" - << r; + return AssertionFailure() + << "Expected: " << expected << " containing \"" << substr << "\"\n" + << " Actual:\n" + << r; } return AssertionSuccess(); @@ -908,7 +986,8 @@ SingleFailureChecker::~SingleFailureChecker() { } DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter( - UnitTestImpl* unit_test) : unit_test_(unit_test) {} + UnitTestImpl* unit_test) + : unit_test_(unit_test) {} void DefaultGlobalTestPartResultReporter::ReportTestPartResult( const TestPartResult& result) { @@ -917,7 +996,8 @@ void DefaultGlobalTestPartResultReporter::ReportTestPartResult( } DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter( - UnitTestImpl* unit_test) : unit_test_(unit_test) {} + UnitTestImpl* unit_test) + : unit_test_(unit_test) {} void DefaultPerThreadTestPartResultReporter::ReportTestPartResult( const TestPartResult& result) { @@ -1024,11 +1104,10 @@ int UnitTestImpl::test_to_run_count() const { // trace but Bar() and CurrentOsStackTraceExceptTop() won't. std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) { return os_stack_trace_getter()->CurrentStackTrace( - static_cast(GTEST_FLAG(stack_trace_depth)), - skip_count + 1 + static_cast(GTEST_FLAG_GET(stack_trace_depth)), skip_count + 1 // Skips the user-specified number of frames plus this function // itself. - ); // NOLINT + ); // NOLINT } // A helper class for measuring elapsed times. @@ -1072,8 +1151,7 @@ LPCWSTR String::AnsiToUtf16(const char* ansi) { const int unicode_length = MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0); WCHAR* unicode = new WCHAR[unicode_length + 1]; - MultiByteToWideChar(CP_ACP, 0, ansi, length, - unicode, unicode_length); + MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length); unicode[unicode_length] = 0; return unicode; } @@ -1082,7 +1160,7 @@ LPCWSTR String::AnsiToUtf16(const char* ansi) { // memory using new. The caller is responsible for deleting the return // value using delete[]. Returns the ANSI string, or NULL if the // input is NULL. -const char* String::Utf16ToAnsi(LPCWSTR utf16_str) { +const char* String::Utf16ToAnsi(LPCWSTR utf16_str) { if (!utf16_str) return nullptr; const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr, 0, nullptr, nullptr); @@ -1101,7 +1179,7 @@ const char* String::Utf16ToAnsi(LPCWSTR utf16_str) { // Unlike strcmp(), this function can handle NULL argument(s). A NULL // C string is considered different to any non-NULL C string, // including the empty string. -bool String::CStringEquals(const char * lhs, const char * rhs) { +bool String::CStringEquals(const char* lhs, const char* rhs) { if (lhs == nullptr) return rhs == nullptr; if (rhs == nullptr) return false; @@ -1115,11 +1193,10 @@ bool String::CStringEquals(const char * lhs, const char * rhs) { // encoding, and streams the result to the given Message object. static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length, Message* msg) { - for (size_t i = 0; i != length; ) { // NOLINT + for (size_t i = 0; i != length;) { // NOLINT if (wstr[i] != L'\0') { *msg << WideStringToUtf8(wstr + i, static_cast(length - i)); - while (i != length && wstr[i] != L'\0') - i++; + while (i != length && wstr[i] != L'\0') i++; } else { *msg << '\0'; i++; @@ -1161,17 +1238,17 @@ Message::Message() : ss_(new ::std::stringstream) { // These two overloads allow streaming a wide C string to a Message // using the UTF-8 encoding. -Message& Message::operator <<(const wchar_t* wide_c_str) { +Message& Message::operator<<(const wchar_t* wide_c_str) { return *this << internal::String::ShowWideCString(wide_c_str); } -Message& Message::operator <<(wchar_t* wide_c_str) { +Message& Message::operator<<(wchar_t* wide_c_str) { return *this << internal::String::ShowWideCString(wide_c_str); } #if GTEST_HAS_STD_WSTRING // Converts the given wide string to a narrow string using the UTF-8 // encoding, and streams the result to this Message object. -Message& Message::operator <<(const ::std::wstring& wstr) { +Message& Message::operator<<(const ::std::wstring& wstr) { internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); return *this; } @@ -1183,44 +1260,6 @@ std::string Message::GetString() const { return internal::StringStreamToString(ss_.get()); } -// AssertionResult constructors. -// Used in EXPECT_TRUE/FALSE(assertion_result). -AssertionResult::AssertionResult(const AssertionResult& other) - : success_(other.success_), - message_(other.message_.get() != nullptr - ? new ::std::string(*other.message_) - : static_cast< ::std::string*>(nullptr)) {} - -// Swaps two AssertionResults. -void AssertionResult::swap(AssertionResult& other) { - using std::swap; - swap(success_, other.success_); - swap(message_, other.message_); -} - -// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. -AssertionResult AssertionResult::operator!() const { - AssertionResult negation(!success_); - if (message_.get() != nullptr) negation << *message_; - return negation; -} - -// Makes a successful assertion result. -AssertionResult AssertionSuccess() { - return AssertionResult(true); -} - -// Makes a failed assertion result. -AssertionResult AssertionFailure() { - return AssertionResult(false); -} - -// Makes a failed assertion result with the given failure message. -// Deprecated; use AssertionFailure() << message. -AssertionResult AssertionFailure(const Message& message) { - return AssertionFailure() << message; -} - namespace internal { namespace edit_distance { @@ -1512,8 +1551,7 @@ std::vector SplitEscapedString(const std::string& str) { AssertionResult EqFailure(const char* lhs_expression, const char* rhs_expression, const std::string& lhs_value, - const std::string& rhs_value, - bool ignoring_case) { + const std::string& rhs_value, bool ignoring_case) { Message msg; msg << "Expected equality of these values:"; msg << "\n " << lhs_expression; @@ -1530,10 +1568,8 @@ AssertionResult EqFailure(const char* lhs_expression, } if (!lhs_value.empty() && !rhs_value.empty()) { - const std::vector lhs_lines = - SplitEscapedString(lhs_value); - const std::vector rhs_lines = - SplitEscapedString(rhs_value); + const std::vector lhs_lines = SplitEscapedString(lhs_value); + const std::vector rhs_lines = SplitEscapedString(rhs_value); if (lhs_lines.size() > 1 || rhs_lines.size() > 1) { msg << "\nWith diff:\n" << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines); @@ -1545,27 +1581,21 @@ AssertionResult EqFailure(const char* lhs_expression, // Constructs a failure message for Boolean assertions such as EXPECT_TRUE. std::string GetBoolAssertionFailureMessage( - const AssertionResult& assertion_result, - const char* expression_text, - const char* actual_predicate_value, - const char* expected_predicate_value) { + const AssertionResult& assertion_result, const char* expression_text, + const char* actual_predicate_value, const char* expected_predicate_value) { const char* actual_message = assertion_result.message(); Message msg; msg << "Value of: " << expression_text << "\n Actual: " << actual_predicate_value; - if (actual_message[0] != '\0') - msg << " (" << actual_message << ")"; + if (actual_message[0] != '\0') msg << " (" << actual_message << ")"; msg << "\nExpected: " << expected_predicate_value; return msg.GetString(); } // Helper function for implementing ASSERT_NEAR. -AssertionResult DoubleNearPredFormat(const char* expr1, - const char* expr2, - const char* abs_error_expr, - double val1, - double val2, - double abs_error) { +AssertionResult DoubleNearPredFormat(const char* expr1, const char* expr2, + const char* abs_error_expr, double val1, + double val2, double abs_error) { const double diff = fabs(val1 - val2); if (diff <= abs_error) return AssertionSuccess(); @@ -1595,20 +1625,17 @@ AssertionResult DoubleNearPredFormat(const char* expr1, "EXPECT_EQUAL. Consider using EXPECT_DOUBLE_EQ instead."; } return AssertionFailure() - << "The difference between " << expr1 << " and " << expr2 - << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n" - << expr1 << " evaluates to " << val1 << ",\n" - << expr2 << " evaluates to " << val2 << ", and\n" - << abs_error_expr << " evaluates to " << abs_error << "."; + << "The difference between " << expr1 << " and " << expr2 << " is " + << diff << ", which exceeds " << abs_error_expr << ", where\n" + << expr1 << " evaluates to " << val1 << ",\n" + << expr2 << " evaluates to " << val2 << ", and\n" + << abs_error_expr << " evaluates to " << abs_error << "."; } - // Helper template for implementing FloatLE() and DoubleLE(). template -AssertionResult FloatingPointLE(const char* expr1, - const char* expr2, - RawType val1, - RawType val2) { +AssertionResult FloatingPointLE(const char* expr1, const char* expr2, + RawType val1, RawType val2) { // Returns success if val1 is less than val2, if (val1 < val2) { return AssertionSuccess(); @@ -1633,24 +1660,24 @@ AssertionResult FloatingPointLE(const char* expr1, << val2; return AssertionFailure() - << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n" - << " Actual: " << StringStreamToString(&val1_ss) << " vs " - << StringStreamToString(&val2_ss); + << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n" + << " Actual: " << StringStreamToString(&val1_ss) << " vs " + << StringStreamToString(&val2_ss); } } // namespace internal // Asserts that val1 is less than, or almost equal to, val2. Fails // otherwise. In particular, it fails if either val1 or val2 is NaN. -AssertionResult FloatLE(const char* expr1, const char* expr2, - float val1, float val2) { +AssertionResult FloatLE(const char* expr1, const char* expr2, float val1, + float val2) { return internal::FloatingPointLE(expr1, expr2, val1, val2); } // Asserts that val1 is less than, or almost equal to, val2. Fails // otherwise. In particular, it fails if either val1 or val2 is NaN. -AssertionResult DoubleLE(const char* expr1, const char* expr2, - double val1, double val2) { +AssertionResult DoubleLE(const char* expr1, const char* expr2, double val1, + double val2) { return internal::FloatingPointLE(expr1, expr2, val1, val2); } @@ -1658,62 +1685,51 @@ namespace internal { // The helper function for {ASSERT|EXPECT}_STREQ. AssertionResult CmpHelperSTREQ(const char* lhs_expression, - const char* rhs_expression, - const char* lhs, + const char* rhs_expression, const char* lhs, const char* rhs) { if (String::CStringEquals(lhs, rhs)) { return AssertionSuccess(); } - return EqFailure(lhs_expression, - rhs_expression, - PrintToString(lhs), - PrintToString(rhs), - false); + return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs), + PrintToString(rhs), false); } // The helper function for {ASSERT|EXPECT}_STRCASEEQ. AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression, - const char* rhs_expression, - const char* lhs, + const char* rhs_expression, const char* lhs, const char* rhs) { if (String::CaseInsensitiveCStringEquals(lhs, rhs)) { return AssertionSuccess(); } - return EqFailure(lhs_expression, - rhs_expression, - PrintToString(lhs), - PrintToString(rhs), - true); + return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs), + PrintToString(rhs), true); } // The helper function for {ASSERT|EXPECT}_STRNE. AssertionResult CmpHelperSTRNE(const char* s1_expression, - const char* s2_expression, - const char* s1, + const char* s2_expression, const char* s1, const char* s2) { if (!String::CStringEquals(s1, s2)) { return AssertionSuccess(); } else { - return AssertionFailure() << "Expected: (" << s1_expression << ") != (" - << s2_expression << "), actual: \"" - << s1 << "\" vs \"" << s2 << "\""; + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" << s2_expression + << "), actual: \"" << s1 << "\" vs \"" << s2 << "\""; } } // The helper function for {ASSERT|EXPECT}_STRCASENE. AssertionResult CmpHelperSTRCASENE(const char* s1_expression, - const char* s2_expression, - const char* s1, + const char* s2_expression, const char* s1, const char* s2) { if (!String::CaseInsensitiveCStringEquals(s1, s2)) { return AssertionSuccess(); } else { return AssertionFailure() - << "Expected: (" << s1_expression << ") != (" - << s2_expression << ") (ignoring case), actual: \"" - << s1 << "\" vs \"" << s2 << "\""; + << "Expected: (" << s1_expression << ") != (" << s2_expression + << ") (ignoring case), actual: \"" << s1 << "\" vs \"" << s2 << "\""; } } @@ -1741,8 +1757,7 @@ bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) { // StringType here can be either ::std::string or ::std::wstring. template -bool IsSubstringPred(const StringType& needle, - const StringType& haystack) { +bool IsSubstringPred(const StringType& needle, const StringType& haystack) { return haystack.find(needle) != StringType::npos; } @@ -1751,21 +1766,22 @@ bool IsSubstringPred(const StringType& needle, // StringType here can be const char*, const wchar_t*, ::std::string, // or ::std::wstring. template -AssertionResult IsSubstringImpl( - bool expected_to_be_substring, - const char* needle_expr, const char* haystack_expr, - const StringType& needle, const StringType& haystack) { +AssertionResult IsSubstringImpl(bool expected_to_be_substring, + const char* needle_expr, + const char* haystack_expr, + const StringType& needle, + const StringType& haystack) { if (IsSubstringPred(needle, haystack) == expected_to_be_substring) return AssertionSuccess(); const bool is_wide_string = sizeof(needle[0]) > 1; const char* const begin_string_quote = is_wide_string ? "L\"" : "\""; return AssertionFailure() - << "Value of: " << needle_expr << "\n" - << " Actual: " << begin_string_quote << needle << "\"\n" - << "Expected: " << (expected_to_be_substring ? "" : "not ") - << "a substring of " << haystack_expr << "\n" - << "Which is: " << begin_string_quote << haystack << "\""; + << "Value of: " << needle_expr << "\n" + << " Actual: " << begin_string_quote << needle << "\"\n" + << "Expected: " << (expected_to_be_substring ? "" : "not ") + << "a substring of " << haystack_expr << "\n" + << "Which is: " << begin_string_quote << haystack << "\""; } } // namespace @@ -1774,52 +1790,52 @@ AssertionResult IsSubstringImpl( // substring of haystack (NULL is considered a substring of itself // only), and return an appropriate error message when they fail. -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const char* needle, const char* haystack) { +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack) { return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); } -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const wchar_t* needle, const wchar_t* haystack) { +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack) { return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); } -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const char* needle, const char* haystack) { +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, const char* needle, + const char* haystack) { return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); } -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const wchar_t* needle, const wchar_t* haystack) { +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, const wchar_t* needle, + const wchar_t* haystack) { return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); } -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::string& needle, const ::std::string& haystack) { +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack) { return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); } -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::string& needle, const ::std::string& haystack) { +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack) { return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); } #if GTEST_HAS_STD_WSTRING -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::wstring& needle, const ::std::wstring& haystack) { +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack) { return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); } -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::wstring& needle, const ::std::wstring& haystack) { +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack) { return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); } #endif // GTEST_HAS_STD_WSTRING @@ -1831,43 +1847,42 @@ namespace internal { namespace { // Helper function for IsHRESULT{SuccessFailure} predicates -AssertionResult HRESULTFailureHelper(const char* expr, - const char* expected, +AssertionResult HRESULTFailureHelper(const char* expr, const char* expected, long hr) { // NOLINT -# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE // Windows CE doesn't support FormatMessage. const char error_text[] = ""; -# else +#else // Looks up the human-readable system message for the HRESULT code // and since we're not passing any params to FormatMessage, we don't // want inserts expanded. - const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS; + const DWORD kFlags = + FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS; const DWORD kBufSize = 4096; // Gets the system's human readable message string for this HRESULT. - char error_text[kBufSize] = { '\0' }; + char error_text[kBufSize] = {'\0'}; DWORD message_length = ::FormatMessageA(kFlags, - 0, // no source, we're asking system + 0, // no source, we're asking system static_cast(hr), // the error - 0, // no line width restrictions + 0, // no line width restrictions error_text, // output buffer kBufSize, // buf size nullptr); // no arguments for inserts // Trims tailing white space (FormatMessage leaves a trailing CR-LF) for (; message_length && IsSpace(error_text[message_length - 1]); - --message_length) { + --message_length) { error_text[message_length - 1] = '\0'; } -# endif // GTEST_OS_WINDOWS_MOBILE +#endif // GTEST_OS_WINDOWS_MOBILE const std::string error_hex("0x" + String::FormatHexInt(hr)); return ::testing::AssertionFailure() - << "Expected: " << expr << " " << expected << ".\n" - << " Actual: " << error_hex << " " << error_text << "\n"; + << "Expected: " << expr << " " << expected << ".\n" + << " Actual: " << error_hex << " " << error_text << "\n"; } } // namespace @@ -1901,16 +1916,18 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) { // NOLINT // 17 - 21 bits 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx // The maximum code-point a one-byte UTF-8 sequence can represent. -constexpr uint32_t kMaxCodePoint1 = (static_cast(1) << 7) - 1; +constexpr uint32_t kMaxCodePoint1 = (static_cast(1) << 7) - 1; // The maximum code-point a two-byte UTF-8 sequence can represent. constexpr uint32_t kMaxCodePoint2 = (static_cast(1) << (5 + 6)) - 1; // The maximum code-point a three-byte UTF-8 sequence can represent. -constexpr uint32_t kMaxCodePoint3 = (static_cast(1) << (4 + 2*6)) - 1; +constexpr uint32_t kMaxCodePoint3 = + (static_cast(1) << (4 + 2 * 6)) - 1; // The maximum code-point a four-byte UTF-8 sequence can represent. -constexpr uint32_t kMaxCodePoint4 = (static_cast(1) << (3 + 3*6)) - 1; +constexpr uint32_t kMaxCodePoint4 = + (static_cast(1) << (3 + 3 * 6)) - 1; // Chops off the n lowest bits from a bit pattern. Returns the n // lowest bits. As a side effect, the original bit pattern will be @@ -1935,7 +1952,7 @@ std::string CodePointToUtf8(uint32_t code_point) { char str[5]; // Big enough for the largest valid code point. if (code_point <= kMaxCodePoint1) { str[1] = '\0'; - str[0] = static_cast(code_point); // 0xxxxxxx + str[0] = static_cast(code_point); // 0xxxxxxx } else if (code_point <= kMaxCodePoint2) { str[2] = '\0'; str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx @@ -1963,8 +1980,8 @@ std::string CodePointToUtf8(uint32_t code_point) { // and thus should be combined into a single Unicode code point // using CreateCodePointFromUtf16SurrogatePair. inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) { - return sizeof(wchar_t) == 2 && - (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00; + return sizeof(wchar_t) == 2 && (first & 0xFC00) == 0xD800 && + (second & 0xFC00) == 0xDC00; } // Creates a Unicode code point from UTF16 surrogate pair. @@ -1995,8 +2012,7 @@ inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first, // and contains invalid UTF-16 surrogate pairs, values in those pairs // will be encoded as individual Unicode characters from Basic Normal Plane. std::string WideStringToUtf8(const wchar_t* str, int num_chars) { - if (num_chars == -1) - num_chars = static_cast(wcslen(str)); + if (num_chars == -1) num_chars = static_cast(wcslen(str)); ::std::stringstream stream; for (int i = 0; i < num_chars; ++i) { @@ -2005,8 +2021,8 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) { if (str[i] == L'\0') { break; } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) { - unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i], - str[i + 1]); + unicode_code_point = + CreateCodePointFromUtf16SurrogatePair(str[i], str[i + 1]); i++; } else { unicode_code_point = static_cast(str[i]); @@ -2019,7 +2035,7 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) { // Converts a wide C string to an std::string using the UTF-8 encoding. // NULL will be converted to "(null)". -std::string String::ShowWideCString(const wchar_t * wide_c_str) { +std::string String::ShowWideCString(const wchar_t* wide_c_str) { if (wide_c_str == nullptr) return "(null)"; return internal::WideStringToUtf8(wide_c_str, -1); @@ -2031,7 +2047,7 @@ std::string String::ShowWideCString(const wchar_t * wide_c_str) { // Unlike wcscmp(), this function can handle NULL argument(s). A NULL // C string is considered different to any non-NULL C string, // including the empty string. -bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) { +bool String::WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs) { if (lhs == nullptr) return rhs == nullptr; if (rhs == nullptr) return false; @@ -2041,33 +2057,27 @@ bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) { // Helper function for *_STREQ on wide strings. AssertionResult CmpHelperSTREQ(const char* lhs_expression, - const char* rhs_expression, - const wchar_t* lhs, + const char* rhs_expression, const wchar_t* lhs, const wchar_t* rhs) { if (String::WideCStringEquals(lhs, rhs)) { return AssertionSuccess(); } - return EqFailure(lhs_expression, - rhs_expression, - PrintToString(lhs), - PrintToString(rhs), - false); + return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs), + PrintToString(rhs), false); } // Helper function for *_STRNE on wide strings. AssertionResult CmpHelperSTRNE(const char* s1_expression, - const char* s2_expression, - const wchar_t* s1, + const char* s2_expression, const wchar_t* s1, const wchar_t* s2) { if (!String::WideCStringEquals(s1, s2)) { return AssertionSuccess(); } - return AssertionFailure() << "Expected: (" << s1_expression << ") != (" - << s2_expression << "), actual: " - << PrintToString(s1) - << " vs " << PrintToString(s2); + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" << s2_expression + << "), actual: " << PrintToString(s1) << " vs " << PrintToString(s2); } // Compares two C strings, ignoring case. Returns true if and only if they have @@ -2076,7 +2086,7 @@ AssertionResult CmpHelperSTRNE(const char* s1_expression, // Unlike strcasecmp(), this function can handle NULL argument(s). A // NULL C string is considered different to any non-NULL C string, // including the empty string. -bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) { +bool String::CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) { if (lhs == nullptr) return rhs == nullptr; if (rhs == nullptr) return false; return posix::StrCaseCmp(lhs, rhs) == 0; @@ -2118,8 +2128,8 @@ bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs, // Returns true if and only if str ends with the given suffix, ignoring case. // Any string is considered to end with an empty suffix. -bool String::EndsWithCaseInsensitive( - const std::string& str, const std::string& suffix) { +bool String::EndsWithCaseInsensitive(const std::string& str, + const std::string& suffix) { const size_t str_len = str.length(); const size_t suffix_len = suffix.length(); return (str_len >= suffix_len) && @@ -2202,15 +2212,13 @@ TestResult::TestResult() : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {} // D'tor. -TestResult::~TestResult() { -} +TestResult::~TestResult() {} // Returns the i-th test part result among all the results. i can // range from 0 to total_part_count() - 1. If i is not in that range, // aborts the program. const TestPartResult& TestResult::GetTestPartResult(int i) const { - if (i < 0 || i >= total_part_count()) - internal::posix::Abort(); + if (i < 0 || i >= total_part_count()) internal::posix::Abort(); return test_part_results_.at(static_cast(i)); } @@ -2218,15 +2226,12 @@ const TestPartResult& TestResult::GetTestPartResult(int i) const { // test_property_count() - 1. If i is not in that range, aborts the // program. const TestProperty& TestResult::GetTestProperty(int i) const { - if (i < 0 || i >= test_property_count()) - internal::posix::Abort(); + if (i < 0 || i >= test_property_count()) internal::posix::Abort(); return test_properties_.at(static_cast(i)); } // Clears the test part results. -void TestResult::ClearTestPartResults() { - test_part_results_.clear(); -} +void TestResult::ClearTestPartResults() { test_part_results_.clear(); } // Adds a test part result to the list. void TestResult::AddTestPartResult(const TestPartResult& test_part_result) { @@ -2255,15 +2260,8 @@ void TestResult::RecordProperty(const std::string& xml_element, // The list of reserved attributes used in the element of XML // output. static const char* const kReservedTestSuitesAttributes[] = { - "disabled", - "errors", - "failures", - "name", - "random_seed", - "tests", - "time", - "timestamp" -}; + "disabled", "errors", "failures", "name", + "random_seed", "tests", "time", "timestamp"}; // The list of reserved attributes used in the element of XML // output. @@ -2273,8 +2271,8 @@ static const char* const kReservedTestSuiteAttributes[] = { // The list of reserved attributes used in the element of XML output. static const char* const kReservedTestCaseAttributes[] = { - "classname", "name", "status", "time", "type_param", - "value_param", "file", "line"}; + "classname", "name", "status", "time", + "type_param", "value_param", "file", "line"}; // Use a slightly different set for allowed output to ensure existing tests can // still RecordProperty("result") or "RecordProperty(timestamp") @@ -2336,7 +2334,7 @@ static bool ValidateTestPropertyName( const std::string& property_name, const std::vector& reserved_names) { if (std::find(reserved_names.begin(), reserved_names.end(), property_name) != - reserved_names.end()) { + reserved_names.end()) { ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name << " (" << FormatWordList(reserved_names) << " are reserved by " << GTEST_NAME_ << ")"; @@ -2374,8 +2372,7 @@ bool TestResult::Skipped() const { // Returns true if and only if the test failed. bool TestResult::Failed() const { for (int i = 0; i < total_part_count(); ++i) { - if (GetTestPartResult(i).failed()) - return true; + if (GetTestPartResult(i).failed()) return true; } return false; } @@ -2416,27 +2413,22 @@ int TestResult::test_property_count() const { // Creates a Test object. // The c'tor saves the states of all flags. -Test::Test() - : gtest_flag_saver_(new GTEST_FLAG_SAVER_) { -} +Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {} // The d'tor restores the states of all flags. The actual work is // done by the d'tor of the gtest_flag_saver_ field, and thus not // visible here. -Test::~Test() { -} +Test::~Test() {} // Sets up the test fixture. // // A sub-class may override this. -void Test::SetUp() { -} +void Test::SetUp() {} // Tears down the test fixture. // // A sub-class may override this. -void Test::TearDown() { -} +void Test::TearDown() {} // Allows user supplied key value pairs to be recorded for later output. void Test::RecordProperty(const std::string& key, const std::string& value) { @@ -2541,8 +2533,8 @@ bool Test::HasSameFixtureClass() { static std::string* FormatSehExceptionMessage(DWORD exception_code, const char* location) { Message message; - message << "SEH exception with code 0x" << std::setbase(16) << - exception_code << std::setbase(10) << " thrown in " << location << "."; + message << "SEH exception with code 0x" << std::setbase(16) << exception_code + << std::setbase(10) << " thrown in " << location << "."; return new std::string(message.GetString()); } @@ -2585,8 +2577,8 @@ GoogleTestFailureException::GoogleTestFailureException( // exceptions in the same function. Therefore, we provide a separate // wrapper function for handling SEH exceptions.) template -Result HandleSehExceptionsInMethodIfSupported( - T* object, Result (T::*method)(), const char* location) { +Result HandleSehExceptionsInMethodIfSupported(T* object, Result (T::*method)(), + const char* location) { #if GTEST_HAS_SEH __try { return (object->*method)(); @@ -2595,8 +2587,8 @@ Result HandleSehExceptionsInMethodIfSupported( // We create the exception message on the heap because VC++ prohibits // creation of objects with destructors on stack in functions using __try // (see error C2712). - std::string* exception_message = FormatSehExceptionMessage( - GetExceptionCode(), location); + std::string* exception_message = + FormatSehExceptionMessage(GetExceptionCode(), location); internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure, *exception_message); delete exception_message; @@ -2612,8 +2604,8 @@ Result HandleSehExceptionsInMethodIfSupported( // exceptions, if they are supported; returns the 0-value for type // Result in case of an SEH exception. template -Result HandleExceptionsInMethodIfSupported( - T* object, Result (T::*method)(), const char* location) { +Result HandleExceptionsInMethodIfSupported(T* object, Result (T::*method)(), + const char* location) { // NOTE: The user code can affect the way in which Google Test handles // exceptions by setting GTEST_FLAG(catch_exceptions), but only before // RUN_ALL_TESTS() starts. It is technically possible to check the flag @@ -2623,7 +2615,7 @@ Result HandleExceptionsInMethodIfSupported( // try { // // Perform the test method. // } catch (...) { - // if (GTEST_FLAG(catch_exceptions)) + // if (GTEST_FLAG_GET(catch_exceptions)) // // Report the exception as failure. // else // throw; // Re-throws the original exception. @@ -2679,16 +2671,16 @@ void Test::Run() { // GTEST_SKIP(). if (!HasFatalFailure() && !IsSkipped()) { impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported( - this, &Test::TestBody, "the test body"); + internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody, + "the test body"); } // However, we want to clean up as much as possible. Hence we will // always call TearDown(), even if SetUp() or the test body has // failed. impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported( - this, &Test::TearDown, "TearDown()"); + internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown, + "TearDown()"); } // Returns true if and only if the current test has a fatal failure. @@ -2698,8 +2690,9 @@ bool Test::HasFatalFailure() { // Returns true if and only if the current test has a non-fatal failure. bool Test::HasNonfatalFailure() { - return internal::GetUnitTestImpl()->current_test_result()-> - HasNonfatalFailure(); + return internal::GetUnitTestImpl() + ->current_test_result() + ->HasNonfatalFailure(); } // Returns true if and only if the current test was skipped. @@ -2799,11 +2792,10 @@ class TestNameIs { // Constructor. // // TestNameIs has NO default constructor. - explicit TestNameIs(const char* name) - : name_(name) {} + explicit TestNameIs(const char* name) : name_(name) {} // Returns true if and only if the test name of test_info matches name_. - bool operator()(const TestInfo * test_info) const { + bool operator()(const TestInfo* test_info) const { return test_info && test_info->name() == name_; } @@ -2831,20 +2823,20 @@ void UnitTestImpl::RegisterParameterizedTests() { // Creates the test object, runs it, records its result, and then // deletes it. void TestInfo::Run() { - if (!should_run_) return; + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + if (!should_run_) { + if (is_disabled_ && matches_filter_) repeater->OnTestDisabled(*this); + return; + } // Tells UnitTest where to store test result. internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); impl->set_current_test_info(this); - TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); - // Notifies the unit test event listeners that a test is about to start. repeater->OnTestStart(*this); - result_.set_start_timestamp(internal::GetTimeInMillis()); internal::Timer timer; - impl->os_stack_trace_getter()->UponLeavingGTest(); // Creates the test object. @@ -3009,11 +3001,18 @@ void TestSuite::Run() { internal::HandleExceptionsInMethodIfSupported( this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()"); + const bool skip_all = ad_hoc_test_result().Failed(); + start_timestamp_ = internal::GetTimeInMillis(); internal::Timer timer; for (int i = 0; i < total_test_count(); i++) { - GetMutableTestInfo(i)->Run(); - if (GTEST_FLAG(fail_fast) && GetMutableTestInfo(i)->result()->Failed()) { + if (skip_all) { + GetMutableTestInfo(i)->Skip(); + } else { + GetMutableTestInfo(i)->Run(); + } + if (GTEST_FLAG_GET(fail_fast) && + GetMutableTestInfo(i)->result()->Failed()) { for (int j = i + 1; j < total_test_count(); j++) { GetMutableTestInfo(j)->Skip(); } @@ -3089,11 +3088,10 @@ void TestSuite::UnshuffleTests() { // // FormatCountableNoun(1, "formula", "formuli") returns "1 formula". // FormatCountableNoun(5, "book", "books") returns "5 books". -static std::string FormatCountableNoun(int count, - const char * singular_form, - const char * plural_form) { +static std::string FormatCountableNoun(int count, const char* singular_form, + const char* plural_form) { return internal::StreamableToString(count) + " " + - (count == 1 ? singular_form : plural_form); + (count == 1 ? singular_form : plural_form); } // Formats the count of tests. @@ -3110,7 +3108,7 @@ static std::string FormatTestSuiteCount(int test_suite_count) { // representation. Both kNonFatalFailure and kFatalFailure are translated // to "Failure", as the user usually doesn't care about the difference // between the two when viewing the test result. -static const char * TestPartResultTypeToString(TestPartResult::Type type) { +static const char* TestPartResultTypeToString(TestPartResult::Type type) { switch (type) { case TestPartResult::kSkip: return "Skipped\n"; @@ -3137,17 +3135,18 @@ enum class GTestColor { kDefault, kRed, kGreen, kYellow }; // Prints a TestPartResult to an std::string. static std::string PrintTestPartResultToString( const TestPartResult& test_part_result) { - return (Message() - << internal::FormatFileLocation(test_part_result.file_name(), - test_part_result.line_number()) - << " " << TestPartResultTypeToString(test_part_result.type()) - << test_part_result.message()).GetString(); + return (Message() << internal::FormatFileLocation( + test_part_result.file_name(), + test_part_result.line_number()) + << " " + << TestPartResultTypeToString(test_part_result.type()) + << test_part_result.message()) + .GetString(); } // Prints a TestPartResult. static void PrintTestPartResult(const TestPartResult& test_part_result) { - const std::string& result = - PrintTestPartResultToString(test_part_result); + const std::string& result = PrintTestPartResultToString(test_part_result); printf("%s\n", result.c_str()); fflush(stdout); // If the test program runs in Visual Studio or a debugger, the @@ -3164,8 +3163,8 @@ static void PrintTestPartResult(const TestPartResult& test_part_result) { } // class PrettyUnitTestResultPrinter -#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \ - !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \ + !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW // Returns the character attribute for the given color. static WORD GetColorAttribute(GTestColor color) { @@ -3176,7 +3175,8 @@ static WORD GetColorAttribute(GTestColor color) { return FOREGROUND_GREEN; case GTestColor::kYellow: return FOREGROUND_RED | FOREGROUND_GREEN; - default: return 0; + default: + return 0; } } @@ -3232,7 +3232,8 @@ static const char* GetAnsiColorCode(GTestColor color) { // Returns true if and only if Google Test should use colors in the output. bool ShouldUseColor(bool stdout_is_tty) { - const char* const gtest_color = GTEST_FLAG(color).c_str(); + std::string c = GTEST_FLAG_GET(color); + const char* const gtest_color = c.c_str(); if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) { #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW @@ -3259,9 +3260,9 @@ bool ShouldUseColor(bool stdout_is_tty) { } return String::CaseInsensitiveCStringEquals(gtest_color, "yes") || - String::CaseInsensitiveCStringEquals(gtest_color, "true") || - String::CaseInsensitiveCStringEquals(gtest_color, "t") || - String::CStringEquals(gtest_color, "1"); + String::CaseInsensitiveCStringEquals(gtest_color, "true") || + String::CaseInsensitiveCStringEquals(gtest_color, "t") || + String::CStringEquals(gtest_color, "1"); // We take "yes", "true", "t", and "1" as meaning "yes". If the // value is neither one of these nor "auto", we treat it as "no" to // be conservative. @@ -3273,18 +3274,13 @@ bool ShouldUseColor(bool stdout_is_tty) { // that would be colored when printed, as can be done on Linux. GTEST_ATTRIBUTE_PRINTF_(2, 3) -static void ColoredPrintf(GTestColor color, const char *fmt, ...) { +static void ColoredPrintf(GTestColor color, const char* fmt, ...) { va_list args; va_start(args, fmt); -#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \ - GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM) - const bool use_color = AlwaysFalse(); -#else static const bool in_color_mode = ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0); const bool use_color = in_color_mode && (color != GTestColor::kDefault); -#endif // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS if (!use_color) { vprintf(fmt, args); @@ -3292,8 +3288,8 @@ static void ColoredPrintf(GTestColor color, const char *fmt, ...) { return; } -#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \ - !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \ + !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE); // Gets the current text color. @@ -3364,6 +3360,7 @@ class PrettyUnitTestResultPrinter : public TestEventListener { #endif // OnTestCaseStart void OnTestStart(const TestInfo& test_info) override; + void OnTestDisabled(const TestInfo& test_info) override; void OnTestPartResult(const TestPartResult& result) override; void OnTestEnd(const TestInfo& test_info) override; @@ -3384,13 +3381,14 @@ class PrettyUnitTestResultPrinter : public TestEventListener { static void PrintSkippedTests(const UnitTest& unit_test); }; - // Fired before each iteration of tests starts. +// Fired before each iteration of tests starts. void PrettyUnitTestResultPrinter::OnTestIterationStart( const UnitTest& unit_test, int iteration) { - if (GTEST_FLAG(repeat) != 1) + if (GTEST_FLAG_GET(repeat) != 1) printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1); - const char* const filter = GTEST_FLAG(filter).c_str(); + std::string f = GTEST_FLAG_GET(filter); + const char* const filter = f.c_str(); // Prints the filter if it's not *. This reminds the user that some // tests may be skipped. @@ -3406,7 +3404,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart( internal::posix::GetEnv(kTestTotalShards)); } - if (GTEST_FLAG(shuffle)) { + if (GTEST_FLAG_GET(shuffle)) { ColoredPrintf(GTestColor::kYellow, "Note: Randomizing tests' orders with a seed of %d .\n", unit_test.random_seed()); @@ -3462,6 +3460,13 @@ void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) { fflush(stdout); } +void PrettyUnitTestResultPrinter::OnTestDisabled(const TestInfo& test_info) { + ColoredPrintf(GTestColor::kYellow, "[ DISABLED ] "); + PrintTestName(test_info.test_suite_name(), test_info.name()); + printf("\n"); + fflush(stdout); +} + // Called after an assertion failure. void PrettyUnitTestResultPrinter::OnTestPartResult( const TestPartResult& result) { @@ -3486,12 +3491,12 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); } PrintTestName(test_info.test_suite_name(), test_info.name()); - if (test_info.result()->Failed()) - PrintFullTestCommentIfPresent(test_info); + if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info); - if (GTEST_FLAG(print_time)) { - printf(" (%s ms)\n", internal::StreamableToString( - test_info.result()->elapsed_time()).c_str()); + if (GTEST_FLAG_GET(print_time)) { + printf(" (%s ms)\n", + internal::StreamableToString(test_info.result()->elapsed_time()) + .c_str()); } else { printf("\n"); } @@ -3500,7 +3505,7 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) { - if (!GTEST_FLAG(print_time)) return; + if (!GTEST_FLAG_GET(print_time)) return; const std::string counts = FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); @@ -3511,7 +3516,7 @@ void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) { } #else void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) { - if (!GTEST_FLAG(print_time)) return; + if (!GTEST_FLAG_GET(print_time)) return; const std::string counts = FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests"); @@ -3607,7 +3612,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, printf("%s from %s ran.", FormatTestCount(unit_test.test_to_run_count()).c_str(), FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); - if (GTEST_FLAG(print_time)) { + if (GTEST_FLAG_GET(print_time)) { printf(" (%s ms total)", internal::StreamableToString(unit_test.elapsed_time()).c_str()); } @@ -3628,7 +3633,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, } int num_disabled = unit_test.reportable_disabled_test_count(); - if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) { + if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) { if (unit_test.Passed()) { printf("\n"); // Add a spacer if no FAILURE banner is displayed. } @@ -3664,6 +3669,7 @@ class BriefUnitTestResultPrinter : public TestEventListener { #endif // OnTestCaseStart void OnTestStart(const TestInfo& /*test_info*/) override {} + void OnTestDisabled(const TestInfo& /*test_info*/) override {} void OnTestPartResult(const TestPartResult& result) override; void OnTestEnd(const TestInfo& test_info) override; @@ -3700,7 +3706,7 @@ void BriefUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { PrintTestName(test_info.test_suite_name(), test_info.name()); PrintFullTestCommentIfPresent(test_info); - if (GTEST_FLAG(print_time)) { + if (GTEST_FLAG_GET(print_time)) { printf(" (%s ms)\n", internal::StreamableToString(test_info.result()->elapsed_time()) .c_str()); @@ -3717,7 +3723,7 @@ void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, printf("%s from %s ran.", FormatTestCount(unit_test.test_to_run_count()).c_str(), FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); - if (GTEST_FLAG(print_time)) { + if (GTEST_FLAG_GET(print_time)) { printf(" (%s ms total)", internal::StreamableToString(unit_test.elapsed_time()).c_str()); } @@ -3732,7 +3738,7 @@ void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, } int num_disabled = unit_test.reportable_disabled_test_count(); - if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) { + if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) { if (unit_test.Passed()) { printf("\n"); // Add a spacer if no FAILURE banner is displayed. } @@ -3752,7 +3758,7 @@ class TestEventRepeater : public TestEventListener { public: TestEventRepeater() : forwarding_enabled_(true) {} ~TestEventRepeater() override; - void Append(TestEventListener *listener); + void Append(TestEventListener* listener); TestEventListener* Release(TestEventListener* listener); // Controls whether events will be forwarded to listeners_. Set to false @@ -3770,6 +3776,7 @@ class TestEventRepeater : public TestEventListener { #endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ void OnTestSuiteStart(const TestSuite& parameter) override; void OnTestStart(const TestInfo& test_info) override; + void OnTestDisabled(const TestInfo& test_info) override; void OnTestPartResult(const TestPartResult& result) override; void OnTestEnd(const TestInfo& test_info) override; // Legacy API is deprecated but still available @@ -3789,18 +3796,19 @@ class TestEventRepeater : public TestEventListener { // The list of listeners that receive events. std::vector listeners_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater); + TestEventRepeater(const TestEventRepeater&) = delete; + TestEventRepeater& operator=(const TestEventRepeater&) = delete; }; TestEventRepeater::~TestEventRepeater() { ForEach(listeners_, Delete); } -void TestEventRepeater::Append(TestEventListener *listener) { +void TestEventRepeater::Append(TestEventListener* listener) { listeners_.push_back(listener); } -TestEventListener* TestEventRepeater::Release(TestEventListener *listener) { +TestEventListener* TestEventRepeater::Release(TestEventListener* listener) { for (size_t i = 0; i < listeners_.size(); ++i) { if (listeners_[i] == listener) { listeners_.erase(listeners_.begin() + static_cast(i)); @@ -3813,14 +3821,14 @@ TestEventListener* TestEventRepeater::Release(TestEventListener *listener) { // Since most methods are very similar, use macros to reduce boilerplate. // This defines a member that forwards the call to all listeners. -#define GTEST_REPEATER_METHOD_(Name, Type) \ -void TestEventRepeater::Name(const Type& parameter) { \ - if (forwarding_enabled_) { \ - for (size_t i = 0; i < listeners_.size(); i++) { \ - listeners_[i]->Name(parameter); \ - } \ - } \ -} +#define GTEST_REPEATER_METHOD_(Name, Type) \ + void TestEventRepeater::Name(const Type& parameter) { \ + if (forwarding_enabled_) { \ + for (size_t i = 0; i < listeners_.size(); i++) { \ + listeners_[i]->Name(parameter); \ + } \ + } \ + } // This defines a member that forwards the call to all listeners in reverse // order. #define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \ @@ -3840,6 +3848,7 @@ GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite) #endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite) GTEST_REPEATER_METHOD_(OnTestStart, TestInfo) +GTEST_REPEATER_METHOD_(OnTestDisabled, TestInfo) GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult) GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest) GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest) @@ -3890,12 +3899,13 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener { private: // Is c a whitespace character that is normalized to a space character // when it appears in an XML attribute value? - static bool IsNormalizableWhitespace(char c) { - return c == 0x9 || c == 0xA || c == 0xD; + static bool IsNormalizableWhitespace(unsigned char c) { + return c == '\t' || c == '\n' || c == '\r'; } // May c appear in a well-formed XML document? - static bool IsValidXmlCharacter(char c) { + // https://www.w3.org/TR/REC-xml/#charsets + static bool IsValidXmlCharacter(unsigned char c) { return IsNormalizableWhitespace(c) || c >= 0x20; } @@ -3965,7 +3975,8 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener { // The output file. const std::string output_file_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter); + XmlUnitTestResultPrinter(const XmlUnitTestResultPrinter&) = delete; + XmlUnitTestResultPrinter& operator=(const XmlUnitTestResultPrinter&) = delete; }; // Creates a new XmlUnitTestResultPrinter. @@ -4005,8 +4016,8 @@ void XmlUnitTestResultPrinter::ListTestsMatchingFilter( // module will consist of ordinary English text. // If this module is ever modified to produce version 1.1 XML output, // most invalid characters can be retained using character references. -std::string XmlUnitTestResultPrinter::EscapeXml( - const std::string& str, bool is_attribute) { +std::string XmlUnitTestResultPrinter::EscapeXml(const std::string& str, + bool is_attribute) { Message m; for (size_t i = 0; i < str.size(); ++i) { @@ -4034,8 +4045,9 @@ std::string XmlUnitTestResultPrinter::EscapeXml( m << '"'; break; default: - if (IsValidXmlCharacter(ch)) { - if (is_attribute && IsNormalizableWhitespace(ch)) + if (IsValidXmlCharacter(static_cast(ch))) { + if (is_attribute && + IsNormalizableWhitespace(static_cast(ch))) m << "&#x" << String::FormatByte(static_cast(ch)) << ";"; else @@ -4056,7 +4068,7 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters( std::string output; output.reserve(str.size()); for (std::string::const_iterator it = str.begin(); it != str.end(); ++it) - if (IsValidXmlCharacter(*it)) + if (IsValidXmlCharacter(static_cast(*it))) output.push_back(*it); return output; @@ -4064,7 +4076,6 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters( // The following routines generate an XML representation of a UnitTest // object. -// GOOGLETEST_CM0009 DO NOT DELETE // // This is how Google Test concepts map to the DTD: // @@ -4113,12 +4124,12 @@ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) { return ""; // YYYY-MM-DDThh:mm:ss.sss return StreamableToString(time_struct.tm_year + 1900) + "-" + - String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + - String::FormatIntWidth2(time_struct.tm_mday) + "T" + - String::FormatIntWidth2(time_struct.tm_hour) + ":" + - String::FormatIntWidth2(time_struct.tm_min) + ":" + - String::FormatIntWidth2(time_struct.tm_sec) + "." + - String::FormatIntWidthN(static_cast(ms % 1000), 3); + String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + + String::FormatIntWidth2(time_struct.tm_mday) + "T" + + String::FormatIntWidth2(time_struct.tm_hour) + ":" + + String::FormatIntWidth2(time_struct.tm_min) + ":" + + String::FormatIntWidth2(time_struct.tm_sec) + "." + + String::FormatIntWidthN(static_cast(ms % 1000), 3); } // Streams an XML CDATA section, escaping invalid CDATA sequences as needed. @@ -4129,8 +4140,8 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream, for (;;) { const char* const next_segment = strstr(segment, "]]>"); if (next_segment != nullptr) { - stream->write( - segment, static_cast(next_segment - segment)); + stream->write(segment, + static_cast(next_segment - segment)); *stream << "]]>]]>"); } else { @@ -4142,15 +4153,13 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream, } void XmlUnitTestResultPrinter::OutputXmlAttribute( - std::ostream* stream, - const std::string& element_name, - const std::string& name, - const std::string& value) { + std::ostream* stream, const std::string& element_name, + const std::string& name, const std::string& value) { const std::vector& allowed_names = GetReservedOutputAttributesForElement(element_name); GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != - allowed_names.end()) + allowed_names.end()) << "Attribute " << name << " is not allowed for element <" << element_name << ">."; @@ -4216,10 +4225,11 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, OutputXmlAttribute(stream, kTestsuite, "type_param", test_info.type_param()); } - if (GTEST_FLAG(list_tests)) { - OutputXmlAttribute(stream, kTestsuite, "file", test_info.file()); - OutputXmlAttribute(stream, kTestsuite, "line", - StreamableToString(test_info.line())); + + OutputXmlAttribute(stream, kTestsuite, "file", test_info.file()); + OutputXmlAttribute(stream, kTestsuite, "line", + StreamableToString(test_info.line())); + if (GTEST_FLAG_GET(list_tests)) { *stream << " />\n"; return; } @@ -4254,8 +4264,7 @@ void XmlUnitTestResultPrinter::OutputXmlTestResult(::std::ostream* stream, internal::FormatCompilerIndependentFileLocation(part.file_name(), part.line_number()); const std::string summary = location + "\n" + part.summary(); - *stream << " "; const std::string detail = location + "\n" + part.message(); OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str()); @@ -4295,7 +4304,7 @@ void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream, OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name()); OutputXmlAttribute(stream, kTestsuite, "tests", StreamableToString(test_suite.reportable_test_count())); - if (!GTEST_FLAG(list_tests)) { + if (!GTEST_FLAG_GET(list_tests)) { OutputXmlAttribute(stream, kTestsuite, "failures", StreamableToString(test_suite.failed_test_count())); OutputXmlAttribute( @@ -4343,7 +4352,7 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream, stream, kTestsuites, "timestamp", FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp())); - if (GTEST_FLAG(shuffle)) { + if (GTEST_FLAG_GET(shuffle)) { OutputXmlAttribute(stream, kTestsuites, "random_seed", StreamableToString(unit_test.random_seed())); } @@ -4396,7 +4405,7 @@ std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes( for (int i = 0; i < result.test_property_count(); ++i) { const TestProperty& property = result.GetTestProperty(i); attributes << " " << property.key() << "=" - << "\"" << EscapeXmlAttribute(property.value()) << "\""; + << "\"" << EscapeXmlAttribute(property.value()) << "\""; } return attributes.GetString(); } @@ -4410,15 +4419,15 @@ void XmlUnitTestResultPrinter::OutputXmlTestProperties( return; } - *stream << "<" << kProperties << ">\n"; + *stream << " <" << kProperties << ">\n"; for (int i = 0; i < result.test_property_count(); ++i) { const TestProperty& property = result.GetTestProperty(i); - *stream << "<" << kProperty; + *stream << " <" << kProperty; *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\""; *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\""; *stream << "/>\n"; } - *stream << "\n"; + *stream << " \n"; } // End XmlUnitTestResultPrinter @@ -4442,16 +4451,12 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener { //// streams the attribute as JSON. static void OutputJsonKey(std::ostream* stream, const std::string& element_name, - const std::string& name, - const std::string& value, - const std::string& indent, - bool comma = true); + const std::string& name, const std::string& value, + const std::string& indent, bool comma = true); static void OutputJsonKey(std::ostream* stream, const std::string& element_name, - const std::string& name, - int value, - const std::string& indent, - bool comma = true); + const std::string& name, int value, + const std::string& indent, bool comma = true); // Streams a test suite JSON stanza containing the given test result. // @@ -4484,7 +4489,9 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener { // The output file. const std::string output_file_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter); + JsonUnitTestResultPrinter(const JsonUnitTestResultPrinter&) = delete; + JsonUnitTestResultPrinter& operator=(const JsonUnitTestResultPrinter&) = + delete; }; // Creates a new JsonUnitTestResultPrinter. @@ -4496,7 +4503,7 @@ JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file) } void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, - int /*iteration*/) { + int /*iteration*/) { FILE* jsonout = OpenFileForWriting(output_file_); std::stringstream stream; PrintJsonUnitTest(&stream, unit_test); @@ -4562,55 +4569,48 @@ static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) { return ""; // YYYY-MM-DDThh:mm:ss return StreamableToString(time_struct.tm_year + 1900) + "-" + - String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + - String::FormatIntWidth2(time_struct.tm_mday) + "T" + - String::FormatIntWidth2(time_struct.tm_hour) + ":" + - String::FormatIntWidth2(time_struct.tm_min) + ":" + - String::FormatIntWidth2(time_struct.tm_sec) + "Z"; + String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + + String::FormatIntWidth2(time_struct.tm_mday) + "T" + + String::FormatIntWidth2(time_struct.tm_hour) + ":" + + String::FormatIntWidth2(time_struct.tm_min) + ":" + + String::FormatIntWidth2(time_struct.tm_sec) + "Z"; } static inline std::string Indent(size_t width) { return std::string(width, ' '); } -void JsonUnitTestResultPrinter::OutputJsonKey( - std::ostream* stream, - const std::string& element_name, - const std::string& name, - const std::string& value, - const std::string& indent, - bool comma) { +void JsonUnitTestResultPrinter::OutputJsonKey(std::ostream* stream, + const std::string& element_name, + const std::string& name, + const std::string& value, + const std::string& indent, + bool comma) { const std::vector& allowed_names = GetReservedOutputAttributesForElement(element_name); GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != - allowed_names.end()) + allowed_names.end()) << "Key \"" << name << "\" is not allowed for value \"" << element_name << "\"."; *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\""; - if (comma) - *stream << ",\n"; + if (comma) *stream << ",\n"; } void JsonUnitTestResultPrinter::OutputJsonKey( - std::ostream* stream, - const std::string& element_name, - const std::string& name, - int value, - const std::string& indent, - bool comma) { + std::ostream* stream, const std::string& element_name, + const std::string& name, int value, const std::string& indent, bool comma) { const std::vector& allowed_names = GetReservedOutputAttributesForElement(element_name); GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != - allowed_names.end()) + allowed_names.end()) << "Key \"" << name << "\" is not allowed for value \"" << element_name << "\"."; *stream << indent << "\"" << name << "\": " << StreamableToString(value); - if (comma) - *stream << ",\n"; + if (comma) *stream << ",\n"; } // Streams a test suite JSON stanza containing the given test result. @@ -4620,7 +4620,7 @@ void JsonUnitTestResultPrinter::OutputJsonTestSuiteForTestResult( *stream << Indent(4) << "{\n"; OutputJsonKey(stream, "testsuite", "name", "NonTestSuiteFailure", Indent(6)); OutputJsonKey(stream, "testsuite", "tests", 1, Indent(6)); - if (!GTEST_FLAG(list_tests)) { + if (!GTEST_FLAG_GET(list_tests)) { OutputJsonKey(stream, "testsuite", "failures", 1, Indent(6)); OutputJsonKey(stream, "testsuite", "disabled", 0, Indent(6)); OutputJsonKey(stream, "testsuite", "skipped", 0, Indent(6)); @@ -4674,11 +4674,14 @@ void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream, OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(), kIndent); } - if (GTEST_FLAG(list_tests)) { - OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent); - OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false); + + OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent); + OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false); + if (GTEST_FLAG_GET(list_tests)) { *stream << "\n" << Indent(8) << "}"; return; + } else { + *stream << ",\n"; } OutputJsonKey(stream, kTestsuite, "status", @@ -4710,7 +4713,9 @@ void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream, if (part.failed()) { *stream << ",\n"; if (++failures == 1) { - *stream << kIndent << "\"" << "failures" << "\": [\n"; + *stream << kIndent << "\"" + << "failures" + << "\": [\n"; } const std::string location = internal::FormatCompilerIndependentFileLocation(part.file_name(), @@ -4723,8 +4728,7 @@ void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream, } } - if (failures > 0) - *stream << "\n" << kIndent << "]"; + if (failures > 0) *stream << "\n" << kIndent << "]"; *stream << "\n" << Indent(8) << "}"; } @@ -4738,7 +4742,7 @@ void JsonUnitTestResultPrinter::PrintJsonTestSuite( OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent); OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(), kIndent); - if (!GTEST_FLAG(list_tests)) { + if (!GTEST_FLAG_GET(list_tests)) { OutputJsonKey(stream, kTestsuite, "failures", test_suite.failed_test_count(), kIndent); OutputJsonKey(stream, kTestsuite, "disabled", @@ -4785,7 +4789,7 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream, OutputJsonKey(stream, kTestsuites, "disabled", unit_test.reportable_disabled_test_count(), kIndent); OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent); - if (GTEST_FLAG(shuffle)) { + if (GTEST_FLAG_GET(shuffle)) { OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(), kIndent); } @@ -4820,7 +4824,9 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream, OutputJsonTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result()); } - *stream << "\n" << kIndent << "]\n" << "}\n"; + *stream << "\n" + << kIndent << "]\n" + << "}\n"; } void JsonUnitTestResultPrinter::PrintJsonTestList( @@ -4855,7 +4861,8 @@ std::string JsonUnitTestResultPrinter::TestPropertiesAsJson( Message attributes; for (int i = 0; i < result.test_property_count(); ++i) { const TestProperty& property = result.GetTestProperty(i); - attributes << ",\n" << indent << "\"" << property.key() << "\": " + attributes << ",\n" + << indent << "\"" << property.key() << "\": " << "\"" << EscapeJson(property.value()) << "\""; } return attributes.GetString(); @@ -4895,14 +4902,14 @@ void StreamingListener::SocketWriter::MakeConnection() { addrinfo hints; memset(&hints, 0, sizeof(hints)); - hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses. + hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses. hints.ai_socktype = SOCK_STREAM; addrinfo* servinfo = nullptr; // Use the getaddrinfo() to get a linked list of IP addresses for // the given host name. - const int error_num = getaddrinfo( - host_name_.c_str(), port_num_.c_str(), &hints, &servinfo); + const int error_num = + getaddrinfo(host_name_.c_str(), port_num_.c_str(), &hints, &servinfo); if (error_num != 0) { GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: " << gai_strerror(error_num); @@ -4911,8 +4918,8 @@ void StreamingListener::SocketWriter::MakeConnection() { // Loop through all the results and connect to the first we can. for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr; cur_addr = cur_addr->ai_next) { - sockfd_ = socket( - cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol); + sockfd_ = socket(cur_addr->ai_family, cur_addr->ai_socktype, + cur_addr->ai_protocol); if (sockfd_ != -1) { // Connect the client socket to the server socket. if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) { @@ -4962,7 +4969,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count) for (int i = 0; i < raw_stack_size; ++i) { if (raw_stack[i] == caller_frame && - !GTEST_FLAG(show_internal_stack_frames)) { + !GTEST_FLAG_GET(show_internal_stack_frames)) { // Add a marker to the trace and stop adding frames. absl::StrAppend(&result, kElidedFramesMarker, "\n"); break; @@ -4981,7 +4988,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count) return result; -#else // !GTEST_HAS_ABSL +#else // !GTEST_HAS_ABSL static_cast(max_depth); static_cast(skip_count); return ""; @@ -5005,14 +5012,14 @@ void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) { class ScopedPrematureExitFile { public: explicit ScopedPrematureExitFile(const char* premature_exit_filepath) - : premature_exit_filepath_(premature_exit_filepath ? - premature_exit_filepath : "") { + : premature_exit_filepath_( + premature_exit_filepath ? premature_exit_filepath : "") { // If a path to the premature-exit file is specified... if (!premature_exit_filepath_.empty()) { // create the file with a single "0" character in it. I/O // errors are ignored as there's nothing better we can do and we // don't want to fail the test because of this. - FILE* pfile = posix::FOpen(premature_exit_filepath, "w"); + FILE* pfile = posix::FOpen(premature_exit_filepath_.c_str(), "w"); fwrite("0", 1, 1, pfile); fclose(pfile); } @@ -5034,7 +5041,8 @@ class ScopedPrematureExitFile { private: const std::string premature_exit_filepath_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile); + ScopedPrematureExitFile(const ScopedPrematureExitFile&) = delete; + ScopedPrematureExitFile& operator=(const ScopedPrematureExitFile&) = delete; }; } // namespace internal @@ -5208,7 +5216,7 @@ int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); } // Gets the time of the test program start, in ms from the start of the // UNIX epoch. internal::TimeInMillis UnitTest::start_timestamp() const { - return impl()->start_timestamp(); + return impl()->start_timestamp(); } // Gets the elapsed time, in milliseconds. @@ -5251,9 +5259,7 @@ TestSuite* UnitTest::GetMutableTestSuite(int i) { // Returns the list of event listeners that can be used to track events // inside Google Test. -TestEventListeners& UnitTest::listeners() { - return *impl()->listeners(); -} +TestEventListeners& UnitTest::listeners() { return *impl()->listeners(); } // Registers and returns a global test environment. When a test // program is run, all global test environments will be set-up in the @@ -5278,12 +5284,11 @@ Environment* UnitTest::AddEnvironment(Environment* env) { // assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call // this to report their results. The user code should use the // assertion macros instead of calling this directly. -void UnitTest::AddTestPartResult( - TestPartResult::Type result_type, - const char* file_name, - int line_number, - const std::string& message, - const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) { +void UnitTest::AddTestPartResult(TestPartResult::Type result_type, + const char* file_name, int line_number, + const std::string& message, + const std::string& os_stack_trace) + GTEST_LOCK_EXCLUDED_(mutex_) { Message msg; msg << message; @@ -5293,8 +5298,9 @@ void UnitTest::AddTestPartResult( for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) { const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1]; - msg << "\n" << internal::FormatFileLocation(trace.file, trace.line) - << " " << trace.message; + msg << "\n" + << internal::FormatFileLocation(trace.file, trace.line) << " " + << trace.message; } } @@ -5304,8 +5310,8 @@ void UnitTest::AddTestPartResult( const TestPartResult result = TestPartResult( result_type, file_name, line_number, msg.GetString().c_str()); - impl_->GetTestPartResultReporterForCurrentThread()-> - ReportTestPartResult(result); + impl_->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult( + result); if (result_type != TestPartResult::kSuccess && result_type != TestPartResult::kSkip) { @@ -5314,7 +5320,7 @@ void UnitTest::AddTestPartResult( // in the code (perhaps in order to use Google Test assertions // with another testing framework) and specify the former on the // command line for debugging. - if (GTEST_FLAG(break_on_failure)) { + if (GTEST_FLAG_GET(break_on_failure)) { #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT // Using DebugBreak on Windows allows gtest to still break into a debugger // when a failure happens and both the --gtest_break_on_failure and @@ -5331,7 +5337,7 @@ void UnitTest::AddTestPartResult( // portability: some debuggers don't correctly trap abort(). *static_cast(nullptr) = 1; #endif // GTEST_OS_WINDOWS - } else if (GTEST_FLAG(throw_on_failure)) { + } else if (GTEST_FLAG_GET(throw_on_failure)) { #if GTEST_HAS_EXCEPTIONS throw internal::GoogleTestFailureException(result); #else @@ -5360,7 +5366,7 @@ void UnitTest::RecordProperty(const std::string& key, // from the main thread. int UnitTest::Run() { const bool in_death_test_child_process = - internal::GTEST_FLAG(internal_run_death_test).length() > 0; + GTEST_FLAG_GET(internal_run_death_test).length() > 0; // Google Test implements this protocol for catching that a test // program exits before returning control to Google Test: @@ -5390,7 +5396,7 @@ int UnitTest::Run() { // Captures the value of GTEST_FLAG(catch_exceptions). This value will be // used for the duration of the program. - impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions)); + impl()->set_catch_exceptions(GTEST_FLAG_GET(catch_exceptions)); #if GTEST_OS_WINDOWS // Either the user wants Google Test to catch exceptions thrown by the @@ -5398,26 +5404,26 @@ int UnitTest::Run() { // process. In either case the user does not want to see pop-up dialogs // about crashes - they are expected. if (impl()->catch_exceptions() || in_death_test_child_process) { -# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT +#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT // SetErrorMode doesn't exist on CE. SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT | SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX); -# endif // !GTEST_OS_WINDOWS_MOBILE +#endif // !GTEST_OS_WINDOWS_MOBILE -# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE +#if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE // Death test children can be terminated with _abort(). On Windows, // _abort() can show a dialog with a warning message. This forces the // abort message to go to stderr instead. _set_error_mode(_OUT_TO_STDERR); -# endif +#endif -# if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE +#if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE // In the debug version, Visual Studio pops up a separate dialog // offering a choice to debug the aborted program. We need to suppress // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement // executed. Google Test will notify the user of any unexpected // failure via stderr. - if (!GTEST_FLAG(break_on_failure)) + if (!GTEST_FLAG_GET(break_on_failure)) _set_abort_behavior( 0x0, // Clear the following flags: _WRITE_ABORT_MSG | _CALL_REPORTFAULT); // pop-up window, core dump. @@ -5431,14 +5437,15 @@ int UnitTest::Run() { _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG); (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR); } -# endif +#endif } #endif // GTEST_OS_WINDOWS return internal::HandleExceptionsInMethodIfSupported( - impl(), - &internal::UnitTestImpl::RunAllTests, - "auxiliary test code (environments or event listeners)") ? 0 : 1; + impl(), &internal::UnitTestImpl::RunAllTests, + "auxiliary test code (environments or event listeners)") + ? 0 + : 1; } // Returns the working directory when the first TEST() or TEST_F() was @@ -5483,14 +5490,10 @@ UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) { } // Creates an empty UnitTest. -UnitTest::UnitTest() { - impl_ = new internal::UnitTestImpl(this); -} +UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); } // Destructor of UnitTest. -UnitTest::~UnitTest() { - delete impl_; -} +UnitTest::~UnitTest() { delete impl_; } // Pushes a trace defined by SCOPED_TRACE() on to the per-thread // Google Test trace stack. @@ -5501,8 +5504,7 @@ void UnitTest::PushGTestTrace(const internal::TraceInfo& trace) } // Pops a trace from the per-thread Google Test trace stack. -void UnitTest::PopGTestTrace() - GTEST_LOCK_EXCLUDED_(mutex_) { +void UnitTest::PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_) { internal::MutexLock lock(&mutex_); impl_->gtest_trace_stack().pop_back(); } @@ -5599,12 +5601,12 @@ void UnitTestImpl::ConfigureXmlOutput() { // Initializes event listeners for streaming test results in string form. // Must not be called before InitGoogleTest. void UnitTestImpl::ConfigureStreamingOutput() { - const std::string& target = GTEST_FLAG(stream_result_to); + const std::string& target = GTEST_FLAG_GET(stream_result_to); if (!target.empty()) { const size_t pos = target.find(':'); if (pos != std::string::npos) { - listeners()->Append(new StreamingListener(target.substr(0, pos), - target.substr(pos+1))); + listeners()->Append( + new StreamingListener(target.substr(0, pos), target.substr(pos + 1))); } else { GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target << "\" ignored."; @@ -5642,7 +5644,7 @@ void UnitTestImpl::PostFlagParsingInit() { // to shut down the default XML output before invoking RUN_ALL_TESTS. ConfigureXmlOutput(); - if (GTEST_FLAG(brief)) { + if (GTEST_FLAG_GET(brief)) { listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter); } @@ -5652,7 +5654,7 @@ void UnitTestImpl::PostFlagParsingInit() { #endif // GTEST_CAN_STREAM_RESULTS_ #if GTEST_HAS_ABSL - if (GTEST_FLAG(install_failure_signal_handler)) { + if (GTEST_FLAG_GET(install_failure_signal_handler)) { absl::FailureSignalHandlerOptions options; absl::InstallFailureSignalHandler(options); } @@ -5710,9 +5712,9 @@ TestSuite* UnitTestImpl::GetTestSuite( auto* const new_test_suite = new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc); + const UnitTestFilter death_test_suite_filter(kDeathTestSuiteFilter); // Is this a death test suite? - if (internal::UnitTestOptions::MatchesFilter(test_suite_name, - kDeathTestSuiteFilter)) { + if (death_test_suite_filter.MatchesName(test_suite_name)) { // Yes. Inserts the test suite after the last death test suite // defined so far. This only works when the test suites haven't // been shuffled. Otherwise we may end up running a death test @@ -5749,8 +5751,7 @@ bool UnitTestImpl::RunAllTests() { const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized(); // Do not run any test if the --help flag was specified. - if (g_help_flag) - return true; + if (g_help_flag) return true; // Repeats the call to the post-flag parsing initialization in case the // user didn't call InitGoogleTest. @@ -5768,11 +5769,11 @@ bool UnitTestImpl::RunAllTests() { #if GTEST_HAS_DEATH_TEST in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != nullptr); -# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) +#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) if (in_subprocess_for_death_test) { GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_(); } -# endif // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) +#endif // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) #endif // GTEST_HAS_DEATH_TEST const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex, @@ -5780,19 +5781,18 @@ bool UnitTestImpl::RunAllTests() { // Compares the full test names with the filter to decide which // tests to run. - const bool has_tests_to_run = FilterTests(should_shard - ? HONOR_SHARDING_PROTOCOL - : IGNORE_SHARDING_PROTOCOL) > 0; + const bool has_tests_to_run = + FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL + : IGNORE_SHARDING_PROTOCOL) > 0; // Lists the tests and exits if the --gtest_list_tests flag was specified. - if (GTEST_FLAG(list_tests)) { + if (GTEST_FLAG_GET(list_tests)) { // This must be called *after* FilterTests() has been called. ListTestsMatchingFilter(); return true; } - random_seed_ = GTEST_FLAG(shuffle) ? - GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0; + random_seed_ = GetRandomSeedFromFlag(GTEST_FLAG_GET(random_seed)); // True if and only if at least one test has failed. bool failed = false; @@ -5804,9 +5804,21 @@ bool UnitTestImpl::RunAllTests() { // How many times to repeat the tests? We don't want to repeat them // when we are inside the subprocess of a death test. - const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat); + const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG_GET(repeat); + // Repeats forever if the repeat count is negative. const bool gtest_repeat_forever = repeat < 0; + + // Should test environments be set up and torn down for each repeat, or only + // set up on the first and torn down on the last iteration? If there is no + // "last" iteration because the tests will repeat forever, always recreate the + // environments to avoid leaks in case one of the environments is using + // resources that are external to this process. Without this check there would + // be no way to clean up those external resources automatically. + const bool recreate_environments_when_repeating = + GTEST_FLAG_GET(recreate_environments_when_repeating) || + gtest_repeat_forever; + for (int i = 0; gtest_repeat_forever || i != repeat; i++) { // We want to preserve failures generated by ad-hoc test // assertions executed before RUN_ALL_TESTS(). @@ -5815,7 +5827,7 @@ bool UnitTestImpl::RunAllTests() { Timer timer; // Shuffles test suites and tests if requested. - if (has_tests_to_run && GTEST_FLAG(shuffle)) { + if (has_tests_to_run && GTEST_FLAG_GET(shuffle)) { random()->Reseed(static_cast(random_seed_)); // This should be done before calling OnTestIterationStart(), // such that a test event listener can see the actual test order @@ -5828,10 +5840,13 @@ bool UnitTestImpl::RunAllTests() { // Runs each test suite if there is at least one test to run. if (has_tests_to_run) { - // Sets up all environments beforehand. - repeater->OnEnvironmentsSetUpStart(*parent_); - ForEach(environments_, SetUpEnvironment); - repeater->OnEnvironmentsSetUpEnd(*parent_); + // Sets up all environments beforehand. If test environments aren't + // recreated for each iteration, only do so on the first iteration. + if (i == 0 || recreate_environments_when_repeating) { + repeater->OnEnvironmentsSetUpStart(*parent_); + ForEach(environments_, SetUpEnvironment); + repeater->OnEnvironmentsSetUpEnd(*parent_); + } // Runs the tests only if there was no fatal failure or skip triggered // during global set-up. @@ -5853,7 +5868,7 @@ bool UnitTestImpl::RunAllTests() { for (int test_index = 0; test_index < total_test_suite_count(); test_index++) { GetMutableSuiteCase(test_index)->Run(); - if (GTEST_FLAG(fail_fast) && + if (GTEST_FLAG_GET(fail_fast) && GetMutableSuiteCase(test_index)->Failed()) { for (int j = test_index + 1; j < total_test_suite_count(); j++) { GetMutableSuiteCase(j)->Skip(); @@ -5871,11 +5886,15 @@ bool UnitTestImpl::RunAllTests() { } } - // Tears down all environments in reverse order afterwards. - repeater->OnEnvironmentsTearDownStart(*parent_); - std::for_each(environments_.rbegin(), environments_.rend(), - TearDownEnvironment); - repeater->OnEnvironmentsTearDownEnd(*parent_); + // Tears down all environments in reverse order afterwards. If test + // environments aren't recreated for each iteration, only do so on the + // last iteration. + if (i == repeat - 1 || recreate_environments_when_repeating) { + repeater->OnEnvironmentsTearDownStart(*parent_); + std::for_each(environments_.rbegin(), environments_.rend(), + TearDownEnvironment); + repeater->OnEnvironmentsTearDownEnd(*parent_); + } } elapsed_time_ = timer.Elapsed(); @@ -5896,7 +5915,7 @@ bool UnitTestImpl::RunAllTests() { // (it's always safe to unshuffle the tests). UnshuffleTests(); - if (GTEST_FLAG(shuffle)) { + if (GTEST_FLAG_GET(shuffle)) { // Picks a new random seed for each iteration. random_seed_ = GetNextRandomSeed(random_seed_); } @@ -5947,8 +5966,7 @@ void WriteToShardStatusFileIfNeeded() { // an error and exits. If in_subprocess_for_death_test, sharding is // disabled because it must only be applied to the original test // process. Otherwise, we could filter out death tests we intended to execute. -bool ShouldShard(const char* total_shards_env, - const char* shard_index_env, +bool ShouldShard(const char* total_shards_env, const char* shard_index_env, bool in_subprocess_for_death_test) { if (in_subprocess_for_death_test) { return false; @@ -5960,27 +5978,27 @@ bool ShouldShard(const char* total_shards_env, if (total_shards == -1 && shard_index == -1) { return false; } else if (total_shards == -1 && shard_index != -1) { - const Message msg = Message() - << "Invalid environment variables: you have " - << kTestShardIndex << " = " << shard_index - << ", but have left " << kTestTotalShards << " unset.\n"; + const Message msg = Message() << "Invalid environment variables: you have " + << kTestShardIndex << " = " << shard_index + << ", but have left " << kTestTotalShards + << " unset.\n"; ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str()); fflush(stdout); exit(EXIT_FAILURE); } else if (total_shards != -1 && shard_index == -1) { const Message msg = Message() - << "Invalid environment variables: you have " - << kTestTotalShards << " = " << total_shards - << ", but have left " << kTestShardIndex << " unset.\n"; + << "Invalid environment variables: you have " + << kTestTotalShards << " = " << total_shards + << ", but have left " << kTestShardIndex << " unset.\n"; ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str()); fflush(stdout); exit(EXIT_FAILURE); } else if (shard_index < 0 || shard_index >= total_shards) { - const Message msg = Message() - << "Invalid environment variables: we require 0 <= " - << kTestShardIndex << " < " << kTestTotalShards - << ", but you have " << kTestShardIndex << "=" << shard_index - << ", " << kTestTotalShards << "=" << total_shards << ".\n"; + const Message msg = + Message() << "Invalid environment variables: we require 0 <= " + << kTestShardIndex << " < " << kTestTotalShards + << ", but you have " << kTestShardIndex << "=" << shard_index + << ", " << kTestTotalShards << "=" << total_shards << ".\n"; ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str()); fflush(stdout); exit(EXIT_FAILURE); @@ -6022,11 +6040,16 @@ bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) { // https://github.com/google/googletest/blob/master/googletest/docs/advanced.md // . Returns the number of tests that should run. int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { - const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ? - Int32FromEnvOrDie(kTestTotalShards, -1) : -1; - const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ? - Int32FromEnvOrDie(kTestShardIndex, -1) : -1; - + const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL + ? Int32FromEnvOrDie(kTestTotalShards, -1) + : -1; + const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL + ? Int32FromEnvOrDie(kTestShardIndex, -1) + : -1; + + const PositiveAndNegativeUnitTestFilter gtest_flag_filter( + GTEST_FLAG_GET(filter)); + const UnitTestFilter disable_test_filter(kDisableTestFilter); // num_runnable_tests are the number of tests that will // run across all shards (i.e., match filter and are not disabled). // num_selected_tests are the number of tests to be run on @@ -6042,18 +6065,17 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { const std::string test_name(test_info->name()); // A test is disabled if test suite name or test name matches // kDisableTestFilter. - const bool is_disabled = internal::UnitTestOptions::MatchesFilter( - test_suite_name, kDisableTestFilter) || - internal::UnitTestOptions::MatchesFilter( - test_name, kDisableTestFilter); + const bool is_disabled = + disable_test_filter.MatchesName(test_suite_name) || + disable_test_filter.MatchesName(test_name); test_info->is_disabled_ = is_disabled; - const bool matches_filter = internal::UnitTestOptions::FilterMatchesTest( - test_suite_name, test_name); + const bool matches_filter = + gtest_flag_filter.MatchesTest(test_suite_name, test_name); test_info->matches_filter_ = matches_filter; const bool is_runnable = - (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) && + (GTEST_FLAG_GET(also_run_disabled_tests) || !is_disabled) && matches_filter; const bool is_in_another_shard = @@ -6222,8 +6244,8 @@ void UnitTestImpl::UnshuffleTests() { // For example, if Foo() calls Bar(), which in turn calls // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. -std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, - int skip_count) { +GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_ std::string +GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, int skip_count) { // We pass skip_count + 1 to skip this wrapper function in addition // to what the user really wants to skip. return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1); @@ -6233,7 +6255,7 @@ std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, // suppress unreachable code warnings. namespace { class ClassUniqueToAlwaysTrue {}; -} +} // namespace bool IsTrue(bool condition) { return condition; } @@ -6241,8 +6263,7 @@ bool AlwaysTrue() { #if GTEST_HAS_EXCEPTIONS // This condition is always false so AlwaysTrue() never actually throws, // but it makes the compiler think that it may throw. - if (IsTrue(false)) - throw ClassUniqueToAlwaysTrue(); + if (IsTrue(false)) throw ClassUniqueToAlwaysTrue(); #endif // GTEST_HAS_EXCEPTIONS return true; } @@ -6264,13 +6285,14 @@ bool SkipPrefix(const char* prefix, const char** pstr) { // part can be omitted. // // Returns the value of the flag, or NULL if the parsing failed. -static const char* ParseFlagValue(const char* str, const char* flag, +static const char* ParseFlagValue(const char* str, const char* flag_name, bool def_optional) { // str and flag must not be NULL. - if (str == nullptr || flag == nullptr) return nullptr; + if (str == nullptr || flag_name == nullptr) return nullptr; // The flag must start with "--" followed by GTEST_FLAG_PREFIX_. - const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag; + const std::string flag_str = + std::string("--") + GTEST_FLAG_PREFIX_ + flag_name; const size_t flag_len = flag_str.length(); if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr; @@ -6301,9 +6323,9 @@ static const char* ParseFlagValue(const char* str, const char* flag, // // On success, stores the value of the flag in *value, and returns // true. On failure, returns false without changing *value. -static bool ParseBoolFlag(const char* str, const char* flag, bool* value) { +static bool ParseFlag(const char* str, const char* flag_name, bool* value) { // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, true); + const char* const value_str = ParseFlagValue(str, flag_name, true); // Aborts if the parsing failed. if (value_str == nullptr) return false; @@ -6317,16 +6339,16 @@ static bool ParseBoolFlag(const char* str, const char* flag, bool* value) { // // On success, stores the value of the flag in *value, and returns // true. On failure, returns false without changing *value. -bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) { +bool ParseFlag(const char* str, const char* flag_name, int32_t* value) { // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, false); + const char* const value_str = ParseFlagValue(str, flag_name, false); // Aborts if the parsing failed. if (value_str == nullptr) return false; // Sets *value to the value of the flag. - return ParseInt32(Message() << "The value of flag --" << flag, - value_str, value); + return ParseInt32(Message() << "The value of flag --" << flag_name, value_str, + value); } // Parses a string for a string flag, in the form of "--flag=value". @@ -6334,9 +6356,9 @@ bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) { // On success, stores the value of the flag in *value, and returns // true. On failure, returns false without changing *value. template -static bool ParseStringFlag(const char* str, const char* flag, String* value) { +static bool ParseFlag(const char* str, const char* flag_name, String* value) { // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, false); + const char* const value_str = ParseFlagValue(str, flag_name, false); // Aborts if the parsing failed. if (value_str == nullptr) return false; @@ -6353,8 +6375,7 @@ static bool ParseStringFlag(const char* str, const char* flag, String* value) { // GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test // internal flags and do not trigger the help message. static bool HasGoogleTestFlagPrefix(const char* str) { - return (SkipPrefix("--", &str) || - SkipPrefix("-", &str) || + return (SkipPrefix("--", &str) || SkipPrefix("-", &str) || SkipPrefix("/", &str)) && !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) && (SkipPrefix(GTEST_FLAG_PREFIX_, &str) || @@ -6437,6 +6458,10 @@ static const char kColorEncodedHelpMessage[] = "random_seed=@Y[NUMBER]@D\n" " Random number seed to use for shuffling test orders (between 1 and\n" " 99999, or 0 to use a seed based on the current time).\n" + " @G--" GTEST_FLAG_PREFIX_ + "recreate_environments_when_repeating@D\n" + " Sets up and tears down the global test environment on each repeat\n" + " of the test.\n" "\n" "Test Output:\n" " @G--" GTEST_FLAG_PREFIX_ @@ -6454,18 +6479,18 @@ static const char kColorEncodedHelpMessage[] = " Generate a JSON or XML report in the given directory or with the " "given\n" " file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n" -# if GTEST_CAN_STREAM_RESULTS_ +#if GTEST_CAN_STREAM_RESULTS_ " @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n" " Stream test results to the given server.\n" -# endif // GTEST_CAN_STREAM_RESULTS_ +#endif // GTEST_CAN_STREAM_RESULTS_ "\n" "Assertion Behavior:\n" -# if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS +#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS " @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n" " Set the default death test style.\n" -# endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS +#endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS " @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n" " Turn assertion failures into debugger break-points.\n" @@ -6497,41 +6522,44 @@ static const char kColorEncodedHelpMessage[] = "@G<" GTEST_DEV_EMAIL_ ">@D.\n"; static bool ParseGoogleTestFlag(const char* const arg) { - return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag, - >EST_FLAG(also_run_disabled_tests)) || - ParseBoolFlag(arg, kBreakOnFailureFlag, - >EST_FLAG(break_on_failure)) || - ParseBoolFlag(arg, kCatchExceptionsFlag, - >EST_FLAG(catch_exceptions)) || - ParseStringFlag(arg, kColorFlag, >EST_FLAG(color)) || - ParseStringFlag(arg, kDeathTestStyleFlag, - >EST_FLAG(death_test_style)) || - ParseBoolFlag(arg, kDeathTestUseFork, - >EST_FLAG(death_test_use_fork)) || - ParseBoolFlag(arg, kFailFast, >EST_FLAG(fail_fast)) || - ParseStringFlag(arg, kFilterFlag, >EST_FLAG(filter)) || - ParseStringFlag(arg, kInternalRunDeathTestFlag, - >EST_FLAG(internal_run_death_test)) || - ParseBoolFlag(arg, kListTestsFlag, >EST_FLAG(list_tests)) || - ParseStringFlag(arg, kOutputFlag, >EST_FLAG(output)) || - ParseBoolFlag(arg, kBriefFlag, >EST_FLAG(brief)) || - ParseBoolFlag(arg, kPrintTimeFlag, >EST_FLAG(print_time)) || - ParseBoolFlag(arg, kPrintUTF8Flag, >EST_FLAG(print_utf8)) || - ParseInt32Flag(arg, kRandomSeedFlag, >EST_FLAG(random_seed)) || - ParseInt32Flag(arg, kRepeatFlag, >EST_FLAG(repeat)) || - ParseBoolFlag(arg, kShuffleFlag, >EST_FLAG(shuffle)) || - ParseInt32Flag(arg, kStackTraceDepthFlag, - >EST_FLAG(stack_trace_depth)) || - ParseStringFlag(arg, kStreamResultToFlag, - >EST_FLAG(stream_result_to)) || - ParseBoolFlag(arg, kThrowOnFailureFlag, >EST_FLAG(throw_on_failure)); +#define GTEST_INTERNAL_PARSE_FLAG(flag_name) \ + do { \ + auto value = GTEST_FLAG_GET(flag_name); \ + if (ParseFlag(arg, #flag_name, &value)) { \ + GTEST_FLAG_SET(flag_name, value); \ + return true; \ + } \ + } while (false) + + GTEST_INTERNAL_PARSE_FLAG(also_run_disabled_tests); + GTEST_INTERNAL_PARSE_FLAG(break_on_failure); + GTEST_INTERNAL_PARSE_FLAG(catch_exceptions); + GTEST_INTERNAL_PARSE_FLAG(color); + GTEST_INTERNAL_PARSE_FLAG(death_test_style); + GTEST_INTERNAL_PARSE_FLAG(death_test_use_fork); + GTEST_INTERNAL_PARSE_FLAG(fail_fast); + GTEST_INTERNAL_PARSE_FLAG(filter); + GTEST_INTERNAL_PARSE_FLAG(internal_run_death_test); + GTEST_INTERNAL_PARSE_FLAG(list_tests); + GTEST_INTERNAL_PARSE_FLAG(output); + GTEST_INTERNAL_PARSE_FLAG(brief); + GTEST_INTERNAL_PARSE_FLAG(print_time); + GTEST_INTERNAL_PARSE_FLAG(print_utf8); + GTEST_INTERNAL_PARSE_FLAG(random_seed); + GTEST_INTERNAL_PARSE_FLAG(repeat); + GTEST_INTERNAL_PARSE_FLAG(recreate_environments_when_repeating); + GTEST_INTERNAL_PARSE_FLAG(shuffle); + GTEST_INTERNAL_PARSE_FLAG(stack_trace_depth); + GTEST_INTERNAL_PARSE_FLAG(stream_result_to); + GTEST_INTERNAL_PARSE_FLAG(throw_on_failure); + return false; } #if GTEST_USE_OWN_FLAGFILE_FLAG_ static void LoadFlagsFromFile(const std::string& path) { FILE* flagfile = posix::FOpen(path.c_str(), "r"); if (!flagfile) { - GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile) + GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG_GET(flagfile) << "\""; } std::string contents(ReadEntireFile(flagfile)); @@ -6539,10 +6567,8 @@ static void LoadFlagsFromFile(const std::string& path) { std::vector lines; SplitString(contents, '\n', &lines); for (size_t i = 0; i < lines.size(); ++i) { - if (lines[i].empty()) - continue; - if (!ParseGoogleTestFlag(lines[i].c_str())) - g_help_flag = true; + if (lines[i].empty()) continue; + if (!ParseGoogleTestFlag(lines[i].c_str())) g_help_flag = true; } } #endif // GTEST_USE_OWN_FLAGFILE_FLAG_ @@ -6552,25 +6578,23 @@ static void LoadFlagsFromFile(const std::string& path) { // instantiated to either char or wchar_t. template void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) { + std::string flagfile_value; for (int i = 1; i < *argc; i++) { const std::string arg_string = StreamableToString(argv[i]); const char* const arg = arg_string.c_str(); - using internal::ParseBoolFlag; - using internal::ParseInt32Flag; - using internal::ParseStringFlag; + using internal::ParseFlag; bool remove_flag = false; if (ParseGoogleTestFlag(arg)) { remove_flag = true; #if GTEST_USE_OWN_FLAGFILE_FLAG_ - } else if (ParseStringFlag(arg, kFlagfileFlag, >EST_FLAG(flagfile))) { - LoadFlagsFromFile(GTEST_FLAG(flagfile)); + } else if (ParseFlag(arg, "flagfile", &flagfile_value)) { + GTEST_FLAG_SET(flagfile, flagfile_value); + LoadFlagsFromFile(flagfile_value); remove_flag = true; #endif // GTEST_USE_OWN_FLAGFILE_FLAG_ - } else if (arg_string == "--help" || arg_string == "-h" || - arg_string == "-?" || arg_string == "/?" || - HasGoogleTestFlagPrefix(arg)) { + } else if (arg_string == "--help" || HasGoogleTestFlagPrefix(arg)) { // Both help flag and unrecognized Google Test flags (excluding // internal ones) trigger help display. g_help_flag = true; @@ -6605,7 +6629,27 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) { // Parses the command line for Google Test flags, without initializing // other parts of Google Test. void ParseGoogleTestFlagsOnly(int* argc, char** argv) { +#if GTEST_HAS_ABSL + if (*argc > 0) { + // absl::ParseCommandLine() requires *argc > 0. + auto positional_args = absl::flags_internal::ParseCommandLineImpl( + *argc, argv, absl::flags_internal::ArgvListAction::kRemoveParsedArgs, + absl::flags_internal::UsageFlagsAction::kHandleUsage, + absl::flags_internal::OnUndefinedFlag::kReportUndefined); + // Any command-line positional arguments not part of any command-line flag + // (or arguments to a flag) are copied back out to argv, with the program + // invocation name at position 0, and argc is resized. This includes + // positional arguments after the flag-terminating delimiter '--'. + // See https://abseil.io/docs/cpp/guides/flags. + std::copy(positional_args.begin(), positional_args.end(), argv); + if (static_cast(positional_args.size()) < *argc) { + argv[positional_args.size()] = nullptr; + *argc = static_cast(positional_args.size()); + } + } +#else ParseGoogleTestFlagsOnlyImpl(argc, argv); +#endif // Fix the value of *_NSGetArgc() on macOS, but if and only if // *_NSGetArgv() == argv @@ -6640,6 +6684,12 @@ void InitGoogleTestImpl(int* argc, CharType** argv) { #if GTEST_HAS_ABSL absl::InitializeSymbolizer(g_argvs[0].c_str()); + + // When using the Abseil Flags library, set the program usage message to the + // help message, but remove the color-encoding from the message first. + absl::SetProgramUsageMessage(absl::StrReplaceAll( + kColorEncodedHelpMessage, + {{"@D", ""}, {"@R", ""}, {"@G", ""}, {"@Y", ""}, {"@@", "@"}})); #endif // GTEST_HAS_ABSL ParseGoogleTestFlagsOnly(argc, argv); @@ -6660,7 +6710,7 @@ void InitGoogleTestImpl(int* argc, CharType** argv) { void InitGoogleTest(int* argc, char** argv) { #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv); -#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) internal::InitGoogleTestImpl(argc, argv); #endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) } @@ -6670,7 +6720,7 @@ void InitGoogleTest(int* argc, char** argv) { void InitGoogleTest(int* argc, wchar_t** argv) { #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv); -#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) internal::InitGoogleTestImpl(argc, argv); #endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) } @@ -6686,42 +6736,42 @@ void InitGoogleTest() { #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv); -#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) internal::InitGoogleTestImpl(&argc, argv); #endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) } +#if !defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_) +// Return value of first environment variable that is set and contains +// a non-empty string. If there are none, return the "fallback" string. +// Since we like the temporary directory to have a directory separator suffix, +// add it if not provided in the environment variable value. +static std::string GetTempDirFromEnv( + std::initializer_list environment_variables, + const char* fallback, char separator) { + for (const char* variable_name : environment_variables) { + const char* value = internal::posix::GetEnv(variable_name); + if (value != nullptr && value[0] != '\0') { + if (value[strlen(value) - 1] != separator) { + return std::string(value).append(1, separator); + } + return value; + } + } + return fallback; +} +#endif + std::string TempDir() { #if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_) return GTEST_CUSTOM_TEMPDIR_FUNCTION_(); -#elif GTEST_OS_WINDOWS_MOBILE - return "\\temp\\"; -#elif GTEST_OS_WINDOWS - const char* temp_dir = internal::posix::GetEnv("TEMP"); - if (temp_dir == nullptr || temp_dir[0] == '\0') { - return "\\temp\\"; - } else if (temp_dir[strlen(temp_dir) - 1] == '\\') { - return temp_dir; - } else { - return std::string(temp_dir) + "\\"; - } +#elif GTEST_OS_WINDOWS || GTEST_OS_WINDOWS_MOBILE + return GetTempDirFromEnv({"TEST_TMPDIR", "TEMP"}, "\\temp\\", '\\'); #elif GTEST_OS_LINUX_ANDROID - const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR"); - if (temp_dir == nullptr || temp_dir[0] == '\0') { - return "/data/local/tmp/"; - } else { - return temp_dir; - } -#elif GTEST_OS_LINUX - const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR"); - if (temp_dir == nullptr || temp_dir[0] == '\0') { - return "/tmp/"; - } else { - return temp_dir; - } + return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/data/local/tmp/", '/'); #else - return "/tmp/"; -#endif // GTEST_OS_WINDOWS_MOBILE + return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/tmp/", '/'); +#endif } // Class ScopedTrace @@ -6738,8 +6788,7 @@ void ScopedTrace::PushTrace(const char* file, int line, std::string message) { } // Pops the info pushed by the c'tor. -ScopedTrace::~ScopedTrace() - GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { +ScopedTrace::~ScopedTrace() GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { UnitTest::GetInstance()->PopGTestTrace(); } diff --git a/third_party/googletest/src/src/gtest_main.cc b/third_party/googletest/src/src/gtest_main.cc index 46b27c3d7d..44976375c9 100644 --- a/third_party/googletest/src/src/gtest_main.cc +++ b/third_party/googletest/src/src/gtest_main.cc @@ -28,15 +28,14 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include + #include "gtest/gtest.h" #if GTEST_OS_ESP8266 || GTEST_OS_ESP32 #if GTEST_OS_ESP8266 extern "C" { #endif -void setup() { - testing::InitGoogleTest(); -} +void setup() { testing::InitGoogleTest(); } void loop() { RUN_ALL_TESTS(); } From fd615b4348822f4ba77e3d5731018cf229110b93 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 27 May 2022 21:53:49 -0700 Subject: [PATCH 0503/1000] CHECK_MEM_ERROR: add an assert for a valid jmp target callers of CHECK_MEM_ERROR() expect failures to not return tested with: configure --enable-debug --enable-vp9-postproc --enable-postproc \ --enable-multi-res-encoding --enable-vp9-temporal-denoising \ --enable-error-concealment --enable-internal-stats has unrelated assertion failures currently Change-Id: Ic12073b1ae80a6f434f14d24f652e64d30f63eea --- vp8/decoder/onyxd_int.h | 4 ++++ vp8/encoder/onyx_int.h | 4 ++++ vp9/common/vp9_common.h | 2 ++ 3 files changed, 10 insertions(+) diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h index cf2c066d9b..a6bedc4faf 100644 --- a/vp8/decoder/onyxd_int.h +++ b/vp8/decoder/onyxd_int.h @@ -11,6 +11,8 @@ #ifndef VPX_VP8_DECODER_ONYXD_INT_H_ #define VPX_VP8_DECODER_ONYXD_INT_H_ +#include + #include "vpx_config.h" #include "vp8/common/onyxd.h" #include "treereader.h" @@ -136,6 +138,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb); #if CONFIG_DEBUG #define CHECK_MEM_ERROR(lval, expr) \ do { \ + assert(pbi->common.error.setjmp); \ (lval) = (expr); \ if (!(lval)) \ vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \ @@ -145,6 +148,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb); #else #define CHECK_MEM_ERROR(lval, expr) \ do { \ + assert(pbi->common.error.setjmp); \ (lval) = (expr); \ if (!(lval)) \ vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \ diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 726dcc9466..7951f0a77e 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -11,7 +11,9 @@ #ifndef VPX_VP8_ENCODER_ONYX_INT_H_ #define VPX_VP8_ENCODER_ONYX_INT_H_ +#include #include + #include "vpx_config.h" #include "vp8/common/onyx.h" #include "treewriter.h" @@ -730,6 +732,7 @@ void vp8_set_speed_features(VP8_COMP *cpi); #if CONFIG_DEBUG #define CHECK_MEM_ERROR(lval, expr) \ do { \ + assert(cpi->common.error.setjmp); \ (lval) = (expr); \ if (!(lval)) \ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \ @@ -739,6 +742,7 @@ void vp8_set_speed_features(VP8_COMP *cpi); #else #define CHECK_MEM_ERROR(lval, expr) \ do { \ + assert(cpi->common.error.setjmp); \ (lval) = (expr); \ if (!(lval)) \ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \ diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h index 3cec53bfd8..8d2bed38e5 100644 --- a/vp9/common/vp9_common.h +++ b/vp9/common/vp9_common.h @@ -49,6 +49,7 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { #if CONFIG_DEBUG #define CHECK_MEM_ERROR(cm, lval, expr) \ do { \ + assert(&(cm)->error.setjmp); \ (lval) = (expr); \ if (!(lval)) \ vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \ @@ -58,6 +59,7 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { #else #define CHECK_MEM_ERROR(cm, lval, expr) \ do { \ + assert(&(cm)->error.setjmp); \ (lval) = (expr); \ if (!(lval)) \ vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \ From b39722f851f5505b2fa61f47694f307c9a09794b Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Mon, 12 Sep 2022 07:40:39 -0700 Subject: [PATCH 0504/1000] Add vpx_highbd_sad32x{64,32,16}x4d_avx2. ~2.4x faster than the sse2 version. Bug: b/245917257 Change-Id: I6df2bd62b46e5e175c8ad80daa6de3a1c313db0f --- test/sad_test.cc | 9 +++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +- vpx_dsp/x86/highbd_sad4d_avx2.c | 113 ++++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 3 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 92c9e6332a..b3ad96ab8c 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1080,12 +1080,21 @@ const SadMxNx4Param x4d_avx2_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2), SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2), #if CONFIG_VP9_HIGHBITDEPTH + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 8), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 8), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 8), SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 8), SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 8), SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 8), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 10), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 10), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 10), SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 10), SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 10), SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 10), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 12), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 12), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 12), SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 12), SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 12), SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 12), diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index df2c8da74e..f5f5f9dd65 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1039,13 +1039,13 @@ () specialize qw/vpx_highbd_sad64x32x4d sse2 neon/; add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad32x64x4d sse2 neon/; + specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad32x32x4d sse2 neon/; + specialize qw/vpx_highbd_sad32x32x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad32x16x4d sse2 neon/; + specialize qw/vpx_highbd_sad32x16x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/; diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c index 46c7e4fbc8..3384694f39 100644 --- a/vpx_dsp/x86/highbd_sad4d_avx2.c +++ b/vpx_dsp/x86/highbd_sad4d_avx2.c @@ -21,6 +21,119 @@ static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/, _mm_storeu_si128((__m128i *)sad_array, sum); } +static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; i++) { + __m256i r[8]; + + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16)); + r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)refs[1]); + r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16)); + r[4] = _mm256_loadu_si256((const __m256i *)refs[2]); + r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16)); + r[6] = _mm256_loadu_si256((const __m256i *)refs[3]); + r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16)); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2)); + r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s)); + r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2)); + r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s)); + r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2)); + + // sum every abs diff + sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1])); + sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3])); + sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5])); + sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7])); + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +#define HIGHBD_SAD32XNX4D(n) \ + void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *refs[4]; \ + __m256i sums_16[4]; \ + __m256i sums_32[4]; \ + int i; \ + \ + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ + sums_32[0] = _mm256_setzero_si256(); \ + sums_32[1] = _mm256_setzero_si256(); \ + sums_32[2] = _mm256_setzero_si256(); \ + sums_32[3] = _mm256_setzero_si256(); \ + \ + for (i = 0; i < (n / 8); ++i) { \ + sums_16[0] = _mm256_setzero_si256(); \ + sums_16[1] = _mm256_setzero_si256(); \ + sums_16[2] = _mm256_setzero_si256(); \ + sums_16[3] = _mm256_setzero_si256(); \ + \ + highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); \ + \ + /* sums_16 will outrange after 8 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32[0] = _mm256_add_epi32( \ + sums_32[0], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[0], 1)))); \ + sums_32[1] = _mm256_add_epi32( \ + sums_32[1], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[1], 1)))); \ + sums_32[2] = _mm256_add_epi32( \ + sums_32[2], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[2], 1)))); \ + sums_32[3] = _mm256_add_epi32( \ + sums_32[3], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[3], 1)))); \ + \ + src += src_stride << 3; \ + } \ + calc_final_4(sums_32, sad_array); \ + } + +// 32x64 +HIGHBD_SAD32XNX4D(64) + +// 32x32 +HIGHBD_SAD32XNX4D(32) + +// 32x16 +HIGHBD_SAD32XNX4D(16) + static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, const uint16_t *src, int src_stride, From 34284e930a24f48d64220d34bc355e0883a3c569 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 14 Sep 2022 03:36:46 -0700 Subject: [PATCH 0505/1000] Add vpx_highbd_sad64x{64,32}x4d_avx2. ~2x faster than the sse2 version. Bug: b/245917257 Change-Id: I4742950ab7b90d7f09e8d4687e1e967138acee39 --- test/sad_test.cc | 6 ++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- vpx_dsp/x86/highbd_sad4d_avx2.c | 105 ++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+), 2 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index b3ad96ab8c..7e84ea0dbf 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1080,18 +1080,24 @@ const SadMxNx4Param x4d_avx2_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2), SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2), #if CONFIG_VP9_HIGHBITDEPTH + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 8), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 8), SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 8), SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 8), SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 8), SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 8), SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 8), SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 8), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 10), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 10), SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 10), SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 10), SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 10), SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 10), SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 10), SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 10), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 12), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 12), SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 12), SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 12), SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 12), diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index f5f5f9dd65..527d0e6e74 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1033,10 +1033,10 @@ () # Multi-block SAD, comparing a reference to N independent blocks # add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad64x64x4d sse2 neon/; + specialize qw/vpx_highbd_sad64x64x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad64x32x4d sse2 neon/; + specialize qw/vpx_highbd_sad64x32x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/; diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c index 3384694f39..947b5e9772 100644 --- a/vpx_dsp/x86/highbd_sad4d_avx2.c +++ b/vpx_dsp/x86/highbd_sad4d_avx2.c @@ -21,6 +21,111 @@ static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/, _mm_storeu_si128((__m128i *)sad_array, sum); } +static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); + const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); + int x; + + for (x = 0; x < 4; ++x) { + __m256i r[4]; + r[0] = _mm256_loadu_si256((const __m256i *)refs[x]); + r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48)); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3)); + + // sum every abs diff + sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1])); + sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3])); + } + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +#define HIGHBD_SAD64XNX4D(n) \ + void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *refs[4]; \ + __m256i sums_16[4]; \ + __m256i sums_32[4]; \ + int i; \ + \ + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ + sums_32[0] = _mm256_setzero_si256(); \ + sums_32[1] = _mm256_setzero_si256(); \ + sums_32[2] = _mm256_setzero_si256(); \ + sums_32[3] = _mm256_setzero_si256(); \ + \ + for (i = 0; i < (n / 2); ++i) { \ + sums_16[0] = _mm256_setzero_si256(); \ + sums_16[1] = _mm256_setzero_si256(); \ + sums_16[2] = _mm256_setzero_si256(); \ + sums_16[3] = _mm256_setzero_si256(); \ + \ + highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); \ + \ + /* sums_16 will outrange after 2 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32[0] = _mm256_add_epi32( \ + sums_32[0], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[0], 1)))); \ + sums_32[1] = _mm256_add_epi32( \ + sums_32[1], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[1], 1)))); \ + sums_32[2] = _mm256_add_epi32( \ + sums_32[2], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[2], 1)))); \ + sums_32[3] = _mm256_add_epi32( \ + sums_32[3], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[3], 1)))); \ + \ + src += src_stride << 1; \ + } \ + calc_final_4(sums_32, sad_array); \ + } + +// 64x64 +HIGHBD_SAD64XNX4D(64) + +// 64x32 +HIGHBD_SAD64XNX4D(32) + static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, const uint16_t *src, int src_stride, From 7ed6b47c607ad73b0a99901b04200b07fa81f24c Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Wed, 14 Sep 2022 11:40:50 -0700 Subject: [PATCH 0506/1000] L2E: Rework recode decisions for external max frame size and q Allow to handle external q and external max frame size separately. Rely on libvpx's decision to catch overshoot/undershoot and recode frames. Previously, when external max frame size is set, we didn't handle undershoot cases, and now we fall back to libvpx's decision to recode a frame if overshoot/undershoot is seen. Change-Id: Ic3eee042cfe104b528c5f2c6c82b98dd5d8fa8ca --- vp9/encoder/vp9_encoder.c | 49 ++++++++++++++------------------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 91b64e5d13..ca3439d7c0 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4367,7 +4367,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest int frame_over_shoot_limit; int frame_under_shoot_limit; int q = 0, q_low = 0, q_high = 0; - int last_q_attempt = 0; int enable_acl; #ifdef AGGRESSIVE_VBR int qrange_adj = 1; @@ -4381,8 +4380,18 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest // Maximal frame size allowed by the external rate control. // case: 0, we ignore the max frame size limit, and encode with the qindex // passed in by the external rate control model. - // case: -1, we take VP9's decision for the max frame size. + // If the external qindex is VPX_DEFAULT_Q, libvpx will pick a qindex + // and may recode if undershoot/overshoot is seen. + // If the external qindex is not VPX_DEFAULT_Q, we force no recode. + // case: -1, we take libvpx's decision for the max frame size, as well as + // the recode decision. + // Otherwise: if a specific size is given, libvpx's recode decision + // will respect the given size. int ext_rc_max_frame_size = 0; + // Use VP9's decision of qindex. This flag is in use only in external rate + // control model to help determine whether to recode when + // |ext_rc_max_frame_size| is 0. + int ext_rc_use_default_q = 1; const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth; #if CONFIG_RATE_CTRL @@ -4520,8 +4529,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest // libvpx's default q. if (encode_frame_decision.q_index != VPX_DEFAULT_Q) { q = encode_frame_decision.q_index; - ext_rc_max_frame_size = encode_frame_decision.max_frame_size; + ext_rc_use_default_q = 0; } + ext_rc_max_frame_size = encode_frame_decision.max_frame_size; } vp9_set_quantizer(cpi, q); @@ -4564,7 +4574,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest if (cpi->ext_ratectrl.ready && (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) { - last_q_attempt = q; // In general, for the external rate control, we take the qindex provided // as input and encode the frame with this qindex faithfully. However, // in some extreme scenarios, the provided qindex leads to a massive @@ -4572,20 +4581,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest // to pick a new qindex and recode the frame. We return the new qindex // through the API to the external model. if (ext_rc_max_frame_size == 0) { - break; + if (!ext_rc_use_default_q) break; } else if (ext_rc_max_frame_size == -1) { - if (rc->projected_frame_size < rc->max_frame_bandwidth) { - break; - } + // Do nothing, fall back to libvpx's recode decision. } else { - if (rc->projected_frame_size < ext_rc_max_frame_size) { - break; - } + // Change the max frame size, used in libvpx's recode decision. + rc->max_frame_bandwidth = ext_rc_max_frame_size; } - rc->max_frame_bandwidth = ext_rc_max_frame_size; - // If the current frame size exceeds the ext_rc_max_frame_size, - // we adjust the worst qindex to meet the frame size constraint. - q_high = 255; ext_rc_recode = 1; } #if CONFIG_RATE_CTRL @@ -4788,23 +4790,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest rc->projected_frame_size < rc->max_frame_bandwidth) loop = 0; - // Special handling of external max frame size constraint - if (ext_rc_recode) { - // If the largest q is not able to meet the max frame size limit, - // do nothing. - if (rc->projected_frame_size > ext_rc_max_frame_size && - last_q_attempt == 255) { - break; - } - // If VP9's q selection leads to a smaller q, we force it to use - // a larger q to better approximate the external max frame size - // constraint. - if (rc->projected_frame_size > ext_rc_max_frame_size && - q <= last_q_attempt) { - q = VPXMIN(255, last_q_attempt + 1); - } - } - if (loop) { ++loop_count; ++loop_at_this_size; From 3cd417b6d2380b993f33a0d8f342e4d16717d16e Mon Sep 17 00:00:00 2001 From: Johann Date: Sat, 17 Sep 2022 07:54:40 +0900 Subject: [PATCH 0507/1000] fwd_txfm: remove avx2 file from non-hbd Resolves warning on OS X: file: libvpx_g.a(fwd_txfm_avx2.c.o) has no symbols Change-Id: Ie8b290bb3ed329656beb883d552c98353f1ed5e5 --- vpx_dsp/vpx_dsp.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 34e9d736db..f9a5c97dd2 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -226,7 +226,6 @@ DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h ifeq ($(VPX_ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm endif -DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h DSP_SRCS-$(HAVE_NEON) += arm/fdct_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fdct16x16_neon.c @@ -239,6 +238,7 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.h DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.c ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_dct32x32_lsx.c endif # !CONFIG_VP9_HIGHBITDEPTH From 884837a5805b77aec2e16d3dee910d59081ba9b0 Mon Sep 17 00:00:00 2001 From: Johann Date: Sun, 18 Sep 2022 10:26:00 +0900 Subject: [PATCH 0508/1000] quantize: test lowbd in highbd builds Change-Id: I7af273e979415a8b8cafb7494728d2736862f4a5 --- test/vp9_quantize_test.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 4ecdd91b06..a81775fd94 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -586,12 +586,16 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_12, 32, true), + make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, VPX_BITS_8, 16, false), make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, VPX_BITS_10, 16, false), make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, VPX_BITS_12, 16, false), + make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false), make_tuple(&vpx_highbd_quantize_b_32x32_avx2, &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), make_tuple(&vpx_highbd_quantize_b_32x32_avx2, @@ -620,12 +624,16 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, ::testing::Values( + make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, VPX_BITS_8, 16, false), make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, VPX_BITS_10, 16, false), make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, VPX_BITS_12, 16, false), + make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false), make_tuple(&vpx_highbd_quantize_b_32x32_neon, &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), make_tuple(&vpx_highbd_quantize_b_32x32_neon, From f6939699b626fe0e8588f1bc639b955d89fb537c Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 21 Sep 2022 11:37:04 -0700 Subject: [PATCH 0509/1000] post_proc_sse2.c: quiet -Wuninitialized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In file included from ../libvpx/vpx_dsp/x86/post_proc_sse2.c:12: In function ‘_mm_add_epi16’, inlined from ‘vpx_mbpost_proc_down_sse2’ at ../libvpx/vpx_dsp/x86/post_proc_sse2.c:88:13: /usr/lib/gcc/x86_64-linux-gnu/12/include/emmintrin.h:1060:35: warning: ‘below_context’ may be used uninitialized [-Wmaybe-uninitialized] 1060 | return (__m128i) ((__v8hu)__A + (__v8hu)__B); | ^~~~~~~~~~~ ../libvpx/vpx_dsp/x86/post_proc_sse2.c: In function ‘vpx_mbpost_proc_down_sse2’: ../libvpx/vpx_dsp/x86/post_proc_sse2.c:39:13: note: ‘below_context’ was declared here 39 | __m128i below_context; Change-Id: I2fc592f121c4e85d0aff1640014c3444f5eb09fd --- vpx_dsp/x86/post_proc_sse2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpx_dsp/x86/post_proc_sse2.c b/vpx_dsp/x86/post_proc_sse2.c index d1029afc4f..119fa7cd1a 100644 --- a/vpx_dsp/x86/post_proc_sse2.c +++ b/vpx_dsp/x86/post_proc_sse2.c @@ -36,7 +36,7 @@ void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, __m128i s = _mm_loadl_epi64((__m128i *)dst); __m128i sum, sumsq_0, sumsq_1; __m128i tmp_0, tmp_1; - __m128i below_context; + __m128i below_context = _mm_setzero_si128(); s = _mm_unpacklo_epi8(s, zero); From 8b0c92ebdfd0478812b2fbf84c0dfc762a2609bd Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 21 Sep 2022 12:15:16 -0700 Subject: [PATCH 0510/1000] resize_test.cc: quiet -Wmaybe-uninitialized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit warning: ‘expected_w’ may be used uninitialized Change-Id: I915efd82d3263250cea90391345f7683c1330fc8 --- test/resize_test.cc | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/test/resize_test.cc b/test/resize_test.cc index 212ff46975..e122a74742 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -95,10 +95,11 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, unsigned int initial_h, unsigned int *w, unsigned int *h, bool flag_codec, bool smaller_width_larger_size_) { + *w = initial_w; + *h = initial_h; + if (smaller_width_larger_size_) { if (frame < 30) { - *w = initial_w; - *h = initial_h; return; } if (frame < 100) { @@ -109,8 +110,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 10) { - *w = initial_w; - *h = initial_h; return; } if (frame < 20) { @@ -124,8 +123,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 40) { - *w = initial_w; - *h = initial_h; return; } if (frame < 50) { @@ -139,8 +136,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 70) { - *w = initial_w; - *h = initial_h; return; } if (frame < 80) { @@ -159,8 +154,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 110) { - *w = initial_w; - *h = initial_h; return; } if (frame < 120) { @@ -179,8 +172,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 150) { - *w = initial_w; - *h = initial_h; return; } if (frame < 160) { @@ -199,8 +190,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 190) { - *w = initial_w; - *h = initial_h; return; } if (frame < 200) { @@ -219,8 +208,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 230) { - *w = initial_w; - *h = initial_h; return; } if (frame < 240) { @@ -234,8 +221,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 260) { - *w = initial_w; - *h = initial_h; return; } // Go down very low. @@ -248,13 +233,9 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, // Cases that only works for VP9. // For VP9: Swap width and height of original. if (frame < 320) { - *w = initial_h; - *h = initial_w; return; } } - *w = initial_w; - *h = initial_h; } class ResizingVideoSource : public ::libvpx_test::DummyVideoSource { From c8874f74a7a29729a77ccba5790bf2c71f583f15 Mon Sep 17 00:00:00 2001 From: Johann Date: Sat, 17 Sep 2022 08:47:28 +0900 Subject: [PATCH 0511/1000] quantize: increase iscan by 1 All of the assembly adds 1 to iscan to convert from a 0 based array to the EOB value. Add 1 to all iscan values and remove the extra instructions from the assembly. Change-Id: I219dd7f2bd10533ab24b206289565703176dc5e9 --- vp9/common/vp9_scan.c | 293 +++++++++--------- vp9/encoder/arm/neon/vp9_quantize_neon.c | 4 +- vp9/encoder/x86/vp9_quantize_avx2.c | 14 +- vp9/encoder/x86/vp9_quantize_sse2.c | 6 - vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm | 4 - vpx_dsp/arm/highbd_quantize_neon.c | 18 +- vpx_dsp/arm/quantize_neon.c | 18 +- vpx_dsp/loongarch/quantize_lsx.c | 13 +- vpx_dsp/ppc/quantize_vsx.c | 24 +- vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 5 +- vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 8 +- vpx_dsp/x86/quantize_avx.c | 12 +- vpx_dsp/x86/quantize_avx2.c | 11 +- vpx_dsp/x86/quantize_sse2.c | 5 +- vpx_dsp/x86/quantize_sse2.h | 8 +- vpx_dsp/x86/quantize_ssse3.c | 11 +- 16 files changed, 197 insertions(+), 257 deletions(-) diff --git a/vp9/common/vp9_scan.c b/vp9/common/vp9_scan.c index 0fef263510..8bea61dea6 100644 --- a/vp9/common/vp9_scan.c +++ b/vp9/common/vp9_scan.c @@ -511,180 +511,181 @@ DECLARE_ALIGNED(16, static const int16_t, 959, 990, 991, 1022, 0, 0, }; +// Add 1 to iscan values. This represents the EOB position instead of the index. DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_4x4[16]) = { - 0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15, + 1, 3, 6, 9, 2, 4, 10, 13, 5, 8, 12, 15, 7, 11, 14, 16, }; DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_4x4[16]) = { - 0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15, + 1, 4, 8, 12, 2, 6, 10, 13, 3, 7, 11, 15, 5, 9, 14, 16, }; DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_4x4[16]) = { - 0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15, + 1, 2, 4, 6, 3, 5, 7, 10, 8, 9, 12, 14, 11, 13, 15, 16, }; DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_8x8[64]) = { - 0, 3, 8, 15, 22, 32, 40, 47, 1, 5, 11, 18, 26, 34, 44, 51, - 2, 7, 13, 20, 28, 38, 46, 54, 4, 10, 16, 24, 31, 41, 50, 56, - 6, 12, 21, 27, 35, 43, 52, 58, 9, 17, 25, 33, 39, 48, 55, 60, - 14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63, + 1, 4, 9, 16, 23, 33, 41, 48, 2, 6, 12, 19, 27, 35, 45, 52, + 3, 8, 14, 21, 29, 39, 47, 55, 5, 11, 17, 25, 32, 42, 51, 57, + 7, 13, 22, 28, 36, 44, 53, 59, 10, 18, 26, 34, 40, 49, 56, 61, + 15, 24, 31, 38, 46, 54, 60, 63, 20, 30, 37, 43, 50, 58, 62, 64, }; DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_8x8[64]) = { - 0, 1, 2, 5, 8, 12, 19, 24, 3, 4, 7, 10, 15, 20, 30, 39, - 6, 9, 13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52, - 18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59, - 32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63, + 1, 2, 3, 6, 9, 13, 20, 25, 4, 5, 8, 11, 16, 21, 31, 40, + 7, 10, 14, 17, 22, 28, 38, 47, 12, 15, 18, 24, 29, 35, 45, 53, + 19, 23, 26, 32, 36, 42, 51, 58, 27, 30, 34, 39, 44, 50, 56, 60, + 33, 37, 43, 48, 52, 55, 61, 62, 41, 46, 49, 54, 57, 59, 63, 64, }; DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_8x8[64]) = { - 0, 2, 5, 9, 14, 22, 31, 37, 1, 4, 8, 13, 19, 26, 38, 44, - 3, 6, 10, 17, 24, 30, 42, 49, 7, 11, 15, 21, 29, 36, 47, 53, - 12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60, - 25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63, + 1, 3, 6, 10, 15, 23, 32, 38, 2, 5, 9, 14, 20, 27, 39, 45, + 4, 7, 11, 18, 25, 31, 43, 50, 8, 12, 16, 22, 30, 37, 48, 54, + 13, 17, 21, 28, 35, 44, 53, 58, 19, 24, 29, 36, 42, 49, 57, 61, + 26, 33, 40, 46, 51, 56, 60, 63, 34, 41, 47, 52, 55, 59, 62, 64, }; DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_16x16[256]) = { - 0, 4, 11, 20, 31, 43, 59, 75, 85, 109, 130, 150, 165, 181, 195, 198, - 1, 6, 14, 23, 34, 47, 64, 81, 95, 114, 135, 153, 171, 188, 201, 212, - 2, 8, 16, 25, 38, 52, 67, 83, 101, 116, 136, 157, 172, 190, 205, 216, - 3, 10, 18, 29, 41, 55, 71, 89, 103, 119, 141, 159, 176, 194, 208, 218, - 5, 12, 21, 32, 45, 58, 74, 93, 104, 123, 144, 164, 179, 196, 210, 223, - 7, 15, 26, 37, 49, 63, 78, 96, 112, 129, 146, 166, 182, 200, 215, 228, - 9, 19, 28, 39, 54, 69, 86, 102, 117, 132, 151, 170, 187, 206, 220, 230, - 13, 24, 35, 46, 60, 73, 91, 108, 122, 137, 154, 174, 189, 207, 224, 235, - 17, 30, 40, 53, 66, 82, 98, 115, 126, 142, 161, 180, 197, 213, 227, 237, - 22, 36, 48, 62, 76, 92, 105, 120, 133, 147, 167, 186, 203, 219, 232, 240, - 27, 44, 56, 70, 84, 99, 113, 127, 140, 156, 175, 193, 209, 226, 236, 244, - 33, 51, 68, 79, 94, 110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247, - 42, 61, 77, 90, 106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251, - 50, 72, 87, 100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253, - 57, 80, 97, 111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254, - 65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255, + 1, 5, 12, 21, 32, 44, 60, 76, 86, 110, 131, 151, 166, 182, 196, 199, + 2, 7, 15, 24, 35, 48, 65, 82, 96, 115, 136, 154, 172, 189, 202, 213, + 3, 9, 17, 26, 39, 53, 68, 84, 102, 117, 137, 158, 173, 191, 206, 217, + 4, 11, 19, 30, 42, 56, 72, 90, 104, 120, 142, 160, 177, 195, 209, 219, + 6, 13, 22, 33, 46, 59, 75, 94, 105, 124, 145, 165, 180, 197, 211, 224, + 8, 16, 27, 38, 50, 64, 79, 97, 113, 130, 147, 167, 183, 201, 216, 229, + 10, 20, 29, 40, 55, 70, 87, 103, 118, 133, 152, 171, 188, 207, 221, 231, + 14, 25, 36, 47, 61, 74, 92, 109, 123, 138, 155, 175, 190, 208, 225, 236, + 18, 31, 41, 54, 67, 83, 99, 116, 127, 143, 162, 181, 198, 214, 228, 238, + 23, 37, 49, 63, 77, 93, 106, 121, 134, 148, 168, 187, 204, 220, 233, 241, + 28, 45, 57, 71, 85, 100, 114, 128, 141, 157, 176, 194, 210, 227, 237, 245, + 34, 52, 69, 80, 95, 111, 126, 139, 150, 163, 185, 203, 218, 230, 242, 248, + 43, 62, 78, 91, 107, 122, 135, 149, 161, 174, 192, 212, 226, 239, 246, 252, + 51, 73, 88, 101, 119, 129, 146, 159, 169, 184, 205, 223, 234, 243, 250, 254, + 58, 81, 98, 112, 132, 144, 156, 170, 179, 193, 215, 232, 240, 247, 251, 255, + 66, 89, 108, 125, 140, 153, 164, 178, 186, 200, 222, 235, 244, 249, 253, 256, }; DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_16x16[256]) = { - 0, 1, 2, 4, 6, 9, 12, 17, 22, 29, 36, 43, 54, 64, 76, - 86, 3, 5, 7, 11, 15, 19, 25, 32, 38, 48, 59, 68, 84, 99, - 115, 130, 8, 10, 13, 18, 23, 27, 33, 42, 51, 60, 72, 88, 103, - 119, 142, 167, 14, 16, 20, 26, 31, 37, 44, 53, 61, 73, 85, 100, - 116, 135, 161, 185, 21, 24, 30, 35, 40, 47, 55, 65, 74, 81, 94, - 112, 133, 154, 179, 205, 28, 34, 39, 45, 50, 58, 67, 77, 87, 96, - 106, 121, 146, 169, 196, 212, 41, 46, 49, 56, 63, 70, 79, 90, 98, - 107, 122, 138, 159, 182, 207, 222, 52, 57, 62, 69, 75, 83, 93, 102, - 110, 120, 134, 150, 176, 195, 215, 226, 66, 71, 78, 82, 91, 97, 108, - 113, 127, 136, 148, 168, 188, 202, 221, 232, 80, 89, 92, 101, 105, 114, - 125, 131, 139, 151, 162, 177, 192, 208, 223, 234, 95, 104, 109, 117, 123, - 128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239, 111, 118, 124, 129, - 140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240, 243, 126, 132, 137, - 145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237, 244, 246, 141, 149, - 156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238, 242, 249, 251, 152, - 163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236, 245, 247, 252, 253, - 158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235, 241, 248, 250, 254, - 255, + 1, 2, 3, 5, 7, 10, 13, 18, 23, 30, 37, 44, 55, 65, 77, + 87, 4, 6, 8, 12, 16, 20, 26, 33, 39, 49, 60, 69, 85, 100, + 116, 131, 9, 11, 14, 19, 24, 28, 34, 43, 52, 61, 73, 89, 104, + 120, 143, 168, 15, 17, 21, 27, 32, 38, 45, 54, 62, 74, 86, 101, + 117, 136, 162, 186, 22, 25, 31, 36, 41, 48, 56, 66, 75, 82, 95, + 113, 134, 155, 180, 206, 29, 35, 40, 46, 51, 59, 68, 78, 88, 97, + 107, 122, 147, 170, 197, 213, 42, 47, 50, 57, 64, 71, 80, 91, 99, + 108, 123, 139, 160, 183, 208, 223, 53, 58, 63, 70, 76, 84, 94, 103, + 111, 121, 135, 151, 177, 196, 216, 227, 67, 72, 79, 83, 92, 98, 109, + 114, 128, 137, 149, 169, 189, 203, 222, 233, 81, 90, 93, 102, 106, 115, + 126, 132, 140, 152, 163, 178, 193, 209, 224, 235, 96, 105, 110, 118, 124, + 129, 144, 145, 156, 166, 176, 191, 207, 220, 234, 240, 112, 119, 125, 130, + 141, 148, 158, 165, 171, 182, 192, 204, 225, 231, 241, 244, 127, 133, 138, + 146, 154, 161, 175, 179, 185, 198, 205, 217, 232, 238, 245, 247, 142, 150, + 157, 167, 173, 181, 190, 200, 201, 211, 221, 229, 239, 243, 250, 252, 153, + 164, 172, 184, 187, 194, 202, 212, 215, 219, 228, 237, 246, 248, 253, 254, + 159, 174, 188, 195, 199, 210, 214, 218, 226, 230, 236, 242, 249, 251, 255, + 256, }; DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_16x16[256]) = { - 0, 2, 5, 9, 17, 24, 36, 44, 55, 72, 88, 104, 128, 143, 166, - 179, 1, 4, 8, 13, 20, 30, 40, 54, 66, 79, 96, 113, 141, 154, - 178, 196, 3, 7, 11, 18, 25, 33, 46, 57, 71, 86, 101, 119, 148, - 164, 186, 201, 6, 12, 16, 23, 31, 39, 53, 64, 78, 92, 110, 127, - 153, 169, 193, 208, 10, 14, 19, 28, 37, 47, 58, 67, 84, 98, 114, - 133, 161, 176, 198, 214, 15, 21, 26, 34, 43, 52, 65, 77, 91, 106, - 120, 140, 165, 185, 205, 221, 22, 27, 32, 41, 48, 60, 73, 85, 99, - 116, 130, 151, 175, 190, 211, 225, 29, 35, 42, 49, 59, 69, 81, 95, - 108, 125, 139, 155, 182, 197, 217, 229, 38, 45, 51, 61, 68, 80, 93, - 105, 118, 134, 150, 168, 191, 207, 223, 234, 50, 56, 63, 74, 83, 94, - 109, 117, 129, 147, 163, 177, 199, 213, 228, 238, 62, 70, 76, 87, 97, - 107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242, 75, 82, 90, 102, - 112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245, 89, 100, 111, - 123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250, 103, 115, - 126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248, 252, 121, - 135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244, 251, 254, - 137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247, 249, 253, - 255, + 1, 3, 6, 10, 18, 25, 37, 45, 56, 73, 89, 105, 129, 144, 167, + 180, 2, 5, 9, 14, 21, 31, 41, 55, 67, 80, 97, 114, 142, 155, + 179, 197, 4, 8, 12, 19, 26, 34, 47, 58, 72, 87, 102, 120, 149, + 165, 187, 202, 7, 13, 17, 24, 32, 40, 54, 65, 79, 93, 111, 128, + 154, 170, 194, 209, 11, 15, 20, 29, 38, 48, 59, 68, 85, 99, 115, + 134, 162, 177, 199, 215, 16, 22, 27, 35, 44, 53, 66, 78, 92, 107, + 121, 141, 166, 186, 206, 222, 23, 28, 33, 42, 49, 61, 74, 86, 100, + 117, 131, 152, 176, 191, 212, 226, 30, 36, 43, 50, 60, 70, 82, 96, + 109, 126, 140, 156, 183, 198, 218, 230, 39, 46, 52, 62, 69, 81, 94, + 106, 119, 135, 151, 169, 192, 208, 224, 235, 51, 57, 64, 75, 84, 95, + 110, 118, 130, 148, 164, 178, 200, 214, 229, 239, 63, 71, 77, 88, 98, + 108, 123, 132, 146, 160, 173, 189, 211, 223, 236, 243, 76, 83, 91, 103, + 113, 125, 139, 147, 158, 174, 188, 203, 220, 231, 241, 246, 90, 101, 112, + 124, 133, 143, 157, 168, 181, 190, 204, 217, 232, 238, 247, 251, 104, 116, + 127, 137, 150, 163, 172, 184, 195, 205, 216, 225, 237, 242, 249, 253, 122, + 136, 145, 159, 171, 182, 193, 201, 210, 219, 228, 234, 244, 245, 252, 255, + 138, 153, 161, 175, 185, 196, 207, 213, 221, 227, 233, 240, 248, 250, 254, + 256, }; DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = { - 0, 2, 5, 10, 17, 25, 38, 47, 62, 83, 101, 121, 145, - 170, 193, 204, 210, 219, 229, 233, 245, 257, 275, 299, 342, 356, - 377, 405, 455, 471, 495, 527, 1, 4, 8, 15, 22, 30, 45, - 58, 74, 92, 112, 133, 158, 184, 203, 215, 222, 228, 234, 237, - 256, 274, 298, 317, 355, 376, 404, 426, 470, 494, 526, 551, 3, - 7, 12, 18, 28, 36, 52, 64, 82, 102, 118, 142, 164, 189, - 208, 217, 224, 231, 235, 238, 273, 297, 316, 329, 375, 403, 425, - 440, 493, 525, 550, 567, 6, 11, 16, 23, 31, 43, 60, 73, - 90, 109, 126, 150, 173, 196, 211, 220, 226, 232, 236, 239, 296, - 315, 328, 335, 402, 424, 439, 447, 524, 549, 566, 575, 9, 14, - 19, 29, 37, 50, 65, 78, 95, 116, 134, 157, 179, 201, 214, - 223, 244, 255, 272, 295, 341, 354, 374, 401, 454, 469, 492, 523, - 582, 596, 617, 645, 13, 20, 26, 35, 44, 54, 72, 85, 105, - 123, 140, 163, 182, 205, 216, 225, 254, 271, 294, 314, 353, 373, - 400, 423, 468, 491, 522, 548, 595, 616, 644, 666, 21, 27, 33, - 42, 53, 63, 80, 94, 113, 132, 151, 172, 190, 209, 218, 227, - 270, 293, 313, 327, 372, 399, 422, 438, 490, 521, 547, 565, 615, - 643, 665, 680, 24, 32, 39, 48, 57, 71, 88, 104, 120, 139, - 159, 178, 197, 212, 221, 230, 292, 312, 326, 334, 398, 421, 437, - 446, 520, 546, 564, 574, 642, 664, 679, 687, 34, 40, 46, 56, - 68, 81, 96, 111, 130, 147, 167, 186, 243, 253, 269, 291, 340, - 352, 371, 397, 453, 467, 489, 519, 581, 594, 614, 641, 693, 705, - 723, 747, 41, 49, 55, 67, 77, 91, 107, 124, 138, 161, 177, - 194, 252, 268, 290, 311, 351, 370, 396, 420, 466, 488, 518, 545, - 593, 613, 640, 663, 704, 722, 746, 765, 51, 59, 66, 76, 89, - 99, 119, 131, 149, 168, 181, 200, 267, 289, 310, 325, 369, 395, - 419, 436, 487, 517, 544, 563, 612, 639, 662, 678, 721, 745, 764, - 777, 61, 69, 75, 87, 100, 114, 129, 144, 162, 180, 191, 207, - 288, 309, 324, 333, 394, 418, 435, 445, 516, 543, 562, 573, 638, - 661, 677, 686, 744, 763, 776, 783, 70, 79, 86, 97, 108, 122, - 137, 155, 242, 251, 266, 287, 339, 350, 368, 393, 452, 465, 486, - 515, 580, 592, 611, 637, 692, 703, 720, 743, 788, 798, 813, 833, - 84, 93, 103, 110, 125, 141, 154, 171, 250, 265, 286, 308, 349, - 367, 392, 417, 464, 485, 514, 542, 591, 610, 636, 660, 702, 719, - 742, 762, 797, 812, 832, 848, 98, 106, 115, 127, 143, 156, 169, - 185, 264, 285, 307, 323, 366, 391, 416, 434, 484, 513, 541, 561, - 609, 635, 659, 676, 718, 741, 761, 775, 811, 831, 847, 858, 117, - 128, 136, 148, 160, 175, 188, 198, 284, 306, 322, 332, 390, 415, - 433, 444, 512, 540, 560, 572, 634, 658, 675, 685, 740, 760, 774, - 782, 830, 846, 857, 863, 135, 146, 152, 165, 241, 249, 263, 283, - 338, 348, 365, 389, 451, 463, 483, 511, 579, 590, 608, 633, 691, - 701, 717, 739, 787, 796, 810, 829, 867, 875, 887, 903, 153, 166, - 174, 183, 248, 262, 282, 305, 347, 364, 388, 414, 462, 482, 510, - 539, 589, 607, 632, 657, 700, 716, 738, 759, 795, 809, 828, 845, - 874, 886, 902, 915, 176, 187, 195, 202, 261, 281, 304, 321, 363, - 387, 413, 432, 481, 509, 538, 559, 606, 631, 656, 674, 715, 737, - 758, 773, 808, 827, 844, 856, 885, 901, 914, 923, 192, 199, 206, - 213, 280, 303, 320, 331, 386, 412, 431, 443, 508, 537, 558, 571, - 630, 655, 673, 684, 736, 757, 772, 781, 826, 843, 855, 862, 900, - 913, 922, 927, 240, 247, 260, 279, 337, 346, 362, 385, 450, 461, - 480, 507, 578, 588, 605, 629, 690, 699, 714, 735, 786, 794, 807, - 825, 866, 873, 884, 899, 930, 936, 945, 957, 246, 259, 278, 302, - 345, 361, 384, 411, 460, 479, 506, 536, 587, 604, 628, 654, 698, - 713, 734, 756, 793, 806, 824, 842, 872, 883, 898, 912, 935, 944, - 956, 966, 258, 277, 301, 319, 360, 383, 410, 430, 478, 505, 535, - 557, 603, 627, 653, 672, 712, 733, 755, 771, 805, 823, 841, 854, - 882, 897, 911, 921, 943, 955, 965, 972, 276, 300, 318, 330, 382, - 409, 429, 442, 504, 534, 556, 570, 626, 652, 671, 683, 732, 754, - 770, 780, 822, 840, 853, 861, 896, 910, 920, 926, 954, 964, 971, - 975, 336, 344, 359, 381, 449, 459, 477, 503, 577, 586, 602, 625, - 689, 697, 711, 731, 785, 792, 804, 821, 865, 871, 881, 895, 929, - 934, 942, 953, 977, 981, 987, 995, 343, 358, 380, 408, 458, 476, - 502, 533, 585, 601, 624, 651, 696, 710, 730, 753, 791, 803, 820, - 839, 870, 880, 894, 909, 933, 941, 952, 963, 980, 986, 994, 1001, - 357, 379, 407, 428, 475, 501, 532, 555, 600, 623, 650, 670, 709, - 729, 752, 769, 802, 819, 838, 852, 879, 893, 908, 919, 940, 951, - 962, 970, 985, 993, 1000, 1005, 378, 406, 427, 441, 500, 531, 554, - 569, 622, 649, 669, 682, 728, 751, 768, 779, 818, 837, 851, 860, - 892, 907, 918, 925, 950, 961, 969, 974, 992, 999, 1004, 1007, 448, - 457, 474, 499, 576, 584, 599, 621, 688, 695, 708, 727, 784, 790, - 801, 817, 864, 869, 878, 891, 928, 932, 939, 949, 976, 979, 984, - 991, 1008, 1010, 1013, 1017, 456, 473, 498, 530, 583, 598, 620, 648, - 694, 707, 726, 750, 789, 800, 816, 836, 868, 877, 890, 906, 931, - 938, 948, 960, 978, 983, 990, 998, 1009, 1012, 1016, 1020, 472, 497, - 529, 553, 597, 619, 647, 668, 706, 725, 749, 767, 799, 815, 835, - 850, 876, 889, 905, 917, 937, 947, 959, 968, 982, 989, 997, 1003, - 1011, 1015, 1019, 1022, 496, 528, 552, 568, 618, 646, 667, 681, 724, - 748, 766, 778, 814, 834, 849, 859, 888, 904, 916, 924, 946, 958, - 967, 973, 988, 996, 1002, 1006, 1014, 1018, 1021, 1023, + 1, 3, 6, 11, 18, 26, 39, 48, 63, 84, 102, 122, 146, + 171, 194, 205, 211, 220, 230, 234, 246, 258, 276, 300, 343, 357, + 378, 406, 456, 472, 496, 528, 2, 5, 9, 16, 23, 31, 46, + 59, 75, 93, 113, 134, 159, 185, 204, 216, 223, 229, 235, 238, + 257, 275, 299, 318, 356, 377, 405, 427, 471, 495, 527, 552, 4, + 8, 13, 19, 29, 37, 53, 65, 83, 103, 119, 143, 165, 190, + 209, 218, 225, 232, 236, 239, 274, 298, 317, 330, 376, 404, 426, + 441, 494, 526, 551, 568, 7, 12, 17, 24, 32, 44, 61, 74, + 91, 110, 127, 151, 174, 197, 212, 221, 227, 233, 237, 240, 297, + 316, 329, 336, 403, 425, 440, 448, 525, 550, 567, 576, 10, 15, + 20, 30, 38, 51, 66, 79, 96, 117, 135, 158, 180, 202, 215, + 224, 245, 256, 273, 296, 342, 355, 375, 402, 455, 470, 493, 524, + 583, 597, 618, 646, 14, 21, 27, 36, 45, 55, 73, 86, 106, + 124, 141, 164, 183, 206, 217, 226, 255, 272, 295, 315, 354, 374, + 401, 424, 469, 492, 523, 549, 596, 617, 645, 667, 22, 28, 34, + 43, 54, 64, 81, 95, 114, 133, 152, 173, 191, 210, 219, 228, + 271, 294, 314, 328, 373, 400, 423, 439, 491, 522, 548, 566, 616, + 644, 666, 681, 25, 33, 40, 49, 58, 72, 89, 105, 121, 140, + 160, 179, 198, 213, 222, 231, 293, 313, 327, 335, 399, 422, 438, + 447, 521, 547, 565, 575, 643, 665, 680, 688, 35, 41, 47, 57, + 69, 82, 97, 112, 131, 148, 168, 187, 244, 254, 270, 292, 341, + 353, 372, 398, 454, 468, 490, 520, 582, 595, 615, 642, 694, 706, + 724, 748, 42, 50, 56, 68, 78, 92, 108, 125, 139, 162, 178, + 195, 253, 269, 291, 312, 352, 371, 397, 421, 467, 489, 519, 546, + 594, 614, 641, 664, 705, 723, 747, 766, 52, 60, 67, 77, 90, + 100, 120, 132, 150, 169, 182, 201, 268, 290, 311, 326, 370, 396, + 420, 437, 488, 518, 545, 564, 613, 640, 663, 679, 722, 746, 765, + 778, 62, 70, 76, 88, 101, 115, 130, 145, 163, 181, 192, 208, + 289, 310, 325, 334, 395, 419, 436, 446, 517, 544, 563, 574, 639, + 662, 678, 687, 745, 764, 777, 784, 71, 80, 87, 98, 109, 123, + 138, 156, 243, 252, 267, 288, 340, 351, 369, 394, 453, 466, 487, + 516, 581, 593, 612, 638, 693, 704, 721, 744, 789, 799, 814, 834, + 85, 94, 104, 111, 126, 142, 155, 172, 251, 266, 287, 309, 350, + 368, 393, 418, 465, 486, 515, 543, 592, 611, 637, 661, 703, 720, + 743, 763, 798, 813, 833, 849, 99, 107, 116, 128, 144, 157, 170, + 186, 265, 286, 308, 324, 367, 392, 417, 435, 485, 514, 542, 562, + 610, 636, 660, 677, 719, 742, 762, 776, 812, 832, 848, 859, 118, + 129, 137, 149, 161, 176, 189, 199, 285, 307, 323, 333, 391, 416, + 434, 445, 513, 541, 561, 573, 635, 659, 676, 686, 741, 761, 775, + 783, 831, 847, 858, 864, 136, 147, 153, 166, 242, 250, 264, 284, + 339, 349, 366, 390, 452, 464, 484, 512, 580, 591, 609, 634, 692, + 702, 718, 740, 788, 797, 811, 830, 868, 876, 888, 904, 154, 167, + 175, 184, 249, 263, 283, 306, 348, 365, 389, 415, 463, 483, 511, + 540, 590, 608, 633, 658, 701, 717, 739, 760, 796, 810, 829, 846, + 875, 887, 903, 916, 177, 188, 196, 203, 262, 282, 305, 322, 364, + 388, 414, 433, 482, 510, 539, 560, 607, 632, 657, 675, 716, 738, + 759, 774, 809, 828, 845, 857, 886, 902, 915, 924, 193, 200, 207, + 214, 281, 304, 321, 332, 387, 413, 432, 444, 509, 538, 559, 572, + 631, 656, 674, 685, 737, 758, 773, 782, 827, 844, 856, 863, 901, + 914, 923, 928, 241, 248, 261, 280, 338, 347, 363, 386, 451, 462, + 481, 508, 579, 589, 606, 630, 691, 700, 715, 736, 787, 795, 808, + 826, 867, 874, 885, 900, 931, 937, 946, 958, 247, 260, 279, 303, + 346, 362, 385, 412, 461, 480, 507, 537, 588, 605, 629, 655, 699, + 714, 735, 757, 794, 807, 825, 843, 873, 884, 899, 913, 936, 945, + 957, 967, 259, 278, 302, 320, 361, 384, 411, 431, 479, 506, 536, + 558, 604, 628, 654, 673, 713, 734, 756, 772, 806, 824, 842, 855, + 883, 898, 912, 922, 944, 956, 966, 973, 277, 301, 319, 331, 383, + 410, 430, 443, 505, 535, 557, 571, 627, 653, 672, 684, 733, 755, + 771, 781, 823, 841, 854, 862, 897, 911, 921, 927, 955, 965, 972, + 976, 337, 345, 360, 382, 450, 460, 478, 504, 578, 587, 603, 626, + 690, 698, 712, 732, 786, 793, 805, 822, 866, 872, 882, 896, 930, + 935, 943, 954, 978, 982, 988, 996, 344, 359, 381, 409, 459, 477, + 503, 534, 586, 602, 625, 652, 697, 711, 731, 754, 792, 804, 821, + 840, 871, 881, 895, 910, 934, 942, 953, 964, 981, 987, 995, 1002, + 358, 380, 408, 429, 476, 502, 533, 556, 601, 624, 651, 671, 710, + 730, 753, 770, 803, 820, 839, 853, 880, 894, 909, 920, 941, 952, + 963, 971, 986, 994, 1001, 1006, 379, 407, 428, 442, 501, 532, 555, + 570, 623, 650, 670, 683, 729, 752, 769, 780, 819, 838, 852, 861, + 893, 908, 919, 926, 951, 962, 970, 975, 993, 1000, 1005, 1008, 449, + 458, 475, 500, 577, 585, 600, 622, 689, 696, 709, 728, 785, 791, + 802, 818, 865, 870, 879, 892, 929, 933, 940, 950, 977, 980, 985, + 992, 1009, 1011, 1014, 1018, 457, 474, 499, 531, 584, 599, 621, 649, + 695, 708, 727, 751, 790, 801, 817, 837, 869, 878, 891, 907, 932, + 939, 949, 961, 979, 984, 991, 999, 1010, 1013, 1017, 1021, 473, 498, + 530, 554, 598, 620, 648, 669, 707, 726, 750, 768, 800, 816, 836, + 851, 877, 890, 906, 918, 938, 948, 960, 969, 983, 990, 998, 1004, + 1012, 1016, 1020, 1023, 497, 529, 553, 569, 619, 647, 668, 682, 725, + 749, 767, 779, 815, 835, 850, 860, 889, 905, 917, 925, 947, 959, + 968, 974, 989, 997, 1003, 1007, 1015, 1019, 1022, 1024, }; const scan_order vp9_default_scan_orders[TX_SIZES] = { diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index 945fd522e8..b9bd1eba31 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -45,9 +45,7 @@ static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr, int16x8_t v_eobmax, uint16x8_t v_nz_mask) { const int16x8_t v_iscan = vld1q_s16(&iscan_ptr[0]); - const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1)); - const int16x8_t v_nz_iscan = - vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan_plus1); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan); return vmaxq_s16(v_eobmax, v_nz_iscan); } diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c index 15ce71c5c6..da285be8e7 100644 --- a/vp9/encoder/x86/vp9_quantize_avx2.c +++ b/vp9/encoder/x86/vp9_quantize_avx2.c @@ -43,16 +43,13 @@ static VPX_FORCE_INLINE void load_fp_values_avx2( static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, __m256i v_eobmax, __m256i v_mask) { - const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); #if CONFIG_VP9_HIGHBITDEPTH - // typedef int32_t tran_low_t; - const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8); - const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask); + const __m256i v_iscan = _mm256_permute4x64_epi64( + _mm256_loadu_si256((const __m256i *)iscan), 0xD8); #else - // typedef int16_t tran_low_t; - const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan, v_mask); + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); #endif - const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask); + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask); return _mm256_max_epi16(v_eobmax, v_nz_iscan); } @@ -303,8 +300,7 @@ static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob( _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); const __m256i iscan = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); - const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm); - const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm); + const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm); return _mm256_max_epi16(eobmax, nz_iscan); } diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index 4bcadaa6a1..0fd0dccc4f 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -99,9 +99,6 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob = _mm_max_epi16(eob, eob1); @@ -174,9 +171,6 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob0 = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob0 = _mm_max_epi16(eob0, eob1); diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index 680acfec69..ae43a90f8b 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -91,8 +91,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, round, quant, \ pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m7 ; m11 = scan[i] + 1 pandn m8, m6 ; m8 = max(eob) pandn m13, m11 ; m13 = max(eob) pmaxsw m8, m13 @@ -141,8 +139,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, round, quant, \ pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m7 ; m11 = scan[i] + 1 pandn m14, m6 ; m14 = max(eob) pandn m13, m11 ; m13 = max(eob) pmaxsw m8, m14 diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index 502a9c972d..b9f72a94c5 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -101,7 +101,6 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; @@ -119,9 +118,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // Process first 8 values which include a dc component. { - // Add one because the eob does not index from 0. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, @@ -148,9 +145,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = vdupq_lane_s32(vget_low_s32(dequant), 1); do { - // Add one because the eob is not its index. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, @@ -234,7 +229,6 @@ void vpx_highbd_quantize_b_32x32_neon( const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; @@ -253,9 +247,7 @@ void vpx_highbd_quantize_b_32x32_neon( // Process first 8 values which include a dc component. { - // Add one because the eob does not index from 0. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, @@ -279,9 +271,7 @@ void vpx_highbd_quantize_b_32x32_neon( dequant = vdupq_lane_s32(vget_low_s32(dequant), 1); for (i = 1; i < 32 * 32 / 8; ++i) { - // Add one because the eob is not its index. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index dcdf588cbc..9c227d560f 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -75,7 +75,6 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; @@ -88,9 +87,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // Process first 8 values which include a dc component. { - // Add one because the eob does not index from 0. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant, @@ -116,9 +113,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); do { - // Add one because the eob is not its index. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, @@ -226,7 +221,6 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; @@ -240,9 +234,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // Process first 8 values which include a dc component. { - // Add one because the eob does not index from 0. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, @@ -266,9 +258,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); for (i = 1; i < 32 * 32 / 8; ++i) { - // Add one because the eob is not its index. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c index 2fc33b06b7..77be0bb4fe 100644 --- a/vpx_dsp/loongarch/quantize_lsx.c +++ b/vpx_dsp/loongarch/quantize_lsx.c @@ -59,7 +59,6 @@ static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff, } static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1, - __m128i zbin_mask0, __m128i zbin_mask1, const int16_t *scan, int index, __m128i zero) { const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero); @@ -68,8 +67,6 @@ static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1, __m128i scan1 = __lsx_vld(scan + index + 8, 0); __m128i eob0, eob1; - scan0 = __lsx_vsub_h(scan0, zbin_mask0); - scan1 = __lsx_vsub_h(scan1, zbin_mask1); eob0 = __lsx_vandn_v(zero_coeff0, scan0); eob1 = __lsx_vandn_v(zero_coeff1, scan1); return __lsx_vmax_h(eob0, eob1); @@ -138,7 +135,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, dequant = __lsx_vilvh_d(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { coeff0 = __lsx_vld(coeff_ptr + index, 0); @@ -161,8 +158,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero); eob = __lsx_vmax_h(eob, eob0); index += 16; @@ -221,7 +217,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr); dequant = __lsx_vilvh_d(dequant, dequant); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero); // AC only loop. for (index = 16; index < 32 * 32; index += 16) { coeff0 = __lsx_vld(coeff_ptr + index, 0); @@ -243,8 +239,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8 + index); - eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero); eob = __lsx_vmax_h(eob, eob0); } diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c index 7cdcbeb405..ab71f6e235 100644 --- a/vpx_dsp/ppc/quantize_vsx.c +++ b/vpx_dsp/ppc/quantize_vsx.c @@ -78,11 +78,10 @@ static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack); } -static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask, +static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, const int16_t *iscan_ptr, int index) { int16x8_t scan = vec_vsx_ld(index, iscan_ptr); bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16); - scan = vec_sub(scan, mask); return vec_andc(scan, zero_coeff); } @@ -139,8 +138,8 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); - eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0), - nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16)); + eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, iscan_ptr, 16)); if (n_coeffs > 16) { int index = 16; @@ -177,10 +176,9 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); - eob = - vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0)); - eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1), - nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2)); + eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, iscan_ptr, off2)); eob = vec_max(eob, eob2); index += 24; @@ -252,8 +250,8 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = vec_splat(dequant, 1); // remove DC from dequant vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr); - eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0), - nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16)); + eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, iscan_ptr, 16)); do { int16x8_t coeff2, coeff2_abs, qcoeff2, eob2; @@ -286,9 +284,9 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr); vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr); - eob = vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0)); - eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1), - nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2)); + eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, iscan_ptr, off2)); eob = vec_max(eob, eob2); // 24 int16_t is 48 bytes diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c index cbc715c046..8edddd637f 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -80,8 +80,7 @@ static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr, _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); const __m256i iscan = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); - const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm); - const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm); + const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm); return _mm256_max_epi16(eobmax, nz_iscan); } @@ -256,4 +255,4 @@ void vpx_highbd_quantize_b_32x32_avx2( } *eob_ptr = get_max_eob(eob); -} \ No newline at end of file +} diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 1264fbed22..ae1981a834 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -25,7 +25,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - int i, j, non_zero_regs = (int)count / 4, eob_i = -1; + int i, j, non_zero_regs = (int)count / 4, eob_i = 0; __m128i zbins[2]; __m128i nzbins[2]; @@ -89,7 +89,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, } } } - *eob_ptr = eob_i + 1; + *eob_ptr = eob_i; } void vpx_highbd_quantize_b_32x32_sse2( @@ -102,7 +102,7 @@ void vpx_highbd_quantize_b_32x32_sse2( __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; - int i, eob = -1; + int i, eob = 0; const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); (void)scan; @@ -148,6 +148,6 @@ void vpx_highbd_quantize_b_32x32_sse2( dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } - *eob_ptr = eob + 1; + *eob_ptr = eob; } #endif diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index 706e4e6413..7d83527216 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -93,8 +93,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = - scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); } // AC only loop. @@ -134,8 +133,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } @@ -229,8 +227,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); - eob = - scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); } // AC only loop. @@ -272,8 +269,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index 6fd5174876..28f7c9c7da 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -127,16 +127,13 @@ quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, __m256i v_eobmax, __m256i v_mask) { - const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); #if CONFIG_VP9_HIGHBITDEPTH - // typedef int32_t tran_low_t; - const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8); - const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask); + const __m256i v_iscan = _mm256_permute4x64_epi64( + _mm256_loadu_si256((const __m256i *)iscan), 0xD8); #else - // typedef int16_t tran_low_t; - const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan, v_mask); + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); #endif - const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask); + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask); return _mm256_max_epi16(v_eobmax, v_nz_iscan); } diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c index 459d95f28b..9533e7916d 100644 --- a/vpx_dsp/x86/quantize_sse2.c +++ b/vpx_dsp/x86/quantize_sse2.c @@ -76,7 +76,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { @@ -106,8 +106,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); index += 16; diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h index afe2f924b3..580dd883f0 100644 --- a/vpx_dsp/x86/quantize_sse2.h +++ b/vpx_dsp/x86/quantize_sse2.h @@ -62,11 +62,8 @@ static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, #endif // CONFIG_VP9_HIGHBITDEPTH } -// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to -// zbin to add 1 to the index in 'scan'. +// Scan 16 values for eob reference in scan. static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, - const __m128i zbin_mask0, - const __m128i zbin_mask1, const int16_t *scan, const int index, const __m128i zero) { const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); @@ -74,9 +71,6 @@ static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index)); __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8)); __m128i eob0, eob1; - // Add one to convert from indices to counts - scan0 = _mm_sub_epi16(scan0, zbin_mask0); - scan1 = _mm_sub_epi16(scan1, zbin_mask1); eob0 = _mm_andnot_si128(zero_coeff0, scan0); eob1 = _mm_andnot_si128(zero_coeff1, scan1); return _mm_max_epi16(eob0, eob1); diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 9d2a88b7bc..476230286d 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -70,7 +70,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { @@ -98,8 +98,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); index += 16; @@ -202,8 +201,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); - eob = - scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); } // AC only loop. @@ -249,8 +247,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8 + index); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } From 00608eb1dea3206b2ea60236b5aa990d6d2c317f Mon Sep 17 00:00:00 2001 From: Johann Date: Sat, 24 Sep 2022 10:55:52 +0900 Subject: [PATCH 0512/1000] quantize: add untested function vp9_quantize_fp_sse2 was only tested in non-hbd configuration. Missed when fixing this for vpx_quantize_b_sse2. Change-Id: Ide346e5727d74281c774f605c90d280050e0bf62 --- test/vp9_quantize_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index a81775fd94..7bb0bee512 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -512,6 +512,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_8, 16, false), make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, From a1ba7188a8da368a4a3d80a0096e993945a534b1 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Fri, 23 Sep 2022 09:17:18 -0700 Subject: [PATCH 0513/1000] vp9_rd.c quiet -Wstringop-overflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ../libvpx/vp9/encoder/vp9_rd.c:594:20: warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=] 594 | t_above[i] = !!*(const uint32_t *)&above[i]; | ~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../libvpx/vp9/encoder/vp9_rd.c:572:47: note: at offset [64, 254] into destination object ‘t_above’ of size [0, 16] 572 | ENTROPY_CONTEXT t_above[16], | ~~~~~~~~~~~~~~~~^~~~~~~~~~~ Change-Id: Ie9ef24e685af417cdd35f6aa7284805e422b6ae2 --- vp9/encoder/vp9_rd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 9fa3ff1865..28f992f4b6 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -567,6 +567,12 @@ void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE], } } +// Disable gcc 12.2 false positive warning. +// warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=] +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstringop-overflow" +#endif void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, const struct macroblockd_plane *pd, ENTROPY_CONTEXT t_above[16], @@ -604,6 +610,9 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, break; } } +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic pop +#endif void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) { From f74ce37a3abcdca68e5bc549fbd1e0a0af3f79c8 Mon Sep 17 00:00:00 2001 From: Johann Date: Sat, 24 Sep 2022 10:53:05 +0900 Subject: [PATCH 0514/1000] quantize: standardize vp9_quantize_fp_sse2 Match style for vpx_quantize_b_sse2 and prepare to rewrite ssse3 version in intrinsics. Need to evaluate the value of threshold breakout before going further. Change-Id: I9cfceb1bb0dc237cd6b73fc8d41d78bba444a15b --- vp9/encoder/x86/vp9_quantize_sse2.c | 205 +++++++++++----------------- vpx_dsp/x86/quantize_sse2.h | 9 ++ 2 files changed, 85 insertions(+), 129 deletions(-) diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index 0fd0dccc4f..da4cd9ee8f 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -16,13 +16,14 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - __m128i zero; + const __m128i zero = _mm_setzero_si128(); __m128i thr; int nzflag; __m128i eob; @@ -35,159 +36,105 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); + + // Setup global values. + load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); { __m128i coeff0, coeff1; + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr + n_coeffs); + coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); - // Setup global values - { - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - } + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, dqcoeff_ptr + n_coeffs + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan + n_coeffs, 0, zero); + + n_coeffs += 8 * 2; + } + + thr = _mm_srai_epi16(dequant, 1); + + // AC only loop. + while (n_coeffs < 0) { + __m128i coeff0, coeff1; + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + + coeff0 = load_tran_low(coeff_ptr + n_coeffs); + coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - // Do DC and first 15 AC - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + if (nzflag) { qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + // Reinsert signs. + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } - - thr = _mm_srai_epi16(dequant, 1); + } else { + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - // AC only loop - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | - _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); - - if (nzflag) { - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } else { - store_zero_tran_low(qcoeff_ptr + n_coeffs); - store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - - store_zero_tran_low(dqcoeff_ptr + n_coeffs); - store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); - } + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); } if (nzflag) { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); + const __m128i eob0 = + scan_for_eob(&coeff0, &coeff1, iscan + n_coeffs, 0, zero); eob = _mm_max_epi16(eob, eob0); } n_coeffs += 8 * 2; } - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } + *eob_ptr = accumulate_eob(eob); } diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h index 580dd883f0..27bfb4e41b 100644 --- a/vpx_dsp/x86/quantize_sse2.h +++ b/vpx_dsp/x86/quantize_sse2.h @@ -29,6 +29,15 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, *shift = _mm_load_si128((const __m128i *)shift_ptr); } +static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round, + const int16_t *quant_ptr, __m128i *quant, + const int16_t *dequant_ptr, + __m128i *dequant) { + *round = _mm_load_si128((const __m128i *)round_ptr); + *quant = _mm_load_si128((const __m128i *)quant_ptr); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); +} + // With ssse3 and later abs() and sign() are preferred. static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { a = _mm_xor_si128(a, sign); From 87c7da21c2910875f2dc478abfbe96a2246fe93b Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Mon, 19 Sep 2022 05:09:23 -0700 Subject: [PATCH 0515/1000] vpx_subpixel_8t_intrin_avx2.c: quiet -Wuninitialized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit warning: ‘s2[3]’ may be used uninitialized and warning: ‘s1[3]’ may be used uninitialized The warnings exposed unused code. Change-Id: I75cf1f9db75e811cb42e2f143be1ad76f3e4dee9 --- vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 32 +++-------------------- 1 file changed, 3 insertions(+), 29 deletions(-) diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index db3c39de0f..c7d880860e 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -227,6 +227,9 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2( s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]); } + // The output_height is always a multiple of two. + assert(!(output_height & 1)); + for (i = output_height; i > 1; i -= 2) { __m256i srcRegHead2, srcRegHead3; @@ -282,35 +285,6 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2( s2[2] = s2[3]; srcRegHead1 = srcRegHead3; } - - // if the number of strides is odd. - // process only 16 bytes - if (i > 0) { - // load the last 16 bytes - const __m128i srcRegHead2 = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); - - // merge the last 2 results together - s1[0] = _mm256_castsi128_si256( - _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2)); - s2[0] = _mm256_castsi128_si256( - _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2)); - - outReg1 = convolve8_8_avx2(s1, f); - outReg2 = convolve8_8_avx2(s2, f); - - // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane - // contain the first and second convolve result respectively - outReg1 = _mm_packus_epi16(outReg1, outReg2); - - // average if necessary - if (avg) { - outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); - } - - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, outReg1); - } } static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, From eeea3daacbf0c3f8e1bbfd2f9b67e4eda1badafc Mon Sep 17 00:00:00 2001 From: Johann Date: Sat, 1 Oct 2022 11:18:09 +0900 Subject: [PATCH 0516/1000] vp9 quantize: change index In assembly it made sense to iterate using n_coeffs. In intrinsics it's just as fast to use index and easier to read. Change-Id: I403c959709309dad68123d0a3d0efe183874543d --- vp9/encoder/x86/vp9_quantize_sse2.c | 99 ++++++++++++----------------- 1 file changed, 42 insertions(+), 57 deletions(-) diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index da4cd9ee8f..272e5fb079 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -26,72 +26,58 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const __m128i zero = _mm_setzero_si128(); __m128i thr; int nzflag; - __m128i eob; + int index = 16; __m128i round, quant, dequant; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i eob; (void)scan; - coeff_ptr += n_coeffs; - iscan += n_coeffs; - qcoeff_ptr += n_coeffs; - dqcoeff_ptr += n_coeffs; - n_coeffs = -n_coeffs; - // Setup global values. load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); - { - __m128i coeff0, coeff1; - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - // Do DC and first 15 AC. - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); - // Poor man's abs(). - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); - qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); - round = _mm_unpackhi_epi64(round, round); - quant = _mm_unpackhi_epi64(quant, quant); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); - // Reinsert signs. - qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); - qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + // Reinsert signs. + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); - qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); - store_tran_low(qcoeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, dqcoeff_ptr + n_coeffs + 8); + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); - eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan + n_coeffs, 0, zero); - - n_coeffs += 8 * 2; - } + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); thr = _mm_srai_epi16(dequant, 1); // AC only loop. - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); // Poor man's abs(). coeff0_sign = _mm_srai_epi16(coeff0, 15); @@ -112,28 +98,27 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); + store_tran_low(coeff0, dqcoeff_ptr + index); + store_tran_low(coeff1, dqcoeff_ptr + index + 8); } else { - store_zero_tran_low(qcoeff_ptr + n_coeffs); - store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); - store_zero_tran_low(dqcoeff_ptr + n_coeffs); - store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); } if (nzflag) { - const __m128i eob0 = - scan_for_eob(&coeff0, &coeff1, iscan + n_coeffs, 0, zero); + const __m128i eob0 = scan_for_eob(&coeff0, &coeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } - n_coeffs += 8 * 2; + index += 16; } *eob_ptr = accumulate_eob(eob); From c03c882785dc96ed91799280e68f8998bec50b90 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 5 Oct 2022 07:04:27 -0700 Subject: [PATCH 0517/1000] Add vpx_highbd_sad16x{32,16,8}_avx2. 1.9x to 2.4x faster than the sse2 version. Bug: b/245917257 Change-Id: I686452772f9b72233930de2207af36a0cd72e0bb --- test/sad_test.cc | 11 ++++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +- vpx_dsp/x86/highbd_sad_avx2.c | 100 ++++++++++++++++++++++++++++++++++ 4 files changed, 115 insertions(+), 3 deletions(-) create mode 100644 vpx_dsp/x86/highbd_sad_avx2.c diff --git a/test/sad_test.cc b/test/sad_test.cc index 7e84ea0dbf..cd1dd0dd02 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1064,6 +1064,17 @@ const SadMxNParam avx2_tests[] = { SadMxNParam(32, 64, &vpx_sad32x64_avx2), SadMxNParam(32, 32, &vpx_sad32x32_avx2), SadMxNParam(32, 16, &vpx_sad32x16_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 8), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 8), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 8), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 10), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 10), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 10), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 12), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 12), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests)); diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index f9a5c97dd2..32d21e03f7 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -394,6 +394,7 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 527d0e6e74..004afb38f8 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -956,13 +956,13 @@ () specialize qw/vpx_highbd_sad32x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad16x32 sse2 neon/; + specialize qw/vpx_highbd_sad16x32 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad16x16 sse2 neon/; + specialize qw/vpx_highbd_sad16x16 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad16x8 sse2 neon/; + specialize qw/vpx_highbd_sad16x8 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_highbd_sad8x16 sse2 neon/; diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c new file mode 100644 index 0000000000..36e9fa6c0f --- /dev/null +++ b/vpx_dsp/x86/highbd_sad_avx2.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include // AVX2 +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) { + const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8)); + const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4)); + const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1), + _mm256_extractf128_si256(t1, 1)); + return (unsigned int)_mm_cvtsi128_si32(sum); +} + +static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; i += 2) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride << 1; + ref += ref_stride << 1; + } +} + +unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < 2; ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 4; + ref += ref_stride << 4; + } + return calc_final(sums_32); +} + +unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} + +unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} From 4955b945d851cd86c287401d3bca846dae354d16 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 5 Oct 2022 14:03:55 -0700 Subject: [PATCH 0518/1000] Add vpx_highbd_sad32x{64,32,16}_avx2. 2.7x to 3.1x faster than the sse2 version. Bug: b/245917257 Change-Id: Idff3284932f7ee89d036f38893205bf622a159a3 --- test/sad_test.cc | 9 +++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 ++-- vpx_dsp/x86/highbd_sad_avx2.c | 62 ++++++++++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 4 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index cd1dd0dd02..4712c51f6d 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1065,6 +1065,15 @@ const SadMxNParam avx2_tests[] = { SadMxNParam(32, 32, &vpx_sad32x32_avx2), SadMxNParam(32, 16, &vpx_sad32x16_avx2), #if CONFIG_VP9_HIGHBITDEPTH + SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 8), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 8), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 8), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 10), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 10), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 10), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 12), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 12), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 12), SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 8), SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 8), SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 8), diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 004afb38f8..d669b9999c 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -947,13 +947,13 @@ () specialize qw/vpx_highbd_sad64x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad32x64 sse2 neon/; + specialize qw/vpx_highbd_sad32x64 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad32x32 sse2 neon/; + specialize qw/vpx_highbd_sad32x32 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad32x16 sse2 neon/; + specialize qw/vpx_highbd_sad32x16 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_highbd_sad16x32 sse2 neon avx2/; diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c index 36e9fa6c0f..eb0e3eec5f 100644 --- a/vpx_dsp/x86/highbd_sad_avx2.c +++ b/vpx_dsp/x86/highbd_sad_avx2.c @@ -7,7 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include // AVX2 +#include #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" @@ -19,6 +19,66 @@ static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) { return (unsigned int)_mm_cvtsi128_si32(sum); } +static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride; + ref += ref_stride; + } +} + +#define HIGHBD_SAD32XN(n) \ + unsigned int vpx_highbd_sad32x##n##_avx2( \ + const uint8_t *src8_ptr, int src_stride, const uint8_t *ref8_ptr, \ + int ref_stride) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 8); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); \ + \ + /* sums_16 will outrange after 8 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 3; \ + ref += ref_stride << 3; \ + } \ + return calc_final(sums_32); \ + } + +// 32x64 +HIGHBD_SAD32XN(64) + +// 32x32 +HIGHBD_SAD32XN(32) + +// 32x16 +HIGHBD_SAD32XN(16) + static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, const uint16_t *src, int src_stride, uint16_t *ref, int ref_stride, From 06b09ebd351deb35b5bdcf387904dcbecc3da02f Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Fri, 7 Oct 2022 05:53:50 -0700 Subject: [PATCH 0519/1000] Add vpx_highbd_sad64x{64,32}_avx2. ~2.8x faster than the sse2 version. Bug: b/245917257 Change-Id: Ibc8e5d030ec145c9a9b742fff98fbd9131c9ede4 --- test/sad_test.cc | 20 +++++++---- vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +-- vpx_dsp/x86/highbd_sad_avx2.c | 65 +++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 8 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 4712c51f6d..a8f04e6eb8 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1065,21 +1065,29 @@ const SadMxNParam avx2_tests[] = { SadMxNParam(32, 32, &vpx_sad32x32_avx2), SadMxNParam(32, 16, &vpx_sad32x16_avx2), #if CONFIG_VP9_HIGHBITDEPTH + SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 8), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 8), SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 8), SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 8), SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 8), - SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 10), - SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 10), - SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 10), - SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 12), - SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 12), - SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 12), SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 8), SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 8), SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 8), + + SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 10), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 10), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 10), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 10), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 10), SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 10), SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 10), SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 10), + + SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 12), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 12), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 12), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 12), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 12), SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 12), SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 12), SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 12), diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index d669b9999c..34ee98197a 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -941,10 +941,10 @@ () # Single block SAD # add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad64x64 sse2 neon/; + specialize qw/vpx_highbd_sad64x64 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad64x32 sse2 neon/; + specialize qw/vpx_highbd_sad64x32 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_highbd_sad32x64 sse2 neon avx2/; diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c index eb0e3eec5f..12ef2eb3e9 100644 --- a/vpx_dsp/x86/highbd_sad_avx2.c +++ b/vpx_dsp/x86/highbd_sad_avx2.c @@ -19,6 +19,71 @@ static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) { return (unsigned int)_mm_cvtsi128_si32(sum); } +static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); + const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2)); + const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3)); + // sum every abs diff + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1)); + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3)); + + src += src_stride; + ref += ref_stride; + } +} + +#define HIGHBD_SAD64XN(n) \ + unsigned int vpx_highbd_sad64x##n##_avx2( \ + const uint8_t *src8_ptr, int src_stride, const uint8_t *ref8_ptr, \ + int ref_stride) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 2); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); \ + \ + /* sums_16 will outrange after 2 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 1; \ + ref += ref_stride << 1; \ + } \ + return calc_final(sums_32); \ + } + +// 64x64 +HIGHBD_SAD64XN(64) + +// 64x32 +HIGHBD_SAD64XN(32) + static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, const uint16_t *src, int src_stride, uint16_t *ref, int ref_stride, From 2d87b886a3f3d776ae13c3e36101a6e19f00bed6 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 6 Oct 2022 10:26:05 +0000 Subject: [PATCH 0520/1000] [NEON] highbd partial DCT functions Change-Id: I7dd4e698469562f5b1f948cc36f8403b490dcb6a --- test/dct_partial_test.cc | 10 ++++- vpx_dsp/arm/fdct_partial_neon.c | 65 +++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 ++ 3 files changed, 77 insertions(+), 2 deletions(-) diff --git a/test/dct_partial_test.cc b/test/dct_partial_test.cc index 8d0e3a912e..e57fa0f48b 100644 --- a/test/dct_partial_test.cc +++ b/test/dct_partial_test.cc @@ -145,11 +145,17 @@ INSTANTIATE_TEST_SUITE_P( #if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_SUITE_P( NEON, PartialFdctTest, - ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8), - make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8), + ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_8), + make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_8), make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12), make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10), make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_12), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_10), make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8))); #else INSTANTIATE_TEST_SUITE_P( diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c index 0a1cdca41d..718dba0d91 100644 --- a/vpx_dsp/arm/fdct_partial_neon.c +++ b/vpx_dsp/arm/fdct_partial_neon.c @@ -101,3 +101,68 @@ void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, output[0] = (tran_low_t)(sum >> 3); output[1] = 0; } + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + int32_t sum; + + int r = 0; + do { + const int16x8_t a = vld1q_s16(input); + const int16x8_t b = vld1q_s16(input + 8); + input += stride; + partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a)); + partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a)); + partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(b)); + partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(b)); + r++; + } while (r < 16); + + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]); + partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]); + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]); + sum = horizontal_add_int32x4(partial_sum[0]); + + output[0] = (tran_low_t)(sum >> 1); + output[1] = 0; +} + +void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + + int32_t sum; + + int r = 0; + do { + const int16x8_t a0 = vld1q_s16(input); + const int16x8_t a1 = vld1q_s16(input + 8); + const int16x8_t a2 = vld1q_s16(input + 16); + const int16x8_t a3 = vld1q_s16(input + 24); + input += stride; + partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a0)); + partial_sum[0] = vaddw_s16(partial_sum[0], vget_high_s16(a0)); + partial_sum[1] = vaddw_s16(partial_sum[1], vget_low_s16(a1)); + partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a1)); + partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(a2)); + partial_sum[2] = vaddw_s16(partial_sum[2], vget_high_s16(a2)); + partial_sum[3] = vaddw_s16(partial_sum[3], vget_low_s16(a3)); + partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(a3)); + r++; + } while (r < 32); + + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]); + partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]); + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]); + sum = horizontal_add_int32x4(partial_sum[0]); + + output[0] = (tran_low_t)(sum >> 3); + output[1] = 0; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 004afb38f8..5dad78c950 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -527,6 +527,8 @@ () add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct4x4_1 sse2 neon/; + specialize qw/vpx_highbd_fdct4x4_1 neon/; + $vpx_highbd_fdct4x4_1_neon=vpx_fdct4x4_1_neon; add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct8x8 neon sse2/; @@ -563,6 +565,7 @@ () specialize qw/vpx_highbd_fdct16x16 sse2/; add_proto qw/void vpx_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vpx_highbd_fdct16x16_1 neon/; add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_highbd_fdct32x32 sse2/; @@ -571,6 +574,7 @@ () specialize qw/vpx_highbd_fdct32x32_rd sse2/; add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vpx_highbd_fdct32x32_1 neon/; } else { add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct4x4 neon sse2 msa lsx/; From 6f8537c4c8bc8cd0bdd4ed329959ab8c66fd2eba Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 6 Oct 2022 10:58:27 +0000 Subject: [PATCH 0521/1000] [NEON] move transpose_8x8 to reuse Change-Id: I3915b6c9971aedaac9c23f21fdb88bc271216208 --- vpx_dsp/arm/fdct16x16_neon.c | 6 +++--- vpx_dsp/arm/fdct16x16_neon.h | 36 ------------------------------------ vpx_dsp/arm/transpose_neon.h | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 39 deletions(-) diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c index 67f43246aa..5cccb6a64a 100644 --- a/vpx_dsp/arm/fdct16x16_neon.c +++ b/vpx_dsp/arm/fdct16x16_neon.c @@ -46,8 +46,8 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Transpose top left and top right quarters into one contiguous location to // process to the top half. - transpose_8x8(&temp0[0], &temp2[0]); - transpose_8x8(&temp1[0], &temp2[8]); + transpose_s16_8x8_new(&temp0[0], &temp2[0]); + transpose_s16_8x8_new(&temp1[0], &temp2[8]); partial_round_shift(temp2); cross_input(temp2, temp3, 1); vpx_fdct16x16_body(temp3, temp2); @@ -61,7 +61,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Transpose bottom left and bottom right quarters into one contiguous // location to process to the bottom half. - transpose_8x8(&temp0[8], &temp1[0]); + transpose_s16_8x8_new(&temp0[8], &temp1[0]); transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], &temp1[13], &temp1[14], &temp1[15]); partial_round_shift(temp1); diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h index 0dd21153fc..5ce74cdf41 100644 --- a/vpx_dsp/arm/fdct16x16_neon.h +++ b/vpx_dsp/arm/fdct16x16_neon.h @@ -174,42 +174,6 @@ static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, *sub = vcombine_s16(rounded2, rounded3); } -// Transpose 8x8 to a new location. Don't use transpose_neon.h because those -// are all in-place. -static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/, - int16x8_t *b /*[8]*/) { - // Swap 16 bit elements. - const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); - const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); - const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); - const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); - - // Swap 32 bit elements. - const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), - vreinterpretq_s32_s16(c1.val[0])); - const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), - vreinterpretq_s32_s16(c1.val[1])); - const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), - vreinterpretq_s32_s16(c3.val[0])); - const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), - vreinterpretq_s32_s16(c3.val[1])); - - // Swap 64 bit elements - const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); - const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); - const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); - const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); - - b[0] = e0.val[0]; - b[1] = e1.val[0]; - b[2] = e2.val[0]; - b[3] = e3.val[0]; - b[4] = e0.val[1]; - b[5] = e1.val[1]; - b[6] = e2.val[1]; - b[7] = e3.val[1]; -} - // Main body of fdct16x16. static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) { diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index c098ad31b6..bf06d6abe2 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -568,6 +568,40 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); } +// Transpose 8x8 to a new location. +static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) { + // Swap 16 bit elements. + const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); + + // Swap 32 bit elements. + const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), + vreinterpretq_s32_s16(c1.val[0])); + const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), + vreinterpretq_s32_s16(c1.val[1])); + const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), + vreinterpretq_s32_s16(c3.val[0])); + const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), + vreinterpretq_s32_s16(c3.val[1])); + + // Swap 64 bit elements + const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); + const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); + const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); + const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); + + b[0] = e0.val[0]; + b[1] = e1.val[0]; + b[2] = e2.val[0]; + b[3] = e3.val[0]; + b[4] = e0.val[1]; + b[5] = e1.val[1]; + b[6] = e2.val[1]; + b[7] = e3.val[1]; +} + static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, From af274914f2187de21ea7cf29d67756ade06e9760 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Mon, 10 Oct 2022 12:20:37 -0700 Subject: [PATCH 0522/1000] SADavgTest: Add speed test. Change-Id: Ie14c0f6d15f410adf749f7ab74cf9f2bf35f3d5f --- test/sad_test.cc | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 4712c51f6d..b7bf2fc4ca 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -317,7 +317,7 @@ class SADTest : public AbstractBench, public SADTestBase { } }; -class SADavgTest : public SADTestBase { +class SADavgTest : public AbstractBench, public SADTestBase { public: SADavgTest() : SADTestBase(GetParam()) {} @@ -338,6 +338,11 @@ class SADavgTest : public SADTestBase { ASSERT_EQ(reference_sad, exp_sad); } + + void Run() { + params_.func(source_data_, source_stride_, reference_data_, + reference_stride_, second_pred_); + } }; TEST_P(SADTest, MaxRef) { @@ -437,6 +442,19 @@ TEST_P(SADavgTest, ShortSrc) { source_stride_ = tmp_stride; } +TEST_P(SADavgTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height); + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, params_.width); + + RunNTimes(kCountSpeedTestBlock); + + char title[16]; + snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height); + PrintMedian(title); +} + TEST_P(SADx4Test, MaxRef) { FillConstant(source_data_, source_stride_, 0); FillConstant(GetReference(0), reference_stride_, mask_); From f538a022441bdb760c3b8ad835e209a71e31d8b9 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 6 Oct 2022 13:05:01 +0000 Subject: [PATCH 0523/1000] [NEON] Move helper functions for reuse Move all butterfly functions to fdct_neon.h Slightly optimize load/scale/cross functions in fdct 16x16. These will be reused in highbd variants. Change-Id: I28b6e0cc240304bab6b94d9c3f33cca77b8cb073 --- vpx_dsp/arm/fdct16x16_neon.c | 12 +- vpx_dsp/arm/fdct16x16_neon.h | 135 +++++----- vpx_dsp/arm/fdct32x32_neon.c | 231 +++--------------- vpx_dsp/arm/{fdct_neon.c => fdct4x4_neon.c} | 0 .../arm/{fwd_txfm_neon.c => fdct8x8_neon.c} | 0 vpx_dsp/arm/fdct_neon.h | 130 ++++++++++ vpx_dsp/vpx_dsp.mk | 4 +- 7 files changed, 231 insertions(+), 281 deletions(-) rename vpx_dsp/arm/{fdct_neon.c => fdct4x4_neon.c} (100%) rename vpx_dsp/arm/{fwd_txfm_neon.c => fdct8x8_neon.c} (100%) diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c index 5cccb6a64a..0b0ce223db 100644 --- a/vpx_dsp/arm/fdct16x16_neon.c +++ b/vpx_dsp/arm/fdct16x16_neon.c @@ -35,13 +35,13 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp3[16]; // Left half. - load(input, stride, temp0); - cross_input(temp0, temp1, 0); + load_cross(input, stride, temp0); + scale_input(temp0, temp1); vpx_fdct16x16_body(temp1, temp0); // Right half. - load(input + 8, stride, temp1); - cross_input(temp1, temp2, 0); + load_cross(input + 8, stride, temp1); + scale_input(temp1, temp2); vpx_fdct16x16_body(temp2, temp1); // Transpose top left and top right quarters into one contiguous location to @@ -49,7 +49,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { transpose_s16_8x8_new(&temp0[0], &temp2[0]); transpose_s16_8x8_new(&temp1[0], &temp2[8]); partial_round_shift(temp2); - cross_input(temp2, temp3, 1); + cross_input(temp2, temp3); vpx_fdct16x16_body(temp3, temp2); transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], &temp2[5], &temp2[6], &temp2[7]); @@ -65,7 +65,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], &temp1[13], &temp1[14], &temp1[15]); partial_round_shift(temp1); - cross_input(temp1, temp0, 1); + cross_input(temp1, temp0); vpx_fdct16x16_body(temp0, temp1); transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4], &temp1[5], &temp1[6], &temp1[7]); diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h index 5ce74cdf41..7fc2c6e7e8 100644 --- a/vpx_dsp/arm/fdct16x16_neon.h +++ b/vpx_dsp/arm/fdct16x16_neon.h @@ -13,6 +13,8 @@ #include +#include "fdct_neon.h" + static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) { b[0] = vld1q_s16(a); a += stride; @@ -72,45 +74,67 @@ static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) { // To maybe reduce register usage this could be combined with the load() step to // get the first 4 and last 4 values, cross those, then load the middle 8 values // and cross them. +static INLINE void scale_input(const int16x8_t *a /*[16]*/, + int16x8_t *b /*[16]*/) { + b[0] = vshlq_n_s16(a[0], 2); + b[1] = vshlq_n_s16(a[1], 2); + b[2] = vshlq_n_s16(a[2], 2); + b[3] = vshlq_n_s16(a[3], 2); + b[4] = vshlq_n_s16(a[4], 2); + b[5] = vshlq_n_s16(a[5], 2); + b[6] = vshlq_n_s16(a[6], 2); + b[7] = vshlq_n_s16(a[7], 2); + + b[8] = vshlq_n_s16(a[8], 2); + b[9] = vshlq_n_s16(a[9], 2); + b[10] = vshlq_n_s16(a[10], 2); + b[11] = vshlq_n_s16(a[11], 2); + b[12] = vshlq_n_s16(a[12], 2); + b[13] = vshlq_n_s16(a[13], 2); + b[14] = vshlq_n_s16(a[14], 2); + b[15] = vshlq_n_s16(a[15], 2); +} + static INLINE void cross_input(const int16x8_t *a /*[16]*/, - int16x8_t *b /*[16]*/, const int pass) { - if (pass == 0) { - b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2); - b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2); - b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2); - b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2); - b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2); - b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2); - b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2); - b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2); + int16x8_t *b /*[16]*/) { + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); +} - b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2); - b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2); - b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2); - b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2); - b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2); - b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2); - b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2); - b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2); - } else { - b[0] = vaddq_s16(a[0], a[15]); - b[1] = vaddq_s16(a[1], a[14]); - b[2] = vaddq_s16(a[2], a[13]); - b[3] = vaddq_s16(a[3], a[12]); - b[4] = vaddq_s16(a[4], a[11]); - b[5] = vaddq_s16(a[5], a[10]); - b[6] = vaddq_s16(a[6], a[9]); - b[7] = vaddq_s16(a[7], a[8]); +static INLINE void load_cross(const int16_t *a, int stride, + int16x8_t *b /*[16]*/) { + b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride)); + b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride)); + b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride)); + b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride)); + b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride)); + b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride)); + b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride)); + b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride)); - b[8] = vsubq_s16(a[7], a[8]); - b[9] = vsubq_s16(a[6], a[9]); - b[10] = vsubq_s16(a[5], a[10]); - b[11] = vsubq_s16(a[4], a[11]); - b[12] = vsubq_s16(a[3], a[12]); - b[13] = vsubq_s16(a[2], a[13]); - b[14] = vsubq_s16(a[1], a[14]); - b[15] = vsubq_s16(a[0], a[15]); - } + b[8] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride)); + b[9] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride)); + b[10] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride)); + b[11] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride)); + b[12] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride)); + b[13] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride)); + b[14] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride)); + b[15] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride)); } // Quarter round at the beginning of the second pass. Can't use vrshr (rounding) @@ -135,45 +159,6 @@ static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) { a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2); } -// fdct_round_shift((a +/- b) * c) -static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, - const tran_high_t c, int16x8_t *add, - int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c); - const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c); - const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - -// fdct_round_shift(a * c0 +/- b * c1) -static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, - const tran_coef_t c0, - const tran_coef_t c1, int16x8_t *add, - int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0); - const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1); - const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1); - const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0); - const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - // Main body of fdct16x16. static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) { diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c index de74e6630b..51d81bd085 100644 --- a/vpx_dsp/arm/fdct32x32_neon.c +++ b/vpx_dsp/arm/fdct32x32_neon.c @@ -15,6 +15,7 @@ #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" // Most gcc 4.9 distributions outside of Android do not generate correct code // for this function. @@ -194,54 +195,6 @@ static INLINE void store(tran_low_t *a, const int16x8_t *b) { #undef STORE_S16 -// fdct_round_shift((a +/- b) * c) -static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, - const tran_high_t constant, - int16x8_t *add, int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); - const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - -// fdct_round_shift(a * c0 +/- b * c1) -static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, - const tran_coef_t constant0, - const tran_coef_t constant1, - int16x8_t *add, int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0); - const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1); - const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1); - const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0); - const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - -// Add 2 if positive, 1 if negative, and shift by 2. -// In practice, subtract the sign bit, then shift with rounding. -static INLINE int16x8_t sub_round_shift(const int16x8_t a) { - const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); - const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); - const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); - return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); -} - static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { int16x8_t a[32]; int16x8_t b[32]; @@ -562,23 +515,6 @@ static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \ } while (0) -// Like butterfly_one_coeff, but don't narrow results. -static INLINE void butterfly_one_coeff_s16_s32( - const int16x8_t a, const int16x8_t b, const tran_high_t constant, - int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, - int32x4_t *sub_hi) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); - const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); - *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); - *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); - *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); - *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); -} - #define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \ add_index, sub_index) \ do { \ @@ -587,23 +523,6 @@ static INLINE void butterfly_one_coeff_s16_s32( &b##_lo[sub_index], &b##_hi[sub_index]); \ } while (0) -// Like butterfly_one_coeff, but with s32. -static INLINE void butterfly_one_coeff_s32( - const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, - const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo, - int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { - const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant); - const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant); - const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant); - const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant); - const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant); - const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant); - *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); - *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); - *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); - *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); -} - #define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \ sub_index) \ do { \ @@ -613,26 +532,6 @@ static INLINE void butterfly_one_coeff_s32( &b##_lo[sub_index], &b##_hi[sub_index]); \ } while (0) -// Like butterfly_two_coeff, but with s32. -static INLINE void butterfly_two_coeff_s32( - const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, - const int32x4_t b_hi, const int32_t constant0, const int32_t constant1, - int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, - int32x4_t *sub_hi) { - const int32x4_t a0 = vmulq_n_s32(a_lo, constant0); - const int32x4_t a1 = vmulq_n_s32(a_hi, constant0); - const int32x4_t a2 = vmulq_n_s32(a_lo, constant1); - const int32x4_t a3 = vmulq_n_s32(a_hi, constant1); - const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0); - const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0); - const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1); - const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1); - *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); - *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); - *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); - *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); -} - #define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \ right_constant, b, add_index, sub_index) \ do { \ @@ -643,24 +542,6 @@ static INLINE void butterfly_two_coeff_s32( &b##_hi[sub_index]); \ } while (0) -// Add 1 if positive, 2 if negative, and shift by 2. -// In practice, add 1, then add the sign bit, then shift without rounding. -static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo, - const int32x4_t a_hi) { - const int32x4_t one = vdupq_n_s32(1); - const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo); - const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31); - const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32); - const int16x4_t b_lo = - vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2); - const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi); - const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31); - const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32); - const int16x4_t b_hi = - vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2); - return vcombine_s16(b_lo, b_hi); -} - static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { int16x8_t a[32]; int16x8_t b[32]; @@ -967,16 +848,6 @@ static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { out[3] = add_round_shift_s32(d_lo[3], d_hi[3]); } -// Add 1 if positive, 2 if negative, and shift by 2. -// In practice, add 1, then add the sign bit, then shift without rounding. -static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { - const int16x8_t one = vdupq_n_s16(1); - const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); - const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); - const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); - return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2); -} - static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) { int16x8_t a[32]; int16x8_t b[32]; @@ -1279,42 +1150,6 @@ static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) { #undef BUTTERFLY_ONE_S32 #undef BUTTERFLY_TWO_S32 -// Transpose 8x8 to a new location. Don't use transpose_neon.h because those -// are all in-place. -// TODO(johannkoenig): share with other fdcts. -static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) { - // Swap 16 bit elements. - const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); - const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); - const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); - const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); - - // Swap 32 bit elements. - const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), - vreinterpretq_s32_s16(c1.val[0])); - const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), - vreinterpretq_s32_s16(c1.val[1])); - const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), - vreinterpretq_s32_s16(c3.val[0])); - const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), - vreinterpretq_s32_s16(c3.val[1])); - - // Swap 64 bit elements - const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); - const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); - const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); - const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); - - b[0] = e0.val[0]; - b[1] = e1.val[0]; - b[2] = e2.val[0]; - b[3] = e3.val[0]; - b[4] = e0.val[1]; - b[5] = e1.val[1]; - b[6] = e2.val[1]; - b[7] = e3.val[1]; -} - void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp0[32]; int16x8_t temp1[32]; @@ -1337,10 +1172,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { dct_body_first_pass(temp0, temp4); // Generate the top row by munging the first set of 8 from each one together. - transpose_8x8(&temp1[0], &temp0[0]); - transpose_8x8(&temp2[0], &temp0[8]); - transpose_8x8(&temp3[0], &temp0[16]); - transpose_8x8(&temp4[0], &temp0[24]); + transpose_s16_8x8_new(&temp1[0], &temp0[0]); + transpose_s16_8x8_new(&temp2[0], &temp0[8]); + transpose_s16_8x8_new(&temp3[0], &temp0[16]); + transpose_s16_8x8_new(&temp4[0], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -1355,10 +1190,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output, temp5); // Second row of 8x32. - transpose_8x8(&temp1[8], &temp0[0]); - transpose_8x8(&temp2[8], &temp0[8]); - transpose_8x8(&temp3[8], &temp0[16]); - transpose_8x8(&temp4[8], &temp0[24]); + transpose_s16_8x8_new(&temp1[8], &temp0[0]); + transpose_s16_8x8_new(&temp2[8], &temp0[8]); + transpose_s16_8x8_new(&temp3[8], &temp0[16]); + transpose_s16_8x8_new(&temp4[8], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -1373,10 +1208,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output + 8 * 32, temp5); // Third row of 8x32 - transpose_8x8(&temp1[16], &temp0[0]); - transpose_8x8(&temp2[16], &temp0[8]); - transpose_8x8(&temp3[16], &temp0[16]); - transpose_8x8(&temp4[16], &temp0[24]); + transpose_s16_8x8_new(&temp1[16], &temp0[0]); + transpose_s16_8x8_new(&temp2[16], &temp0[8]); + transpose_s16_8x8_new(&temp3[16], &temp0[16]); + transpose_s16_8x8_new(&temp4[16], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -1391,10 +1226,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output + 16 * 32, temp5); // Final row of 8x32. - transpose_8x8(&temp1[24], &temp0[0]); - transpose_8x8(&temp2[24], &temp0[8]); - transpose_8x8(&temp3[24], &temp0[16]); - transpose_8x8(&temp4[24], &temp0[24]); + transpose_s16_8x8_new(&temp1[24], &temp0[0]); + transpose_s16_8x8_new(&temp2[24], &temp0[8]); + transpose_s16_8x8_new(&temp3[24], &temp0[16]); + transpose_s16_8x8_new(&temp4[24], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -1432,10 +1267,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, dct_body_first_pass(temp0, temp4); // Generate the top row by munging the first set of 8 from each one together. - transpose_8x8(&temp1[0], &temp0[0]); - transpose_8x8(&temp2[0], &temp0[8]); - transpose_8x8(&temp3[0], &temp0[16]); - transpose_8x8(&temp4[0], &temp0[24]); + transpose_s16_8x8_new(&temp1[0], &temp0[0]); + transpose_s16_8x8_new(&temp2[0], &temp0[8]); + transpose_s16_8x8_new(&temp3[0], &temp0[16]); + transpose_s16_8x8_new(&temp4[0], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -1450,10 +1285,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output, temp5); // Second row of 8x32. - transpose_8x8(&temp1[8], &temp0[0]); - transpose_8x8(&temp2[8], &temp0[8]); - transpose_8x8(&temp3[8], &temp0[16]); - transpose_8x8(&temp4[8], &temp0[24]); + transpose_s16_8x8_new(&temp1[8], &temp0[0]); + transpose_s16_8x8_new(&temp2[8], &temp0[8]); + transpose_s16_8x8_new(&temp3[8], &temp0[16]); + transpose_s16_8x8_new(&temp4[8], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -1468,10 +1303,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output + 8 * 32, temp5); // Third row of 8x32 - transpose_8x8(&temp1[16], &temp0[0]); - transpose_8x8(&temp2[16], &temp0[8]); - transpose_8x8(&temp3[16], &temp0[16]); - transpose_8x8(&temp4[16], &temp0[24]); + transpose_s16_8x8_new(&temp1[16], &temp0[0]); + transpose_s16_8x8_new(&temp2[16], &temp0[8]); + transpose_s16_8x8_new(&temp3[16], &temp0[16]); + transpose_s16_8x8_new(&temp4[16], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -1486,10 +1321,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output + 16 * 32, temp5); // Final row of 8x32. - transpose_8x8(&temp1[24], &temp0[0]); - transpose_8x8(&temp2[24], &temp0[8]); - transpose_8x8(&temp3[24], &temp0[16]); - transpose_8x8(&temp4[24], &temp0[24]); + transpose_s16_8x8_new(&temp1[24], &temp0[0]); + transpose_s16_8x8_new(&temp2[24], &temp0[8]); + transpose_s16_8x8_new(&temp3[24], &temp0[16]); + transpose_s16_8x8_new(&temp4[24], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); diff --git a/vpx_dsp/arm/fdct_neon.c b/vpx_dsp/arm/fdct4x4_neon.c similarity index 100% rename from vpx_dsp/arm/fdct_neon.c rename to vpx_dsp/arm/fdct4x4_neon.c diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fdct8x8_neon.c similarity index 100% rename from vpx_dsp/arm/fwd_txfm_neon.c rename to vpx_dsp/arm/fdct8x8_neon.c diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h index 28d7d86bf8..056cae4083 100644 --- a/vpx_dsp/arm/fdct_neon.h +++ b/vpx_dsp/arm/fdct_neon.h @@ -13,6 +13,136 @@ #include +// fdct_round_shift((a +/- b) * c) +static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, + const tran_high_t constant, + int16x8_t *add, int16x8_t *sub) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); + const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// fdct_round_shift(a * c0 +/- b * c1) +static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, + const tran_coef_t constant0, + const tran_coef_t constant1, + int16x8_t *add, int16x8_t *sub) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0); + const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1); + const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1); + const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0); + const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// Add 2 if positive, 1 if negative, and shift by 2. +// In practice, subtract the sign bit, then shift with rounding. +static INLINE int16x8_t sub_round_shift(const int16x8_t a) { + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); +} + +// Like butterfly_one_coeff, but don't narrow results. +static INLINE void butterfly_one_coeff_s16_s32( + const int16x8_t a, const int16x8_t b, const tran_high_t constant, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); + const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +// Like butterfly_one_coeff, but with s32. +static INLINE void butterfly_one_coeff_s32( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant); + const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant); + const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant); + const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant); + const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant); + const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +// Like butterfly_two_coeff, but with s32. +static INLINE void butterfly_two_coeff_s32( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const int32_t constant0, const int32_t constant1, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + const int32x4_t a0 = vmulq_n_s32(a_lo, constant0); + const int32x4_t a1 = vmulq_n_s32(a_hi, constant0); + const int32x4_t a2 = vmulq_n_s32(a_lo, constant1); + const int32x4_t a3 = vmulq_n_s32(a_hi, constant1); + const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0); + const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0); + const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1); + const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { + const int16x8_t one = vdupq_n_s16(1); + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo, + const int32x4_t a_hi) { + const int32x4_t one = vdupq_n_s32(1); + const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo); + const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31); + const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32); + const int16x4_t b_lo = + vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2); + const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi); + const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31); + const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32); + const int16x4_t b_hi = + vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2); + return vcombine_s16(b_lo, b_hi); +} + static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) { const int16x8_t input_01 = vcombine_s16(in[0], in[1]); const int16x8_t input_32 = vcombine_s16(in[3], in[2]); diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 32d21e03f7..1fd9495cf9 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -227,11 +227,11 @@ ifeq ($(VPX_ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm endif DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h -DSP_SRCS-$(HAVE_NEON) += arm/fdct_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct4x4_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct8x8_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fdct16x16_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fdct32x32_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fdct_partial_neon.c -DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.h From 85484d5960b2adfde5e639ea7e3f3dcba9698e2e Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Mon, 10 Oct 2022 08:38:44 -0700 Subject: [PATCH 0524/1000] Add vpx_highbd_sad16x{32,16,8}_avg_avx2. 1.6x to 2.1x faster than the sse2 version. Bug: b/245917257 Change-Id: I56c467a850297ae3abcca4b4843302bb8d5d0ac1 --- test/sad_test.cc | 11 ++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +-- vpx_dsp/x86/highbd_sad_avx2.c | 99 +++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+), 3 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index a07b7c8a59..47df4661ce 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1119,6 +1119,17 @@ const SadMxNAvgParam avg_avx2_tests[] = { SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_avx2), SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_avx2), SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 8), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 8), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 8), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 10), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 10), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 10), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 12), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 12), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests)); diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index cbf0e6ea8d..4d1d05d548 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1010,13 +1010,13 @@ () specialize qw/vpx_highbd_sad32x16_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad16x32_avg sse2 neon/; + specialize qw/vpx_highbd_sad16x32_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad16x16_avg sse2 neon/; + specialize qw/vpx_highbd_sad16x16_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad16x8_avg sse2 neon/; + specialize qw/vpx_highbd_sad16x8_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vpx_highbd_sad8x16_avg sse2 neon/; diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c index 12ef2eb3e9..f1bab35287 100644 --- a/vpx_dsp/x86/highbd_sad_avx2.c +++ b/vpx_dsp/x86/highbd_sad_avx2.c @@ -223,3 +223,102 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src8_ptr, int src_stride, return calc_final(sums_32); } } + +// AVG ------------------------------------------------------------------------- + +static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16, + const uint16_t *src, + int src_stride, uint16_t *ref, + int ref_stride, uint16_t *sec, + int height) { + int i; + for (i = 0; i < height; i += 2) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); + const __m256i avg0 = _mm256_avg_epu16(r0, x0); + const __m256i avg1 = _mm256_avg_epu16(r1, x1); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride << 1; + ref += ref_stride << 1; + sec += 32; + } +} + +unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + const uint8_t *second_pred) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < 2; ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 4; + ref += ref_stride << 4; + sec += 16 << 4; + } + return calc_final(sums_32); +} + +unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + const uint8_t *second_pred) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} + +unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} From 50d5093a4f1f2f0fafad09e1a106b7c3c1b9d60d Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 12 Oct 2022 06:05:46 -0700 Subject: [PATCH 0525/1000] Add vpx_highbd_sad32x{64,32,16}_avg_avx2. 2.1x to 2.8x faster than the sse2 version. Bug: b/245917257 Change-Id: I1aaffa4a1debbe5559784e854b8fc6fba07e5000 --- test/sad_test.cc | 9 +++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 ++-- vpx_dsp/x86/highbd_sad_avx2.c | 67 +++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 3 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 47df4661ce..a3c2952d63 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1120,12 +1120,21 @@ const SadMxNAvgParam avg_avx2_tests[] = { SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_avx2), SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_avx2), #if CONFIG_VP9_HIGHBITDEPTH + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 8), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 8), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 8), SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 8), SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 8), SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 8), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 10), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 10), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 10), SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 10), SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 10), SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 10), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 12), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 12), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 12), SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 12), SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 12), SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 12), diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 4d1d05d548..4db6de37b6 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1001,13 +1001,13 @@ () specialize qw/vpx_highbd_sad64x32_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad32x64_avg sse2 neon/; + specialize qw/vpx_highbd_sad32x64_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad32x32_avg sse2 neon/; + specialize qw/vpx_highbd_sad32x32_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad32x16_avg sse2 neon/; + specialize qw/vpx_highbd_sad32x16_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vpx_highbd_sad16x32_avg sse2 neon avx2/; diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c index f1bab35287..24ebe4e94a 100644 --- a/vpx_dsp/x86/highbd_sad_avx2.c +++ b/vpx_dsp/x86/highbd_sad_avx2.c @@ -225,6 +225,73 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src8_ptr, int src_stride, } // AVG ------------------------------------------------------------------------- +static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16, + const uint16_t *src, + int src_stride, uint16_t *ref, + int ref_stride, uint16_t *sec, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); + const __m256i avg0 = _mm256_avg_epu16(r0, x0); + const __m256i avg1 = _mm256_avg_epu16(r1, x1); + // absolute differences between every ref/pred avg to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride; + ref += ref_stride; + sec += 32; + } +} + +#define HIGHBD_SAD32XN_AVG(n) \ + unsigned int vpx_highbd_sad32x##n##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 8); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \ + \ + /* sums_16 will outrange after 8 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 3; \ + ref += ref_stride << 3; \ + sec += 32 << 3; \ + } \ + return calc_final(sums_32); \ + } + +// 32x64 +HIGHBD_SAD32XN_AVG(64) + +// 32x32 +HIGHBD_SAD32XN_AVG(32) + +// 32x16 +HIGHBD_SAD32XN_AVG(16) static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16, const uint16_t *src, From 165935a1b6c3dfe2af686545188c3abebc4941d8 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 6 Oct 2022 14:53:56 +0000 Subject: [PATCH 0526/1000] [NEON] Add highbd FDCT 4x4 function ~80% faster than C version for both best/rt profiles. Change-Id: Ibb3c8e1862131d2a020922420d53c66b31d5c2c3 --- test/dct_test.cc | 21 ++++++++-- vpx_dsp/arm/fdct4x4_neon.c | 38 ++++++++++++++++++ vpx_dsp/arm/fdct_neon.h | 74 ++++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 4 files changed, 131 insertions(+), 4 deletions(-) diff --git a/test/dct_test.cc b/test/dct_test.cc index 2182f87e5e..e34122ac92 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -539,6 +539,18 @@ INSTANTIATE_TEST_SUITE_P(AVX2, TransDCT, #endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_neon_func_info[] = { + { &fdct_wrapper, + &highbd_idct_wrapper, 4, 2 }, + /* { &fdct_wrapper, + &highbd_idct_wrapper, 8, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 16, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 32, 2 },*/ +}; +#else static const FuncInfo dct_neon_func_info[4] = { { &fdct_wrapper, &idct_wrapper, 4, 1 }, @@ -549,12 +561,15 @@ static const FuncInfo dct_neon_func_info[4] = { { &fdct_wrapper, &idct_wrapper, 32, 1 } }; +#endif // CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_SUITE_P( NEON, TransDCT, - ::testing::Combine(::testing::Range(0, 4), - ::testing::Values(dct_neon_func_info), - ::testing::Values(0), ::testing::Values(VPX_BITS_8))); + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(dct_neon_func_info) / + sizeof(dct_neon_func_info[0]))), + ::testing::Values(dct_neon_func_info), ::testing::Values(0), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); #endif // HAVE_NEON #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/fdct4x4_neon.c b/vpx_dsp/arm/fdct4x4_neon.c index 2827791f1e..11df7292d4 100644 --- a/vpx_dsp/arm/fdct4x4_neon.c +++ b/vpx_dsp/arm/fdct4x4_neon.c @@ -48,3 +48,41 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, store_s16q_to_tran_low(final_output + 1 * 8, out_23); } } + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + int i; + static const int32x4_t const_1000 = { 1, 0, 0, 0 }; + const int32x4_t const_one = vdupq_n_s32(1); + + // input[M * stride] * 16 + int32x4_t in[4]; + in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4); + in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4); + in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4); + in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4); + + // If the very first value != 0, then add 1. + if (input[0] != 0) { + in[0] = vaddq_s32(in[0], const_1000); + } + + for (i = 0; i < 2; ++i) { + vpx_highbd_fdct4x4_pass1_neon(in); + } + { + // Not quite a rounding shift. Only add 1 despite shifting by 2. + in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2); + in[1] = vshrq_n_s32(vaddq_s32(in[1], const_one), 2); + in[2] = vshrq_n_s32(vaddq_s32(in[2], const_one), 2); + in[3] = vshrq_n_s32(vaddq_s32(in[3], const_one), 2); + + vst1q_s32(final_output, in[0]); + vst1q_s32(final_output + 4, in[1]); + vst1q_s32(final_output + 8, in[2]); + vst1q_s32(final_output + 12, in[3]); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h index 056cae4083..68aeab3aa3 100644 --- a/vpx_dsp/arm/fdct_neon.h +++ b/vpx_dsp/arm/fdct_neon.h @@ -340,4 +340,78 @@ static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) { // 07 17 27 37 47 57 67 77 } } + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_butterfly_one_coeff_s32(const int32x4_t a, + const int32x4_t b, + const tran_high_t c, + int32x4_t *add, + int32x4_t *sub) { + const int32x2_t a_lo = vget_low_s32(a); + const int32x2_t a_hi = vget_high_s32(a); + const int32x2_t b_lo = vget_low_s32(b); + const int32x2_t b_hi = vget_high_s32(b); + + const int64x2_t a64_lo = vmull_n_s32(a_lo, c); + const int64x2_t a64_hi = vmull_n_s32(a_hi, c); + + const int64x2_t sum_lo = vmlal_n_s32(a64_lo, b_lo, c); + const int64x2_t sum_hi = vmlal_n_s32(a64_hi, b_hi, c); + const int64x2_t diff_lo = vmlsl_n_s32(a64_lo, b_lo, c); + const int64x2_t diff_hi = vmlsl_n_s32(a64_hi, b_hi, c); + + *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS), + vrshrn_n_s64(sum_hi, DCT_CONST_BITS)); + *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS), + vrshrn_n_s64(diff_hi, DCT_CONST_BITS)); +} + +static INLINE void highbd_butterfly_two_coeff_s32( + const int32x4_t a, const int32x4_t b, const tran_coef_t c0, + const tran_coef_t c1, int32x4_t *add, int32x4_t *sub) { + const int32x2_t a_lo = vget_low_s32(a); + const int32x2_t a_hi = vget_high_s32(a); + const int32x2_t b_lo = vget_low_s32(b); + const int32x2_t b_hi = vget_high_s32(b); + + const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, c0); + const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, c0); + const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, c1); + const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, c1); + + const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, c1); + const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, c1); + const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, c0); + const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, c0); + + *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS), + vrshrn_n_s64(sum_hi, DCT_CONST_BITS)); + *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS), + vrshrn_n_s64(diff_hi, DCT_CONST_BITS)); +} + +static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) { + int32x4_t out[4]; + // in_0 +/- in_3, in_1 +/- in_2 + const int32x4_t s_0 = vaddq_s32(in[0], in[3]); + const int32x4_t s_1 = vaddq_s32(in[1], in[2]); + const int32x4_t s_2 = vsubq_s32(in[1], in[2]); + const int32x4_t s_3 = vsubq_s32(in[0], in[3]); + + highbd_butterfly_one_coeff_s32(s_0, s_1, cospi_16_64, &out[0], &out[2]); + + // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64 + // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64 + highbd_butterfly_two_coeff_s32(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], + &out[3]); + + transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_ diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index cbf0e6ea8d..c5514b14d3 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -552,7 +552,7 @@ () specialize qw/vpx_fdct32x32_1 sse2 neon/; add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct4x4 sse2/; + specialize qw/vpx_highbd_fdct4x4 sse2 neon/; add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_highbd_fdct8x8 sse2/; From 7142689f00e73d461b8d00347ee84da2ee420994 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 12 Oct 2022 10:26:43 -0700 Subject: [PATCH 0527/1000] Add vpx_highbd_sad64x{64,32}_avg_avx2. ~2.8x faster than the sse2 version. Bug: b/245917257 Change-Id: Ib727ba8a8c8fa4df450bafdde30ed99fd283f06d --- test/sad_test.cc | 6 +++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- vpx_dsp/x86/highbd_sad_avx2.c | 77 +++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 2 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index a3c2952d63..29e3f57f5e 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1120,18 +1120,24 @@ const SadMxNAvgParam avg_avx2_tests[] = { SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_avx2), SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_avx2), #if CONFIG_VP9_HIGHBITDEPTH + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 8), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 8), SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 8), SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 8), SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 8), SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 8), SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 8), SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 8), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 10), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 10), SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 10), SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 10), SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 10), SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 10), SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 10), SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 10), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 12), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 12), SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 12), SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 12), SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 12), diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 4db6de37b6..5fe9c12879 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -995,10 +995,10 @@ () add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max"; add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad64x64_avg sse2 neon/; + specialize qw/vpx_highbd_sad64x64_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad64x32_avg sse2 neon/; + specialize qw/vpx_highbd_sad64x32_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vpx_highbd_sad32x64_avg sse2 neon avx2/; diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c index 24ebe4e94a..7533ccfddb 100644 --- a/vpx_dsp/x86/highbd_sad_avx2.c +++ b/vpx_dsp/x86/highbd_sad_avx2.c @@ -225,6 +225,83 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src8_ptr, int src_stride, } // AVG ------------------------------------------------------------------------- +static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16, + const uint16_t *src, + int src_stride, uint16_t *ref, + int ref_stride, uint16_t *sec, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); + const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48)); + const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); + const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32)); + const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48)); + const __m256i avg0 = _mm256_avg_epu16(r0, x0); + const __m256i avg1 = _mm256_avg_epu16(r1, x1); + const __m256i avg2 = _mm256_avg_epu16(r2, x2); + const __m256i avg3 = _mm256_avg_epu16(r3, x3); + // absolute differences between every ref/pred avg to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); + const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2)); + const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3)); + // sum every abs diff + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1)); + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3)); + + src += src_stride; + ref += ref_stride; + sec += 64; + } +} + +#define HIGHBD_SAD64XN_AVG(n) \ + unsigned int vpx_highbd_sad64x##n##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 2); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \ + \ + /* sums_16 will outrange after 2 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 1; \ + ref += ref_stride << 1; \ + sec += 64 << 1; \ + } \ + return calc_final(sums_32); \ + } + +// 64x64 +HIGHBD_SAD64XN_AVG(64) + +// 64x32 +HIGHBD_SAD64XN_AVG(32) + static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16, const uint16_t *src, int src_stride, uint16_t *ref, From a49f896352671870f38c1374f3d5329e3b60193f Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 6 Oct 2022 16:00:43 +0000 Subject: [PATCH 0528/1000] [NEON] Add highbd FDCT 8x8 function 50% faster than C version in best/rt profiles Change-Id: I0f9504ed52b5d5f7722407e91108ed4056d66bc2 --- test/dct_test.cc | 12 +-- vpx_dsp/arm/fdct8x8_neon.c | 78 +++++++++++++++++++ vpx_dsp/arm/fdct_neon.h | 144 +++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 4 files changed, 229 insertions(+), 7 deletions(-) diff --git a/test/dct_test.cc b/test/dct_test.cc index e34122ac92..ff97fc7c35 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -543,12 +543,12 @@ INSTANTIATE_TEST_SUITE_P(AVX2, TransDCT, static const FuncInfo dct_neon_func_info[] = { { &fdct_wrapper, &highbd_idct_wrapper, 4, 2 }, - /* { &fdct_wrapper, - &highbd_idct_wrapper, 8, 2 }, - { &fdct_wrapper, - &highbd_idct_wrapper, 16, 2 }, - { &fdct_wrapper, - &highbd_idct_wrapper, 32, 2 },*/ + { &fdct_wrapper, + &highbd_idct_wrapper, 8, 2 }, + /* { &fdct_wrapper, + &highbd_idct_wrapper, 16, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 32, 2 },*/ }; #else static const FuncInfo dct_neon_func_info[4] = { diff --git a/vpx_dsp/arm/fdct8x8_neon.c b/vpx_dsp/arm/fdct8x8_neon.c index d9161c6d38..3fb15cc175 100644 --- a/vpx_dsp/arm/fdct8x8_neon.c +++ b/vpx_dsp/arm/fdct8x8_neon.c @@ -66,3 +66,81 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, store_s16q_to_tran_low(final_output + 7 * 8, in[7]); } } + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + int i; + + // input[M * stride] * 16 + int32x4_t left[8], right[8]; + int16x8_t in[8]; + in[0] = vld1q_s16(input + 0 * stride); + in[1] = vld1q_s16(input + 1 * stride); + in[2] = vld1q_s16(input + 2 * stride); + in[3] = vld1q_s16(input + 3 * stride); + in[4] = vld1q_s16(input + 4 * stride); + in[5] = vld1q_s16(input + 5 * stride); + in[6] = vld1q_s16(input + 6 * stride); + in[7] = vld1q_s16(input + 7 * stride); + + left[0] = vshll_n_s16(vget_low_s16(in[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(in[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(in[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(in[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(in[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(in[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(in[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(in[7]), 2); + right[0] = vshll_n_s16(vget_high_s16(in[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(in[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(in[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(in[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(in[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(in[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(in[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(in[7]), 2); + + for (i = 0; i < 2; ++i) { + vpx_highbd_fdct8x8_pass1_neon(left, right); + } + { + left[0] = highbd_add_round_shift_s32(left[0]); + left[1] = highbd_add_round_shift_s32(left[1]); + left[2] = highbd_add_round_shift_s32(left[2]); + left[3] = highbd_add_round_shift_s32(left[3]); + left[4] = highbd_add_round_shift_s32(left[4]); + left[5] = highbd_add_round_shift_s32(left[5]); + left[6] = highbd_add_round_shift_s32(left[6]); + left[7] = highbd_add_round_shift_s32(left[7]); + right[0] = highbd_add_round_shift_s32(right[0]); + right[1] = highbd_add_round_shift_s32(right[1]); + right[2] = highbd_add_round_shift_s32(right[2]); + right[3] = highbd_add_round_shift_s32(right[3]); + right[4] = highbd_add_round_shift_s32(right[4]); + right[5] = highbd_add_round_shift_s32(right[5]); + right[6] = highbd_add_round_shift_s32(right[6]); + right[7] = highbd_add_round_shift_s32(right[7]); + + // store results + vst1q_s32(final_output, left[0]); + vst1q_s32(final_output + 4, right[0]); + vst1q_s32(final_output + 8, left[1]); + vst1q_s32(final_output + 12, right[1]); + vst1q_s32(final_output + 16, left[2]); + vst1q_s32(final_output + 20, right[2]); + vst1q_s32(final_output + 24, left[3]); + vst1q_s32(final_output + 28, right[3]); + vst1q_s32(final_output + 32, left[4]); + vst1q_s32(final_output + 36, right[4]); + vst1q_s32(final_output + 40, left[5]); + vst1q_s32(final_output + 44, right[5]); + vst1q_s32(final_output + 48, left[6]); + vst1q_s32(final_output + 52, right[6]); + vst1q_s32(final_output + 56, left[7]); + vst1q_s32(final_output + 60, right[7]); + } +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h index 68aeab3aa3..c100e709d5 100644 --- a/vpx_dsp/arm/fdct_neon.h +++ b/vpx_dsp/arm/fdct_neon.h @@ -342,6 +342,20 @@ static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) { } #if CONFIG_VP9_HIGHBITDEPTH +static INLINE int32x4_t highbd_add_round_shift_s32(int32x4_t x) { + const int32x2_t x_lo = vget_low_s32(x); + const int32x2_t x_hi = vget_high_s32(x); + const int64x2_t x64_lo = vmovl_s32(x_lo); + const int64x2_t x64_hi = vmovl_s32(x_hi); + + const int64x2_t sign_lo = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_lo, 63); + const int64x2_t sign_hi = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_hi, 63); + + const int64x2_t sum_lo = vaddq_s64(x64_lo, sign_lo); + const int64x2_t sum_hi = vaddq_s64(x64_hi, sign_hi); + return vcombine_s32(vshrn_n_s64(sum_lo, 1), vshrn_n_s64(sum_hi, 1)); +} + static INLINE void highbd_butterfly_one_coeff_s32(const int32x4_t a, const int32x4_t b, const tran_high_t c, @@ -413,5 +427,135 @@ static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) { in[3] = out[3]; } +static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left, + int32x4_t *right) { + int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; + + sl[0] = vaddq_s32(left[0], left[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sr[1] = vaddq_s32(right[1], right[6]); + sr[2] = vaddq_s32(right[2], right[5]); + sr[3] = vaddq_s32(right[3], right[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sr[5] = vsubq_s32(right[2], right[5]); + sr[6] = vsubq_s32(right[1], right[6]); + sr[7] = vsubq_s32(right[0], right[7]); + + // fdct4(step, step); + // x0 = s0 + s3; + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + // x1 = s1 + s2; + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + // x2 = s1 - s2; + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + // x3 = s0 - s3; + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // fdct4(step, step); + // t0 = (x0 + x1) * cospi_16_64; + // t1 = (x0 - x1) * cospi_16_64; + // out[0] = (tran_low_t)fdct_round_shift(t0); + // out[4] = (tran_low_t)fdct_round_shift(t1); + highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[4]); + highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0], + &right[4]); + // t2 = x2 * cospi_24_64 + x3 * cospi_8_64; + // t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; + // out[2] = (tran_low_t)fdct_round_shift(t2); + // out[6] = (tran_low_t)fdct_round_shift(t3); + highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64, + &left[2], &left[6]); + highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64, + &right[2], &right[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &tl[1], &tl[0]); + highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &tr[1], &tr[0]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], tl[0]); + xr[0] = vaddq_s32(sr[4], tr[0]); + xl[1] = vsubq_s32(sl[4], tl[0]); + xr[1] = vsubq_s32(sr[4], tr[0]); + xl[2] = vsubq_s32(sl[7], tl[1]); + xr[2] = vsubq_s32(sr[7], tr[1]); + xl[3] = vaddq_s32(sl[7], tl[1]); + xr[3] = vaddq_s32(sr[7], tr[1]); + + // Stage 4 + // t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + // out[1] = (tran_low_t)fdct_round_shift(t0); + // t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + // out[7] = (tran_low_t)fdct_round_shift(t3); + highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64, + &left[1], &left[7]); + highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64, + &right[1], &right[7]); + + // t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + // out[5] = (tran_low_t)fdct_round_shift(t1); + // t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + // out[3] = (tran_low_t)fdct_round_shift(t2); + highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64, + &left[5], &left[3]); + highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64, + &right[5], &right[3]); +} + +static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left, + int32x4_t *right) { + int32x4x2_t out[8]; + vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right); + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + left[0] = out[0].val[0]; + right[0] = out[0].val[1]; + left[1] = out[1].val[0]; + right[1] = out[1].val[1]; + left[2] = out[2].val[0]; + right[2] = out[2].val[1]; + left[3] = out[3].val[0]; + right[3] = out[3].val[1]; + left[4] = out[4].val[0]; + right[4] = out[4].val[1]; + left[5] = out[5].val[0]; + right[5] = out[5].val[1]; + left[6] = out[6].val[0]; + right[6] = out[6].val[1]; + left[7] = out[7].val[0]; + right[7] = out[7].val[1]; +} + #endif // CONFIG_VP9_HIGHBITDEPTH #endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_ diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index c5514b14d3..e886c0ae45 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -555,7 +555,7 @@ () specialize qw/vpx_highbd_fdct4x4 sse2 neon/; add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct8x8 sse2/; + specialize qw/vpx_highbd_fdct8x8 sse2 neon/; add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_highbd_fdct8x8_1 neon/; From 45b280eb0fa80404321100cee2de1bcea413913e Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 7 Oct 2022 15:13:29 +0000 Subject: [PATCH 0529/1000] [NEON] Add highbd FDCT 16x16 function 90-95% faster than C version in best/rt profiles Change-Id: I41d5e9acdc348b57153637ec736498a25ed84c25 --- test/dct16x16_test.cc | 12 +- test/dct_test.cc | 8 +- vpx_dsp/arm/fdct16x16_neon.c | 53 +++++ vpx_dsp/arm/fdct16x16_neon.h | 395 +++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 5 files changed, 464 insertions(+), 6 deletions(-) diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index 06837d809d..d4ef7ae13d 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -789,13 +789,23 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8))); #endif // CONFIG_VP9_HIGHBITDEPTH -#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE +#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_SUITE_P( NEON, Trans16x16DCT, ::testing::Values(make_tuple(&vpx_fdct16x16_neon, &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8))); #endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE +#if HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + NEON, Trans16x16DCT, + ::testing::Values( + make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_10, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_12, 0, VPX_BITS_12), + make_tuple(&vpx_fdct16x16_neon, &vpx_idct16x16_256_add_c, 0, + VPX_BITS_8))); +#endif // HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_SUITE_P( SSE2, Trans16x16DCT, diff --git a/test/dct_test.cc b/test/dct_test.cc index ff97fc7c35..910d288bd5 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -545,10 +545,10 @@ static const FuncInfo dct_neon_func_info[] = { &highbd_idct_wrapper, 4, 2 }, { &fdct_wrapper, &highbd_idct_wrapper, 8, 2 }, - /* { &fdct_wrapper, - &highbd_idct_wrapper, 16, 2 }, - { &fdct_wrapper, - &highbd_idct_wrapper, 32, 2 },*/ + { &fdct_wrapper, + &highbd_idct_wrapper, 16, 2 }, + /* { &fdct_wrapper, + &highbd_idct_wrapper, 32, 2 },*/ }; #else static const FuncInfo dct_neon_func_info[4] = { diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c index 0b0ce223db..d0c07d429a 100644 --- a/vpx_dsp/arm/fdct16x16_neon.c +++ b/vpx_dsp/arm/fdct16x16_neon.c @@ -74,5 +74,58 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { store(output, temp1); store(output + 8, temp1 + 8); } + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[16]; + int32x4_t left1[16], left2[16], left3[16], left4[16], right1[16], right2[16], + right3[16], right4[16]; + + // Left half. + load_cross(input, stride, temp0); + highbd_scale_input(temp0, left1, right1); + vpx_highbd_fdct16x16_body(left1, right1); + + // right half. + load_cross(input + 8, stride, temp0); + highbd_scale_input(temp0, left2, right2); + vpx_highbd_fdct16x16_body(left2, right2); + + // Transpose top left and top right quarters into one contiguous location to + // process to the top half. + + transpose_s32_8x8_2(left1, right1, left3, right3); + transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4); + transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8); + + highbd_partial_round_shift(left3, right3); + highbd_cross_input(left3, right3, left1, right1); + vpx_highbd_fdct16x16_body(left1, right1); + + // Transpose bottom left and bottom right quarters into one contiguous + // location to process to the bottom half. + + highbd_partial_round_shift(left4, right4); + highbd_cross_input(left4, right4, left2, right2); + vpx_highbd_fdct16x16_body(left2, right2); + + transpose_s32_8x8_2(left1, right1, left3, right3); + transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4); + transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8); + store16_s32(output, left3); + output += 4; + store16_s32(output, right3); + output += 4; + + store16_s32(output, left4); + output += 4; + store16_s32(output, right4); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + #endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4 diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h index 7fc2c6e7e8..d99870903b 100644 --- a/vpx_dsp/arm/fdct16x16_neon.h +++ b/vpx_dsp/arm/fdct16x16_neon.h @@ -273,4 +273,399 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, &out[11]); } +#if CONFIG_VP9_HIGHBITDEPTH + +static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/, + int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + left[0] = vshll_n_s16(vget_low_s16(a[0]), 2); + right[0] = vshll_n_s16(vget_high_s16(a[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(a[1]), 2); + right[1] = vshll_n_s16(vget_high_s16(a[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(a[2]), 2); + right[2] = vshll_n_s16(vget_high_s16(a[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(a[3]), 2); + right[3] = vshll_n_s16(vget_high_s16(a[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(a[4]), 2); + right[4] = vshll_n_s16(vget_high_s16(a[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(a[5]), 2); + right[5] = vshll_n_s16(vget_high_s16(a[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(a[6]), 2); + right[6] = vshll_n_s16(vget_high_s16(a[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(a[7]), 2); + right[7] = vshll_n_s16(vget_high_s16(a[7]), 2); + left[8] = vshll_n_s16(vget_low_s16(a[8]), 2); + right[8] = vshll_n_s16(vget_high_s16(a[8]), 2); + left[9] = vshll_n_s16(vget_low_s16(a[9]), 2); + right[9] = vshll_n_s16(vget_high_s16(a[9]), 2); + left[10] = vshll_n_s16(vget_low_s16(a[10]), 2); + right[10] = vshll_n_s16(vget_high_s16(a[10]), 2); + left[11] = vshll_n_s16(vget_low_s16(a[11]), 2); + right[11] = vshll_n_s16(vget_high_s16(a[11]), 2); + left[12] = vshll_n_s16(vget_low_s16(a[12]), 2); + right[12] = vshll_n_s16(vget_high_s16(a[12]), 2); + left[13] = vshll_n_s16(vget_low_s16(a[13]), 2); + right[13] = vshll_n_s16(vget_high_s16(a[13]), 2); + left[14] = vshll_n_s16(vget_low_s16(a[14]), 2); + right[14] = vshll_n_s16(vget_high_s16(a[14]), 2); + left[15] = vshll_n_s16(vget_low_s16(a[15]), 2); + right[15] = vshll_n_s16(vget_high_s16(a[15]), 2); +} + +static INLINE void highbd_cross_input(const int32x4_t *a_left /*[16]*/, + int32x4_t *a_right /*[16]*/, + int32x4_t *b_left /*[16]*/, + int32x4_t *b_right /*[16]*/) { + b_left[0] = vaddq_s32(a_left[0], a_left[15]); + b_left[1] = vaddq_s32(a_left[1], a_left[14]); + b_left[2] = vaddq_s32(a_left[2], a_left[13]); + b_left[3] = vaddq_s32(a_left[3], a_left[12]); + b_left[4] = vaddq_s32(a_left[4], a_left[11]); + b_left[5] = vaddq_s32(a_left[5], a_left[10]); + b_left[6] = vaddq_s32(a_left[6], a_left[9]); + b_left[7] = vaddq_s32(a_left[7], a_left[8]); + + b_right[0] = vaddq_s32(a_right[0], a_right[15]); + b_right[1] = vaddq_s32(a_right[1], a_right[14]); + b_right[2] = vaddq_s32(a_right[2], a_right[13]); + b_right[3] = vaddq_s32(a_right[3], a_right[12]); + b_right[4] = vaddq_s32(a_right[4], a_right[11]); + b_right[5] = vaddq_s32(a_right[5], a_right[10]); + b_right[6] = vaddq_s32(a_right[6], a_right[9]); + b_right[7] = vaddq_s32(a_right[7], a_right[8]); + + b_left[8] = vsubq_s32(a_left[7], a_left[8]); + b_left[9] = vsubq_s32(a_left[6], a_left[9]); + b_left[10] = vsubq_s32(a_left[5], a_left[10]); + b_left[11] = vsubq_s32(a_left[4], a_left[11]); + b_left[12] = vsubq_s32(a_left[3], a_left[12]); + b_left[13] = vsubq_s32(a_left[2], a_left[13]); + b_left[14] = vsubq_s32(a_left[1], a_left[14]); + b_left[15] = vsubq_s32(a_left[0], a_left[15]); + + b_right[8] = vsubq_s32(a_right[7], a_right[8]); + b_right[9] = vsubq_s32(a_right[6], a_right[9]); + b_right[10] = vsubq_s32(a_right[5], a_right[10]); + b_right[11] = vsubq_s32(a_right[4], a_right[11]); + b_right[12] = vsubq_s32(a_right[3], a_right[12]); + b_right[13] = vsubq_s32(a_right[2], a_right[13]); + b_right[14] = vsubq_s32(a_right[1], a_right[14]); + b_right[15] = vsubq_s32(a_right[0], a_right[15]); +} + +static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + const int32x4_t one = vdupq_n_s32(1); + left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2); + right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2); + left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2); + right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2); + left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2); + right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2); + left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2); + right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2); + left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2); + right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2); + left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2); + right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2); + left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2); + right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2); + left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2); + right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2); + left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2); + right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2); + left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2); + right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2); + left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2); + right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2); + left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2); + right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2); + left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2); + right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2); + left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2); + right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2); + left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2); + right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2); + left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2); + right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2); +} + +static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/, + int32x4_t *right /*[8]*/, + int32x4_t *out_left /*[8]*/, + int32x4_t *out_right /*[8]*/) { + int32x4x2_t out[8]; + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + out_left[0] = out[0].val[0]; + out_left[1] = out[1].val[0]; + out_left[2] = out[2].val[0]; + out_left[3] = out[3].val[0]; + out_left[4] = out[4].val[0]; + out_left[5] = out[5].val[0]; + out_left[6] = out[6].val[0]; + out_left[7] = out[7].val[0]; + out_right[0] = out[0].val[1]; + out_right[1] = out[1].val[1]; + out_right[2] = out[2].val[1]; + out_right[3] = out[3].val[1]; + out_right[4] = out[4].val[1]; + out_right[5] = out[5].val[1]; + out_right[6] = out[6].val[1]; + out_right[7] = out[7].val[1]; +} + +// Store 16 32x4 vectors, assuming stride == 16. +static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) { + vst1q_s32(a, b[0]); + a += 16; + vst1q_s32(a, b[1]); + a += 16; + vst1q_s32(a, b[2]); + a += 16; + vst1q_s32(a, b[3]); + a += 16; + vst1q_s32(a, b[4]); + a += 16; + vst1q_s32(a, b[5]); + a += 16; + vst1q_s32(a, b[6]); + a += 16; + vst1q_s32(a, b[7]); + a += 16; + vst1q_s32(a, b[8]); + a += 16; + vst1q_s32(a, b[9]); + a += 16; + vst1q_s32(a, b[10]); + a += 16; + vst1q_s32(a, b[11]); + a += 16; + vst1q_s32(a, b[12]); + a += 16; + vst1q_s32(a, b[13]); + a += 16; + vst1q_s32(a, b[14]); + a += 16; + vst1q_s32(a, b[15]); +} + +// Main body of fdct16x16. +static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + int32x4_t sl[8]; + int32x4_t sr[8]; + int32x4_t xl[4]; + int32x4_t xr[4]; + int32x4_t inl[8]; + int32x4_t inr[8]; + int32x4_t stepl[8]; + int32x4_t stepr[8]; + + // stage 1 + // From fwd_txfm.c: Work on the first eight values; fdct8(input, + // even_results);" + sl[0] = vaddq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sr[1] = vaddq_s32(right[1], right[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sr[2] = vaddq_s32(right[2], right[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sr[3] = vaddq_s32(right[3], right[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sr[5] = vsubq_s32(right[2], right[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sr[6] = vsubq_s32(right[1], right[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[7] = vsubq_s32(right[0], right[7]); + + // Copy values 8-15 as we're storing in-place + inl[0] = left[8]; + inr[0] = right[8]; + inl[1] = left[9]; + inr[1] = right[9]; + inl[2] = left[10]; + inr[2] = right[10]; + inl[3] = left[11]; + inr[3] = right[11]; + inl[4] = left[12]; + inr[4] = right[12]; + inl[5] = left[13]; + inr[5] = right[13]; + inl[6] = left[14]; + inr[6] = right[14]; + inl[7] = left[15]; + inr[7] = right[15]; + + // fdct4(step, step); + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) + // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) + highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[8]); + highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0], + &right[8]); + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); + highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64, + &left[4], &left[12]); + highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64, + &right[4], &right[12]); + + // Stage 2 + // Re-using source s5/s6 + // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) + // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) + highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &sl[6], &sl[5]); + highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &sr[6], &sr[5]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], sl[5]); + xr[0] = vaddq_s32(sr[4], sr[5]); + xl[1] = vsubq_s32(sl[4], sl[5]); + xr[1] = vsubq_s32(sr[4], sr[5]); + xl[2] = vsubq_s32(sl[7], sl[6]); + xr[2] = vsubq_s32(sr[7], sr[6]); + xl[3] = vaddq_s32(sl[7], sl[6]); + xr[3] = vaddq_s32(sr[7], sr[6]); + + // Stage 4 + // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64, + &left[2], &left[14]); + highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64, + &right[2], &right[14]); + // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64, + &left[10], &left[6]); + highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64, + &right[10], &right[6]); + + // step 2 + // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" + // That file distinguished between "in_high" and "step1" but the only + // difference is that "in_high" is the first 8 values and "step 1" is the + // second. Here, since they are all in one array, "step1" values are += 8. + + // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) + // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) + // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) + // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) + highbd_butterfly_one_coeff_s32(inl[5], inl[2], cospi_16_64, &sl[5], &sl[2]); + highbd_butterfly_one_coeff_s32(inr[5], inr[2], cospi_16_64, &sr[5], &sr[2]); + highbd_butterfly_one_coeff_s32(inl[4], inl[3], cospi_16_64, &sl[4], &sl[3]); + highbd_butterfly_one_coeff_s32(inr[4], inr[3], cospi_16_64, &sr[4], &sr[3]); + + // step 3 + sl[0] = vaddq_s32(inl[0], sl[3]); + sr[0] = vaddq_s32(inr[0], sr[3]); + sl[1] = vaddq_s32(inl[1], sl[2]); + sr[1] = vaddq_s32(inr[1], sr[2]); + xl[0] = vsubq_s32(inl[1], sl[2]); + xr[0] = vsubq_s32(inr[1], sr[2]); + xl[1] = vsubq_s32(inl[0], sl[3]); + xr[1] = vsubq_s32(inr[0], sr[3]); + xl[2] = vsubq_s32(inl[7], sl[4]); + xr[2] = vsubq_s32(inr[7], sr[4]); + xl[3] = vsubq_s32(inl[6], sl[5]); + xr[3] = vsubq_s32(inr[6], sr[5]); + sl[6] = vaddq_s32(inl[6], sl[5]); + sr[6] = vaddq_s32(inr[6], sr[5]); + sl[7] = vaddq_s32(inl[7], sl[4]); + sr[7] = vaddq_s32(inr[7], sr[4]); + + // step 4 + // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64) + // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64) + highbd_butterfly_two_coeff_s32(sl[6], sl[1], cospi_8_64, cospi_24_64, &sl[6], + &sl[1]); + highbd_butterfly_two_coeff_s32(sr[6], sr[1], cospi_8_64, cospi_24_64, &sr[6], + &sr[1]); + + // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64) + highbd_butterfly_two_coeff_s32(xl[0], xl[3], cospi_24_64, cospi_8_64, &sl[2], + &sl[5]); + highbd_butterfly_two_coeff_s32(xr[0], xr[3], cospi_24_64, cospi_8_64, &sr[2], + &sr[5]); + + // step 5 + stepl[0] = vaddq_s32(sl[0], sl[1]); + stepr[0] = vaddq_s32(sr[0], sr[1]); + stepl[1] = vsubq_s32(sl[0], sl[1]); + stepr[1] = vsubq_s32(sr[0], sr[1]); + stepl[2] = vaddq_s32(xl[1], sl[2]); + stepr[2] = vaddq_s32(xr[1], sr[2]); + stepl[3] = vsubq_s32(xl[1], sl[2]); + stepr[3] = vsubq_s32(xr[1], sr[2]); + stepl[4] = vsubq_s32(xl[2], sl[5]); + stepr[4] = vsubq_s32(xr[2], sr[5]); + stepl[5] = vaddq_s32(xl[2], sl[5]); + stepr[5] = vaddq_s32(xr[2], sr[5]); + stepl[6] = vsubq_s32(sl[7], sl[6]); + stepr[6] = vsubq_s32(sr[7], sr[6]); + stepl[7] = vaddq_s32(sl[7], sl[6]); + stepr[7] = vaddq_s32(sr[7], sr[6]); + + // step 6 + // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64) + // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64) + // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64) + // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64) + // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64) + // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] * + // cospi_22_64) out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * + // cospi_26_64) out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * + // cospi_6_64) + highbd_butterfly_two_coeff_s32(stepl[7], stepl[0], cospi_2_64, cospi_30_64, + &left[1], &left[15]); + highbd_butterfly_two_coeff_s32(stepr[7], stepr[0], cospi_2_64, cospi_30_64, + &right[1], &right[15]); + highbd_butterfly_two_coeff_s32(stepl[6], stepl[1], cospi_18_64, cospi_14_64, + &left[9], &left[7]); + highbd_butterfly_two_coeff_s32(stepr[6], stepr[1], cospi_18_64, cospi_14_64, + &right[9], &right[7]); + highbd_butterfly_two_coeff_s32(stepl[5], stepl[2], cospi_10_64, cospi_22_64, + &left[5], &left[11]); + highbd_butterfly_two_coeff_s32(stepr[5], stepr[2], cospi_10_64, cospi_22_64, + &right[5], &right[11]); + highbd_butterfly_two_coeff_s32(stepl[4], stepl[3], cospi_26_64, cospi_6_64, + &left[13], &left[3]); + highbd_butterfly_two_coeff_s32(stepr[4], stepr[3], cospi_26_64, cospi_6_64, + &right[13], &right[3]); +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + #endif // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_ diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index ca904aa504..c82be0a6cc 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -562,7 +562,7 @@ () $vpx_highbd_fdct8x8_1_neon=vpx_fdct8x8_1_neon; add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct16x16 sse2/; + specialize qw/vpx_highbd_fdct16x16 sse2 neon/; add_proto qw/void vpx_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_highbd_fdct16x16_1 neon/; From 124e57be95dbb05ee6b5b1554e345b97a4ddf34e Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 13 Oct 2022 15:19:46 +0000 Subject: [PATCH 0530/1000] [NEON] fix clang compile warnings Change-Id: Ib7ce7a774ec89ba51169ea64d24c878109ef07d1 --- vpx_dsp/arm/fdct_neon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h index c100e709d5..ce669061d2 100644 --- a/vpx_dsp/arm/fdct_neon.h +++ b/vpx_dsp/arm/fdct_neon.h @@ -358,7 +358,7 @@ static INLINE int32x4_t highbd_add_round_shift_s32(int32x4_t x) { static INLINE void highbd_butterfly_one_coeff_s32(const int32x4_t a, const int32x4_t b, - const tran_high_t c, + const tran_coef_t c, int32x4_t *add, int32x4_t *sub) { const int32x2_t a_lo = vget_low_s32(a); From 4007a057fcb60f57f7b60ecc888c4d4b9043b2db Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Wed, 12 Oct 2022 00:10:47 -0700 Subject: [PATCH 0531/1000] Fix to VP8 external RC for dynamic update of layers On change/update of rc_cfg: when number of temporal layers change call vp8_reset_temporal_layer_change(), which in turn will call vp8_init_temporal_layer_context() only for the new layers. Bug:b/249644737 Change-Id: Ib20d746c7eacd10b78806ca6a5362c750d9ca0b3 --- vp8/encoder/onyx_if.c | 6 +++--- vp8/encoder/onyx_int.h | 2 ++ vp8/vp8_ratectrl_rtc.cc | 29 +++++++++++++++++++++++------ 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 94fb6e256e..4bbeadef01 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -328,8 +328,8 @@ void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, // for any "new" layers. For "existing" layers, let them inherit the parameters // from the previous layer state (at the same layer #). In future we may want // to better map the previous layer state(s) to the "new" ones. -static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, - const int prev_num_layers) { +void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, + const int prev_num_layers) { int i; double prev_layer_framerate = 0; const int curr_num_layers = cpi->oxcf.number_of_layers; @@ -1643,7 +1643,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { cpi->temporal_layer_id = 0; } cpi->temporal_pattern_counter = 0; - reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers); + vp8_reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers); } if (!cpi->initial_width) { diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 7951f0a77e..46a17913ad 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -713,6 +713,8 @@ void vp8_initialize_enc(void); void vp8_alloc_compressor_data(VP8_COMP *cpi); int vp8_reverse_trans(int x); +void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, + const int prev_num_layers); void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, const int layer, double prev_layer_framerate); diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc index 2f23c5b1d9..46621546dd 100644 --- a/vp8/vp8_ratectrl_rtc.cc +++ b/vp8/vp8_ratectrl_rtc.cc @@ -92,6 +92,7 @@ void VP8RateControlRTC::UpdateRateControl( const VP8RateControlRtcConfig &rc_cfg) { VP8_COMMON *cm = &cpi_->common; VP8_CONFIG *oxcf = &cpi_->oxcf; + const unsigned int prev_number_of_layers = oxcf->number_of_layers; vpx_clear_system_state(); cm->Width = rc_cfg.width; cm->Height = rc_cfg.height; @@ -124,17 +125,33 @@ void VP8RateControlRTC::UpdateRateControl( static_cast(cpi_->output_framerate); } - if (oxcf->number_of_layers > 1) { + if (oxcf->number_of_layers > 1 || prev_number_of_layers > 1) { memcpy(oxcf->target_bitrate, rc_cfg.layer_target_bitrate, sizeof(rc_cfg.layer_target_bitrate)); memcpy(oxcf->rate_decimator, rc_cfg.ts_rate_decimator, sizeof(rc_cfg.ts_rate_decimator)); - oxcf->periodicity = 2; + if (cm->current_video_frame == 0) { + double prev_layer_framerate = 0; + for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) { + vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate); + prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i]; + } + } else if (oxcf->number_of_layers != prev_number_of_layers) { + // The number of temporal layers has changed, so reset/initialize the + // temporal layer context for the new layer configuration: this means + // calling vp8_reset_temporal_layer_change() below. + + // Start at the base of the pattern cycle, so set the layer id to 0 and + // reset the temporal pattern counter. + // TODO(marpan/jianj): don't think lines 148-151 are needed (user controls + // the layer_id) so remove. + if (cpi_->temporal_layer_id > 0) { + cpi_->temporal_layer_id = 0; + } + cpi_->temporal_pattern_counter = 0; - double prev_layer_framerate = 0; - for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) { - vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate); - prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i]; + vp8_reset_temporal_layer_change(cpi_, oxcf, + static_cast(prev_number_of_layers)); } } From 828d05d4a497e7f5d635a8e5bf35fca790759b13 Mon Sep 17 00:00:00 2001 From: Johann Date: Sat, 1 Oct 2022 11:47:05 +0900 Subject: [PATCH 0532/1000] vp9 quantize: rewrite ssse3 in intrinsics Change-Id: I3177251a5935453a23a23c39ea5f6fd41254775e --- test/vp9_quantize_test.cc | 11 - vp9/common/vp9_rtcd_defs.pl | 4 +- vp9/encoder/x86/vp9_quantize_ssse3.c | 253 ++++++++++++++++++ vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm | 174 ------------ vp9/vp9cx.mk | 5 +- 5 files changed, 256 insertions(+), 191 deletions(-) create mode 100644 vp9/encoder/x86/vp9_quantize_ssse3.c delete mode 100644 vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 7bb0bee512..587cec6923 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -539,7 +539,6 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_SSE2 #if HAVE_SSSE3 -#if VPX_ARCH_X86_64 INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, @@ -553,16 +552,6 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 32, true))); -#else -INSTANTIATE_TEST_SUITE_P( - SSSE3, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_ssse3, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false))); - -#endif // VPX_ARCH_X86_64 #endif // HAVE_SSSE3 #if HAVE_AVX diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 4290c2380e..871e4d0a35 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -129,10 +129,10 @@ () add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -specialize qw/vp9_quantize_fp neon sse2 avx2 vsx/, "$ssse3_x86_64"; +specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/; add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -specialize qw/vp9_quantize_fp_32x32 neon avx2 vsx/, "$ssse3_x86_64"; +specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_block_error avx2 sse2/; diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.c b/vp9/encoder/x86/vp9_quantize_ssse3.c new file mode 100644 index 0000000000..d35004e370 --- /dev/null +++ b/vp9/encoder/x86/vp9_quantize_ssse3.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vpx_dsp/x86/quantize_ssse3.h" + +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const __m128i zero = _mm_setzero_si128(); + __m128i thr; + int nzflag; + int index = 16; + __m128i round, quant, dequant; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i eob; + + (void)scan; + + // Setup global values. + load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + thr = _mm_srai_epi16(dequant, 1); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { + __m128i eob0; + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr + index); + store_tran_low(qcoeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } else { + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); + + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); + } + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} + +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one_s16 = _mm_set1_epi16(1); + __m128i thr; + int nzflag; + int index = 16; + __m128i round, quant, dequant; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i eob; + + (void)scan; + + // Setup global values. + load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); + // The 32x32 halves round. + round = _mm_add_epi16(round, one_s16); + round = _mm_srli_epi16(round, 1); + + // The 16x16 shifts by 16, the 32x32 shifts by 15. We want to use pmulhw so + // upshift quant to account for this. + quant = _mm_slli_epi16(quant, 1); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + // Get the abs value of qcoeff again so we can use shifts for division. + qcoeff0 = _mm_abs_epi16(qcoeff0); + qcoeff1 = _mm_abs_epi16(qcoeff1); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + // Divide by 2. + qcoeff0 = _mm_srli_epi16(qcoeff0, 1); + qcoeff1 = _mm_srli_epi16(qcoeff1, 1); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + thr = _mm_srai_epi16(dequant, 2); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + // Get the abs value of qcoeff again so we can use shifts for division. + qcoeff0 = _mm_abs_epi16(qcoeff0); + qcoeff1 = _mm_abs_epi16(qcoeff1); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + // Divide by 2. + qcoeff0 = _mm_srli_epi16(qcoeff0, 1); + qcoeff1 = _mm_srli_epi16(qcoeff1, 1); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, dqcoeff_ptr + index); + store_tran_low(qcoeff1, dqcoeff_ptr + index + 8); + } else { + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); + + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); + } + + if (nzflag) { + const __m128i eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm deleted file mode 100644 index ae43a90f8b..0000000000 --- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ /dev/null @@ -1,174 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp9 - -%include "third_party/x86inc/x86inc.asm" -%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" - -SECTION_RODATA -pw_1: times 8 dw 1 - -SECTION .text - -%macro QUANTIZE_FP 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, round, quant, \ - qcoeff, dqcoeff, dequant, \ - eob, scan, iscan - - ; actual quantize loop - setup pointers, rounders, etc. - movifnidn coeffq, coeffmp - movifnidn ncoeffq, ncoeffmp - movifnidn roundq, roundmp - movifnidn quantq, quantmp - mova m1, [roundq] ; m1 = round - mova m2, [quantq] ; m2 = quant - mov r2, dequantmp -%ifidn %1, fp_32x32 - pcmpeqw m5, m5 - psrlw m5, 15 - paddw m1, m5 - psrlw m1, 1 ; m1 = (m1 + 1) / 2 -%endif - mova m3, [r2q] ; m3 = dequant - mov r3, qcoeffmp - mov r4, dqcoeffmp - mov r5, iscanmp -%ifidn %1, fp_32x32 - psllw m2, 1 -%endif - pxor m5, m5 ; m5 = dedicated zero - - INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq - lea r5q, [r5q+ncoeffq*2] - INCREMENT_ELEMENTS_TRAN_LOW r3q, ncoeffq - INCREMENT_ELEMENTS_TRAN_LOW r4q, ncoeffq - neg ncoeffq - - ; get DC and first 15 AC coeffs - LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i] - LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i] - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpeqw m7, m7 - - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - STORE_TRAN_LOW 8, r3q, ncoeffq, 6, 11, 12 - STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12 -%ifidn %1, fp_32x32 - pabsw m8, m8 - pabsw m13, m13 -%endif - pmullw m8, m3 ; r4[i] = r3[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; r4[i] = r3[i] * q -%ifidn %1, fp_32x32 - psrlw m8, 1 - psrlw m13, 1 - psignw m8, m9 - psignw m13, m10 - psrlw m0, m3, 2 -%else - psrlw m0, m3, 1 -%endif - STORE_TRAN_LOW 8, r4q, ncoeffq, 6, 11, 12 - STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12 - pcmpeqw m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] - pandn m8, m6 ; m8 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m13 - add ncoeffq, mmsize - jz .accumulate_eob - -.ac_only_loop: - LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i] - LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i] - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - - pcmpgtw m7, m6, m0 - pcmpgtw m12, m11, m0 - pmovmskb r6d, m7 - pmovmskb r2d, m12 - - or r6, r2 - jz .skip_iter - - pcmpeqw m7, m7 - - paddsw m6, m1 ; m6 += round - paddsw m11, m1 ; m11 += round - pmulhw m14, m6, m2 ; m14 = m6*q>>16 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - psignw m14, m9 ; m14 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - STORE_TRAN_LOW 14, r3q, ncoeffq, 6, 11, 12 - STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12 -%ifidn %1, fp_32x32 - pabsw m14, m14 - pabsw m13, m13 -%endif - pmullw m14, m3 ; r4[i] = r3[i] * q - pmullw m13, m3 ; r4[i] = r3[i] * q -%ifidn %1, fp_32x32 - psrlw m14, 1 - psrlw m13, 1 - psignw m14, m9 - psignw m13, m10 -%endif - STORE_TRAN_LOW 14, r4q, ncoeffq, 6, 11, 12 - STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12 - pcmpeqw m14, m5 ; m14 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] - pandn m14, m6 ; m14 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m14 - pmaxsw m8, m13 - add ncoeffq, mmsize - jl .ac_only_loop - - jmp .accumulate_eob -.skip_iter: - STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq - STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8 - STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq - STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8 - add ncoeffq, mmsize - jl .ac_only_loop - -.accumulate_eob: - ; horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - pextrw r6, m8, 0 - mov [r2], r6w - RET -%endmacro - -INIT_XMM ssse3 -QUANTIZE_FP fp, 7 -QUANTIZE_FP fp_32x32, 7 diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index c9afd9a347..9072628f23 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -111,6 +111,7 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c +VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -122,10 +123,6 @@ endif VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm -ifeq ($(VPX_ARCH_X86_64),yes) -VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm -endif - VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c From e8fc52ada46d2375c2dce6ac9e9d0bf27f622866 Mon Sep 17 00:00:00 2001 From: Johann Date: Mon, 17 Oct 2022 16:22:23 +0900 Subject: [PATCH 0533/1000] quantize: consolidate sse2 conditionals Change-Id: I43de579e30f2967b97064063e29676e0af1a498f --- vp9/encoder/x86/vp9_quantize_sse2.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index 272e5fb079..c877234436 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -89,6 +89,7 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); if (nzflag) { + __m128i eob0; qcoeff0 = _mm_adds_epi16(qcoeff0, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); @@ -101,11 +102,14 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr + index); store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); - store_tran_low(coeff0, dqcoeff_ptr + index); - store_tran_low(coeff1, dqcoeff_ptr + index + 8); + store_tran_low(qcoeff0, dqcoeff_ptr + index); + store_tran_low(qcoeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); } else { store_zero_tran_low(qcoeff_ptr + index); store_zero_tran_low(qcoeff_ptr + index + 8); @@ -114,10 +118,6 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_zero_tran_low(dqcoeff_ptr + index + 8); } - if (nzflag) { - const __m128i eob0 = scan_for_eob(&coeff0, &coeff1, iscan, index, zero); - eob = _mm_max_epi16(eob, eob0); - } index += 16; } From 5245f6e9cb7e6bb68ab45fe4d8b00bc9b16857e1 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Mon, 17 Oct 2022 22:36:25 -0700 Subject: [PATCH 0534/1000] Fix to VP8 external RC for buffer levels On a dynamic change of temporal layers: starting/maimum/optimal were being set twice, causing incorrect large values. Bug: b/253927937 Change-Id: I204e885cff92530336a9ed9a4363c486c5bf80ae --- vp8/vp8_ratectrl_rtc.cc | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc index 46621546dd..f3f42529db 100644 --- a/vp8/vp8_ratectrl_rtc.cc +++ b/vp8/vp8_ratectrl_rtc.cc @@ -163,20 +163,24 @@ void VP8RateControlRTC::UpdateRateControl( cm->MBs = cm->mb_rows * cm->mb_cols; cm->mode_info_stride = cm->mb_cols + 1; - oxcf->starting_buffer_level = - rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000); - /* Set or reset optimal and maximum buffer levels. */ - if (oxcf->optimal_buffer_level == 0) { - oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8; - } else { - oxcf->optimal_buffer_level = - rescale((int)oxcf->optimal_buffer_level, oxcf->target_bandwidth, 1000); - } - if (oxcf->maximum_buffer_size == 0) { - oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8; - } else { - oxcf->maximum_buffer_size = - rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000); + // For temporal layers: starting/maximum/optimal_buffer_level is already set + // via vp8_init_temporal_layer_context() or vp8_reset_temporal_layer_change(). + if (oxcf->number_of_layers <= 1 && prev_number_of_layers <= 1) { + oxcf->starting_buffer_level = + rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000); + /* Set or reset optimal and maximum buffer levels. */ + if (oxcf->optimal_buffer_level == 0) { + oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8; + } else { + oxcf->optimal_buffer_level = rescale((int)oxcf->optimal_buffer_level, + oxcf->target_bandwidth, 1000); + } + if (oxcf->maximum_buffer_size == 0) { + oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8; + } else { + oxcf->maximum_buffer_size = + rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000); + } } if (cpi_->bits_off_target > oxcf->maximum_buffer_size) { From 228d8a4fed449cbfea678be4f0baf67113b82549 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 24 Oct 2022 15:24:51 -0700 Subject: [PATCH 0535/1000] highbd_sad_avx2: normalize function param names (src|ref)8_ptr -> (src|ref)_ptr. aligns the names with the rtcd header; clears some clang-tidy warnings Change-Id: Id1aa29da8c0fa5860b46ac902f5b2620c0d3ff54 --- vpx_dsp/x86/highbd_sad_avx2.c | 36 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c index 7533ccfddb..231b67f809 100644 --- a/vpx_dsp/x86/highbd_sad_avx2.c +++ b/vpx_dsp/x86/highbd_sad_avx2.c @@ -52,10 +52,10 @@ static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16, #define HIGHBD_SAD64XN(n) \ unsigned int vpx_highbd_sad64x##n##_avx2( \ - const uint8_t *src8_ptr, int src_stride, const uint8_t *ref8_ptr, \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr); \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ __m256i sums_32 = _mm256_setzero_si256(); \ int i; \ \ @@ -109,10 +109,10 @@ static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, #define HIGHBD_SAD32XN(n) \ unsigned int vpx_highbd_sad32x##n##_avx2( \ - const uint8_t *src8_ptr, int src_stride, const uint8_t *ref8_ptr, \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr); \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ __m256i sums_32 = _mm256_setzero_si256(); \ int i; \ \ @@ -167,10 +167,10 @@ static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, } } -unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src8_ptr, int src_stride, - const uint8_t *ref8_ptr, int ref_stride) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr); +unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); __m256i sums_32 = _mm256_setzero_si256(); int i; @@ -192,10 +192,10 @@ unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src8_ptr, int src_stride, return calc_final(sums_32); } -unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src8_ptr, int src_stride, - const uint8_t *ref8_ptr, int ref_stride) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr); +unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); __m256i sums_16 = _mm256_setzero_si256(); highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); @@ -208,10 +208,10 @@ unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src8_ptr, int src_stride, } } -unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src8_ptr, int src_stride, - const uint8_t *ref8_ptr, int ref_stride) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8_ptr); +unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); __m256i sums_16 = _mm256_setzero_si256(); highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8); From d667193e6a36629746faa1e29e8fb17573eea893 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 24 Oct 2022 15:28:47 -0700 Subject: [PATCH 0536/1000] vp9_highbd_quantize_fp*_neon: normalize fn param name count -> n_coeffs. aligns the name with the rtcd header; clears a clang-tidy warning Change-Id: I36545ff479df92b117c95e494f16002e6990f433 --- vp9/encoder/arm/neon/vp9_quantize_neon.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index b9bd1eba31..c2b55fcbaa 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -257,7 +257,7 @@ highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0))); } -void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, +void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, @@ -294,7 +294,7 @@ void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, v_eobmax = get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); - count -= 8; + n_coeffs -= 8; do { coeff_ptr += 8; qcoeff_ptr += 8; @@ -308,8 +308,8 @@ void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, // Find the max lane eob for 8 coeffs. v_eobmax = get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); - count -= 8; - } while (count); + n_coeffs -= 8; + } while (n_coeffs); *eob_ptr = get_max_eob(v_eobmax); } @@ -349,7 +349,7 @@ highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, } void vp9_highbd_quantize_fp_32x32_neon( - const tran_low_t *coeff_ptr, intptr_t count, const int16_t *round_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -385,7 +385,7 @@ void vp9_highbd_quantize_fp_32x32_neon( v_eobmax = get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); - count -= 8; + n_coeffs -= 8; do { coeff_ptr += 8; qcoeff_ptr += 8; @@ -400,8 +400,8 @@ void vp9_highbd_quantize_fp_32x32_neon( // Find the max lane eob for 8 coeffs. v_eobmax = get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); - count -= 8; - } while (count); + n_coeffs -= 8; + } while (n_coeffs); *eob_ptr = get_max_eob(v_eobmax); } From ee12bc390dca10e53e7dbb16589fd183fc3d7792 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 24 Oct 2022 15:37:26 -0700 Subject: [PATCH 0537/1000] SAD*Test: mark virtual Run() as overridden this comes from AbstractBench Change-Id: Ie0b5a26a68bfbffd80f132125d15a1bdfc990c22 --- test/sad_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 29e3f57f5e..0896c77f12 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -311,7 +311,7 @@ class SADTest : public AbstractBench, public SADTestBase { ASSERT_EQ(reference_sad, exp_sad); } - void Run() { + void Run() override { params_.func(source_data_, source_stride_, reference_data_, reference_stride_); } @@ -339,7 +339,7 @@ class SADavgTest : public AbstractBench, public SADTestBase { ASSERT_EQ(reference_sad, exp_sad); } - void Run() { + void Run() override { params_.func(source_data_, source_stride_, reference_data_, reference_stride_, second_pred_); } From 4b659f3c345b81db67bddcfd831e33abc40c021e Mon Sep 17 00:00:00 2001 From: Johann Date: Wed, 26 Oct 2022 17:14:21 +0900 Subject: [PATCH 0538/1000] mailmap: add johann@duck.com Change-Id: I3b48951e69ba1f4a9fafdbb81fac48f79587a342 --- .mailmap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 376ca83ae3..3593de4b9c 100644 --- a/.mailmap +++ b/.mailmap @@ -21,8 +21,8 @@ Jacky Chen Jim Bankoski Johann Koenig Johann Koenig -Johann Koenig Johann Koenig +Johann John Koleszar Joshua Litt Marco Paniconi From 9e1bdd12c79008a1fa8f1b9ba63194bdcd020fd7 Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 27 Oct 2022 08:49:37 +0900 Subject: [PATCH 0539/1000] rtcd: allow disabling neon on armv8 Change-Id: Idef943775456eb95b46be5c92c114c1d215f38d7 --- build/make/rtcd.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl index 9c97268426..f4edeaad51 100755 --- a/build/make/rtcd.pl +++ b/build/make/rtcd.pl @@ -488,7 +488,8 @@ () arm; } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) { @ALL_ARCHS = filter(qw/neon/); - &require("neon"); + @REQUIRES = filter(qw/neon/); + &require(@REQUIRES); arm; } elsif ($opts{arch} =~ /^ppc/ ) { @ALL_ARCHS = filter(qw/vsx/); From ebf22e2e8db9e863a2cbc24081381e24fe306882 Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 27 Oct 2022 11:40:19 +0900 Subject: [PATCH 0540/1000] MacOS 13 is darwin22 Bug: webm:1783 Change-Id: I97d94ab8c8aebe13aedb58e280dc37474814ad5d --- build/make/configure.sh | 4 ++-- configure | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/build/make/configure.sh b/build/make/configure.sh index 581042e38e..e9b7fa9c1c 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -791,7 +791,7 @@ process_common_toolchain() { tgt_isa=x86_64 tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'` ;; - *darwin2[0-1]*) + *darwin2[0-2]*) tgt_isa=`uname -m` tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'` ;; @@ -940,7 +940,7 @@ process_common_toolchain() { add_cflags "-mmacosx-version-min=10.15" add_ldflags "-mmacosx-version-min=10.15" ;; - *-darwin2[0-1]-*) + *-darwin2[0-2]-*) add_cflags "-arch ${toolchain%%-*}" add_ldflags "-arch ${toolchain%%-*}" ;; diff --git a/configure b/configure index 1b850b5e04..bf92e1ad1f 100755 --- a/configure +++ b/configure @@ -101,6 +101,7 @@ all_platforms="${all_platforms} arm64-android-gcc" all_platforms="${all_platforms} arm64-darwin-gcc" all_platforms="${all_platforms} arm64-darwin20-gcc" all_platforms="${all_platforms} arm64-darwin21-gcc" +all_platforms="${all_platforms} arm64-darwin22-gcc" all_platforms="${all_platforms} arm64-linux-gcc" all_platforms="${all_platforms} arm64-win64-gcc" all_platforms="${all_platforms} arm64-win64-vs15" @@ -157,6 +158,7 @@ all_platforms="${all_platforms} x86_64-darwin18-gcc" all_platforms="${all_platforms} x86_64-darwin19-gcc" all_platforms="${all_platforms} x86_64-darwin20-gcc" all_platforms="${all_platforms} x86_64-darwin21-gcc" +all_platforms="${all_platforms} x86_64-darwin22-gcc" all_platforms="${all_platforms} x86_64-iphonesimulator-gcc" all_platforms="${all_platforms} x86_64-linux-gcc" all_platforms="${all_platforms} x86_64-linux-icc" From 3121783fec60d0ce4551d472d1acbd1f1a8253be Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 26 Oct 2022 21:37:31 +0000 Subject: [PATCH 0541/1000] [NEON] Optimize and homogenize Butterfly DCT functions Provide a set of commonly used Butterfly DCT functions for use in DCT 4x4, 8x8, 16x16, 32x32 functions. These are provided in various forms, using vqrdmulh_s16/vqrdmulh_s32 for _fast variants, which unfortunately are only usable in pass1 of most DCTs, as they do not provide the necessary precision in pass2. This gave a performance gain ranging from 5% to 15% in 16x16 case. Also, for 32x32, the loads were rearranged, along with the butterfly optimizations, this gave 10% gain in 32x32_rd function. This refactoring was necessary to allow easier porting of highbd 32x32 functions -follows this patchset. Change-Id: I6282e640b95a95938faff76c3b2bace3dc298bc3 --- vp9/encoder/arm/neon/vp9_dct_neon.c | 17 +- vpx_dsp/arm/fdct16x16_neon.c | 18 +- vpx_dsp/arm/fdct16x16_neon.h | 297 +++---- vpx_dsp/arm/fdct32x32_neon.c | 1158 +-------------------------- vpx_dsp/arm/fdct32x32_neon.h | 1105 +++++++++++++++++++++++++ vpx_dsp/arm/fdct4x4_neon.c | 13 +- vpx_dsp/arm/fdct4x4_neon.h | 105 +++ vpx_dsp/arm/fdct8x8_neon.c | 47 +- vpx_dsp/arm/fdct8x8_neon.h | 381 +++++++++ vpx_dsp/arm/fdct_neon.h | 757 +++++++---------- vpx_dsp/arm/transpose_neon.h | 45 ++ 11 files changed, 2112 insertions(+), 1831 deletions(-) create mode 100644 vpx_dsp/arm/fdct32x32_neon.h create mode 100644 vpx_dsp/arm/fdct4x4_neon.h create mode 100644 vpx_dsp/arm/fdct8x8_neon.h diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c index a07a1608d7..b8286a8dd5 100644 --- a/vp9/encoder/arm/neon/vp9_dct_neon.c +++ b/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -18,6 +18,8 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/fdct4x4_neon.h" +#include "vpx_dsp/arm/fdct8x8_neon.h" static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in, int stride) { @@ -130,12 +132,14 @@ void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, case ADST_DCT: load_buffer_4x4(input, in, stride); fadst4x4_neon(in); - vpx_fdct4x4_pass1_neon((int16x4_t *)in); + // pass1 variant is not accurate enough + vpx_fdct4x4_pass2_neon((int16x4_t *)in); write_buffer_4x4(output, in); break; case DCT_ADST: load_buffer_4x4(input, in, stride); - vpx_fdct4x4_pass1_neon((int16x4_t *)in); + // pass1 variant is not accurate enough + vpx_fdct4x4_pass2_neon((int16x4_t *)in); fadst4x4_neon(in); write_buffer_4x4(output, in); break; @@ -488,13 +492,15 @@ void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, case ADST_DCT: load_buffer_8x8(input, in, stride); fadst8x8_neon(in); - vpx_fdct8x8_pass1_neon(in); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(in); right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); break; case DCT_ADST: load_buffer_8x8(input, in, stride); - vpx_fdct8x8_pass1_neon(in); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(in); fadst8x8_neon(in); right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); @@ -559,7 +565,8 @@ static void fdct16_8col(int16x8_t *in) { i[6] = vaddq_s16(in[6], in[9]); i[7] = vaddq_s16(in[7], in[8]); - vpx_fdct8x8_pass1_neon(i); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(i); transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]); // step 2 diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c index d0c07d429a..a458ecaa41 100644 --- a/vpx_dsp/arm/fdct16x16_neon.c +++ b/vpx_dsp/arm/fdct16x16_neon.c @@ -37,20 +37,21 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Left half. load_cross(input, stride, temp0); scale_input(temp0, temp1); - vpx_fdct16x16_body(temp1, temp0); + vpx_fdct8x16_body(temp1, temp0); // Right half. load_cross(input + 8, stride, temp1); scale_input(temp1, temp2); - vpx_fdct16x16_body(temp2, temp1); + vpx_fdct8x16_body(temp2, temp1); // Transpose top left and top right quarters into one contiguous location to // process to the top half. + transpose_s16_8x8_new(&temp0[0], &temp2[0]); transpose_s16_8x8_new(&temp1[0], &temp2[8]); partial_round_shift(temp2); cross_input(temp2, temp3); - vpx_fdct16x16_body(temp3, temp2); + vpx_fdct8x16_body(temp3, temp2); transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], &temp2[5], &temp2[6], &temp2[7]); transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], @@ -62,11 +63,12 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Transpose bottom left and bottom right quarters into one contiguous // location to process to the bottom half. transpose_s16_8x8_new(&temp0[8], &temp1[0]); + transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], &temp1[13], &temp1[14], &temp1[15]); partial_round_shift(temp1); cross_input(temp1, temp0); - vpx_fdct16x16_body(temp0, temp1); + vpx_fdct8x16_body(temp0, temp1); transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4], &temp1[5], &temp1[6], &temp1[7]); transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], @@ -86,12 +88,12 @@ void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, // Left half. load_cross(input, stride, temp0); highbd_scale_input(temp0, left1, right1); - vpx_highbd_fdct16x16_body(left1, right1); + vpx_highbd_fdct8x16_body(left1, right1); // right half. load_cross(input + 8, stride, temp0); highbd_scale_input(temp0, left2, right2); - vpx_highbd_fdct16x16_body(left2, right2); + vpx_highbd_fdct8x16_body(left2, right2); // Transpose top left and top right quarters into one contiguous location to // process to the top half. @@ -103,14 +105,14 @@ void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, highbd_partial_round_shift(left3, right3); highbd_cross_input(left3, right3, left1, right1); - vpx_highbd_fdct16x16_body(left1, right1); + vpx_highbd_fdct8x16_body(left1, right1); // Transpose bottom left and bottom right quarters into one contiguous // location to process to the bottom half. highbd_partial_round_shift(left4, right4); highbd_cross_input(left4, right4, left2, right2); - vpx_highbd_fdct16x16_body(left2, right2); + vpx_highbd_fdct8x16_body(left2, right2); transpose_s32_8x8_2(left1, right1, left3, right3); transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8); diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h index d99870903b..43d820b6bd 100644 --- a/vpx_dsp/arm/fdct16x16_neon.h +++ b/vpx_dsp/arm/fdct16x16_neon.h @@ -160,8 +160,8 @@ static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) { } // Main body of fdct16x16. -static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, - int16x8_t *out /*[16]*/) { +static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/, + int16x8_t *out /*[16]*/) { int16x8_t s[8]; int16x8_t x[4]; int16x8_t step[8]; @@ -186,16 +186,17 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) - butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]); - // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], + &out[8]); + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); - butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]); + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]); // Stage 2 // Re-using source s5/s6 // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) - butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]); + butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]); // Stage 3 x[0] = vaddq_s16(s[4], s[5]); @@ -204,12 +205,12 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, x[3] = vaddq_s16(s[7], s[6]); // Stage 4 - // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) - // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) - butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]); - // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) - // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) - butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]); + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]); // step 2 // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" @@ -221,8 +222,8 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) - butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]); - butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]); + butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]); + butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]); // step 3 s[0] = vaddq_s16(in[8], s[3]); @@ -235,13 +236,15 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, s[7] = vaddq_s16(in[15], s[4]); // step 4 - // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64) - // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64) - butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]); + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]); // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) - // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64) - butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]); + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]); // step 5 step[0] = vaddq_s16(s[0], s[1]); @@ -254,22 +257,23 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, step[7] = vaddq_s16(s[7], s[6]); // step 6 - // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64) - // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64) - // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64) - // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64) - // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64) - // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] * - // cospi_22_64) - // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64) - // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64) - butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9], + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9], &out[7]); - butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1], + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1], &out[15]); - butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13], + + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13], &out[3]); - butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5], + + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5], &out[11]); } @@ -279,36 +283,37 @@ static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/, int32x4_t *left /*[16]*/, int32x4_t *right /* [16] */) { left[0] = vshll_n_s16(vget_low_s16(a[0]), 2); - right[0] = vshll_n_s16(vget_high_s16(a[0]), 2); left[1] = vshll_n_s16(vget_low_s16(a[1]), 2); - right[1] = vshll_n_s16(vget_high_s16(a[1]), 2); left[2] = vshll_n_s16(vget_low_s16(a[2]), 2); - right[2] = vshll_n_s16(vget_high_s16(a[2]), 2); left[3] = vshll_n_s16(vget_low_s16(a[3]), 2); - right[3] = vshll_n_s16(vget_high_s16(a[3]), 2); left[4] = vshll_n_s16(vget_low_s16(a[4]), 2); - right[4] = vshll_n_s16(vget_high_s16(a[4]), 2); left[5] = vshll_n_s16(vget_low_s16(a[5]), 2); - right[5] = vshll_n_s16(vget_high_s16(a[5]), 2); left[6] = vshll_n_s16(vget_low_s16(a[6]), 2); - right[6] = vshll_n_s16(vget_high_s16(a[6]), 2); left[7] = vshll_n_s16(vget_low_s16(a[7]), 2); - right[7] = vshll_n_s16(vget_high_s16(a[7]), 2); left[8] = vshll_n_s16(vget_low_s16(a[8]), 2); - right[8] = vshll_n_s16(vget_high_s16(a[8]), 2); left[9] = vshll_n_s16(vget_low_s16(a[9]), 2); - right[9] = vshll_n_s16(vget_high_s16(a[9]), 2); left[10] = vshll_n_s16(vget_low_s16(a[10]), 2); - right[10] = vshll_n_s16(vget_high_s16(a[10]), 2); left[11] = vshll_n_s16(vget_low_s16(a[11]), 2); - right[11] = vshll_n_s16(vget_high_s16(a[11]), 2); left[12] = vshll_n_s16(vget_low_s16(a[12]), 2); - right[12] = vshll_n_s16(vget_high_s16(a[12]), 2); left[13] = vshll_n_s16(vget_low_s16(a[13]), 2); - right[13] = vshll_n_s16(vget_high_s16(a[13]), 2); left[14] = vshll_n_s16(vget_low_s16(a[14]), 2); - right[14] = vshll_n_s16(vget_high_s16(a[14]), 2); left[15] = vshll_n_s16(vget_low_s16(a[15]), 2); + + right[0] = vshll_n_s16(vget_high_s16(a[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(a[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(a[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(a[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(a[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(a[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(a[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(a[7]), 2); + right[8] = vshll_n_s16(vget_high_s16(a[8]), 2); + right[9] = vshll_n_s16(vget_high_s16(a[9]), 2); + right[10] = vshll_n_s16(vget_high_s16(a[10]), 2); + right[11] = vshll_n_s16(vget_high_s16(a[11]), 2); + right[12] = vshll_n_s16(vget_high_s16(a[12]), 2); + right[13] = vshll_n_s16(vget_high_s16(a[13]), 2); + right[14] = vshll_n_s16(vget_high_s16(a[14]), 2); right[15] = vshll_n_s16(vget_high_s16(a[15]), 2); } @@ -357,81 +362,38 @@ static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/, int32x4_t *right /* [16] */) { const int32x4_t one = vdupq_n_s32(1); left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2); - right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2); left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2); - right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2); left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2); - right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2); left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2); - right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2); left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2); - right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2); left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2); - right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2); left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2); - right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2); left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2); - right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2); left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2); - right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2); left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2); - right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2); left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2); - right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2); left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2); - right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2); left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2); - right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2); left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2); - right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2); left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2); - right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2); left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2); - right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2); -} -static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/, - int32x4_t *right /*[8]*/, - int32x4_t *out_left /*[8]*/, - int32x4_t *out_right /*[8]*/) { - int32x4x2_t out[8]; - - out[0].val[0] = left[0]; - out[0].val[1] = right[0]; - out[1].val[0] = left[1]; - out[1].val[1] = right[1]; - out[2].val[0] = left[2]; - out[2].val[1] = right[2]; - out[3].val[0] = left[3]; - out[3].val[1] = right[3]; - out[4].val[0] = left[4]; - out[4].val[1] = right[4]; - out[5].val[0] = left[5]; - out[5].val[1] = right[5]; - out[6].val[0] = left[6]; - out[6].val[1] = right[6]; - out[7].val[0] = left[7]; - out[7].val[1] = right[7]; - - transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], - &out[6], &out[7]); - - out_left[0] = out[0].val[0]; - out_left[1] = out[1].val[0]; - out_left[2] = out[2].val[0]; - out_left[3] = out[3].val[0]; - out_left[4] = out[4].val[0]; - out_left[5] = out[5].val[0]; - out_left[6] = out[6].val[0]; - out_left[7] = out[7].val[0]; - out_right[0] = out[0].val[1]; - out_right[1] = out[1].val[1]; - out_right[2] = out[2].val[1]; - out_right[3] = out[3].val[1]; - out_right[4] = out[4].val[1]; - out_right[5] = out[5].val[1]; - out_right[6] = out[6].val[1]; - out_right[7] = out[7].val[1]; + right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2); + right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2); + right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2); + right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2); + right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2); + right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2); + right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2); + right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2); + right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2); + right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2); + right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2); + right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2); + right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2); + right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2); + right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2); + right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2); } // Store 16 32x4 vectors, assuming stride == 16. @@ -469,9 +431,9 @@ static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) { vst1q_s32(a, b[15]); } -// Main body of fdct16x16. -static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/, - int32x4_t *right /* [16] */) { +// Main body of fdct8x16 column +static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { int32x4_t sl[8]; int32x4_t sr[8]; int32x4_t xl[4]; @@ -531,22 +493,21 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/, // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) - highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[8]); - highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0], - &right[8]); - // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[8], &right[8]); + + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); - highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64, - &left[4], &left[12]); - highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64, - &right[4], &right[12]); + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, + cospi_24_64, &left[4], &right[4], + &left[12], &right[12]); // Stage 2 // Re-using source s5/s6 // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) - highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &sl[6], &sl[5]); - highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &sr[6], &sr[5]); + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6], + &sr[6], &sl[5], &sr[5]); // Stage 3 xl[0] = vaddq_s32(sl[4], sl[5]); @@ -559,18 +520,16 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/, xr[3] = vaddq_s32(sr[7], sr[6]); // Stage 4 - // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) - // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) - highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64, - &left[2], &left[14]); - highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64, - &right[2], &right[14]); - // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) - // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) - highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64, - &left[10], &left[6]); - highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64, - &right[10], &right[6]); + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, + cospi_28_64, &left[2], &right[2], + &left[14], &right[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, + cospi_12_64, &left[10], &right[10], + &left[6], &right[6]); // step 2 // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" @@ -582,10 +541,10 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/, // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) - highbd_butterfly_one_coeff_s32(inl[5], inl[2], cospi_16_64, &sl[5], &sl[2]); - highbd_butterfly_one_coeff_s32(inr[5], inr[2], cospi_16_64, &sr[5], &sr[2]); - highbd_butterfly_one_coeff_s32(inl[4], inl[3], cospi_16_64, &sl[4], &sl[3]); - highbd_butterfly_one_coeff_s32(inr[4], inr[3], cospi_16_64, &sr[4], &sr[3]); + butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64, + &sl[5], &sr[5], &sl[2], &sr[2]); + butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64, + &sl[4], &sr[4], &sl[3], &sr[3]); // step 3 sl[0] = vaddq_s32(inl[0], sl[3]); @@ -606,19 +565,18 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/, sr[7] = vaddq_s32(inr[7], sr[4]); // step 4 - // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64) - // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64) - highbd_butterfly_two_coeff_s32(sl[6], sl[1], cospi_8_64, cospi_24_64, &sl[6], - &sl[1]); - highbd_butterfly_two_coeff_s32(sr[6], sr[1], cospi_8_64, cospi_24_64, &sr[6], - &sr[1]); - + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64, + cospi_24_64, &sl[6], &sr[6], &sl[1], + &sr[1]); // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) - // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64) - highbd_butterfly_two_coeff_s32(xl[0], xl[3], cospi_24_64, cospi_8_64, &sl[2], - &sl[5]); - highbd_butterfly_two_coeff_s32(xr[0], xr[3], cospi_24_64, cospi_8_64, &sr[2], - &sr[5]); + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64, + cospi_8_64, &sl[2], &sr[2], &sl[5], + &sr[5]); // step 5 stepl[0] = vaddq_s32(sl[0], sl[1]); @@ -639,31 +597,26 @@ static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/, stepr[7] = vaddq_s32(sr[7], sr[6]); // step 6 - // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64) - // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64) - // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64) - // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64) - // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64) - // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] * - // cospi_22_64) out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * - // cospi_26_64) out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * - // cospi_6_64) - highbd_butterfly_two_coeff_s32(stepl[7], stepl[0], cospi_2_64, cospi_30_64, - &left[1], &left[15]); - highbd_butterfly_two_coeff_s32(stepr[7], stepr[0], cospi_2_64, cospi_30_64, - &right[1], &right[15]); - highbd_butterfly_two_coeff_s32(stepl[6], stepl[1], cospi_18_64, cospi_14_64, - &left[9], &left[7]); - highbd_butterfly_two_coeff_s32(stepr[6], stepr[1], cospi_18_64, cospi_14_64, - &right[9], &right[7]); - highbd_butterfly_two_coeff_s32(stepl[5], stepl[2], cospi_10_64, cospi_22_64, - &left[5], &left[11]); - highbd_butterfly_two_coeff_s32(stepr[5], stepr[2], cospi_10_64, cospi_22_64, - &right[5], &right[11]); - highbd_butterfly_two_coeff_s32(stepl[4], stepl[3], cospi_26_64, cospi_6_64, - &left[13], &left[3]); - highbd_butterfly_two_coeff_s32(stepr[4], stepr[3], cospi_26_64, cospi_6_64, - &right[13], &right[3]); + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1], + cospi_18_64, cospi_14_64, &left[9], + &right[9], &left[7], &right[7]); + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0], + cospi_2_64, cospi_30_64, &left[1], + &right[1], &left[15], &right[15]); + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3], + cospi_26_64, cospi_6_64, &left[13], + &right[13], &left[3], &right[3]); + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2], + cospi_10_64, cospi_22_64, &left[5], + &right[5], &left[11], &right[11]); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c index 51d81bd085..e2bf167604 100644 --- a/vpx_dsp/arm/fdct32x32_neon.c +++ b/vpx_dsp/arm/fdct32x32_neon.c @@ -16,6 +16,7 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/fdct32x32_neon.h" // Most gcc 4.9 distributions outside of Android do not generate correct code // for this function. @@ -33,1123 +34,6 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, #else -#define LOAD_INCREMENT(src, stride, dest, index) \ - do { \ - dest[index] = vld1q_s16(src); \ - src += stride; \ - } while (0) - -#define ADD_S16(src, index0, index1, dest, index3) \ - do { \ - dest[index3] = vaddq_s16(src[index0], src[index1]); \ - } while (0) - -#define ADD_SHIFT_S16(src, index0, index1) \ - do { \ - src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \ - } while (0) - -// Load, cross, and multiply by 4. Load the first 8 and last 8, then the -// middle -// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better? -static INLINE void load(const int16_t *a, int stride, int16x8_t *b) { - const int16_t *a_end = a + 24 * stride; - int16x8_t c[8]; - - LOAD_INCREMENT(a, stride, b, 0); - LOAD_INCREMENT(a, stride, b, 1); - LOAD_INCREMENT(a, stride, b, 2); - LOAD_INCREMENT(a, stride, b, 3); - LOAD_INCREMENT(a, stride, b, 4); - LOAD_INCREMENT(a, stride, b, 5); - LOAD_INCREMENT(a, stride, b, 6); - LOAD_INCREMENT(a, stride, b, 7); - - LOAD_INCREMENT(a_end, stride, b, 24); - LOAD_INCREMENT(a_end, stride, b, 25); - LOAD_INCREMENT(a_end, stride, b, 26); - LOAD_INCREMENT(a_end, stride, b, 27); - LOAD_INCREMENT(a_end, stride, b, 28); - LOAD_INCREMENT(a_end, stride, b, 29); - LOAD_INCREMENT(a_end, stride, b, 30); - LOAD_INCREMENT(a_end, stride, b, 31); - - ADD_S16(b, 0, 31, c, 0); - ADD_S16(b, 1, 30, c, 1); - ADD_S16(b, 2, 29, c, 2); - ADD_S16(b, 3, 28, c, 3); - ADD_S16(b, 4, 27, c, 4); - ADD_S16(b, 5, 26, c, 5); - ADD_S16(b, 6, 25, c, 6); - ADD_S16(b, 7, 24, c, 7); - - ADD_SHIFT_S16(b, 7, 24); - ADD_SHIFT_S16(b, 6, 25); - ADD_SHIFT_S16(b, 5, 26); - ADD_SHIFT_S16(b, 4, 27); - ADD_SHIFT_S16(b, 3, 28); - ADD_SHIFT_S16(b, 2, 29); - ADD_SHIFT_S16(b, 1, 30); - ADD_SHIFT_S16(b, 0, 31); - - b[0] = vshlq_n_s16(c[0], 2); - b[1] = vshlq_n_s16(c[1], 2); - b[2] = vshlq_n_s16(c[2], 2); - b[3] = vshlq_n_s16(c[3], 2); - b[4] = vshlq_n_s16(c[4], 2); - b[5] = vshlq_n_s16(c[5], 2); - b[6] = vshlq_n_s16(c[6], 2); - b[7] = vshlq_n_s16(c[7], 2); - - LOAD_INCREMENT(a, stride, b, 8); - LOAD_INCREMENT(a, stride, b, 9); - LOAD_INCREMENT(a, stride, b, 10); - LOAD_INCREMENT(a, stride, b, 11); - LOAD_INCREMENT(a, stride, b, 12); - LOAD_INCREMENT(a, stride, b, 13); - LOAD_INCREMENT(a, stride, b, 14); - LOAD_INCREMENT(a, stride, b, 15); - LOAD_INCREMENT(a, stride, b, 16); - LOAD_INCREMENT(a, stride, b, 17); - LOAD_INCREMENT(a, stride, b, 18); - LOAD_INCREMENT(a, stride, b, 19); - LOAD_INCREMENT(a, stride, b, 20); - LOAD_INCREMENT(a, stride, b, 21); - LOAD_INCREMENT(a, stride, b, 22); - LOAD_INCREMENT(a, stride, b, 23); - - ADD_S16(b, 8, 23, c, 0); - ADD_S16(b, 9, 22, c, 1); - ADD_S16(b, 10, 21, c, 2); - ADD_S16(b, 11, 20, c, 3); - ADD_S16(b, 12, 19, c, 4); - ADD_S16(b, 13, 18, c, 5); - ADD_S16(b, 14, 17, c, 6); - ADD_S16(b, 15, 16, c, 7); - - ADD_SHIFT_S16(b, 15, 16); - ADD_SHIFT_S16(b, 14, 17); - ADD_SHIFT_S16(b, 13, 18); - ADD_SHIFT_S16(b, 12, 19); - ADD_SHIFT_S16(b, 11, 20); - ADD_SHIFT_S16(b, 10, 21); - ADD_SHIFT_S16(b, 9, 22); - ADD_SHIFT_S16(b, 8, 23); - - b[8] = vshlq_n_s16(c[0], 2); - b[9] = vshlq_n_s16(c[1], 2); - b[10] = vshlq_n_s16(c[2], 2); - b[11] = vshlq_n_s16(c[3], 2); - b[12] = vshlq_n_s16(c[4], 2); - b[13] = vshlq_n_s16(c[5], 2); - b[14] = vshlq_n_s16(c[6], 2); - b[15] = vshlq_n_s16(c[7], 2); -} - -#undef LOAD_INCREMENT -#undef ADD_S16 -#undef ADD_SHIFT_S16 - -#define STORE_S16(src, index, dest) \ - do { \ - store_s16q_to_tran_low(dest, src[index]); \ - dest += 8; \ - } while (0) - -// Store 32 16x8 values, assuming stride == 32. -// Slight twist: store horizontally in blocks of 8. -static INLINE void store(tran_low_t *a, const int16x8_t *b) { - STORE_S16(b, 0, a); - STORE_S16(b, 8, a); - STORE_S16(b, 16, a); - STORE_S16(b, 24, a); - STORE_S16(b, 1, a); - STORE_S16(b, 9, a); - STORE_S16(b, 17, a); - STORE_S16(b, 25, a); - STORE_S16(b, 2, a); - STORE_S16(b, 10, a); - STORE_S16(b, 18, a); - STORE_S16(b, 26, a); - STORE_S16(b, 3, a); - STORE_S16(b, 11, a); - STORE_S16(b, 19, a); - STORE_S16(b, 27, a); - STORE_S16(b, 4, a); - STORE_S16(b, 12, a); - STORE_S16(b, 20, a); - STORE_S16(b, 28, a); - STORE_S16(b, 5, a); - STORE_S16(b, 13, a); - STORE_S16(b, 21, a); - STORE_S16(b, 29, a); - STORE_S16(b, 6, a); - STORE_S16(b, 14, a); - STORE_S16(b, 22, a); - STORE_S16(b, 30, a); - STORE_S16(b, 7, a); - STORE_S16(b, 15, a); - STORE_S16(b, 23, a); - STORE_S16(b, 31, a); -} - -#undef STORE_S16 - -static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { - int16x8_t a[32]; - int16x8_t b[32]; - - // Stage 1: Done as part of the load. - - // Stage 2. - // Mini cross. X the first 16 values and the middle 8 of the second half. - a[0] = vaddq_s16(in[0], in[15]); - a[1] = vaddq_s16(in[1], in[14]); - a[2] = vaddq_s16(in[2], in[13]); - a[3] = vaddq_s16(in[3], in[12]); - a[4] = vaddq_s16(in[4], in[11]); - a[5] = vaddq_s16(in[5], in[10]); - a[6] = vaddq_s16(in[6], in[9]); - a[7] = vaddq_s16(in[7], in[8]); - - a[8] = vsubq_s16(in[7], in[8]); - a[9] = vsubq_s16(in[6], in[9]); - a[10] = vsubq_s16(in[5], in[10]); - a[11] = vsubq_s16(in[4], in[11]); - a[12] = vsubq_s16(in[3], in[12]); - a[13] = vsubq_s16(in[2], in[13]); - a[14] = vsubq_s16(in[1], in[14]); - a[15] = vsubq_s16(in[0], in[15]); - - a[16] = in[16]; - a[17] = in[17]; - a[18] = in[18]; - a[19] = in[19]; - - butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]); - butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]); - butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]); - butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]); - - a[28] = in[28]; - a[29] = in[29]; - a[30] = in[30]; - a[31] = in[31]; - - // Stage 3. - b[0] = vaddq_s16(a[0], a[7]); - b[1] = vaddq_s16(a[1], a[6]); - b[2] = vaddq_s16(a[2], a[5]); - b[3] = vaddq_s16(a[3], a[4]); - - b[4] = vsubq_s16(a[3], a[4]); - b[5] = vsubq_s16(a[2], a[5]); - b[6] = vsubq_s16(a[1], a[6]); - b[7] = vsubq_s16(a[0], a[7]); - - b[8] = a[8]; - b[9] = a[9]; - - butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]); - butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]); - - b[14] = a[14]; - b[15] = a[15]; - - b[16] = vaddq_s16(in[16], a[23]); - b[17] = vaddq_s16(in[17], a[22]); - b[18] = vaddq_s16(in[18], a[21]); - b[19] = vaddq_s16(in[19], a[20]); - - b[20] = vsubq_s16(in[19], a[20]); - b[21] = vsubq_s16(in[18], a[21]); - b[22] = vsubq_s16(in[17], a[22]); - b[23] = vsubq_s16(in[16], a[23]); - - b[24] = vsubq_s16(in[31], a[24]); - b[25] = vsubq_s16(in[30], a[25]); - b[26] = vsubq_s16(in[29], a[26]); - b[27] = vsubq_s16(in[28], a[27]); - - b[28] = vaddq_s16(in[28], a[27]); - b[29] = vaddq_s16(in[29], a[26]); - b[30] = vaddq_s16(in[30], a[25]); - b[31] = vaddq_s16(in[31], a[24]); - - // Stage 4. - a[0] = vaddq_s16(b[0], b[3]); - a[1] = vaddq_s16(b[1], b[2]); - a[2] = vsubq_s16(b[1], b[2]); - a[3] = vsubq_s16(b[0], b[3]); - - a[4] = b[4]; - - butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]); - - a[7] = b[7]; - - a[8] = vaddq_s16(b[8], b[11]); - a[9] = vaddq_s16(b[9], b[10]); - a[10] = vsubq_s16(b[9], b[10]); - a[11] = vsubq_s16(b[8], b[11]); - a[12] = vsubq_s16(b[15], b[12]); - a[13] = vsubq_s16(b[14], b[13]); - a[14] = vaddq_s16(b[14], b[13]); - a[15] = vaddq_s16(b[15], b[12]); - - a[16] = b[16]; - a[17] = b[17]; - - butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]); - butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]); - butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]); - butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]); - - a[22] = b[22]; - a[23] = b[23]; - a[24] = b[24]; - a[25] = b[25]; - - a[30] = b[30]; - a[31] = b[31]; - - // Stage 5. - butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]); - butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]); - - b[4] = vaddq_s16(a[4], a[5]); - b[5] = vsubq_s16(a[4], a[5]); - b[6] = vsubq_s16(a[7], a[6]); - b[7] = vaddq_s16(a[7], a[6]); - - b[8] = a[8]; - - butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]); - butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]); - - b[11] = a[11]; - b[12] = a[12]; - - b[15] = a[15]; - - b[16] = vaddq_s16(a[19], a[16]); - b[17] = vaddq_s16(a[18], a[17]); - b[18] = vsubq_s16(a[17], a[18]); - b[19] = vsubq_s16(a[16], a[19]); - b[20] = vsubq_s16(a[23], a[20]); - b[21] = vsubq_s16(a[22], a[21]); - b[22] = vaddq_s16(a[21], a[22]); - b[23] = vaddq_s16(a[20], a[23]); - b[24] = vaddq_s16(a[27], a[24]); - b[25] = vaddq_s16(a[26], a[25]); - b[26] = vsubq_s16(a[25], a[26]); - b[27] = vsubq_s16(a[24], a[27]); - b[28] = vsubq_s16(a[31], a[28]); - b[29] = vsubq_s16(a[30], a[29]); - b[30] = vaddq_s16(a[29], a[30]); - b[31] = vaddq_s16(a[28], a[31]); - - // Stage 6. - a[0] = b[0]; - a[1] = b[1]; - a[2] = b[2]; - a[3] = b[3]; - - butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]); - butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]); - - a[8] = vaddq_s16(b[8], b[9]); - a[9] = vsubq_s16(b[8], b[9]); - a[10] = vsubq_s16(b[11], b[10]); - a[11] = vaddq_s16(b[11], b[10]); - a[12] = vaddq_s16(b[12], b[13]); - a[13] = vsubq_s16(b[12], b[13]); - a[14] = vsubq_s16(b[15], b[14]); - a[15] = vaddq_s16(b[15], b[14]); - - a[16] = b[16]; - a[19] = b[19]; - a[20] = b[20]; - a[23] = b[23]; - a[24] = b[24]; - a[27] = b[27]; - a[28] = b[28]; - a[31] = b[31]; - - butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]); - butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]); - - butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]); - butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]); - - // Stage 7. - b[0] = a[0]; - b[1] = a[1]; - b[2] = a[2]; - b[3] = a[3]; - b[4] = a[4]; - b[5] = a[5]; - b[6] = a[6]; - b[7] = a[7]; - - butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]); - butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]); - butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]); - butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]); - - b[16] = vaddq_s16(a[16], a[17]); - b[17] = vsubq_s16(a[16], a[17]); - b[18] = vsubq_s16(a[19], a[18]); - b[19] = vaddq_s16(a[19], a[18]); - b[20] = vaddq_s16(a[20], a[21]); - b[21] = vsubq_s16(a[20], a[21]); - b[22] = vsubq_s16(a[23], a[22]); - b[23] = vaddq_s16(a[23], a[22]); - b[24] = vaddq_s16(a[24], a[25]); - b[25] = vsubq_s16(a[24], a[25]); - b[26] = vsubq_s16(a[27], a[26]); - b[27] = vaddq_s16(a[27], a[26]); - b[28] = vaddq_s16(a[28], a[29]); - b[29] = vsubq_s16(a[28], a[29]); - b[30] = vsubq_s16(a[31], a[30]); - b[31] = vaddq_s16(a[31], a[30]); - - // Final stage. - // Also compute partial rounding shift: - // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; - out[0] = sub_round_shift(b[0]); - out[16] = sub_round_shift(b[1]); - out[8] = sub_round_shift(b[2]); - out[24] = sub_round_shift(b[3]); - out[4] = sub_round_shift(b[4]); - out[20] = sub_round_shift(b[5]); - out[12] = sub_round_shift(b[6]); - out[28] = sub_round_shift(b[7]); - out[2] = sub_round_shift(b[8]); - out[18] = sub_round_shift(b[9]); - out[10] = sub_round_shift(b[10]); - out[26] = sub_round_shift(b[11]); - out[6] = sub_round_shift(b[12]); - out[22] = sub_round_shift(b[13]); - out[14] = sub_round_shift(b[14]); - out[30] = sub_round_shift(b[15]); - - butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]); - out[1] = sub_round_shift(a[1]); - out[31] = sub_round_shift(a[31]); - - butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]); - out[17] = sub_round_shift(a[17]); - out[15] = sub_round_shift(a[15]); - - butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]); - out[9] = sub_round_shift(a[9]); - out[23] = sub_round_shift(a[23]); - - butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]); - out[25] = sub_round_shift(a[25]); - out[7] = sub_round_shift(a[7]); - - butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]); - out[5] = sub_round_shift(a[5]); - out[27] = sub_round_shift(a[27]); - - butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]); - out[21] = sub_round_shift(a[21]); - out[11] = sub_round_shift(a[11]); - - butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]); - out[13] = sub_round_shift(a[13]); - out[19] = sub_round_shift(a[19]); - - butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]); - out[29] = sub_round_shift(a[29]); - out[3] = sub_round_shift(a[3]); -} - -#define PASS_THROUGH(src, dst, element) \ - do { \ - dst##_lo[element] = src##_lo[element]; \ - dst##_hi[element] = src##_hi[element]; \ - } while (0) - -#define ADD_S16_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = \ - vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ - b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \ - vget_high_s16(a[right_index])); \ - } while (0) - -#define SUB_S16_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = \ - vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ - b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \ - vget_high_s16(a[right_index])); \ - } while (0) - -#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \ - do { \ - c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \ - c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \ - } while (0) - -#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \ - do { \ - temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \ - temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \ - c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \ - c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \ - } while (0) - -#define ADD_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \ - b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \ - } while (0) - -#define SUB_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \ - b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \ - } while (0) - -#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \ - add_index, sub_index) \ - do { \ - butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \ - &b##_lo[add_index], &b##_hi[add_index], \ - &b##_lo[sub_index], &b##_hi[sub_index]); \ - } while (0) - -#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \ - sub_index) \ - do { \ - butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ - a##_lo[right_index], a##_hi[right_index], \ - constant, &b##_lo[add_index], &b##_hi[add_index], \ - &b##_lo[sub_index], &b##_hi[sub_index]); \ - } while (0) - -#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \ - right_constant, b, add_index, sub_index) \ - do { \ - butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ - a##_lo[right_index], a##_hi[right_index], \ - left_constant, right_constant, &b##_lo[add_index], \ - &b##_hi[add_index], &b##_lo[sub_index], \ - &b##_hi[sub_index]); \ - } while (0) - -static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { - int16x8_t a[32]; - int16x8_t b[32]; - int32x4_t c_lo[32]; - int32x4_t c_hi[32]; - int32x4_t d_lo[32]; - int32x4_t d_hi[32]; - - // Stage 1. Done as part of the load for the first pass. - a[0] = vaddq_s16(in[0], in[31]); - a[1] = vaddq_s16(in[1], in[30]); - a[2] = vaddq_s16(in[2], in[29]); - a[3] = vaddq_s16(in[3], in[28]); - a[4] = vaddq_s16(in[4], in[27]); - a[5] = vaddq_s16(in[5], in[26]); - a[6] = vaddq_s16(in[6], in[25]); - a[7] = vaddq_s16(in[7], in[24]); - a[8] = vaddq_s16(in[8], in[23]); - a[9] = vaddq_s16(in[9], in[22]); - a[10] = vaddq_s16(in[10], in[21]); - a[11] = vaddq_s16(in[11], in[20]); - a[12] = vaddq_s16(in[12], in[19]); - a[13] = vaddq_s16(in[13], in[18]); - a[14] = vaddq_s16(in[14], in[17]); - a[15] = vaddq_s16(in[15], in[16]); - a[16] = vsubq_s16(in[15], in[16]); - a[17] = vsubq_s16(in[14], in[17]); - a[18] = vsubq_s16(in[13], in[18]); - a[19] = vsubq_s16(in[12], in[19]); - a[20] = vsubq_s16(in[11], in[20]); - a[21] = vsubq_s16(in[10], in[21]); - a[22] = vsubq_s16(in[9], in[22]); - a[23] = vsubq_s16(in[8], in[23]); - a[24] = vsubq_s16(in[7], in[24]); - a[25] = vsubq_s16(in[6], in[25]); - a[26] = vsubq_s16(in[5], in[26]); - a[27] = vsubq_s16(in[4], in[27]); - a[28] = vsubq_s16(in[3], in[28]); - a[29] = vsubq_s16(in[2], in[29]); - a[30] = vsubq_s16(in[1], in[30]); - a[31] = vsubq_s16(in[0], in[31]); - - // Stage 2. - b[0] = vaddq_s16(a[0], a[15]); - b[1] = vaddq_s16(a[1], a[14]); - b[2] = vaddq_s16(a[2], a[13]); - b[3] = vaddq_s16(a[3], a[12]); - b[4] = vaddq_s16(a[4], a[11]); - b[5] = vaddq_s16(a[5], a[10]); - b[6] = vaddq_s16(a[6], a[9]); - b[7] = vaddq_s16(a[7], a[8]); - - b[8] = vsubq_s16(a[7], a[8]); - b[9] = vsubq_s16(a[6], a[9]); - b[10] = vsubq_s16(a[5], a[10]); - b[11] = vsubq_s16(a[4], a[11]); - b[12] = vsubq_s16(a[3], a[12]); - b[13] = vsubq_s16(a[2], a[13]); - b[14] = vsubq_s16(a[1], a[14]); - b[15] = vsubq_s16(a[0], a[15]); - - b[16] = a[16]; - b[17] = a[17]; - b[18] = a[18]; - b[19] = a[19]; - - butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]); - butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]); - butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]); - butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]); - - b[28] = a[28]; - b[29] = a[29]; - b[30] = a[30]; - b[31] = a[31]; - - // Stage 3. With extreme values for input this calculation rolls over int16_t. - // The sources for b[0] get added multiple times and, through testing, have - // been shown to overflow starting here. - ADD_S16_S32(b, 0, 7, c, 0); - ADD_S16_S32(b, 1, 6, c, 1); - ADD_S16_S32(b, 2, 5, c, 2); - ADD_S16_S32(b, 3, 4, c, 3); - SUB_S16_S32(b, 3, 4, c, 4); - SUB_S16_S32(b, 2, 5, c, 5); - SUB_S16_S32(b, 1, 6, c, 6); - SUB_S16_S32(b, 0, 7, c, 7); - - a[8] = b[8]; - a[9] = b[9]; - - BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10); - BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11); - - a[14] = b[14]; - a[15] = b[15]; - - ADD_S16_S32(b, 16, 23, c, 16); - ADD_S16_S32(b, 17, 22, c, 17); - ADD_S16_S32(b, 18, 21, c, 18); - ADD_S16_S32(b, 19, 20, c, 19); - SUB_S16_S32(b, 19, 20, c, 20); - SUB_S16_S32(b, 18, 21, c, 21); - SUB_S16_S32(b, 17, 22, c, 22); - SUB_S16_S32(b, 16, 23, c, 23); - SUB_S16_S32(b, 31, 24, c, 24); - SUB_S16_S32(b, 30, 25, c, 25); - SUB_S16_S32(b, 29, 26, c, 26); - SUB_S16_S32(b, 28, 27, c, 27); - ADD_S16_S32(b, 28, 27, c, 28); - ADD_S16_S32(b, 29, 26, c, 29); - ADD_S16_S32(b, 30, 25, c, 30); - ADD_S16_S32(b, 31, 24, c, 31); - - // Stage 4. - ADD_S32(c, 0, 3, d, 0); - ADD_S32(c, 1, 2, d, 1); - SUB_S32(c, 1, 2, d, 2); - SUB_S32(c, 0, 3, d, 3); - - PASS_THROUGH(c, d, 4); - - BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5); - - PASS_THROUGH(c, d, 7); - - ADDW_S16_S32(c, 11, a, 8, d, 8); - ADDW_S16_S32(c, 10, a, 9, d, 9); - SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10); - SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11); - SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12); - SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13); - ADDW_S16_S32(c, 13, b, 14, d, 14); - ADDW_S16_S32(c, 12, b, 15, d, 15); - - PASS_THROUGH(c, d, 16); - PASS_THROUGH(c, d, 17); - - BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18); - BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19); - BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20); - BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21); - - PASS_THROUGH(c, d, 22); - PASS_THROUGH(c, d, 23); - PASS_THROUGH(c, d, 24); - PASS_THROUGH(c, d, 25); - - PASS_THROUGH(c, d, 30); - PASS_THROUGH(c, d, 31); - - // Stage 5. - BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1); - BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3); - - ADD_S32(d, 4, 5, c, 4); - SUB_S32(d, 4, 5, c, 5); - SUB_S32(d, 7, 6, c, 6); - ADD_S32(d, 7, 6, c, 7); - - PASS_THROUGH(d, c, 8); - - BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9); - BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10); - - PASS_THROUGH(d, c, 11); - PASS_THROUGH(d, c, 12); - PASS_THROUGH(d, c, 15); - - ADD_S32(d, 16, 19, c, 16); - ADD_S32(d, 17, 18, c, 17); - SUB_S32(d, 17, 18, c, 18); - SUB_S32(d, 16, 19, c, 19); - SUB_S32(d, 23, 20, c, 20); - SUB_S32(d, 22, 21, c, 21); - ADD_S32(d, 22, 21, c, 22); - ADD_S32(d, 23, 20, c, 23); - ADD_S32(d, 24, 27, c, 24); - ADD_S32(d, 25, 26, c, 25); - SUB_S32(d, 25, 26, c, 26); - SUB_S32(d, 24, 27, c, 27); - SUB_S32(d, 31, 28, c, 28); - SUB_S32(d, 30, 29, c, 29); - ADD_S32(d, 30, 29, c, 30); - ADD_S32(d, 31, 28, c, 31); - - // Stage 6. - PASS_THROUGH(c, d, 0); - PASS_THROUGH(c, d, 1); - PASS_THROUGH(c, d, 2); - PASS_THROUGH(c, d, 3); - - BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7); - BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6); - - ADD_S32(c, 8, 9, d, 8); - SUB_S32(c, 8, 9, d, 9); - SUB_S32(c, 11, 10, d, 10); - ADD_S32(c, 11, 10, d, 11); - ADD_S32(c, 12, 13, d, 12); - SUB_S32(c, 12, 13, d, 13); - SUB_S32(c, 15, 14, d, 14); - ADD_S32(c, 15, 14, d, 15); - - PASS_THROUGH(c, d, 16); - PASS_THROUGH(c, d, 19); - PASS_THROUGH(c, d, 20); - PASS_THROUGH(c, d, 23); - PASS_THROUGH(c, d, 24); - PASS_THROUGH(c, d, 27); - PASS_THROUGH(c, d, 28); - PASS_THROUGH(c, d, 31); - - BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17); - BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18); - BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21); - BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22); - - // Stage 7. - PASS_THROUGH(d, c, 0); - PASS_THROUGH(d, c, 1); - PASS_THROUGH(d, c, 2); - PASS_THROUGH(d, c, 3); - PASS_THROUGH(d, c, 4); - PASS_THROUGH(d, c, 5); - PASS_THROUGH(d, c, 6); - PASS_THROUGH(d, c, 7); - - BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15); - BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14); - BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13); - BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12); - - ADD_S32(d, 16, 17, c, 16); - SUB_S32(d, 16, 17, c, 17); - SUB_S32(d, 19, 18, c, 18); - ADD_S32(d, 19, 18, c, 19); - ADD_S32(d, 20, 21, c, 20); - SUB_S32(d, 20, 21, c, 21); - SUB_S32(d, 23, 22, c, 22); - ADD_S32(d, 23, 22, c, 23); - ADD_S32(d, 24, 25, c, 24); - SUB_S32(d, 24, 25, c, 25); - SUB_S32(d, 27, 26, c, 26); - ADD_S32(d, 27, 26, c, 27); - ADD_S32(d, 28, 29, c, 28); - SUB_S32(d, 28, 29, c, 29); - SUB_S32(d, 31, 30, c, 30); - ADD_S32(d, 31, 30, c, 31); - - // Final stage. - // Roll rounding into this function so we can pass back int16x8. - - out[0] = add_round_shift_s32(c_lo[0], c_hi[0]); - out[16] = add_round_shift_s32(c_lo[1], c_hi[1]); - - out[8] = add_round_shift_s32(c_lo[2], c_hi[2]); - out[24] = add_round_shift_s32(c_lo[3], c_hi[3]); - out[4] = add_round_shift_s32(c_lo[4], c_hi[4]); - out[20] = add_round_shift_s32(c_lo[5], c_hi[5]); - out[12] = add_round_shift_s32(c_lo[6], c_hi[6]); - - out[28] = add_round_shift_s32(c_lo[7], c_hi[7]); - out[2] = add_round_shift_s32(c_lo[8], c_hi[8]); - out[18] = add_round_shift_s32(c_lo[9], c_hi[9]); - out[10] = add_round_shift_s32(c_lo[10], c_hi[10]); - - out[26] = add_round_shift_s32(c_lo[11], c_hi[11]); - out[6] = add_round_shift_s32(c_lo[12], c_hi[12]); - out[22] = add_round_shift_s32(c_lo[13], c_hi[13]); - out[14] = add_round_shift_s32(c_lo[14], c_hi[14]); - out[30] = add_round_shift_s32(c_lo[15], c_hi[15]); - - BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31); - out[1] = add_round_shift_s32(d_lo[1], d_hi[1]); - out[31] = add_round_shift_s32(d_lo[31], d_hi[31]); - - BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15); - out[17] = add_round_shift_s32(d_lo[17], d_hi[17]); - out[15] = add_round_shift_s32(d_lo[15], d_hi[15]); - - BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23); - out[9] = add_round_shift_s32(d_lo[9], d_hi[9]); - out[23] = add_round_shift_s32(d_lo[23], d_hi[23]); - - BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7); - out[25] = add_round_shift_s32(d_lo[25], d_hi[25]); - out[7] = add_round_shift_s32(d_lo[7], d_hi[7]); - - BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27); - out[5] = add_round_shift_s32(d_lo[5], d_hi[5]); - out[27] = add_round_shift_s32(d_lo[27], d_hi[27]); - - BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11); - out[21] = add_round_shift_s32(d_lo[21], d_hi[21]); - out[11] = add_round_shift_s32(d_lo[11], d_hi[11]); - - BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19); - out[13] = add_round_shift_s32(d_lo[13], d_hi[13]); - out[19] = add_round_shift_s32(d_lo[19], d_hi[19]); - - BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3); - out[29] = add_round_shift_s32(d_lo[29], d_hi[29]); - out[3] = add_round_shift_s32(d_lo[3], d_hi[3]); -} - -static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) { - int16x8_t a[32]; - int16x8_t b[32]; - - // Stage 1. Done as part of the load for the first pass. - a[0] = vaddq_s16(in[0], in[31]); - a[1] = vaddq_s16(in[1], in[30]); - a[2] = vaddq_s16(in[2], in[29]); - a[3] = vaddq_s16(in[3], in[28]); - a[4] = vaddq_s16(in[4], in[27]); - a[5] = vaddq_s16(in[5], in[26]); - a[6] = vaddq_s16(in[6], in[25]); - a[7] = vaddq_s16(in[7], in[24]); - a[8] = vaddq_s16(in[8], in[23]); - a[9] = vaddq_s16(in[9], in[22]); - a[10] = vaddq_s16(in[10], in[21]); - a[11] = vaddq_s16(in[11], in[20]); - a[12] = vaddq_s16(in[12], in[19]); - a[13] = vaddq_s16(in[13], in[18]); - a[14] = vaddq_s16(in[14], in[17]); - a[15] = vaddq_s16(in[15], in[16]); - a[16] = vsubq_s16(in[15], in[16]); - a[17] = vsubq_s16(in[14], in[17]); - a[18] = vsubq_s16(in[13], in[18]); - a[19] = vsubq_s16(in[12], in[19]); - a[20] = vsubq_s16(in[11], in[20]); - a[21] = vsubq_s16(in[10], in[21]); - a[22] = vsubq_s16(in[9], in[22]); - a[23] = vsubq_s16(in[8], in[23]); - a[24] = vsubq_s16(in[7], in[24]); - a[25] = vsubq_s16(in[6], in[25]); - a[26] = vsubq_s16(in[5], in[26]); - a[27] = vsubq_s16(in[4], in[27]); - a[28] = vsubq_s16(in[3], in[28]); - a[29] = vsubq_s16(in[2], in[29]); - a[30] = vsubq_s16(in[1], in[30]); - a[31] = vsubq_s16(in[0], in[31]); - - // Stage 2. - // For the "rd" version, all the values are rounded down after stage 2 to keep - // the values in 16 bits. - b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15])); - b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14])); - b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13])); - b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12])); - b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11])); - b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10])); - b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9])); - b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8])); - - b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8])); - b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9])); - b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10])); - b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11])); - b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12])); - b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13])); - b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14])); - b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15])); - - b[16] = add_round_shift_s16(a[16]); - b[17] = add_round_shift_s16(a[17]); - b[18] = add_round_shift_s16(a[18]); - b[19] = add_round_shift_s16(a[19]); - - butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]); - butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]); - butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]); - butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]); - b[20] = add_round_shift_s16(b[20]); - b[21] = add_round_shift_s16(b[21]); - b[22] = add_round_shift_s16(b[22]); - b[23] = add_round_shift_s16(b[23]); - b[24] = add_round_shift_s16(b[24]); - b[25] = add_round_shift_s16(b[25]); - b[26] = add_round_shift_s16(b[26]); - b[27] = add_round_shift_s16(b[27]); - - b[28] = add_round_shift_s16(a[28]); - b[29] = add_round_shift_s16(a[29]); - b[30] = add_round_shift_s16(a[30]); - b[31] = add_round_shift_s16(a[31]); - - // Stage 3. - a[0] = vaddq_s16(b[0], b[7]); - a[1] = vaddq_s16(b[1], b[6]); - a[2] = vaddq_s16(b[2], b[5]); - a[3] = vaddq_s16(b[3], b[4]); - - a[4] = vsubq_s16(b[3], b[4]); - a[5] = vsubq_s16(b[2], b[5]); - a[6] = vsubq_s16(b[1], b[6]); - a[7] = vsubq_s16(b[0], b[7]); - - a[8] = b[8]; - a[9] = b[9]; - - butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]); - butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]); - - a[14] = b[14]; - a[15] = b[15]; - - a[16] = vaddq_s16(b[16], b[23]); - a[17] = vaddq_s16(b[17], b[22]); - a[18] = vaddq_s16(b[18], b[21]); - a[19] = vaddq_s16(b[19], b[20]); - - a[20] = vsubq_s16(b[19], b[20]); - a[21] = vsubq_s16(b[18], b[21]); - a[22] = vsubq_s16(b[17], b[22]); - a[23] = vsubq_s16(b[16], b[23]); - - a[24] = vsubq_s16(b[31], b[24]); - a[25] = vsubq_s16(b[30], b[25]); - a[26] = vsubq_s16(b[29], b[26]); - a[27] = vsubq_s16(b[28], b[27]); - - a[28] = vaddq_s16(b[28], b[27]); - a[29] = vaddq_s16(b[29], b[26]); - a[30] = vaddq_s16(b[30], b[25]); - a[31] = vaddq_s16(b[31], b[24]); - - // Stage 4. - b[0] = vaddq_s16(a[0], a[3]); - b[1] = vaddq_s16(a[1], a[2]); - b[2] = vsubq_s16(a[1], a[2]); - b[3] = vsubq_s16(a[0], a[3]); - - b[4] = a[4]; - - butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]); - - b[7] = a[7]; - - b[8] = vaddq_s16(a[8], a[11]); - b[9] = vaddq_s16(a[9], a[10]); - b[10] = vsubq_s16(a[9], a[10]); - b[11] = vsubq_s16(a[8], a[11]); - b[12] = vsubq_s16(a[15], a[12]); - b[13] = vsubq_s16(a[14], a[13]); - b[14] = vaddq_s16(a[14], a[13]); - b[15] = vaddq_s16(a[15], a[12]); - - b[16] = a[16]; - b[17] = a[17]; - - butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]); - butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]); - butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]); - butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]); - - b[22] = a[22]; - b[23] = a[23]; - b[24] = a[24]; - b[25] = a[25]; - - b[30] = a[30]; - b[31] = a[31]; - - // Stage 5. - butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]); - butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]); - - a[4] = vaddq_s16(b[4], b[5]); - a[5] = vsubq_s16(b[4], b[5]); - a[6] = vsubq_s16(b[7], b[6]); - a[7] = vaddq_s16(b[7], b[6]); - - a[8] = b[8]; - - butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]); - butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]); - - a[11] = b[11]; - a[12] = b[12]; - - a[15] = b[15]; - - a[16] = vaddq_s16(b[19], b[16]); - a[17] = vaddq_s16(b[18], b[17]); - a[18] = vsubq_s16(b[17], b[18]); - a[19] = vsubq_s16(b[16], b[19]); - a[20] = vsubq_s16(b[23], b[20]); - a[21] = vsubq_s16(b[22], b[21]); - a[22] = vaddq_s16(b[21], b[22]); - a[23] = vaddq_s16(b[20], b[23]); - a[24] = vaddq_s16(b[27], b[24]); - a[25] = vaddq_s16(b[26], b[25]); - a[26] = vsubq_s16(b[25], b[26]); - a[27] = vsubq_s16(b[24], b[27]); - a[28] = vsubq_s16(b[31], b[28]); - a[29] = vsubq_s16(b[30], b[29]); - a[30] = vaddq_s16(b[29], b[30]); - a[31] = vaddq_s16(b[28], b[31]); - - // Stage 6. - b[0] = a[0]; - b[1] = a[1]; - b[2] = a[2]; - b[3] = a[3]; - - butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]); - butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]); - - b[8] = vaddq_s16(a[8], a[9]); - b[9] = vsubq_s16(a[8], a[9]); - b[10] = vsubq_s16(a[11], a[10]); - b[11] = vaddq_s16(a[11], a[10]); - b[12] = vaddq_s16(a[12], a[13]); - b[13] = vsubq_s16(a[12], a[13]); - b[14] = vsubq_s16(a[15], a[14]); - b[15] = vaddq_s16(a[15], a[14]); - - b[16] = a[16]; - b[19] = a[19]; - b[20] = a[20]; - b[23] = a[23]; - b[24] = a[24]; - b[27] = a[27]; - b[28] = a[28]; - b[31] = a[31]; - - butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]); - butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]); - - butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]); - butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]); - - // Stage 7. - a[0] = b[0]; - a[1] = b[1]; - a[2] = b[2]; - a[3] = b[3]; - a[4] = b[4]; - a[5] = b[5]; - a[6] = b[6]; - a[7] = b[7]; - - butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]); - butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]); - butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]); - butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]); - - a[16] = vaddq_s16(b[16], b[17]); - a[17] = vsubq_s16(b[16], b[17]); - a[18] = vsubq_s16(b[19], b[18]); - a[19] = vaddq_s16(b[19], b[18]); - a[20] = vaddq_s16(b[20], b[21]); - a[21] = vsubq_s16(b[20], b[21]); - a[22] = vsubq_s16(b[23], b[22]); - a[23] = vaddq_s16(b[23], b[22]); - a[24] = vaddq_s16(b[24], b[25]); - a[25] = vsubq_s16(b[24], b[25]); - a[26] = vsubq_s16(b[27], b[26]); - a[27] = vaddq_s16(b[27], b[26]); - a[28] = vaddq_s16(b[28], b[29]); - a[29] = vsubq_s16(b[28], b[29]); - a[30] = vsubq_s16(b[31], b[30]); - a[31] = vaddq_s16(b[31], b[30]); - - // Final stage. - out[0] = a[0]; - out[16] = a[1]; - out[8] = a[2]; - out[24] = a[3]; - out[4] = a[4]; - out[20] = a[5]; - out[12] = a[6]; - out[28] = a[7]; - out[2] = a[8]; - out[18] = a[9]; - out[10] = a[10]; - out[26] = a[11]; - out[6] = a[12]; - out[22] = a[13]; - out[14] = a[14]; - out[30] = a[15]; - - butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]); - butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17], - &out[15]); - butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]); - butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]); - butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]); - butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21], - &out[11]); - butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13], - &out[19]); - butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]); -} - -#undef PASS_THROUGH -#undef ADD_S16_S32 -#undef SUB_S16_S32 -#undef ADDW_S16_S32 -#undef SUBW_S16_S32 -#undef ADD_S32 -#undef SUB_S32 -#undef BUTTERFLY_ONE_S16_S32 -#undef BUTTERFLY_ONE_S32 -#undef BUTTERFLY_TWO_S32 - void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp0[32]; int16x8_t temp1[32]; @@ -1159,17 +43,21 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp5[32]; // Process in 8x32 columns. - load(input, stride, temp0); - dct_body_first_pass(temp0, temp1); + load_cross(input, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp1); - load(input + 8, stride, temp0); - dct_body_first_pass(temp0, temp2); + load_cross(input + 8, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp2); - load(input + 16, stride, temp0); - dct_body_first_pass(temp0, temp3); + load_cross(input + 16, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp3); - load(input + 24, stride, temp0); - dct_body_first_pass(temp0, temp4); + load_cross(input + 24, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. transpose_s16_8x8_new(&temp1[0], &temp0[0]); @@ -1254,17 +142,21 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int16x8_t temp5[32]; // Process in 8x32 columns. - load(input, stride, temp0); - dct_body_first_pass(temp0, temp1); + load_cross(input, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp1); - load(input + 8, stride, temp0); - dct_body_first_pass(temp0, temp2); + load_cross(input + 8, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp2); - load(input + 16, stride, temp0); - dct_body_first_pass(temp0, temp3); + load_cross(input + 16, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp3); - load(input + 24, stride, temp0); - dct_body_first_pass(temp0, temp4); + load_cross(input + 24, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. transpose_s16_8x8_new(&temp1[0], &temp0[0]); diff --git a/vpx_dsp/arm/fdct32x32_neon.h b/vpx_dsp/arm/fdct32x32_neon.h new file mode 100644 index 0000000000..dd647918b2 --- /dev/null +++ b/vpx_dsp/arm/fdct32x32_neon.h @@ -0,0 +1,1105 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" + +// Load & cross the first 8 and last 8, then the middle +static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) { + b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride)); + b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride)); + b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride)); + b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride)); + b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride)); + b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride)); + b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride)); + b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride)); + + b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride)); + b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride)); + b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride)); + b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride)); + b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride)); + b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride)); + b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride)); + b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride)); + + b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride)); + b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride)); + b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride)); + b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride)); + b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride)); + b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride)); + b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride)); + b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride)); + + b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride)); + b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride)); + b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride)); + b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride)); + b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride)); + b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride)); + b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride)); + b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride)); +} + +#define STORE_S16(src, index, dest) \ + do { \ + store_s16q_to_tran_low(dest, src[index]); \ + dest += 8; \ + } while (0) + +// Store 32 16x8 values, assuming stride == 32. +// Slight twist: store horizontally in blocks of 8. +static INLINE void store(tran_low_t *a, const int16x8_t *b) { + STORE_S16(b, 0, a); + STORE_S16(b, 8, a); + STORE_S16(b, 16, a); + STORE_S16(b, 24, a); + STORE_S16(b, 1, a); + STORE_S16(b, 9, a); + STORE_S16(b, 17, a); + STORE_S16(b, 25, a); + STORE_S16(b, 2, a); + STORE_S16(b, 10, a); + STORE_S16(b, 18, a); + STORE_S16(b, 26, a); + STORE_S16(b, 3, a); + STORE_S16(b, 11, a); + STORE_S16(b, 19, a); + STORE_S16(b, 27, a); + STORE_S16(b, 4, a); + STORE_S16(b, 12, a); + STORE_S16(b, 20, a); + STORE_S16(b, 28, a); + STORE_S16(b, 5, a); + STORE_S16(b, 13, a); + STORE_S16(b, 21, a); + STORE_S16(b, 29, a); + STORE_S16(b, 6, a); + STORE_S16(b, 14, a); + STORE_S16(b, 22, a); + STORE_S16(b, 30, a); + STORE_S16(b, 7, a); + STORE_S16(b, 15, a); + STORE_S16(b, 23, a); + STORE_S16(b, 31, a); +} + +#undef STORE_S16 + +static INLINE void scale_input(const int16x8_t *in /*32*/, + int16x8_t *out /*32*/) { + out[0] = vshlq_n_s16(in[0], 2); + out[1] = vshlq_n_s16(in[1], 2); + out[2] = vshlq_n_s16(in[2], 2); + out[3] = vshlq_n_s16(in[3], 2); + out[4] = vshlq_n_s16(in[4], 2); + out[5] = vshlq_n_s16(in[5], 2); + out[6] = vshlq_n_s16(in[6], 2); + out[7] = vshlq_n_s16(in[7], 2); + + out[8] = vshlq_n_s16(in[8], 2); + out[9] = vshlq_n_s16(in[9], 2); + out[10] = vshlq_n_s16(in[10], 2); + out[11] = vshlq_n_s16(in[11], 2); + out[12] = vshlq_n_s16(in[12], 2); + out[13] = vshlq_n_s16(in[13], 2); + out[14] = vshlq_n_s16(in[14], 2); + out[15] = vshlq_n_s16(in[15], 2); + + out[16] = vshlq_n_s16(in[16], 2); + out[17] = vshlq_n_s16(in[17], 2); + out[18] = vshlq_n_s16(in[18], 2); + out[19] = vshlq_n_s16(in[19], 2); + out[20] = vshlq_n_s16(in[20], 2); + out[21] = vshlq_n_s16(in[21], 2); + out[22] = vshlq_n_s16(in[22], 2); + out[23] = vshlq_n_s16(in[23], 2); + + out[24] = vshlq_n_s16(in[24], 2); + out[25] = vshlq_n_s16(in[25], 2); + out[26] = vshlq_n_s16(in[26], 2); + out[27] = vshlq_n_s16(in[27], 2); + out[28] = vshlq_n_s16(in[28], 2); + out[29] = vshlq_n_s16(in[29], 2); + out[30] = vshlq_n_s16(in[30], 2); + out[31] = vshlq_n_s16(in[31], 2); +} + +static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + a[0] = vaddq_s16(in[0], in[15]); + a[1] = vaddq_s16(in[1], in[14]); + a[2] = vaddq_s16(in[2], in[13]); + a[3] = vaddq_s16(in[3], in[12]); + a[4] = vaddq_s16(in[4], in[11]); + a[5] = vaddq_s16(in[5], in[10]); + a[6] = vaddq_s16(in[6], in[9]); + a[7] = vaddq_s16(in[7], in[8]); + + a[8] = vsubq_s16(in[7], in[8]); + a[9] = vsubq_s16(in[6], in[9]); + a[10] = vsubq_s16(in[5], in[10]); + a[11] = vsubq_s16(in[4], in[11]); + a[12] = vsubq_s16(in[3], in[12]); + a[13] = vsubq_s16(in[2], in[13]); + a[14] = vsubq_s16(in[1], in[14]); + a[15] = vsubq_s16(in[0], in[15]); + + a[16] = in[16]; + a[17] = in[17]; + a[18] = in[18]; + a[19] = in[19]; + + butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27], + &a[20]); + butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26], + &a[21]); + butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25], + &a[22]); + butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24], + &a[23]); + + a[28] = in[28]; + a[29] = in[29]; + a[30] = in[30]; + a[31] = in[31]; + + // Stage 3. + b[0] = vaddq_s16(a[0], a[7]); + b[1] = vaddq_s16(a[1], a[6]); + b[2] = vaddq_s16(a[2], a[5]); + b[3] = vaddq_s16(a[3], a[4]); + + b[4] = vsubq_s16(a[3], a[4]); + b[5] = vsubq_s16(a[2], a[5]); + b[6] = vsubq_s16(a[1], a[6]); + b[7] = vsubq_s16(a[0], a[7]); + + b[8] = a[8]; + b[9] = a[9]; + + butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]); + butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]); + + b[14] = a[14]; + b[15] = a[15]; + + b[16] = vaddq_s16(in[16], a[23]); + b[17] = vaddq_s16(in[17], a[22]); + b[18] = vaddq_s16(in[18], a[21]); + b[19] = vaddq_s16(in[19], a[20]); + + b[20] = vsubq_s16(in[19], a[20]); + b[21] = vsubq_s16(in[18], a[21]); + b[22] = vsubq_s16(in[17], a[22]); + b[23] = vsubq_s16(in[16], a[23]); + + b[24] = vsubq_s16(in[31], a[24]); + b[25] = vsubq_s16(in[30], a[25]); + b[26] = vsubq_s16(in[29], a[26]); + b[27] = vsubq_s16(in[28], a[27]); + + b[28] = vaddq_s16(in[28], a[27]); + b[29] = vaddq_s16(in[29], a[26]); + b[30] = vaddq_s16(in[30], a[25]); + b[31] = vaddq_s16(in[31], a[24]); + + // Stage 4. + a[0] = vaddq_s16(b[0], b[3]); + a[1] = vaddq_s16(b[1], b[2]); + a[2] = vsubq_s16(b[1], b[2]); + a[3] = vsubq_s16(b[0], b[3]); + + a[4] = b[4]; + + butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]); + + a[7] = b[7]; + + a[8] = vaddq_s16(b[8], b[11]); + a[9] = vaddq_s16(b[9], b[10]); + a[10] = vsubq_s16(b[9], b[10]); + a[11] = vsubq_s16(b[8], b[11]); + a[12] = vsubq_s16(b[15], b[12]); + a[13] = vsubq_s16(b[14], b[13]); + a[14] = vaddq_s16(b[14], b[13]); + a[15] = vaddq_s16(b[15], b[12]); + + a[16] = b[16]; + a[17] = b[17]; + + butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]); + butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]); + butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]); + butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]); + + a[22] = b[22]; + a[23] = b[23]; + a[24] = b[24]; + a[25] = b[25]; + + a[30] = b[30]; + a[31] = b[31]; + + // Stage 5. + butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]); + butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]); + + b[4] = vaddq_s16(a[4], a[5]); + b[5] = vsubq_s16(a[4], a[5]); + b[6] = vsubq_s16(a[7], a[6]); + b[7] = vaddq_s16(a[7], a[6]); + + b[8] = a[8]; + + butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]); + butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]); + + b[11] = a[11]; + b[12] = a[12]; + + b[15] = a[15]; + + b[16] = vaddq_s16(a[19], a[16]); + b[17] = vaddq_s16(a[18], a[17]); + b[18] = vsubq_s16(a[17], a[18]); + b[19] = vsubq_s16(a[16], a[19]); + b[20] = vsubq_s16(a[23], a[20]); + b[21] = vsubq_s16(a[22], a[21]); + b[22] = vaddq_s16(a[21], a[22]); + b[23] = vaddq_s16(a[20], a[23]); + b[24] = vaddq_s16(a[27], a[24]); + b[25] = vaddq_s16(a[26], a[25]); + b[26] = vsubq_s16(a[25], a[26]); + b[27] = vsubq_s16(a[24], a[27]); + b[28] = vsubq_s16(a[31], a[28]); + b[29] = vsubq_s16(a[30], a[29]); + b[30] = vaddq_s16(a[29], a[30]); + b[31] = vaddq_s16(a[28], a[31]); + + // Stage 6. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + + butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]); + butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]); + + a[8] = vaddq_s16(b[8], b[9]); + a[9] = vsubq_s16(b[8], b[9]); + a[10] = vsubq_s16(b[11], b[10]); + a[11] = vaddq_s16(b[11], b[10]); + a[12] = vaddq_s16(b[12], b[13]); + a[13] = vsubq_s16(b[12], b[13]); + a[14] = vsubq_s16(b[15], b[14]); + a[15] = vaddq_s16(b[15], b[14]); + + a[16] = b[16]; + a[19] = b[19]; + a[20] = b[20]; + a[23] = b[23]; + a[24] = b[24]; + a[27] = b[27]; + a[28] = b[28]; + a[31] = b[31]; + + butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]); + butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]); + + butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]); + butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]); + + // Stage 7. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + b[4] = a[4]; + b[5] = a[5]; + b[6] = a[6]; + b[7] = a[7]; + + butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]); + butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]); + butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]); + butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]); + + b[16] = vaddq_s16(a[16], a[17]); + b[17] = vsubq_s16(a[16], a[17]); + b[18] = vsubq_s16(a[19], a[18]); + b[19] = vaddq_s16(a[19], a[18]); + b[20] = vaddq_s16(a[20], a[21]); + b[21] = vsubq_s16(a[20], a[21]); + b[22] = vsubq_s16(a[23], a[22]); + b[23] = vaddq_s16(a[23], a[22]); + b[24] = vaddq_s16(a[24], a[25]); + b[25] = vsubq_s16(a[24], a[25]); + b[26] = vsubq_s16(a[27], a[26]); + b[27] = vaddq_s16(a[27], a[26]); + b[28] = vaddq_s16(a[28], a[29]); + b[29] = vsubq_s16(a[28], a[29]); + b[30] = vsubq_s16(a[31], a[30]); + b[31] = vaddq_s16(a[31], a[30]); + + // Final stage. + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + out[0] = sub_round_shift_s16(b[0]); + out[16] = sub_round_shift_s16(b[1]); + out[8] = sub_round_shift_s16(b[2]); + out[24] = sub_round_shift_s16(b[3]); + out[4] = sub_round_shift_s16(b[4]); + out[20] = sub_round_shift_s16(b[5]); + out[12] = sub_round_shift_s16(b[6]); + out[28] = sub_round_shift_s16(b[7]); + out[2] = sub_round_shift_s16(b[8]); + out[18] = sub_round_shift_s16(b[9]); + out[10] = sub_round_shift_s16(b[10]); + out[26] = sub_round_shift_s16(b[11]); + out[6] = sub_round_shift_s16(b[12]); + out[22] = sub_round_shift_s16(b[13]); + out[14] = sub_round_shift_s16(b[14]); + out[30] = sub_round_shift_s16(b[15]); + + butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]); + out[1] = sub_round_shift_s16(a[1]); + out[31] = sub_round_shift_s16(a[31]); + + butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]); + out[17] = sub_round_shift_s16(a[17]); + out[15] = sub_round_shift_s16(a[15]); + + butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]); + out[9] = sub_round_shift_s16(a[9]); + out[23] = sub_round_shift_s16(a[23]); + + butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]); + out[25] = sub_round_shift_s16(a[25]); + out[7] = sub_round_shift_s16(a[7]); + + butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]); + out[5] = sub_round_shift_s16(a[5]); + out[27] = sub_round_shift_s16(a[27]); + + butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]); + out[21] = sub_round_shift_s16(a[21]); + out[11] = sub_round_shift_s16(a[11]); + + butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]); + out[13] = sub_round_shift_s16(a[13]); + out[19] = sub_round_shift_s16(a[19]); + + butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]); + out[29] = sub_round_shift_s16(a[29]); + out[3] = sub_round_shift_s16(a[3]); +} + +#define PASS_THROUGH(src, dst, element) \ + do { \ + dst##_lo[element] = src##_lo[element]; \ + dst##_hi[element] = src##_hi[element]; \ + } while (0) + +#define ADD_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define SUB_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \ + do { \ + c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \ + c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \ + } while (0) + +#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \ + do { \ + temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \ + temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \ + c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \ + c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \ + } while (0) + +#define ADD_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +#define SUB_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \ + add_index, sub_index) \ + do { \ + butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \ + &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \ + sub_index) \ + do { \ + butterfly_one_coeff_s32_fast( \ + a##_lo[left_index], a##_hi[left_index], a##_lo[right_index], \ + a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \ + right_constant, b, add_index, sub_index) \ + do { \ + butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ + a##_lo[right_index], a##_hi[right_index], \ + left_constant, right_constant, &b##_lo[add_index], \ + &b##_hi[add_index], &b##_lo[sub_index], \ + &b##_hi[sub_index]); \ + } while (0) + +static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + int32x4_t c_lo[32]; + int32x4_t c_hi[32]; + int32x4_t d_lo[32]; + int32x4_t d_hi[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); + + b[16] = a[16]; + b[17] = a[17]; + b[18] = a[18]; + b[19] = a[19]; + + butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]); + + b[28] = a[28]; + b[29] = a[29]; + b[30] = a[30]; + b[31] = a[31]; + + // Stage 3. With extreme values for input this calculation rolls over int16_t. + // The sources for b[0] get added multiple times and, through testing, have + // been shown to overflow starting here. + ADD_S16_S32(b, 0, 7, c, 0); + ADD_S16_S32(b, 1, 6, c, 1); + ADD_S16_S32(b, 2, 5, c, 2); + ADD_S16_S32(b, 3, 4, c, 3); + SUB_S16_S32(b, 3, 4, c, 4); + SUB_S16_S32(b, 2, 5, c, 5); + SUB_S16_S32(b, 1, 6, c, 6); + SUB_S16_S32(b, 0, 7, c, 7); + + a[8] = b[8]; + a[9] = b[9]; + + BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10); + BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11); + + a[14] = b[14]; + a[15] = b[15]; + + ADD_S16_S32(b, 16, 23, c, 16); + ADD_S16_S32(b, 17, 22, c, 17); + ADD_S16_S32(b, 18, 21, c, 18); + ADD_S16_S32(b, 19, 20, c, 19); + SUB_S16_S32(b, 19, 20, c, 20); + SUB_S16_S32(b, 18, 21, c, 21); + SUB_S16_S32(b, 17, 22, c, 22); + SUB_S16_S32(b, 16, 23, c, 23); + SUB_S16_S32(b, 31, 24, c, 24); + SUB_S16_S32(b, 30, 25, c, 25); + SUB_S16_S32(b, 29, 26, c, 26); + SUB_S16_S32(b, 28, 27, c, 27); + ADD_S16_S32(b, 28, 27, c, 28); + ADD_S16_S32(b, 29, 26, c, 29); + ADD_S16_S32(b, 30, 25, c, 30); + ADD_S16_S32(b, 31, 24, c, 31); + + // Stage 4. + ADD_S32(c, 0, 3, d, 0); + ADD_S32(c, 1, 2, d, 1); + SUB_S32(c, 1, 2, d, 2); + SUB_S32(c, 0, 3, d, 3); + + PASS_THROUGH(c, d, 4); + + BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5); + + PASS_THROUGH(c, d, 7); + + ADDW_S16_S32(c, 11, a, 8, d, 8); + ADDW_S16_S32(c, 10, a, 9, d, 9); + SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10); + SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11); + SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12); + SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13); + ADDW_S16_S32(c, 13, b, 14, d, 14); + ADDW_S16_S32(c, 12, b, 15, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 17); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19); + BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20); + BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21); + + PASS_THROUGH(c, d, 22); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 25); + + PASS_THROUGH(c, d, 30); + PASS_THROUGH(c, d, 31); + + // Stage 5. + BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1); + BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3); + + ADD_S32(d, 4, 5, c, 4); + SUB_S32(d, 4, 5, c, 5); + SUB_S32(d, 7, 6, c, 6); + ADD_S32(d, 7, 6, c, 7); + + PASS_THROUGH(d, c, 8); + + BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9); + BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10); + + PASS_THROUGH(d, c, 11); + PASS_THROUGH(d, c, 12); + PASS_THROUGH(d, c, 15); + + ADD_S32(d, 16, 19, c, 16); + ADD_S32(d, 17, 18, c, 17); + SUB_S32(d, 17, 18, c, 18); + SUB_S32(d, 16, 19, c, 19); + SUB_S32(d, 23, 20, c, 20); + SUB_S32(d, 22, 21, c, 21); + ADD_S32(d, 22, 21, c, 22); + ADD_S32(d, 23, 20, c, 23); + ADD_S32(d, 24, 27, c, 24); + ADD_S32(d, 25, 26, c, 25); + SUB_S32(d, 25, 26, c, 26); + SUB_S32(d, 24, 27, c, 27); + SUB_S32(d, 31, 28, c, 28); + SUB_S32(d, 30, 29, c, 29); + ADD_S32(d, 30, 29, c, 30); + ADD_S32(d, 31, 28, c, 31); + + // Stage 6. + PASS_THROUGH(c, d, 0); + PASS_THROUGH(c, d, 1); + PASS_THROUGH(c, d, 2); + PASS_THROUGH(c, d, 3); + + BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7); + BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6); + + ADD_S32(c, 8, 9, d, 8); + SUB_S32(c, 8, 9, d, 9); + SUB_S32(c, 11, 10, d, 10); + ADD_S32(c, 11, 10, d, 11); + ADD_S32(c, 12, 13, d, 12); + SUB_S32(c, 12, 13, d, 13); + SUB_S32(c, 15, 14, d, 14); + ADD_S32(c, 15, 14, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 19); + PASS_THROUGH(c, d, 20); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 27); + PASS_THROUGH(c, d, 28); + PASS_THROUGH(c, d, 31); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17); + BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21); + BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22); + + // Stage 7. + PASS_THROUGH(d, c, 0); + PASS_THROUGH(d, c, 1); + PASS_THROUGH(d, c, 2); + PASS_THROUGH(d, c, 3); + PASS_THROUGH(d, c, 4); + PASS_THROUGH(d, c, 5); + PASS_THROUGH(d, c, 6); + PASS_THROUGH(d, c, 7); + + BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15); + BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14); + BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13); + BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12); + + ADD_S32(d, 16, 17, c, 16); + SUB_S32(d, 16, 17, c, 17); + SUB_S32(d, 19, 18, c, 18); + ADD_S32(d, 19, 18, c, 19); + ADD_S32(d, 20, 21, c, 20); + SUB_S32(d, 20, 21, c, 21); + SUB_S32(d, 23, 22, c, 22); + ADD_S32(d, 23, 22, c, 23); + ADD_S32(d, 24, 25, c, 24); + SUB_S32(d, 24, 25, c, 25); + SUB_S32(d, 27, 26, c, 26); + ADD_S32(d, 27, 26, c, 27); + ADD_S32(d, 28, 29, c, 28); + SUB_S32(d, 28, 29, c, 29); + SUB_S32(d, 31, 30, c, 30); + ADD_S32(d, 31, 30, c, 31); + + // Final stage. + // Roll rounding into this function so we can pass back int16x8. + + out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]); + out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]); + + out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]); + out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]); + out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]); + out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]); + out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]); + + out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]); + out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]); + out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]); + out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]); + + out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]); + out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]); + out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]); + out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]); + out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]); + + BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31); + out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]); + out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15); + out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]); + out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23); + out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]); + out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]); + + BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7); + out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]); + out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]); + + BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27); + out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]); + out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]); + + BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11); + out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]); + out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]); + + BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19); + out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]); + out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]); + + BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3); + out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]); + out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]); +} + +static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + // For the "rd" version, all the values are rounded down after stage 2 to keep + // the values in 16 bits. + b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15])); + b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14])); + b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13])); + b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12])); + b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11])); + b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10])); + b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9])); + b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8])); + + b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8])); + b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9])); + b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10])); + b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11])); + b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12])); + b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13])); + b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14])); + b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15])); + + b[16] = add_round_shift_s16(a[16]); + b[17] = add_round_shift_s16(a[17]); + b[18] = add_round_shift_s16(a[18]); + b[19] = add_round_shift_s16(a[19]); + + butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]); + b[20] = add_round_shift_s16(b[20]); + b[21] = add_round_shift_s16(b[21]); + b[22] = add_round_shift_s16(b[22]); + b[23] = add_round_shift_s16(b[23]); + b[24] = add_round_shift_s16(b[24]); + b[25] = add_round_shift_s16(b[25]); + b[26] = add_round_shift_s16(b[26]); + b[27] = add_round_shift_s16(b[27]); + + b[28] = add_round_shift_s16(a[28]); + b[29] = add_round_shift_s16(a[29]); + b[30] = add_round_shift_s16(a[30]); + b[31] = add_round_shift_s16(a[31]); + + // Stage 3. + a[0] = vaddq_s16(b[0], b[7]); + a[1] = vaddq_s16(b[1], b[6]); + a[2] = vaddq_s16(b[2], b[5]); + a[3] = vaddq_s16(b[3], b[4]); + + a[4] = vsubq_s16(b[3], b[4]); + a[5] = vsubq_s16(b[2], b[5]); + a[6] = vsubq_s16(b[1], b[6]); + a[7] = vsubq_s16(b[0], b[7]); + + a[8] = b[8]; + a[9] = b[9]; + + butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]); + butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]); + + a[14] = b[14]; + a[15] = b[15]; + + a[16] = vaddq_s16(b[16], b[23]); + a[17] = vaddq_s16(b[17], b[22]); + a[18] = vaddq_s16(b[18], b[21]); + a[19] = vaddq_s16(b[19], b[20]); + + a[20] = vsubq_s16(b[19], b[20]); + a[21] = vsubq_s16(b[18], b[21]); + a[22] = vsubq_s16(b[17], b[22]); + a[23] = vsubq_s16(b[16], b[23]); + + a[24] = vsubq_s16(b[31], b[24]); + a[25] = vsubq_s16(b[30], b[25]); + a[26] = vsubq_s16(b[29], b[26]); + a[27] = vsubq_s16(b[28], b[27]); + + a[28] = vaddq_s16(b[28], b[27]); + a[29] = vaddq_s16(b[29], b[26]); + a[30] = vaddq_s16(b[30], b[25]); + a[31] = vaddq_s16(b[31], b[24]); + + // Stage 4. + b[0] = vaddq_s16(a[0], a[3]); + b[1] = vaddq_s16(a[1], a[2]); + b[2] = vsubq_s16(a[1], a[2]); + b[3] = vsubq_s16(a[0], a[3]); + + b[4] = a[4]; + + butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]); + + b[7] = a[7]; + + b[8] = vaddq_s16(a[8], a[11]); + b[9] = vaddq_s16(a[9], a[10]); + b[10] = vsubq_s16(a[9], a[10]); + b[11] = vsubq_s16(a[8], a[11]); + b[12] = vsubq_s16(a[15], a[12]); + b[13] = vsubq_s16(a[14], a[13]); + b[14] = vaddq_s16(a[14], a[13]); + b[15] = vaddq_s16(a[15], a[12]); + + b[16] = a[16]; + b[17] = a[17]; + + butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]); + butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]); + butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]); + butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]); + + b[22] = a[22]; + b[23] = a[23]; + b[24] = a[24]; + b[25] = a[25]; + + b[30] = a[30]; + b[31] = a[31]; + + // Stage 5. + butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]); + butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]); + + a[4] = vaddq_s16(b[4], b[5]); + a[5] = vsubq_s16(b[4], b[5]); + a[6] = vsubq_s16(b[7], b[6]); + a[7] = vaddq_s16(b[7], b[6]); + + a[8] = b[8]; + + butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]); + butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]); + + a[11] = b[11]; + a[12] = b[12]; + + a[15] = b[15]; + + a[16] = vaddq_s16(b[19], b[16]); + a[17] = vaddq_s16(b[18], b[17]); + a[18] = vsubq_s16(b[17], b[18]); + a[19] = vsubq_s16(b[16], b[19]); + a[20] = vsubq_s16(b[23], b[20]); + a[21] = vsubq_s16(b[22], b[21]); + a[22] = vaddq_s16(b[21], b[22]); + a[23] = vaddq_s16(b[20], b[23]); + a[24] = vaddq_s16(b[27], b[24]); + a[25] = vaddq_s16(b[26], b[25]); + a[26] = vsubq_s16(b[25], b[26]); + a[27] = vsubq_s16(b[24], b[27]); + a[28] = vsubq_s16(b[31], b[28]); + a[29] = vsubq_s16(b[30], b[29]); + a[30] = vaddq_s16(b[29], b[30]); + a[31] = vaddq_s16(b[28], b[31]); + + // Stage 6. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + + butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]); + butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]); + + b[8] = vaddq_s16(a[8], a[9]); + b[9] = vsubq_s16(a[8], a[9]); + b[10] = vsubq_s16(a[11], a[10]); + b[11] = vaddq_s16(a[11], a[10]); + b[12] = vaddq_s16(a[12], a[13]); + b[13] = vsubq_s16(a[12], a[13]); + b[14] = vsubq_s16(a[15], a[14]); + b[15] = vaddq_s16(a[15], a[14]); + + b[16] = a[16]; + b[19] = a[19]; + b[20] = a[20]; + b[23] = a[23]; + b[24] = a[24]; + b[27] = a[27]; + b[28] = a[28]; + b[31] = a[31]; + + butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]); + butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]); + + butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]); + butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]); + + // Stage 7. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + a[4] = b[4]; + a[5] = b[5]; + a[6] = b[6]; + a[7] = b[7]; + + butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]); + butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]); + butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]); + butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]); + + a[16] = vaddq_s16(b[16], b[17]); + a[17] = vsubq_s16(b[16], b[17]); + a[18] = vsubq_s16(b[19], b[18]); + a[19] = vaddq_s16(b[19], b[18]); + a[20] = vaddq_s16(b[20], b[21]); + a[21] = vsubq_s16(b[20], b[21]); + a[22] = vsubq_s16(b[23], b[22]); + a[23] = vaddq_s16(b[23], b[22]); + a[24] = vaddq_s16(b[24], b[25]); + a[25] = vsubq_s16(b[24], b[25]); + a[26] = vsubq_s16(b[27], b[26]); + a[27] = vaddq_s16(b[27], b[26]); + a[28] = vaddq_s16(b[28], b[29]); + a[29] = vsubq_s16(b[28], b[29]); + a[30] = vsubq_s16(b[31], b[30]); + a[31] = vaddq_s16(b[31], b[30]); + + // Final stage. + out[0] = a[0]; + out[16] = a[1]; + out[8] = a[2]; + out[24] = a[3]; + out[4] = a[4]; + out[20] = a[5]; + out[12] = a[6]; + out[28] = a[7]; + out[2] = a[8]; + out[18] = a[9]; + out[10] = a[10]; + out[26] = a[11]; + out[6] = a[12]; + out[22] = a[13]; + out[14] = a[14]; + out[30] = a[15]; + + butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]); + butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17], + &out[15]); + butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]); + butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]); + butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]); + butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21], + &out[11]); + butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13], + &out[19]); + butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]); +} + +#undef PASS_THROUGH +#undef ADD_S16_S32 +#undef SUB_S16_S32 +#undef ADDW_S16_S32 +#undef SUBW_S16_S32 +#undef ADD_S32 +#undef SUB_S32 +#undef BUTTERFLY_ONE_S16_S32 +#undef BUTTERFLY_ONE_S32 +#undef BUTTERFLY_TWO_S32 + +#endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ diff --git a/vpx_dsp/arm/fdct4x4_neon.c b/vpx_dsp/arm/fdct4x4_neon.c index 11df7292d4..3b9196fae9 100644 --- a/vpx_dsp/arm/fdct4x4_neon.c +++ b/vpx_dsp/arm/fdct4x4_neon.c @@ -18,10 +18,10 @@ #include "vpx_dsp/arm/fdct_neon.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct4x4_neon.h" void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, int stride) { - int i; // input[M * stride] * 16 int16x4_t in[4]; in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4); @@ -34,9 +34,8 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1)); in[0] = vadd_s16(in[0], one); } - for (i = 0; i < 2; ++i) { - vpx_fdct4x4_pass1_neon(in); - } + vpx_fdct4x4_pass1_neon(in); + vpx_fdct4x4_pass2_neon(in); { // Not quite a rounding shift. Only add 1 despite shifting by 2. const int16x8_t one = vdupq_n_s16(1); @@ -53,7 +52,6 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, int stride) { - int i; static const int32x4_t const_1000 = { 1, 0, 0, 0 }; const int32x4_t const_one = vdupq_n_s32(1); @@ -69,9 +67,8 @@ void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, in[0] = vaddq_s32(in[0], const_1000); } - for (i = 0; i < 2; ++i) { - vpx_highbd_fdct4x4_pass1_neon(in); - } + vpx_highbd_fdct4x4_pass1_neon(in); + vpx_highbd_fdct4x4_pass1_neon(in); { // Not quite a rounding shift. Only add 1 despite shifting by 2. in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2); diff --git a/vpx_dsp/arm/fdct4x4_neon.h b/vpx_dsp/arm/fdct4x4_neon.h new file mode 100644 index 0000000000..de3db9774c --- /dev/null +++ b/vpx_dsp/arm/fdct4x4_neon.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ + +#include + +static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) { + int16x4_t out[4]; + + const int16x8_t input_01 = vcombine_s16(in[0], in[1]); + const int16x8_t input_32 = vcombine_s16(in[3], in[2]); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // fdct_round_shift(s_0 +/- s_1) * cospi_16_64 + butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]); + + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) { + int16x4_t out[4]; + + const int16x8_t input_01 = vcombine_s16(in[0], in[1]); + const int16x8_t input_32 = vcombine_s16(in[3], in[2]); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // fdct_round_shift(s_0 +/- s_1) * cospi_16_64 + butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0], + &out[2]); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]); + + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +#if CONFIG_VP9_HIGHBITDEPTH + +static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) { + int32x4_t out[4]; + // in_0 +/- in_3, in_1 +/- in_2 + const int32x4_t s_0 = vaddq_s32(in[0], in[3]); + const int32x4_t s_1 = vaddq_s32(in[1], in[2]); + const int32x4_t s_2 = vsubq_s32(in[1], in[2]); + const int32x4_t s_3 = vsubq_s32(in[0], in[3]); + + butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]); + + // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64 + // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64, + &out[1], &out[3]); + + transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ diff --git a/vpx_dsp/arm/fdct8x8_neon.c b/vpx_dsp/arm/fdct8x8_neon.c index 3fb15cc175..75ee6f2230 100644 --- a/vpx_dsp/arm/fdct8x8_neon.c +++ b/vpx_dsp/arm/fdct8x8_neon.c @@ -17,10 +17,10 @@ #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/arm/fdct_neon.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/fdct8x8_neon.h" void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, int stride) { - int i; // stage 1 int16x8_t in[8]; in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); @@ -31,9 +31,9 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); - for (i = 0; i < 2; ++i) { - vpx_fdct8x8_pass1_neon(in); - } // for + + vpx_fdct8x8_pass1_neon(in); + vpx_fdct8x8_pass2_neon(in); { // from vpx_dct_sse2.c // Post-condition (division by two) @@ -71,8 +71,6 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, int stride) { - int i; - // input[M * stride] * 16 int32x4_t left[8], right[8]; int16x8_t in[8]; @@ -102,26 +100,25 @@ void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, right[6] = vshll_n_s16(vget_high_s16(in[6]), 2); right[7] = vshll_n_s16(vget_high_s16(in[7]), 2); - for (i = 0; i < 2; ++i) { - vpx_highbd_fdct8x8_pass1_neon(left, right); - } + vpx_highbd_fdct8x8_pass1_neon(left, right); + vpx_highbd_fdct8x8_pass2_neon(left, right); { - left[0] = highbd_add_round_shift_s32(left[0]); - left[1] = highbd_add_round_shift_s32(left[1]); - left[2] = highbd_add_round_shift_s32(left[2]); - left[3] = highbd_add_round_shift_s32(left[3]); - left[4] = highbd_add_round_shift_s32(left[4]); - left[5] = highbd_add_round_shift_s32(left[5]); - left[6] = highbd_add_round_shift_s32(left[6]); - left[7] = highbd_add_round_shift_s32(left[7]); - right[0] = highbd_add_round_shift_s32(right[0]); - right[1] = highbd_add_round_shift_s32(right[1]); - right[2] = highbd_add_round_shift_s32(right[2]); - right[3] = highbd_add_round_shift_s32(right[3]); - right[4] = highbd_add_round_shift_s32(right[4]); - right[5] = highbd_add_round_shift_s32(right[5]); - right[6] = highbd_add_round_shift_s32(right[6]); - right[7] = highbd_add_round_shift_s32(right[7]); + left[0] = add_round_shift_half_s32(left[0]); + left[1] = add_round_shift_half_s32(left[1]); + left[2] = add_round_shift_half_s32(left[2]); + left[3] = add_round_shift_half_s32(left[3]); + left[4] = add_round_shift_half_s32(left[4]); + left[5] = add_round_shift_half_s32(left[5]); + left[6] = add_round_shift_half_s32(left[6]); + left[7] = add_round_shift_half_s32(left[7]); + right[0] = add_round_shift_half_s32(right[0]); + right[1] = add_round_shift_half_s32(right[1]); + right[2] = add_round_shift_half_s32(right[2]); + right[3] = add_round_shift_half_s32(right[3]); + right[4] = add_round_shift_half_s32(right[4]); + right[5] = add_round_shift_half_s32(right[5]); + right[6] = add_round_shift_half_s32(right[6]); + right[7] = add_round_shift_half_s32(right[7]); // store results vst1q_s32(final_output, left[0]); diff --git a/vpx_dsp/arm/fdct8x8_neon.h b/vpx_dsp/arm/fdct8x8_neon.h new file mode 100644 index 0000000000..d8fa600448 --- /dev/null +++ b/vpx_dsp/arm/fdct8x8_neon.h @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ + +#include + +static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in, + int16x8_t *out) { + int16x8_t s[8], x[4], t[2]; + + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]); + + // Stage 3 + x[0] = vaddq_s16(s[4], t[0]); + x[1] = vsubq_s16(s[4], t[0]); + x[2] = vsubq_s16(s[7], t[1]); + x[3] = vaddq_s16(s[7], t[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]); +} + +static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in, + int16x8_t *out) { + int16x8_t s[8], x[4], t[2]; + + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], + &out[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1], + &t[0]); + + // Stage 3 + x[0] = vaddq_s16(s[4], t[0]); + x[1] = vsubq_s16(s[4], t[0]); + x[2] = vsubq_s16(s[7], t[1]); + x[3] = vaddq_s16(s[7], t[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]); +} + +static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) { + int16x8_t out[8]; + vpx_fdct8x8_pass1_notranspose_neon(in, out); + // transpose 8x8 + transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; + in[4] = out[4]; + in[5] = out[5]; + in[6] = out[6]; + in[7] = out[7]; +} + +static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) { + int16x8_t out[8]; + vpx_fdct8x8_pass2_notranspose_neon(in, out); + // transpose 8x8 + transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; + in[4] = out[4]; + in[5] = out[5]; + in[6] = out[6]; + in[7] = out[7]; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left, + int32x4_t *right) { + int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; + + sl[0] = vaddq_s32(left[0], left[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sr[1] = vaddq_s32(right[1], right[6]); + sr[2] = vaddq_s32(right[2], right[5]); + sr[3] = vaddq_s32(right[3], right[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sr[5] = vsubq_s32(right[2], right[5]); + sr[6] = vsubq_s32(right[1], right[6]); + sr[7] = vsubq_s32(right[0], right[7]); + + // fdct4(step, step); + // x0 = s0 + s3; + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + // x1 = s1 + s2; + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + // x2 = s1 - s2; + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + // x3 = s0 - s3; + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[4], &right[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64, + &left[2], &right[2], &left[6], &right[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1], + &tr[1], &tl[0], &tr[0]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], tl[0]); + xr[0] = vaddq_s32(sr[4], tr[0]); + xl[1] = vsubq_s32(sl[4], tl[0]); + xr[1] = vsubq_s32(sr[4], tr[0]); + xl[2] = vsubq_s32(sl[7], tl[1]); + xr[2] = vsubq_s32(sr[7], tr[1]); + xl[3] = vaddq_s32(sl[7], tl[1]); + xr[3] = vaddq_s32(sr[7], tr[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64, + &left[1], &right[1], &left[7], &right[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64, + &left[5], &right[5], &left[3], &right[3]); +} + +static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left, + int32x4_t *right) { + int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; + + sl[0] = vaddq_s32(left[0], left[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sr[1] = vaddq_s32(right[1], right[6]); + sr[2] = vaddq_s32(right[2], right[5]); + sr[3] = vaddq_s32(right[3], right[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sr[5] = vsubq_s32(right[2], right[5]); + sr[6] = vsubq_s32(right[1], right[6]); + sr[7] = vsubq_s32(right[0], right[7]); + + // fdct4(step, step); + // x0 = s0 + s3; + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + // x1 = s1 + s2; + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + // x2 = s1 - s2; + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + // x3 = s0 - s3; + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[4], &right[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, + cospi_24_64, &left[2], &right[2], &left[6], + &right[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1], + &tr[1], &tl[0], &tr[0]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], tl[0]); + xr[0] = vaddq_s32(sr[4], tr[0]); + xl[1] = vsubq_s32(sl[4], tl[0]); + xr[1] = vsubq_s32(sr[4], tr[0]); + xl[2] = vsubq_s32(sl[7], tl[1]); + xr[2] = vsubq_s32(sr[7], tr[1]); + xl[3] = vaddq_s32(sl[7], tl[1]); + xr[3] = vaddq_s32(sr[7], tr[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, + cospi_28_64, &left[1], &right[1], &left[7], + &right[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, + cospi_12_64, &left[5], &right[5], &left[3], + &right[3]); +} + +static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left, + int32x4_t *right) { + int32x4x2_t out[8]; + vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right); + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + left[0] = out[0].val[0]; + right[0] = out[0].val[1]; + left[1] = out[1].val[0]; + right[1] = out[1].val[1]; + left[2] = out[2].val[0]; + right[2] = out[2].val[1]; + left[3] = out[3].val[0]; + right[3] = out[3].val[1]; + left[4] = out[4].val[0]; + right[4] = out[4].val[1]; + left[5] = out[5].val[0]; + right[5] = out[5].val[1]; + left[6] = out[6].val[0]; + right[6] = out[6].val[1]; + left[7] = out[7].val[0]; + right[7] = out[7].val[1]; +} + +static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left, + int32x4_t *right) { + int32x4x2_t out[8]; + vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right); + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + left[0] = out[0].val[0]; + right[0] = out[0].val[1]; + left[1] = out[1].val[0]; + right[1] = out[1].val[1]; + left[2] = out[2].val[0]; + right[2] = out[2].val[1]; + left[3] = out[3].val[0]; + right[3] = out[3].val[1]; + left[4] = out[4].val[0]; + right[4] = out[4].val[1]; + left[5] = out[5].val[0]; + right[5] = out[5].val[1]; + left[6] = out[6].val[0]; + right[6] = out[6].val[1]; + left[7] = out[7].val[0]; + right[7] = out[7].val[1]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h index ce669061d2..1ea948b3f7 100644 --- a/vpx_dsp/arm/fdct_neon.h +++ b/vpx_dsp/arm/fdct_neon.h @@ -14,56 +14,94 @@ #include // fdct_round_shift((a +/- b) * c) -static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, - const tran_high_t constant, - int16x8_t *add, int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); - const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); +// Variant that performs fast vqrdmulh_s16 operation on half vector +// can be slightly less accurate, adequate for pass1 +static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a, + const int16x4_t b, + const tran_coef_t constant, + int16x4_t *add, + int16x4_t *sub) { + int16x4_t c = vdup_n_s16(2 * constant); + *add = vqrdmulh_s16(vadd_s16(a, b), c); + *sub = vqrdmulh_s16(vsub_s16(a, b), c); } -// fdct_round_shift(a * c0 +/- b * c1) -static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, - const tran_coef_t constant0, - const tran_coef_t constant1, - int16x8_t *add, int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0); - const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1); - const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1); - const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0); - const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulh_s16 operation on full vector +// can be slightly less accurate, adequate for pass1 +static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a, + const int16x8_t b, + const tran_coef_t constant, + int16x8_t *add, + int16x8_t *sub) { + int16x8_t c = vdupq_n_s16(2 * constant); + *add = vqrdmulhq_s16(vaddq_s16(a, b), c); + *sub = vqrdmulhq_s16(vsubq_s16(a, b), c); } -// Add 2 if positive, 1 if negative, and shift by 2. -// In practice, subtract the sign bit, then shift with rounding. -static INLINE int16x8_t sub_round_shift(const int16x8_t a) { - const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); - const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); - const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); - return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + int32x4_t c = vdupq_n_s32(constant << 17); + const int16x4_t a_lo = vget_low_s16(a); + const int16x4_t a_hi = vget_high_s16(a); + const int16x4_t b_lo = vget_low_s16(b); + const int16x4_t b_hi = vget_high_s16(b); + *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c); + *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c); + *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c); + *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast_narrow( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int16x8_t *add, int16x8_t *sub) { + int32x4_t add_lo, add_hi, sub_lo, sub_hi; + butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo, + &sub_hi); + *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi)); + *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi)); } -// Like butterfly_one_coeff, but don't narrow results. +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast_half( + const int16x4_t a, const int16x4_t b, const tran_coef_t constant, + int32x4_t *add, int32x4_t *sub) { + int32x4_t c = vdupq_n_s32(constant << 17); + *add = vqrdmulhq_s32(vaddl_s16(a, b), c); + *sub = vqrdmulhq_s32(vsubl_s16(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on half vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns narrowed down 16-bit values +static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half( + const int16x4_t a, const int16x4_t b, const tran_coef_t constant, + int16x4_t *add, int16x4_t *sub) { + int32x4_t add32, sub32; + butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32); + *add = vmovn_s32(add32); + *sub = vmovn_s32(sub32); +} + +// fdct_round_shift((a +/- b) * c) +// Original Variant that performs normal implementation on full vector +// fully accurate does 32-bit processing, takes 16-bit values static INLINE void butterfly_one_coeff_s16_s32( - const int16x8_t a, const int16x8_t b, const tran_high_t constant, + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); @@ -78,37 +116,182 @@ static INLINE void butterfly_one_coeff_s16_s32( *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); } -// Like butterfly_one_coeff, but with s32. -static INLINE void butterfly_one_coeff_s32( +// fdct_round_shift((a +/- b) * c) +// Original Variant that performs normal implementation on full vector +// fully accurate does 32-bit processing, takes 16-bit values +// returns narrowed down 16-bit values +static INLINE void butterfly_one_coeff_s16_s32_narrow( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int16x8_t *add, int16x8_t *sub) { + int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi; + butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo, + &sub32_hi); + *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi)); + *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi)); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a, + const int32x4_t b, + const tran_coef_t constant, + int32x4_t *add, + int32x4_t *sub) { + const int32x4_t c = vdupq_n_s32(constant << 17); + *add = vqrdmulhq_s32(vaddq_s32(a, b), c); + *sub = vqrdmulhq_s32(vsubq_s32(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_fast( const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, - const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo, + const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { - const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant); - const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant); - const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant); - const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant); - const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant); - const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant); - *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); - *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); - *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); - *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); + const int32x4_t c = vdupq_n_s32(constant << 17); + *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c); + *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c); + *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c); + *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on half vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_s64_narrow_half( + const int32x4_t a, const int32x4_t b, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) { + const int32x2_t a_lo = vget_low_s32(a); + const int32x2_t a_hi = vget_high_s32(a); + const int32x2_t b_lo = vget_low_s32(b); + const int32x2_t b_hi = vget_high_s32(b); + + const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1); + const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1); + const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2); + const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2); + + const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2); + const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2); + const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1); + const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1); + + *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS), + vrshrn_n_s64(sum_hi, DCT_CONST_BITS)); + *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS), + vrshrn_n_s64(diff_hi, DCT_CONST_BITS)); } -// Like butterfly_two_coeff, but with s32. +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on full vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_s64_narrow( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + // ac1/ac2 hold the following values: + // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1, + // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1 + // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2, + // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2 + int64x2_t ac1[4]; + int64x2_t ac2[4]; + int64x2_t sum[4]; + int64x2_t diff[4]; + + ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1); + ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1); + ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1); + ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1); + ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2); + ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2); + ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2); + ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2); + + sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2); + sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2); + sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2); + sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2); + *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS), + vrshrn_n_s64(sum[1], DCT_CONST_BITS)); + *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS), + vrshrn_n_s64(sum[3], DCT_CONST_BITS)); + + diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1); + diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1); + diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1); + diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1); + *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS), + vrshrn_n_s64(diff[1], DCT_CONST_BITS)); + *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS), + vrshrn_n_s64(diff[3], DCT_CONST_BITS)); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on half vector +// more accurate does 32-bit processing, takes and returns 16-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_half(const int16x4_t a, + const int16x4_t b, + const tran_coef_t constant1, + const tran_coef_t constant2, + int16x4_t *add, int16x4_t *sub) { + const int32x4_t a1 = vmull_n_s16(a, constant1); + const int32x4_t a2 = vmull_n_s16(a, constant2); + const int32x4_t sum = vmlal_n_s16(a1, b, constant2); + const int32x4_t diff = vmlsl_n_s16(a2, b, constant1); + *add = vqrshrn_n_s32(sum, DCT_CONST_BITS); + *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 16-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, + const tran_coef_t constant1, + const tran_coef_t constant2, + int16x8_t *add, int16x8_t *sub) { + const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1); + const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1); + const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2); + const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2); + const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2); + const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2); + const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1); + const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results static INLINE void butterfly_two_coeff_s32( const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, - const int32x4_t b_hi, const int32_t constant0, const int32_t constant1, - int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, - int32x4_t *sub_hi) { - const int32x4_t a0 = vmulq_n_s32(a_lo, constant0); - const int32x4_t a1 = vmulq_n_s32(a_hi, constant0); - const int32x4_t a2 = vmulq_n_s32(a_lo, constant1); - const int32x4_t a3 = vmulq_n_s32(a_hi, constant1); - const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0); - const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0); - const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1); - const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1); + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant1); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant1); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant2); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant2); + const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2); + const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2); + const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1); + const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1); *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); @@ -126,9 +309,10 @@ static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { } // Add 1 if positive, 2 if negative, and shift by 2. -// In practice, add 1, then add the sign bit, then shift without rounding. -static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo, - const int32x4_t a_hi) { +// In practice, add 1, then add the sign bit, then shift and round, +// return narrowed results +static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo, + const int32x4_t a_hi) { const int32x4_t one = vdupq_n_s32(1); const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo); const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31); @@ -143,419 +327,32 @@ static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo, return vcombine_s16(b_lo, b_hi); } -static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) { - const int16x8_t input_01 = vcombine_s16(in[0], in[1]); - const int16x8_t input_32 = vcombine_s16(in[3], in[2]); - - // in_0 +/- in_3, in_1 +/- in_2 - const int16x8_t s_01 = vaddq_s16(input_01, input_32); - const int16x8_t s_32 = vsubq_s16(input_01, input_32); - - // step_0 +/- step_1, step_2 +/- step_3 - const int16x4_t s_0 = vget_low_s16(s_01); - const int16x4_t s_1 = vget_high_s16(s_01); - const int16x4_t s_2 = vget_high_s16(s_32); - const int16x4_t s_3 = vget_low_s16(s_32); - - // (s_0 +/- s_1) * cospi_16_64 - // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. - const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); - const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); - const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64); - const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64); - - // fdct_round_shift - int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); - int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS); - - // s_3 * cospi_8_64 + s_2 * cospi_24_64 - // s_3 * cospi_24_64 - s_2 * cospi_8_64 - const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64); - const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64); - - const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64); - const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64); - - // fdct_round_shift - int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); - int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS); - - transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); - - in[0] = out_0; - in[1] = out_1; - in[2] = out_2; - in[3] = out_3; -} - -static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in, - int16x8_t *out) { - const int16x8_t v_s0 = vaddq_s16(in[0], in[7]); - const int16x8_t v_s1 = vaddq_s16(in[1], in[6]); - const int16x8_t v_s2 = vaddq_s16(in[2], in[5]); - const int16x8_t v_s3 = vaddq_s16(in[3], in[4]); - const int16x8_t v_s4 = vsubq_s16(in[3], in[4]); - const int16x8_t v_s5 = vsubq_s16(in[2], in[5]); - const int16x8_t v_s6 = vsubq_s16(in[1], in[6]); - const int16x8_t v_s7 = vsubq_s16(in[0], in[7]); - // fdct4(step, step); - int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); - int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); - int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); - int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); - // fdct4(step, step); - int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64); - int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64); - int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64); - int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64); - v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64); - v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64); - v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); - v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); - v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); - v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out[0] = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 - out[2] = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 - out[4] = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 - out[6] = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 - } - // Stage 2 - v_x0 = vsubq_s16(v_s6, v_s5); - v_x1 = vaddq_s16(v_s6, v_s5); - v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x8_t ab = vcombine_s16(a, b); - const int16x8_t cd = vcombine_s16(c, d); - // Stage 3 - v_x0 = vaddq_s16(v_s4, ab); - v_x1 = vsubq_s16(v_s4, ab); - v_x2 = vsubq_s16(v_s7, cd); - v_x3 = vaddq_s16(v_s7, cd); - } - // Stage 4 - v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64); - v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64); - v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64); - v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64); - v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64); - v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64); - v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64); - v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64); - v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64); - v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64); - v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out[1] = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 - out[3] = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 - out[5] = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 - out[7] = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 - } -} - -static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) { - int16x8_t out[8]; - vpx_fdct8x8_pass1_notranspose_neon(in, out); - // transpose 8x8 - // Can't use transpose_s16_8x8() because the values are arranged in two 4x8 - // columns. - { - // 00 01 02 03 40 41 42 43 - // 10 11 12 13 50 51 52 53 - // 20 21 22 23 60 61 62 63 - // 30 31 32 33 70 71 72 73 - // 04 05 06 07 44 45 46 47 - // 14 15 16 17 54 55 56 57 - // 24 25 26 27 64 65 66 67 - // 34 35 36 37 74 75 76 77 - const int32x4x2_t r02_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2])); - const int32x4x2_t r13_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3])); - const int32x4x2_t r46_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6])); - const int32x4x2_t r57_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7])); - const int16x8x2_t r01_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), - vreinterpretq_s16_s32(r13_s32.val[0])); - const int16x8x2_t r23_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), - vreinterpretq_s16_s32(r13_s32.val[1])); - const int16x8x2_t r45_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), - vreinterpretq_s16_s32(r57_s32.val[0])); - const int16x8x2_t r67_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), - vreinterpretq_s16_s32(r57_s32.val[1])); - in[0] = r01_s16.val[0]; - in[1] = r01_s16.val[1]; - in[2] = r23_s16.val[0]; - in[3] = r23_s16.val[1]; - in[4] = r45_s16.val[0]; - in[5] = r45_s16.val[1]; - in[6] = r67_s16.val[0]; - in[7] = r67_s16.val[1]; - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } -} - -#if CONFIG_VP9_HIGHBITDEPTH -static INLINE int32x4_t highbd_add_round_shift_s32(int32x4_t x) { - const int32x2_t x_lo = vget_low_s32(x); - const int32x2_t x_hi = vget_high_s32(x); - const int64x2_t x64_lo = vmovl_s32(x_lo); - const int64x2_t x64_hi = vmovl_s32(x_hi); - - const int64x2_t sign_lo = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_lo, 63); - const int64x2_t sign_hi = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_hi, 63); - - const int64x2_t sum_lo = vaddq_s64(x64_lo, sign_lo); - const int64x2_t sum_hi = vaddq_s64(x64_hi, sign_hi); - return vcombine_s32(vshrn_n_s64(sum_lo, 1), vshrn_n_s64(sum_hi, 1)); -} - -static INLINE void highbd_butterfly_one_coeff_s32(const int32x4_t a, - const int32x4_t b, - const tran_coef_t c, - int32x4_t *add, - int32x4_t *sub) { - const int32x2_t a_lo = vget_low_s32(a); - const int32x2_t a_hi = vget_high_s32(a); - const int32x2_t b_lo = vget_low_s32(b); - const int32x2_t b_hi = vget_high_s32(b); - - const int64x2_t a64_lo = vmull_n_s32(a_lo, c); - const int64x2_t a64_hi = vmull_n_s32(a_hi, c); - - const int64x2_t sum_lo = vmlal_n_s32(a64_lo, b_lo, c); - const int64x2_t sum_hi = vmlal_n_s32(a64_hi, b_hi, c); - const int64x2_t diff_lo = vmlsl_n_s32(a64_lo, b_lo, c); - const int64x2_t diff_hi = vmlsl_n_s32(a64_hi, b_hi, c); - - *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS), - vrshrn_n_s64(sum_hi, DCT_CONST_BITS)); - *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS), - vrshrn_n_s64(diff_hi, DCT_CONST_BITS)); -} - -static INLINE void highbd_butterfly_two_coeff_s32( - const int32x4_t a, const int32x4_t b, const tran_coef_t c0, - const tran_coef_t c1, int32x4_t *add, int32x4_t *sub) { - const int32x2_t a_lo = vget_low_s32(a); - const int32x2_t a_hi = vget_high_s32(a); - const int32x2_t b_lo = vget_low_s32(b); - const int32x2_t b_hi = vget_high_s32(b); - - const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, c0); - const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, c0); - const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, c1); - const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, c1); - - const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, c1); - const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, c1); - const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, c0); - const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, c0); - - *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS), - vrshrn_n_s64(sum_hi, DCT_CONST_BITS)); - *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS), - vrshrn_n_s64(diff_hi, DCT_CONST_BITS)); -} - -static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) { - int32x4_t out[4]; - // in_0 +/- in_3, in_1 +/- in_2 - const int32x4_t s_0 = vaddq_s32(in[0], in[3]); - const int32x4_t s_1 = vaddq_s32(in[1], in[2]); - const int32x4_t s_2 = vsubq_s32(in[1], in[2]); - const int32x4_t s_3 = vsubq_s32(in[0], in[3]); - - highbd_butterfly_one_coeff_s32(s_0, s_1, cospi_16_64, &out[0], &out[2]); - - // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64 - // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64 - highbd_butterfly_two_coeff_s32(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], - &out[3]); - - transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]); - - in[0] = out[0]; - in[1] = out[1]; - in[2] = out[2]; - in[3] = out[3]; +// Add 1 if negative, and shift by 1. +// In practice, add the sign bit, then shift and round +static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) { + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1); } -static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left, - int32x4_t *right) { - int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; - - sl[0] = vaddq_s32(left[0], left[7]); - sl[1] = vaddq_s32(left[1], left[6]); - sl[2] = vaddq_s32(left[2], left[5]); - sl[3] = vaddq_s32(left[3], left[4]); - sl[4] = vsubq_s32(left[3], left[4]); - sl[5] = vsubq_s32(left[2], left[5]); - sl[6] = vsubq_s32(left[1], left[6]); - sl[7] = vsubq_s32(left[0], left[7]); - sr[0] = vaddq_s32(right[0], right[7]); - sr[1] = vaddq_s32(right[1], right[6]); - sr[2] = vaddq_s32(right[2], right[5]); - sr[3] = vaddq_s32(right[3], right[4]); - sr[4] = vsubq_s32(right[3], right[4]); - sr[5] = vsubq_s32(right[2], right[5]); - sr[6] = vsubq_s32(right[1], right[6]); - sr[7] = vsubq_s32(right[0], right[7]); - - // fdct4(step, step); - // x0 = s0 + s3; - xl[0] = vaddq_s32(sl[0], sl[3]); - xr[0] = vaddq_s32(sr[0], sr[3]); - // x1 = s1 + s2; - xl[1] = vaddq_s32(sl[1], sl[2]); - xr[1] = vaddq_s32(sr[1], sr[2]); - // x2 = s1 - s2; - xl[2] = vsubq_s32(sl[1], sl[2]); - xr[2] = vsubq_s32(sr[1], sr[2]); - // x3 = s0 - s3; - xl[3] = vsubq_s32(sl[0], sl[3]); - xr[3] = vsubq_s32(sr[0], sr[3]); - - // fdct4(step, step); - // t0 = (x0 + x1) * cospi_16_64; - // t1 = (x0 - x1) * cospi_16_64; - // out[0] = (tran_low_t)fdct_round_shift(t0); - // out[4] = (tran_low_t)fdct_round_shift(t1); - highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[4]); - highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0], - &right[4]); - // t2 = x2 * cospi_24_64 + x3 * cospi_8_64; - // t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; - // out[2] = (tran_low_t)fdct_round_shift(t2); - // out[6] = (tran_low_t)fdct_round_shift(t3); - highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64, - &left[2], &left[6]); - highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64, - &right[2], &right[6]); - - // Stage 2 - // t0 = (s6 - s5) * cospi_16_64; - highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &tl[1], &tl[0]); - highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &tr[1], &tr[0]); - - // Stage 3 - xl[0] = vaddq_s32(sl[4], tl[0]); - xr[0] = vaddq_s32(sr[4], tr[0]); - xl[1] = vsubq_s32(sl[4], tl[0]); - xr[1] = vsubq_s32(sr[4], tr[0]); - xl[2] = vsubq_s32(sl[7], tl[1]); - xr[2] = vsubq_s32(sr[7], tr[1]); - xl[3] = vaddq_s32(sl[7], tl[1]); - xr[3] = vaddq_s32(sr[7], tr[1]); - - // Stage 4 - // t0 = x0 * cospi_28_64 + x3 * cospi_4_64; - // out[1] = (tran_low_t)fdct_round_shift(t0); - // t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; - // out[7] = (tran_low_t)fdct_round_shift(t3); - highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64, - &left[1], &left[7]); - highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64, - &right[1], &right[7]); - - // t1 = x1 * cospi_12_64 + x2 * cospi_20_64; - // out[5] = (tran_low_t)fdct_round_shift(t1); - // t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; - // out[3] = (tran_low_t)fdct_round_shift(t2); - highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64, - &left[5], &left[3]); - highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64, - &right[5], &right[3]); +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) { + const int32x4_t one = vdupq_n_s32(1); + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2); } -static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left, - int32x4_t *right) { - int32x4x2_t out[8]; - vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right); - - out[0].val[0] = left[0]; - out[0].val[1] = right[0]; - out[1].val[0] = left[1]; - out[1].val[1] = right[1]; - out[2].val[0] = left[2]; - out[2].val[1] = right[2]; - out[3].val[0] = left[3]; - out[3].val[1] = right[3]; - out[4].val[0] = left[4]; - out[4].val[1] = right[4]; - out[5].val[0] = left[5]; - out[5].val[1] = right[5]; - out[6].val[0] = left[6]; - out[6].val[1] = right[6]; - out[7].val[0] = left[7]; - out[7].val[1] = right[7]; - - transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], - &out[6], &out[7]); - - left[0] = out[0].val[0]; - right[0] = out[0].val[1]; - left[1] = out[1].val[0]; - right[1] = out[1].val[1]; - left[2] = out[2].val[0]; - right[2] = out[2].val[1]; - left[3] = out[3].val[0]; - right[3] = out[3].val[1]; - left[4] = out[4].val[0]; - right[4] = out[4].val[1]; - left[5] = out[5].val[0]; - right[5] = out[5].val[1]; - left[6] = out[6].val[0]; - right[6] = out[6].val[1]; - left[7] = out[7].val[0]; - right[7] = out[7].val[1]; +// Add 2 if positive, 1 if negative, and shift by 2. +// In practice, subtract the sign bit, then shift with rounding. +static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) { + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); } -#endif // CONFIG_VP9_HIGHBITDEPTH #endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_ diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index bf06d6abe2..41d44f2b1f 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -821,6 +821,51 @@ static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1, a7->val[1] = c7.val[1]; } +// Helper transpose function for highbd FDCT variants +static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/, + int32x4_t *right /*[8]*/, + int32x4_t *out_left /*[8]*/, + int32x4_t *out_right /*[8]*/) { + int32x4x2_t out[8]; + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + out_left[0] = out[0].val[0]; + out_left[1] = out[1].val[0]; + out_left[2] = out[2].val[0]; + out_left[3] = out[3].val[0]; + out_left[4] = out[4].val[0]; + out_left[5] = out[5].val[0]; + out_left[6] = out[6].val[0]; + out_left[7] = out[7].val[0]; + out_right[0] = out[0].val[1]; + out_right[1] = out[1].val[1]; + out_right[2] = out[2].val[1]; + out_right[3] = out[3].val[1]; + out_right[4] = out[4].val[1]; + out_right[5] = out[5].val[1]; + out_right[6] = out[6].val[1]; + out_right[7] = out[7].val[1]; +} + static INLINE void transpose_u8_16x8( const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2, const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5, From 3f08aa0d0b2828b670073f808ae079acb35902a4 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 26 Oct 2022 22:09:32 +0000 Subject: [PATCH 0542/1000] [NEON] Optimize highbd 32x32 DCT For --best quality, resulting function vpx_highbd_fdct32x32_rd_neon takes 0.27% of cpu time in profiling, vs 6.27% for the sum of scalar functions: vpx_fdct32, vpx_fdct32.constprop.0, vpx_fdct32x32_rd_c for rd. For --rt quality, the function takes 0.19% vs 4.57% for the scalar version. Overall, this improves encoding time by ~6% compared for highbd for --best and ~9% for --rt. Change-Id: I1ce4bbef6e364bbadc76264056aa3f86b1a8edc5 --- vpx_dsp/arm/fdct32x32_neon.c | 185 ++++ vpx_dsp/arm/fdct32x32_neon.h | 1820 +++++++++++++++++++++++++++++++++- vpx_dsp/arm/fdct_neon.h | 9 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 4 files changed, 2013 insertions(+), 5 deletions(-) diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c index e2bf167604..d6818d2ec6 100644 --- a/vpx_dsp/arm/fdct32x32_neon.c +++ b/vpx_dsp/arm/fdct32x32_neon.c @@ -230,5 +230,190 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, &temp5[29], &temp5[30], &temp5[31]); store(output + 24 * 32, temp5); } + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[32]; + int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32], + right3[32], right4[32]; + int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32], + left8[32], right8[32]; + int32x4_t temp1[32], temp2[32]; + + // Process in 8x32 columns. + load_cross(input, stride, temp0); + highbd_scale_input(temp0, left1, right1); + highbd_dct8x32_body_first_pass(left1, right1); + highbd_partial_sub_round_shift(left1, right1); + + load_cross(input + 8, stride, temp0); + highbd_scale_input(temp0, left2, right2); + highbd_dct8x32_body_first_pass(left2, right2); + highbd_partial_sub_round_shift(left2, right2); + + load_cross(input + 16, stride, temp0); + highbd_scale_input(temp0, left3, right3); + highbd_dct8x32_body_first_pass(left3, right3); + highbd_partial_sub_round_shift(left3, right3); + + load_cross(input + 24, stride, temp0); + highbd_scale_input(temp0, left4, right4); + highbd_dct8x32_body_first_pass(left4, right4); + highbd_partial_sub_round_shift(left4, right4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_s32_8x8_2(left1, right1, temp1, temp2); + transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left5, right5); + highbd_dct8x32_body_second_pass(left5, right5); + highbd_partial_add_round_shift(left5, right5); + + // Second row of 8x32. + transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2); + transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left6, right6); + highbd_dct8x32_body_second_pass(left6, right6); + highbd_partial_add_round_shift(left6, right6); + + // Third row of 8x32 + transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2); + transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left7, right7); + highbd_dct8x32_body_second_pass(left7, right7); + highbd_partial_add_round_shift(left7, right7); + + // Final row of 8x32. + transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2); + transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left8, right8); + highbd_dct8x32_body_second_pass(left8, right8); + highbd_partial_add_round_shift(left8, right8); + + // Final transpose + transpose_s32_8x8_2(left5, right5, left1, right1); + transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2); + transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3); + transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4); + transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8); + transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8); + transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8); + transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16); + transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16); + transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16); + transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16); + transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24); + transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24); + transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24); + transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24); + + store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4, + right4); +} + +void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[32]; + int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32], + right3[32], right4[32]; + int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32], + left8[32], right8[32]; + int32x4_t temp1[32], temp2[32]; + + // Process in 8x32 columns. + load_cross(input, stride, temp0); + highbd_scale_input(temp0, left1, right1); + highbd_dct8x32_body_first_pass(left1, right1); + highbd_partial_sub_round_shift(left1, right1); + + load_cross(input + 8, stride, temp0); + highbd_scale_input(temp0, left2, right2); + highbd_dct8x32_body_first_pass(left2, right2); + highbd_partial_sub_round_shift(left2, right2); + + load_cross(input + 16, stride, temp0); + highbd_scale_input(temp0, left3, right3); + highbd_dct8x32_body_first_pass(left3, right3); + highbd_partial_sub_round_shift(left3, right3); + + load_cross(input + 24, stride, temp0); + highbd_scale_input(temp0, left4, right4); + highbd_dct8x32_body_first_pass(left4, right4); + highbd_partial_sub_round_shift(left4, right4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_s32_8x8_2(left1, right1, temp1, temp2); + transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left5, right5); + highbd_dct8x32_body_second_pass_rd(left5, right5); + + // Second row of 8x32. + transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2); + transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left6, right6); + highbd_dct8x32_body_second_pass_rd(left6, right6); + + // Third row of 8x32 + transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2); + transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left7, right7); + highbd_dct8x32_body_second_pass_rd(left7, right7); + + // Final row of 8x32. + transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2); + transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left8, right8); + highbd_dct8x32_body_second_pass_rd(left8, right8); + + // Final transpose + transpose_s32_8x8_2(left5, right5, left1, right1); + transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2); + transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3); + transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4); + transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8); + transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8); + transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8); + transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16); + transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16); + transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16); + transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16); + transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24); + transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24); + transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24); + transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24); + + store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4, + right4); +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + #endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && // __GNUC__ == 4 && __GNUC_MINOR__ <= 9 diff --git a/vpx_dsp/arm/fdct32x32_neon.h b/vpx_dsp/arm/fdct32x32_neon.h index dd647918b2..3b9e64c6df 100644 --- a/vpx_dsp/arm/fdct32x32_neon.h +++ b/vpx_dsp/arm/fdct32x32_neon.h @@ -143,7 +143,7 @@ static INLINE void scale_input(const int16x8_t *in /*32*/, out[31] = vshlq_n_s16(in[31], 2); } -static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { +static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { int16x8_t a[32]; int16x8_t b[32]; @@ -494,7 +494,7 @@ static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { &b##_hi[sub_index]); \ } while (0) -static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { +static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { int16x8_t a[32]; int16x8_t b[32]; int32x4_t c_lo[32]; @@ -800,7 +800,8 @@ static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]); } -static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) { +static INLINE void dct_body_second_pass_rd(const int16x8_t *in, + int16x8_t *out) { int16x8_t a[32]; int16x8_t b[32]; @@ -1102,4 +1103,1817 @@ static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) { #undef BUTTERFLY_ONE_S32 #undef BUTTERFLY_TWO_S32 +#if CONFIG_VP9_HIGHBITDEPTH + +// Store 32 32x4 vectors, assuming stride == 32. +static INLINE void store32x32_s32( + tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/, + const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/, + const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/, + const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) { + int i; + for (i = 0; i < 32; i++) { + vst1q_s32(a, l1[i]); + vst1q_s32(a + 4, r1[i]); + vst1q_s32(a + 8, l2[i]); + vst1q_s32(a + 12, r2[i]); + vst1q_s32(a + 16, l3[i]); + vst1q_s32(a + 20, r3[i]); + vst1q_s32(a + 24, l4[i]); + vst1q_s32(a + 28, r4[i]); + a += 32; + } +} + +static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/, + int32x4_t *left /*[32]*/, + int32x4_t *right /* [32] */) { + left[0] = vshll_n_s16(vget_low_s16(a[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(a[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(a[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(a[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(a[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(a[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(a[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(a[7]), 2); + left[8] = vshll_n_s16(vget_low_s16(a[8]), 2); + left[9] = vshll_n_s16(vget_low_s16(a[9]), 2); + left[10] = vshll_n_s16(vget_low_s16(a[10]), 2); + left[11] = vshll_n_s16(vget_low_s16(a[11]), 2); + left[12] = vshll_n_s16(vget_low_s16(a[12]), 2); + left[13] = vshll_n_s16(vget_low_s16(a[13]), 2); + left[14] = vshll_n_s16(vget_low_s16(a[14]), 2); + left[15] = vshll_n_s16(vget_low_s16(a[15]), 2); + left[16] = vshll_n_s16(vget_low_s16(a[16]), 2); + left[17] = vshll_n_s16(vget_low_s16(a[17]), 2); + left[18] = vshll_n_s16(vget_low_s16(a[18]), 2); + left[19] = vshll_n_s16(vget_low_s16(a[19]), 2); + left[20] = vshll_n_s16(vget_low_s16(a[20]), 2); + left[21] = vshll_n_s16(vget_low_s16(a[21]), 2); + left[22] = vshll_n_s16(vget_low_s16(a[22]), 2); + left[23] = vshll_n_s16(vget_low_s16(a[23]), 2); + left[24] = vshll_n_s16(vget_low_s16(a[24]), 2); + left[25] = vshll_n_s16(vget_low_s16(a[25]), 2); + left[26] = vshll_n_s16(vget_low_s16(a[26]), 2); + left[27] = vshll_n_s16(vget_low_s16(a[27]), 2); + left[28] = vshll_n_s16(vget_low_s16(a[28]), 2); + left[29] = vshll_n_s16(vget_low_s16(a[29]), 2); + left[30] = vshll_n_s16(vget_low_s16(a[30]), 2); + left[31] = vshll_n_s16(vget_low_s16(a[31]), 2); + + right[0] = vshll_n_s16(vget_high_s16(a[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(a[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(a[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(a[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(a[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(a[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(a[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(a[7]), 2); + right[8] = vshll_n_s16(vget_high_s16(a[8]), 2); + right[9] = vshll_n_s16(vget_high_s16(a[9]), 2); + right[10] = vshll_n_s16(vget_high_s16(a[10]), 2); + right[11] = vshll_n_s16(vget_high_s16(a[11]), 2); + right[12] = vshll_n_s16(vget_high_s16(a[12]), 2); + right[13] = vshll_n_s16(vget_high_s16(a[13]), 2); + right[14] = vshll_n_s16(vget_high_s16(a[14]), 2); + right[15] = vshll_n_s16(vget_high_s16(a[15]), 2); + right[16] = vshll_n_s16(vget_high_s16(a[16]), 2); + right[17] = vshll_n_s16(vget_high_s16(a[17]), 2); + right[18] = vshll_n_s16(vget_high_s16(a[18]), 2); + right[19] = vshll_n_s16(vget_high_s16(a[19]), 2); + right[20] = vshll_n_s16(vget_high_s16(a[20]), 2); + right[21] = vshll_n_s16(vget_high_s16(a[21]), 2); + right[22] = vshll_n_s16(vget_high_s16(a[22]), 2); + right[23] = vshll_n_s16(vget_high_s16(a[23]), 2); + right[24] = vshll_n_s16(vget_high_s16(a[24]), 2); + right[25] = vshll_n_s16(vget_high_s16(a[25]), 2); + right[26] = vshll_n_s16(vget_high_s16(a[26]), 2); + right[27] = vshll_n_s16(vget_high_s16(a[27]), 2); + right[28] = vshll_n_s16(vget_high_s16(a[28]), 2); + right[29] = vshll_n_s16(vget_high_s16(a[29]), 2); + right[30] = vshll_n_s16(vget_high_s16(a[30]), 2); + right[31] = vshll_n_s16(vget_high_s16(a[31]), 2); +} + +static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/, + int32x4_t *a_right /*[32]*/, + int32x4_t *b_left /*[32]*/, + int32x4_t *b_right /*[32]*/) { + // Stage 1. Done as part of the load for the first pass. + b_left[0] = vaddq_s32(a_left[0], a_left[31]); + b_left[1] = vaddq_s32(a_left[1], a_left[30]); + b_left[2] = vaddq_s32(a_left[2], a_left[29]); + b_left[3] = vaddq_s32(a_left[3], a_left[28]); + b_left[4] = vaddq_s32(a_left[4], a_left[27]); + b_left[5] = vaddq_s32(a_left[5], a_left[26]); + b_left[6] = vaddq_s32(a_left[6], a_left[25]); + b_left[7] = vaddq_s32(a_left[7], a_left[24]); + b_left[8] = vaddq_s32(a_left[8], a_left[23]); + b_left[9] = vaddq_s32(a_left[9], a_left[22]); + b_left[10] = vaddq_s32(a_left[10], a_left[21]); + b_left[11] = vaddq_s32(a_left[11], a_left[20]); + b_left[12] = vaddq_s32(a_left[12], a_left[19]); + b_left[13] = vaddq_s32(a_left[13], a_left[18]); + b_left[14] = vaddq_s32(a_left[14], a_left[17]); + b_left[15] = vaddq_s32(a_left[15], a_left[16]); + + b_right[0] = vaddq_s32(a_right[0], a_right[31]); + b_right[1] = vaddq_s32(a_right[1], a_right[30]); + b_right[2] = vaddq_s32(a_right[2], a_right[29]); + b_right[3] = vaddq_s32(a_right[3], a_right[28]); + b_right[4] = vaddq_s32(a_right[4], a_right[27]); + b_right[5] = vaddq_s32(a_right[5], a_right[26]); + b_right[6] = vaddq_s32(a_right[6], a_right[25]); + b_right[7] = vaddq_s32(a_right[7], a_right[24]); + b_right[8] = vaddq_s32(a_right[8], a_right[23]); + b_right[9] = vaddq_s32(a_right[9], a_right[22]); + b_right[10] = vaddq_s32(a_right[10], a_right[21]); + b_right[11] = vaddq_s32(a_right[11], a_right[20]); + b_right[12] = vaddq_s32(a_right[12], a_right[19]); + b_right[13] = vaddq_s32(a_right[13], a_right[18]); + b_right[14] = vaddq_s32(a_right[14], a_right[17]); + b_right[15] = vaddq_s32(a_right[15], a_right[16]); + + b_left[16] = vsubq_s32(a_left[15], a_left[16]); + b_left[17] = vsubq_s32(a_left[14], a_left[17]); + b_left[18] = vsubq_s32(a_left[13], a_left[18]); + b_left[19] = vsubq_s32(a_left[12], a_left[19]); + b_left[20] = vsubq_s32(a_left[11], a_left[20]); + b_left[21] = vsubq_s32(a_left[10], a_left[21]); + b_left[22] = vsubq_s32(a_left[9], a_left[22]); + b_left[23] = vsubq_s32(a_left[8], a_left[23]); + b_left[24] = vsubq_s32(a_left[7], a_left[24]); + b_left[25] = vsubq_s32(a_left[6], a_left[25]); + b_left[26] = vsubq_s32(a_left[5], a_left[26]); + b_left[27] = vsubq_s32(a_left[4], a_left[27]); + b_left[28] = vsubq_s32(a_left[3], a_left[28]); + b_left[29] = vsubq_s32(a_left[2], a_left[29]); + b_left[30] = vsubq_s32(a_left[1], a_left[30]); + b_left[31] = vsubq_s32(a_left[0], a_left[31]); + + b_right[16] = vsubq_s32(a_right[15], a_right[16]); + b_right[17] = vsubq_s32(a_right[14], a_right[17]); + b_right[18] = vsubq_s32(a_right[13], a_right[18]); + b_right[19] = vsubq_s32(a_right[12], a_right[19]); + b_right[20] = vsubq_s32(a_right[11], a_right[20]); + b_right[21] = vsubq_s32(a_right[10], a_right[21]); + b_right[22] = vsubq_s32(a_right[9], a_right[22]); + b_right[23] = vsubq_s32(a_right[8], a_right[23]); + b_right[24] = vsubq_s32(a_right[7], a_right[24]); + b_right[25] = vsubq_s32(a_right[6], a_right[25]); + b_right[26] = vsubq_s32(a_right[5], a_right[26]); + b_right[27] = vsubq_s32(a_right[4], a_right[27]); + b_right[28] = vsubq_s32(a_right[3], a_right[28]); + b_right[29] = vsubq_s32(a_right[2], a_right[29]); + b_right[30] = vsubq_s32(a_right[1], a_right[30]); + b_right[31] = vsubq_s32(a_right[0], a_right[31]); +} + +static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/, + int32x4_t *right /* [32] */) { + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + + left[0] = add_round_shift_s32(left[0]); + left[1] = add_round_shift_s32(left[1]); + left[2] = add_round_shift_s32(left[2]); + left[3] = add_round_shift_s32(left[3]); + left[4] = add_round_shift_s32(left[4]); + left[5] = add_round_shift_s32(left[5]); + left[6] = add_round_shift_s32(left[6]); + left[7] = add_round_shift_s32(left[7]); + left[8] = add_round_shift_s32(left[8]); + left[9] = add_round_shift_s32(left[9]); + left[10] = add_round_shift_s32(left[10]); + left[11] = add_round_shift_s32(left[11]); + left[12] = add_round_shift_s32(left[12]); + left[13] = add_round_shift_s32(left[13]); + left[14] = add_round_shift_s32(left[14]); + left[15] = add_round_shift_s32(left[15]); + left[16] = add_round_shift_s32(left[16]); + left[17] = add_round_shift_s32(left[17]); + left[18] = add_round_shift_s32(left[18]); + left[19] = add_round_shift_s32(left[19]); + left[20] = add_round_shift_s32(left[20]); + left[21] = add_round_shift_s32(left[21]); + left[22] = add_round_shift_s32(left[22]); + left[23] = add_round_shift_s32(left[23]); + left[24] = add_round_shift_s32(left[24]); + left[25] = add_round_shift_s32(left[25]); + left[26] = add_round_shift_s32(left[26]); + left[27] = add_round_shift_s32(left[27]); + left[28] = add_round_shift_s32(left[28]); + left[29] = add_round_shift_s32(left[29]); + left[30] = add_round_shift_s32(left[30]); + left[31] = add_round_shift_s32(left[31]); + + right[0] = add_round_shift_s32(right[0]); + right[1] = add_round_shift_s32(right[1]); + right[2] = add_round_shift_s32(right[2]); + right[3] = add_round_shift_s32(right[3]); + right[4] = add_round_shift_s32(right[4]); + right[5] = add_round_shift_s32(right[5]); + right[6] = add_round_shift_s32(right[6]); + right[7] = add_round_shift_s32(right[7]); + right[8] = add_round_shift_s32(right[8]); + right[9] = add_round_shift_s32(right[9]); + right[10] = add_round_shift_s32(right[10]); + right[11] = add_round_shift_s32(right[11]); + right[12] = add_round_shift_s32(right[12]); + right[13] = add_round_shift_s32(right[13]); + right[14] = add_round_shift_s32(right[14]); + right[15] = add_round_shift_s32(right[15]); + right[16] = add_round_shift_s32(right[16]); + right[17] = add_round_shift_s32(right[17]); + right[18] = add_round_shift_s32(right[18]); + right[19] = add_round_shift_s32(right[19]); + right[20] = add_round_shift_s32(right[20]); + right[21] = add_round_shift_s32(right[21]); + right[22] = add_round_shift_s32(right[22]); + right[23] = add_round_shift_s32(right[23]); + right[24] = add_round_shift_s32(right[24]); + right[25] = add_round_shift_s32(right[25]); + right[26] = add_round_shift_s32(right[26]); + right[27] = add_round_shift_s32(right[27]); + right[28] = add_round_shift_s32(right[28]); + right[29] = add_round_shift_s32(right[29]); + right[30] = add_round_shift_s32(right[30]); + right[31] = add_round_shift_s32(right[31]); +} + +static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/, + int32x4_t *right /* [32] */) { + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + + left[0] = sub_round_shift_s32(left[0]); + left[1] = sub_round_shift_s32(left[1]); + left[2] = sub_round_shift_s32(left[2]); + left[3] = sub_round_shift_s32(left[3]); + left[4] = sub_round_shift_s32(left[4]); + left[5] = sub_round_shift_s32(left[5]); + left[6] = sub_round_shift_s32(left[6]); + left[7] = sub_round_shift_s32(left[7]); + left[8] = sub_round_shift_s32(left[8]); + left[9] = sub_round_shift_s32(left[9]); + left[10] = sub_round_shift_s32(left[10]); + left[11] = sub_round_shift_s32(left[11]); + left[12] = sub_round_shift_s32(left[12]); + left[13] = sub_round_shift_s32(left[13]); + left[14] = sub_round_shift_s32(left[14]); + left[15] = sub_round_shift_s32(left[15]); + left[16] = sub_round_shift_s32(left[16]); + left[17] = sub_round_shift_s32(left[17]); + left[18] = sub_round_shift_s32(left[18]); + left[19] = sub_round_shift_s32(left[19]); + left[20] = sub_round_shift_s32(left[20]); + left[21] = sub_round_shift_s32(left[21]); + left[22] = sub_round_shift_s32(left[22]); + left[23] = sub_round_shift_s32(left[23]); + left[24] = sub_round_shift_s32(left[24]); + left[25] = sub_round_shift_s32(left[25]); + left[26] = sub_round_shift_s32(left[26]); + left[27] = sub_round_shift_s32(left[27]); + left[28] = sub_round_shift_s32(left[28]); + left[29] = sub_round_shift_s32(left[29]); + left[30] = sub_round_shift_s32(left[30]); + left[31] = sub_round_shift_s32(left[31]); + + right[0] = sub_round_shift_s32(right[0]); + right[1] = sub_round_shift_s32(right[1]); + right[2] = sub_round_shift_s32(right[2]); + right[3] = sub_round_shift_s32(right[3]); + right[4] = sub_round_shift_s32(right[4]); + right[5] = sub_round_shift_s32(right[5]); + right[6] = sub_round_shift_s32(right[6]); + right[7] = sub_round_shift_s32(right[7]); + right[8] = sub_round_shift_s32(right[8]); + right[9] = sub_round_shift_s32(right[9]); + right[10] = sub_round_shift_s32(right[10]); + right[11] = sub_round_shift_s32(right[11]); + right[12] = sub_round_shift_s32(right[12]); + right[13] = sub_round_shift_s32(right[13]); + right[14] = sub_round_shift_s32(right[14]); + right[15] = sub_round_shift_s32(right[15]); + right[16] = sub_round_shift_s32(right[16]); + right[17] = sub_round_shift_s32(right[17]); + right[18] = sub_round_shift_s32(right[18]); + right[19] = sub_round_shift_s32(right[19]); + right[20] = sub_round_shift_s32(right[20]); + right[21] = sub_round_shift_s32(right[21]); + right[22] = sub_round_shift_s32(right[22]); + right[23] = sub_round_shift_s32(right[23]); + right[24] = sub_round_shift_s32(right[24]); + right[25] = sub_round_shift_s32(right[25]); + right[26] = sub_round_shift_s32(right[26]); + right[27] = sub_round_shift_s32(right[27]); + right[28] = sub_round_shift_s32(right[28]); + right[29] = sub_round_shift_s32(right[29]); + right[30] = sub_round_shift_s32(right[30]); + right[31] = sub_round_shift_s32(right[31]); +} + +static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/, + int32x4_t *right /*32*/) { + int32x4_t al[32], ar[32]; + int32x4_t bl[32], br[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + al[0] = vaddq_s32(left[0], left[15]); + ar[0] = vaddq_s32(right[0], right[15]); + al[1] = vaddq_s32(left[1], left[14]); + ar[1] = vaddq_s32(right[1], right[14]); + al[2] = vaddq_s32(left[2], left[13]); + ar[2] = vaddq_s32(right[2], right[13]); + al[3] = vaddq_s32(left[3], left[12]); + ar[3] = vaddq_s32(right[3], right[12]); + al[4] = vaddq_s32(left[4], left[11]); + ar[4] = vaddq_s32(right[4], right[11]); + al[5] = vaddq_s32(left[5], left[10]); + ar[5] = vaddq_s32(right[5], right[10]); + al[6] = vaddq_s32(left[6], left[9]); + ar[6] = vaddq_s32(right[6], right[9]); + al[7] = vaddq_s32(left[7], left[8]); + ar[7] = vaddq_s32(right[7], right[8]); + + al[8] = vsubq_s32(left[7], left[8]); + ar[8] = vsubq_s32(right[7], right[8]); + al[9] = vsubq_s32(left[6], left[9]); + ar[9] = vsubq_s32(right[6], right[9]); + al[10] = vsubq_s32(left[5], left[10]); + ar[10] = vsubq_s32(right[5], right[10]); + al[11] = vsubq_s32(left[4], left[11]); + ar[11] = vsubq_s32(right[4], right[11]); + al[12] = vsubq_s32(left[3], left[12]); + ar[12] = vsubq_s32(right[3], right[12]); + al[13] = vsubq_s32(left[2], left[13]); + ar[13] = vsubq_s32(right[2], right[13]); + al[14] = vsubq_s32(left[1], left[14]); + ar[14] = vsubq_s32(right[1], right[14]); + al[15] = vsubq_s32(left[0], left[15]); + ar[15] = vsubq_s32(right[0], right[15]); + + al[16] = left[16]; + ar[16] = right[16]; + al[17] = left[17]; + ar[17] = right[17]; + al[18] = left[18]; + ar[18] = right[18]; + al[19] = left[19]; + ar[19] = right[19]; + + butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20], + cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21], + cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22], + cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]); + butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23], + cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]); + + al[28] = left[28]; + ar[28] = right[28]; + al[29] = left[29]; + ar[29] = right[29]; + al[30] = left[30]; + ar[30] = right[30]; + al[31] = left[31]; + ar[31] = right[31]; + + // Stage 3. + bl[0] = vaddq_s32(al[0], al[7]); + br[0] = vaddq_s32(ar[0], ar[7]); + bl[1] = vaddq_s32(al[1], al[6]); + br[1] = vaddq_s32(ar[1], ar[6]); + bl[2] = vaddq_s32(al[2], al[5]); + br[2] = vaddq_s32(ar[2], ar[5]); + bl[3] = vaddq_s32(al[3], al[4]); + br[3] = vaddq_s32(ar[3], ar[4]); + + bl[4] = vsubq_s32(al[3], al[4]); + br[4] = vsubq_s32(ar[3], ar[4]); + bl[5] = vsubq_s32(al[2], al[5]); + br[5] = vsubq_s32(ar[2], ar[5]); + bl[6] = vsubq_s32(al[1], al[6]); + br[6] = vsubq_s32(ar[1], ar[6]); + bl[7] = vsubq_s32(al[0], al[7]); + br[7] = vsubq_s32(ar[0], ar[7]); + + bl[8] = al[8]; + br[8] = ar[8]; + bl[9] = al[9]; + br[9] = ar[9]; + + butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64, + &bl[13], &br[13], &bl[10], &br[10]); + butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64, + &bl[12], &br[12], &bl[11], &br[11]); + + bl[14] = al[14]; + br[14] = ar[14]; + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(left[16], al[23]); + br[16] = vaddq_s32(right[16], ar[23]); + bl[17] = vaddq_s32(left[17], al[22]); + br[17] = vaddq_s32(right[17], ar[22]); + bl[18] = vaddq_s32(left[18], al[21]); + br[18] = vaddq_s32(right[18], ar[21]); + bl[19] = vaddq_s32(left[19], al[20]); + br[19] = vaddq_s32(right[19], ar[20]); + + bl[20] = vsubq_s32(left[19], al[20]); + br[20] = vsubq_s32(right[19], ar[20]); + bl[21] = vsubq_s32(left[18], al[21]); + br[21] = vsubq_s32(right[18], ar[21]); + bl[22] = vsubq_s32(left[17], al[22]); + br[22] = vsubq_s32(right[17], ar[22]); + bl[23] = vsubq_s32(left[16], al[23]); + br[23] = vsubq_s32(right[16], ar[23]); + + bl[24] = vsubq_s32(left[31], al[24]); + br[24] = vsubq_s32(right[31], ar[24]); + bl[25] = vsubq_s32(left[30], al[25]); + br[25] = vsubq_s32(right[30], ar[25]); + bl[26] = vsubq_s32(left[29], al[26]); + br[26] = vsubq_s32(right[29], ar[26]); + bl[27] = vsubq_s32(left[28], al[27]); + br[27] = vsubq_s32(right[28], ar[27]); + + bl[28] = vaddq_s32(left[28], al[27]); + br[28] = vaddq_s32(right[28], ar[27]); + bl[29] = vaddq_s32(left[29], al[26]); + br[29] = vaddq_s32(right[29], ar[26]); + bl[30] = vaddq_s32(left[30], al[25]); + br[30] = vaddq_s32(right[30], ar[25]); + bl[31] = vaddq_s32(left[31], al[24]); + br[31] = vaddq_s32(right[31], ar[24]); + + // Stage 4. + al[0] = vaddq_s32(bl[0], bl[3]); + ar[0] = vaddq_s32(br[0], br[3]); + al[1] = vaddq_s32(bl[1], bl[2]); + ar[1] = vaddq_s32(br[1], br[2]); + al[2] = vsubq_s32(bl[1], bl[2]); + ar[2] = vsubq_s32(br[1], br[2]); + al[3] = vsubq_s32(bl[0], bl[3]); + ar[3] = vsubq_s32(br[0], br[3]); + + al[4] = bl[4]; + ar[4] = br[4]; + + butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6], + &ar[6], &al[5], &ar[5]); + + al[7] = bl[7]; + ar[7] = br[7]; + + al[8] = vaddq_s32(bl[8], bl[11]); + ar[8] = vaddq_s32(br[8], br[11]); + al[9] = vaddq_s32(bl[9], bl[10]); + ar[9] = vaddq_s32(br[9], br[10]); + al[10] = vsubq_s32(bl[9], bl[10]); + ar[10] = vsubq_s32(br[9], br[10]); + al[11] = vsubq_s32(bl[8], bl[11]); + ar[11] = vsubq_s32(br[8], br[11]); + al[12] = vsubq_s32(bl[15], bl[12]); + ar[12] = vsubq_s32(br[15], br[12]); + al[13] = vsubq_s32(bl[14], bl[13]); + ar[13] = vsubq_s32(br[14], br[13]); + al[14] = vaddq_s32(bl[14], bl[13]); + ar[14] = vaddq_s32(br[14], br[13]); + al[15] = vaddq_s32(bl[15], bl[12]); + ar[15] = vaddq_s32(br[15], br[12]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[17] = bl[17]; + ar[17] = br[17]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64, + cospi_24_64, &al[29], &ar[29], &al[18], + &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64, + cospi_24_64, &al[28], &ar[28], &al[19], + &ar[19]); + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], + cospi_24_64, -cospi_8_64, &al[27], &ar[27], + &al[20], &ar[20]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_24_64, -cospi_8_64, &al[26], &ar[26], + &al[21], &ar[21]); + + al[22] = bl[22]; + ar[22] = br[22]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[25] = bl[25]; + ar[25] = br[25]; + + al[30] = bl[30]; + ar[30] = br[30]; + al[31] = bl[31]; + ar[31] = br[31]; + + // Stage 5. + butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0], + &br[0], &bl[1], &br[1]); + butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64, + cospi_24_64, &bl[2], &br[2], &bl[3], + &br[3]); + + bl[4] = vaddq_s32(al[4], al[5]); + br[4] = vaddq_s32(ar[4], ar[5]); + bl[5] = vsubq_s32(al[4], al[5]); + br[5] = vsubq_s32(ar[4], ar[5]); + bl[6] = vsubq_s32(al[7], al[6]); + br[6] = vsubq_s32(ar[7], ar[6]); + bl[7] = vaddq_s32(al[7], al[6]); + br[7] = vaddq_s32(ar[7], ar[6]); + + bl[8] = al[8]; + br[8] = ar[8]; + + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64, + cospi_24_64, &bl[14], &br[14], &bl[9], + &br[9]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_24_64, -cospi_8_64, &bl[13], &br[13], + &bl[10], &br[10]); + + bl[11] = al[11]; + br[11] = ar[11]; + bl[12] = al[12]; + br[12] = ar[12]; + + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[19], al[16]); + br[16] = vaddq_s32(ar[19], ar[16]); + bl[17] = vaddq_s32(al[18], al[17]); + br[17] = vaddq_s32(ar[18], ar[17]); + bl[18] = vsubq_s32(al[17], al[18]); + br[18] = vsubq_s32(ar[17], ar[18]); + bl[19] = vsubq_s32(al[16], al[19]); + br[19] = vsubq_s32(ar[16], ar[19]); + bl[20] = vsubq_s32(al[23], al[20]); + br[20] = vsubq_s32(ar[23], ar[20]); + bl[21] = vsubq_s32(al[22], al[21]); + br[21] = vsubq_s32(ar[22], ar[21]); + bl[22] = vaddq_s32(al[21], al[22]); + br[22] = vaddq_s32(ar[21], ar[22]); + bl[23] = vaddq_s32(al[20], al[23]); + br[23] = vaddq_s32(ar[20], ar[23]); + bl[24] = vaddq_s32(al[27], al[24]); + br[24] = vaddq_s32(ar[27], ar[24]); + bl[25] = vaddq_s32(al[26], al[25]); + br[25] = vaddq_s32(ar[26], ar[25]); + bl[26] = vsubq_s32(al[25], al[26]); + br[26] = vsubq_s32(ar[25], ar[26]); + bl[27] = vsubq_s32(al[24], al[27]); + br[27] = vsubq_s32(ar[24], ar[27]); + bl[28] = vsubq_s32(al[31], al[28]); + br[28] = vsubq_s32(ar[31], ar[28]); + bl[29] = vsubq_s32(al[30], al[29]); + br[29] = vsubq_s32(ar[30], ar[29]); + bl[30] = vaddq_s32(al[29], al[30]); + br[30] = vaddq_s32(ar[29], ar[30]); + bl[31] = vaddq_s32(al[28], al[31]); + br[31] = vaddq_s32(ar[28], ar[31]); + + // Stage 6. + al[0] = bl[0]; + ar[0] = br[0]; + al[1] = bl[1]; + ar[1] = br[1]; + al[2] = bl[2]; + ar[2] = br[2]; + al[3] = bl[3]; + ar[3] = br[3]; + + butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64, + cospi_28_64, &al[4], &ar[4], &al[7], + &ar[7]); + butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64, + cospi_12_64, &al[5], &ar[5], &al[6], + &ar[6]); + + al[8] = vaddq_s32(bl[8], bl[9]); + ar[8] = vaddq_s32(br[8], br[9]); + al[9] = vsubq_s32(bl[8], bl[9]); + ar[9] = vsubq_s32(br[8], br[9]); + al[10] = vsubq_s32(bl[11], bl[10]); + ar[10] = vsubq_s32(br[11], br[10]); + al[11] = vaddq_s32(bl[11], bl[10]); + ar[11] = vaddq_s32(br[11], br[10]); + al[12] = vaddq_s32(bl[12], bl[13]); + ar[12] = vaddq_s32(br[12], br[13]); + al[13] = vsubq_s32(bl[12], bl[13]); + ar[13] = vsubq_s32(br[12], br[13]); + al[14] = vsubq_s32(bl[15], bl[14]); + ar[14] = vsubq_s32(br[15], br[14]); + al[15] = vaddq_s32(bl[15], bl[14]); + ar[15] = vaddq_s32(br[15], br[14]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[19] = bl[19]; + ar[19] = br[19]; + al[20] = bl[20]; + ar[20] = br[20]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[27] = bl[27]; + ar[27] = br[27]; + al[28] = bl[28]; + ar[28] = br[28]; + al[31] = bl[31]; + ar[31] = br[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64, + cospi_28_64, &al[30], &ar[30], &al[17], + &ar[17]); + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], + cospi_28_64, -cospi_4_64, &al[29], &ar[29], + &al[18], &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_20_64, cospi_12_64, &al[26], &ar[26], + &al[21], &ar[21]); + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_12_64, -cospi_20_64, &al[25], + &ar[25], &al[22], &ar[22]); + + // Stage 7. + bl[0] = al[0]; + br[0] = ar[0]; + bl[1] = al[1]; + br[1] = ar[1]; + bl[2] = al[2]; + br[2] = ar[2]; + bl[3] = al[3]; + br[3] = ar[3]; + bl[4] = al[4]; + br[4] = ar[4]; + bl[5] = al[5]; + br[5] = ar[5]; + bl[6] = al[6]; + br[6] = ar[6]; + bl[7] = al[7]; + br[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64, + cospi_30_64, &bl[8], &br[8], &bl[15], + &br[15]); + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64, + cospi_14_64, &bl[9], &br[9], &bl[14], + &br[14]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_10_64, cospi_22_64, &bl[10], &br[10], + &bl[13], &br[13]); + butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11], + cospi_26_64, cospi_6_64, &bl[11], &br[11], + &bl[12], &br[12]); + + bl[16] = vaddq_s32(al[16], al[17]); + br[16] = vaddq_s32(ar[16], ar[17]); + bl[17] = vsubq_s32(al[16], al[17]); + br[17] = vsubq_s32(ar[16], ar[17]); + bl[18] = vsubq_s32(al[19], al[18]); + br[18] = vsubq_s32(ar[19], ar[18]); + bl[19] = vaddq_s32(al[19], al[18]); + br[19] = vaddq_s32(ar[19], ar[18]); + bl[20] = vaddq_s32(al[20], al[21]); + br[20] = vaddq_s32(ar[20], ar[21]); + bl[21] = vsubq_s32(al[20], al[21]); + br[21] = vsubq_s32(ar[20], ar[21]); + bl[22] = vsubq_s32(al[23], al[22]); + br[22] = vsubq_s32(ar[23], ar[22]); + bl[23] = vaddq_s32(al[23], al[22]); + br[23] = vaddq_s32(ar[23], ar[22]); + bl[24] = vaddq_s32(al[24], al[25]); + br[24] = vaddq_s32(ar[24], ar[25]); + bl[25] = vsubq_s32(al[24], al[25]); + br[25] = vsubq_s32(ar[24], ar[25]); + bl[26] = vsubq_s32(al[27], al[26]); + br[26] = vsubq_s32(ar[27], ar[26]); + bl[27] = vaddq_s32(al[27], al[26]); + br[27] = vaddq_s32(ar[27], ar[26]); + bl[28] = vaddq_s32(al[28], al[29]); + br[28] = vaddq_s32(ar[28], ar[29]); + bl[29] = vsubq_s32(al[28], al[29]); + br[29] = vsubq_s32(ar[28], ar[29]); + bl[30] = vsubq_s32(al[31], al[30]); + br[30] = vsubq_s32(ar[31], ar[30]); + bl[31] = vaddq_s32(al[31], al[30]); + br[31] = vaddq_s32(ar[31], ar[30]); + + // Final stage. + + left[0] = bl[0]; + right[0] = br[0]; + left[16] = bl[1]; + right[16] = br[1]; + left[8] = bl[2]; + right[8] = br[2]; + left[24] = bl[3]; + right[24] = br[3]; + left[4] = bl[4]; + right[4] = br[4]; + left[20] = bl[5]; + right[20] = br[5]; + left[12] = bl[6]; + right[12] = br[6]; + left[28] = bl[7]; + right[28] = br[7]; + left[2] = bl[8]; + right[2] = br[8]; + left[18] = bl[9]; + right[18] = br[9]; + left[10] = bl[10]; + right[10] = br[10]; + left[26] = bl[11]; + right[26] = br[11]; + left[6] = bl[12]; + right[6] = br[12]; + left[22] = bl[13]; + right[22] = br[13]; + left[14] = bl[14]; + right[14] = br[14]; + left[30] = bl[15]; + right[30] = br[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64, + cospi_31_64, &al[1], &ar[1], &al[31], + &ar[31]); + left[1] = al[1]; + right[1] = ar[1]; + left[31] = al[31]; + right[31] = ar[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], + cospi_17_64, cospi_15_64, &al[17], &ar[17], + &al[15], &ar[15]); + left[17] = al[17]; + right[17] = ar[17]; + left[15] = al[15]; + right[15] = ar[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64, + cospi_23_64, &al[9], &ar[9], &al[23], + &ar[23]); + left[9] = al[9]; + right[9] = ar[9]; + left[23] = al[23]; + right[23] = ar[23]; + + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], + cospi_25_64, cospi_7_64, &al[25], &ar[25], + &al[7], &ar[7]); + left[25] = al[25]; + right[25] = ar[25]; + left[7] = al[7]; + right[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64, + cospi_27_64, &al[5], &ar[5], &al[27], + &ar[27]); + left[5] = al[5]; + right[5] = ar[5]; + left[27] = al[27]; + right[27] = ar[27]; + + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_21_64, cospi_11_64, &al[21], &ar[21], + &al[11], &ar[11]); + left[21] = al[21]; + right[21] = ar[21]; + left[11] = al[11]; + right[11] = ar[11]; + + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_13_64, cospi_19_64, &al[13], &ar[13], + &al[19], &ar[19]); + left[13] = al[13]; + right[13] = ar[13]; + left[19] = al[19]; + right[19] = ar[19]; + + butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23], + cospi_29_64, cospi_3_64, &al[29], &ar[29], + &al[3], &ar[3]); + left[29] = al[29]; + right[29] = ar[29]; + left[3] = al[3]; + right[3] = ar[3]; +} + +static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/, + int32x4_t *right /*32*/) { + int32x4_t al[32], ar[32]; + int32x4_t bl[32], br[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + al[0] = vaddq_s32(left[0], left[15]); + ar[0] = vaddq_s32(right[0], right[15]); + al[1] = vaddq_s32(left[1], left[14]); + ar[1] = vaddq_s32(right[1], right[14]); + al[2] = vaddq_s32(left[2], left[13]); + ar[2] = vaddq_s32(right[2], right[13]); + al[3] = vaddq_s32(left[3], left[12]); + ar[3] = vaddq_s32(right[3], right[12]); + al[4] = vaddq_s32(left[4], left[11]); + ar[4] = vaddq_s32(right[4], right[11]); + al[5] = vaddq_s32(left[5], left[10]); + ar[5] = vaddq_s32(right[5], right[10]); + al[6] = vaddq_s32(left[6], left[9]); + ar[6] = vaddq_s32(right[6], right[9]); + al[7] = vaddq_s32(left[7], left[8]); + ar[7] = vaddq_s32(right[7], right[8]); + + al[8] = vsubq_s32(left[7], left[8]); + ar[8] = vsubq_s32(right[7], right[8]); + al[9] = vsubq_s32(left[6], left[9]); + ar[9] = vsubq_s32(right[6], right[9]); + al[10] = vsubq_s32(left[5], left[10]); + ar[10] = vsubq_s32(right[5], right[10]); + al[11] = vsubq_s32(left[4], left[11]); + ar[11] = vsubq_s32(right[4], right[11]); + al[12] = vsubq_s32(left[3], left[12]); + ar[12] = vsubq_s32(right[3], right[12]); + al[13] = vsubq_s32(left[2], left[13]); + ar[13] = vsubq_s32(right[2], right[13]); + al[14] = vsubq_s32(left[1], left[14]); + ar[14] = vsubq_s32(right[1], right[14]); + al[15] = vsubq_s32(left[0], left[15]); + ar[15] = vsubq_s32(right[0], right[15]); + + al[16] = left[16]; + ar[16] = right[16]; + al[17] = left[17]; + ar[17] = right[17]; + al[18] = left[18]; + ar[18] = right[18]; + al[19] = left[19]; + ar[19] = right[19]; + + butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20], + cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21], + cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22], + cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]); + butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23], + cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]); + + al[28] = left[28]; + ar[28] = right[28]; + al[29] = left[29]; + ar[29] = right[29]; + al[30] = left[30]; + ar[30] = right[30]; + al[31] = left[31]; + ar[31] = right[31]; + + // Stage 3. + bl[0] = vaddq_s32(al[0], al[7]); + br[0] = vaddq_s32(ar[0], ar[7]); + bl[1] = vaddq_s32(al[1], al[6]); + br[1] = vaddq_s32(ar[1], ar[6]); + bl[2] = vaddq_s32(al[2], al[5]); + br[2] = vaddq_s32(ar[2], ar[5]); + bl[3] = vaddq_s32(al[3], al[4]); + br[3] = vaddq_s32(ar[3], ar[4]); + + bl[4] = vsubq_s32(al[3], al[4]); + br[4] = vsubq_s32(ar[3], ar[4]); + bl[5] = vsubq_s32(al[2], al[5]); + br[5] = vsubq_s32(ar[2], ar[5]); + bl[6] = vsubq_s32(al[1], al[6]); + br[6] = vsubq_s32(ar[1], ar[6]); + bl[7] = vsubq_s32(al[0], al[7]); + br[7] = vsubq_s32(ar[0], ar[7]); + + bl[8] = al[8]; + br[8] = ar[8]; + bl[9] = al[9]; + br[9] = ar[9]; + + butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64, + &bl[13], &br[13], &bl[10], &br[10]); + butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64, + &bl[12], &br[12], &bl[11], &br[11]); + + bl[14] = al[14]; + br[14] = ar[14]; + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(left[16], al[23]); + br[16] = vaddq_s32(right[16], ar[23]); + bl[17] = vaddq_s32(left[17], al[22]); + br[17] = vaddq_s32(right[17], ar[22]); + bl[18] = vaddq_s32(left[18], al[21]); + br[18] = vaddq_s32(right[18], ar[21]); + bl[19] = vaddq_s32(left[19], al[20]); + br[19] = vaddq_s32(right[19], ar[20]); + + bl[20] = vsubq_s32(left[19], al[20]); + br[20] = vsubq_s32(right[19], ar[20]); + bl[21] = vsubq_s32(left[18], al[21]); + br[21] = vsubq_s32(right[18], ar[21]); + bl[22] = vsubq_s32(left[17], al[22]); + br[22] = vsubq_s32(right[17], ar[22]); + bl[23] = vsubq_s32(left[16], al[23]); + br[23] = vsubq_s32(right[16], ar[23]); + + bl[24] = vsubq_s32(left[31], al[24]); + br[24] = vsubq_s32(right[31], ar[24]); + bl[25] = vsubq_s32(left[30], al[25]); + br[25] = vsubq_s32(right[30], ar[25]); + bl[26] = vsubq_s32(left[29], al[26]); + br[26] = vsubq_s32(right[29], ar[26]); + bl[27] = vsubq_s32(left[28], al[27]); + br[27] = vsubq_s32(right[28], ar[27]); + + bl[28] = vaddq_s32(left[28], al[27]); + br[28] = vaddq_s32(right[28], ar[27]); + bl[29] = vaddq_s32(left[29], al[26]); + br[29] = vaddq_s32(right[29], ar[26]); + bl[30] = vaddq_s32(left[30], al[25]); + br[30] = vaddq_s32(right[30], ar[25]); + bl[31] = vaddq_s32(left[31], al[24]); + br[31] = vaddq_s32(right[31], ar[24]); + + // Stage 4. + al[0] = vaddq_s32(bl[0], bl[3]); + ar[0] = vaddq_s32(br[0], br[3]); + al[1] = vaddq_s32(bl[1], bl[2]); + ar[1] = vaddq_s32(br[1], br[2]); + al[2] = vsubq_s32(bl[1], bl[2]); + ar[2] = vsubq_s32(br[1], br[2]); + al[3] = vsubq_s32(bl[0], bl[3]); + ar[3] = vsubq_s32(br[0], br[3]); + + al[4] = bl[4]; + ar[4] = br[4]; + + butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6], + &ar[6], &al[5], &ar[5]); + + al[7] = bl[7]; + ar[7] = br[7]; + + al[8] = vaddq_s32(bl[8], bl[11]); + ar[8] = vaddq_s32(br[8], br[11]); + al[9] = vaddq_s32(bl[9], bl[10]); + ar[9] = vaddq_s32(br[9], br[10]); + al[10] = vsubq_s32(bl[9], bl[10]); + ar[10] = vsubq_s32(br[9], br[10]); + al[11] = vsubq_s32(bl[8], bl[11]); + ar[11] = vsubq_s32(br[8], br[11]); + al[12] = vsubq_s32(bl[15], bl[12]); + ar[12] = vsubq_s32(br[15], br[12]); + al[13] = vsubq_s32(bl[14], bl[13]); + ar[13] = vsubq_s32(br[14], br[13]); + al[14] = vaddq_s32(bl[14], bl[13]); + ar[14] = vaddq_s32(br[14], br[13]); + al[15] = vaddq_s32(bl[15], bl[12]); + ar[15] = vaddq_s32(br[15], br[12]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[17] = bl[17]; + ar[17] = br[17]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64, + cospi_24_64, &al[29], &ar[29], &al[18], + &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64, + cospi_24_64, &al[28], &ar[28], &al[19], + &ar[19]); + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], + cospi_24_64, -cospi_8_64, &al[27], &ar[27], + &al[20], &ar[20]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_24_64, -cospi_8_64, &al[26], &ar[26], + &al[21], &ar[21]); + + al[22] = bl[22]; + ar[22] = br[22]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[25] = bl[25]; + ar[25] = br[25]; + + al[30] = bl[30]; + ar[30] = br[30]; + al[31] = bl[31]; + ar[31] = br[31]; + + // Stage 5. + butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0], + &br[0], &bl[1], &br[1]); + butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64, + cospi_24_64, &bl[2], &br[2], &bl[3], + &br[3]); + + bl[4] = vaddq_s32(al[4], al[5]); + br[4] = vaddq_s32(ar[4], ar[5]); + bl[5] = vsubq_s32(al[4], al[5]); + br[5] = vsubq_s32(ar[4], ar[5]); + bl[6] = vsubq_s32(al[7], al[6]); + br[6] = vsubq_s32(ar[7], ar[6]); + bl[7] = vaddq_s32(al[7], al[6]); + br[7] = vaddq_s32(ar[7], ar[6]); + + bl[8] = al[8]; + br[8] = ar[8]; + + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64, + cospi_24_64, &bl[14], &br[14], &bl[9], + &br[9]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_24_64, -cospi_8_64, &bl[13], &br[13], + &bl[10], &br[10]); + + bl[11] = al[11]; + br[11] = ar[11]; + bl[12] = al[12]; + br[12] = ar[12]; + + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[19], al[16]); + br[16] = vaddq_s32(ar[19], ar[16]); + bl[17] = vaddq_s32(al[18], al[17]); + br[17] = vaddq_s32(ar[18], ar[17]); + bl[18] = vsubq_s32(al[17], al[18]); + br[18] = vsubq_s32(ar[17], ar[18]); + bl[19] = vsubq_s32(al[16], al[19]); + br[19] = vsubq_s32(ar[16], ar[19]); + bl[20] = vsubq_s32(al[23], al[20]); + br[20] = vsubq_s32(ar[23], ar[20]); + bl[21] = vsubq_s32(al[22], al[21]); + br[21] = vsubq_s32(ar[22], ar[21]); + bl[22] = vaddq_s32(al[21], al[22]); + br[22] = vaddq_s32(ar[21], ar[22]); + bl[23] = vaddq_s32(al[20], al[23]); + br[23] = vaddq_s32(ar[20], ar[23]); + bl[24] = vaddq_s32(al[27], al[24]); + br[24] = vaddq_s32(ar[27], ar[24]); + bl[25] = vaddq_s32(al[26], al[25]); + br[25] = vaddq_s32(ar[26], ar[25]); + bl[26] = vsubq_s32(al[25], al[26]); + br[26] = vsubq_s32(ar[25], ar[26]); + bl[27] = vsubq_s32(al[24], al[27]); + br[27] = vsubq_s32(ar[24], ar[27]); + bl[28] = vsubq_s32(al[31], al[28]); + br[28] = vsubq_s32(ar[31], ar[28]); + bl[29] = vsubq_s32(al[30], al[29]); + br[29] = vsubq_s32(ar[30], ar[29]); + bl[30] = vaddq_s32(al[29], al[30]); + br[30] = vaddq_s32(ar[29], ar[30]); + bl[31] = vaddq_s32(al[28], al[31]); + br[31] = vaddq_s32(ar[28], ar[31]); + + // Stage 6. + al[0] = bl[0]; + ar[0] = br[0]; + al[1] = bl[1]; + ar[1] = br[1]; + al[2] = bl[2]; + ar[2] = br[2]; + al[3] = bl[3]; + ar[3] = br[3]; + + butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64, + cospi_28_64, &al[4], &ar[4], &al[7], + &ar[7]); + butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64, + cospi_12_64, &al[5], &ar[5], &al[6], + &ar[6]); + + al[8] = vaddq_s32(bl[8], bl[9]); + ar[8] = vaddq_s32(br[8], br[9]); + al[9] = vsubq_s32(bl[8], bl[9]); + ar[9] = vsubq_s32(br[8], br[9]); + al[10] = vsubq_s32(bl[11], bl[10]); + ar[10] = vsubq_s32(br[11], br[10]); + al[11] = vaddq_s32(bl[11], bl[10]); + ar[11] = vaddq_s32(br[11], br[10]); + al[12] = vaddq_s32(bl[12], bl[13]); + ar[12] = vaddq_s32(br[12], br[13]); + al[13] = vsubq_s32(bl[12], bl[13]); + ar[13] = vsubq_s32(br[12], br[13]); + al[14] = vsubq_s32(bl[15], bl[14]); + ar[14] = vsubq_s32(br[15], br[14]); + al[15] = vaddq_s32(bl[15], bl[14]); + ar[15] = vaddq_s32(br[15], br[14]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[19] = bl[19]; + ar[19] = br[19]; + al[20] = bl[20]; + ar[20] = br[20]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[27] = bl[27]; + ar[27] = br[27]; + al[28] = bl[28]; + ar[28] = br[28]; + al[31] = bl[31]; + ar[31] = br[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64, + cospi_28_64, &al[30], &ar[30], &al[17], + &ar[17]); + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], + cospi_28_64, -cospi_4_64, &al[29], &ar[29], + &al[18], &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_20_64, cospi_12_64, &al[26], &ar[26], + &al[21], &ar[21]); + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_12_64, -cospi_20_64, &al[25], + &ar[25], &al[22], &ar[22]); + + // Stage 7. + bl[0] = al[0]; + br[0] = ar[0]; + bl[1] = al[1]; + br[1] = ar[1]; + bl[2] = al[2]; + br[2] = ar[2]; + bl[3] = al[3]; + br[3] = ar[3]; + bl[4] = al[4]; + br[4] = ar[4]; + bl[5] = al[5]; + br[5] = ar[5]; + bl[6] = al[6]; + br[6] = ar[6]; + bl[7] = al[7]; + br[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64, + cospi_30_64, &bl[8], &br[8], &bl[15], + &br[15]); + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64, + cospi_14_64, &bl[9], &br[9], &bl[14], + &br[14]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_10_64, cospi_22_64, &bl[10], &br[10], + &bl[13], &br[13]); + butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11], + cospi_26_64, cospi_6_64, &bl[11], &br[11], + &bl[12], &br[12]); + + bl[16] = vaddq_s32(al[16], al[17]); + br[16] = vaddq_s32(ar[16], ar[17]); + bl[17] = vsubq_s32(al[16], al[17]); + br[17] = vsubq_s32(ar[16], ar[17]); + bl[18] = vsubq_s32(al[19], al[18]); + br[18] = vsubq_s32(ar[19], ar[18]); + bl[19] = vaddq_s32(al[19], al[18]); + br[19] = vaddq_s32(ar[19], ar[18]); + bl[20] = vaddq_s32(al[20], al[21]); + br[20] = vaddq_s32(ar[20], ar[21]); + bl[21] = vsubq_s32(al[20], al[21]); + br[21] = vsubq_s32(ar[20], ar[21]); + bl[22] = vsubq_s32(al[23], al[22]); + br[22] = vsubq_s32(ar[23], ar[22]); + bl[23] = vaddq_s32(al[23], al[22]); + br[23] = vaddq_s32(ar[23], ar[22]); + bl[24] = vaddq_s32(al[24], al[25]); + br[24] = vaddq_s32(ar[24], ar[25]); + bl[25] = vsubq_s32(al[24], al[25]); + br[25] = vsubq_s32(ar[24], ar[25]); + bl[26] = vsubq_s32(al[27], al[26]); + br[26] = vsubq_s32(ar[27], ar[26]); + bl[27] = vaddq_s32(al[27], al[26]); + br[27] = vaddq_s32(ar[27], ar[26]); + bl[28] = vaddq_s32(al[28], al[29]); + br[28] = vaddq_s32(ar[28], ar[29]); + bl[29] = vsubq_s32(al[28], al[29]); + br[29] = vsubq_s32(ar[28], ar[29]); + bl[30] = vsubq_s32(al[31], al[30]); + br[30] = vsubq_s32(ar[31], ar[30]); + bl[31] = vaddq_s32(al[31], al[30]); + br[31] = vaddq_s32(ar[31], ar[30]); + + // Final stage. + + left[0] = bl[0]; + right[0] = br[0]; + left[16] = bl[1]; + right[16] = br[1]; + left[8] = bl[2]; + right[8] = br[2]; + left[24] = bl[3]; + right[24] = br[3]; + left[4] = bl[4]; + right[4] = br[4]; + left[20] = bl[5]; + right[20] = br[5]; + left[12] = bl[6]; + right[12] = br[6]; + left[28] = bl[7]; + right[28] = br[7]; + left[2] = bl[8]; + right[2] = br[8]; + left[18] = bl[9]; + right[18] = br[9]; + left[10] = bl[10]; + right[10] = br[10]; + left[26] = bl[11]; + right[26] = br[11]; + left[6] = bl[12]; + right[6] = br[12]; + left[22] = bl[13]; + right[22] = br[13]; + left[14] = bl[14]; + right[14] = br[14]; + left[30] = bl[15]; + right[30] = br[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64, + cospi_31_64, &al[1], &ar[1], &al[31], + &ar[31]); + left[1] = al[1]; + right[1] = ar[1]; + left[31] = al[31]; + right[31] = ar[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], + cospi_17_64, cospi_15_64, &al[17], &ar[17], + &al[15], &ar[15]); + left[17] = al[17]; + right[17] = ar[17]; + left[15] = al[15]; + right[15] = ar[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64, + cospi_23_64, &al[9], &ar[9], &al[23], + &ar[23]); + left[9] = al[9]; + right[9] = ar[9]; + left[23] = al[23]; + right[23] = ar[23]; + + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], + cospi_25_64, cospi_7_64, &al[25], &ar[25], + &al[7], &ar[7]); + left[25] = al[25]; + right[25] = ar[25]; + left[7] = al[7]; + right[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64, + cospi_27_64, &al[5], &ar[5], &al[27], + &ar[27]); + left[5] = al[5]; + right[5] = ar[5]; + left[27] = al[27]; + right[27] = ar[27]; + + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_21_64, cospi_11_64, &al[21], &ar[21], + &al[11], &ar[11]); + left[21] = al[21]; + right[21] = ar[21]; + left[11] = al[11]; + right[11] = ar[11]; + + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_13_64, cospi_19_64, &al[13], &ar[13], + &al[19], &ar[19]); + left[13] = al[13]; + right[13] = ar[13]; + left[19] = al[19]; + right[19] = ar[19]; + + butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23], + cospi_29_64, cospi_3_64, &al[29], &ar[29], + &al[3], &ar[3]); + left[29] = al[29]; + right[29] = ar[29]; + left[3] = al[3]; + right[3] = ar[3]; +} + +static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/, + int32x4_t *right /*32*/) { + int32x4_t al[32], ar[32]; + int32x4_t bl[32], br[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // For the "rd" version, all the values are rounded down after stage 2 to keep + // the values in 16 bits. + al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15])); + ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15])); + al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14])); + ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14])); + al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13])); + ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13])); + al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12])); + ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12])); + al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11])); + ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11])); + al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10])); + ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10])); + al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9])); + ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9])); + al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8])); + ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8])); + + al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8])); + ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8])); + al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9])); + ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9])); + al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10])); + ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10])); + al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11])); + ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11])); + al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12])); + ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12])); + al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13])); + ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13])); + al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14])); + ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14])); + al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15])); + ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15])); + + al[16] = add_round_shift_s32(left[16]); + ar[16] = add_round_shift_s32(right[16]); + al[17] = add_round_shift_s32(left[17]); + ar[17] = add_round_shift_s32(right[17]); + al[18] = add_round_shift_s32(left[18]); + ar[18] = add_round_shift_s32(right[18]); + al[19] = add_round_shift_s32(left[19]); + ar[19] = add_round_shift_s32(right[19]); + + butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20], + cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21], + cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22], + cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]); + butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23], + cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]); + + al[20] = add_round_shift_s32(al[20]); + ar[20] = add_round_shift_s32(ar[20]); + al[21] = add_round_shift_s32(al[21]); + ar[21] = add_round_shift_s32(ar[21]); + al[22] = add_round_shift_s32(al[22]); + ar[22] = add_round_shift_s32(ar[22]); + al[23] = add_round_shift_s32(al[23]); + ar[23] = add_round_shift_s32(ar[23]); + al[24] = add_round_shift_s32(al[24]); + ar[24] = add_round_shift_s32(ar[24]); + al[25] = add_round_shift_s32(al[25]); + ar[25] = add_round_shift_s32(ar[25]); + al[26] = add_round_shift_s32(al[26]); + ar[26] = add_round_shift_s32(ar[26]); + al[27] = add_round_shift_s32(al[27]); + ar[27] = add_round_shift_s32(ar[27]); + + al[28] = add_round_shift_s32(left[28]); + ar[28] = add_round_shift_s32(right[28]); + al[29] = add_round_shift_s32(left[29]); + ar[29] = add_round_shift_s32(right[29]); + al[30] = add_round_shift_s32(left[30]); + ar[30] = add_round_shift_s32(right[30]); + al[31] = add_round_shift_s32(left[31]); + ar[31] = add_round_shift_s32(right[31]); + + // Stage 3. + bl[0] = vaddq_s32(al[0], al[7]); + br[0] = vaddq_s32(ar[0], ar[7]); + bl[1] = vaddq_s32(al[1], al[6]); + br[1] = vaddq_s32(ar[1], ar[6]); + bl[2] = vaddq_s32(al[2], al[5]); + br[2] = vaddq_s32(ar[2], ar[5]); + bl[3] = vaddq_s32(al[3], al[4]); + br[3] = vaddq_s32(ar[3], ar[4]); + + bl[4] = vsubq_s32(al[3], al[4]); + br[4] = vsubq_s32(ar[3], ar[4]); + bl[5] = vsubq_s32(al[2], al[5]); + br[5] = vsubq_s32(ar[2], ar[5]); + bl[6] = vsubq_s32(al[1], al[6]); + br[6] = vsubq_s32(ar[1], ar[6]); + bl[7] = vsubq_s32(al[0], al[7]); + br[7] = vsubq_s32(ar[0], ar[7]); + + bl[8] = al[8]; + br[8] = ar[8]; + bl[9] = al[9]; + br[9] = ar[9]; + + butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64, + &bl[13], &br[13], &bl[10], &br[10]); + butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64, + &bl[12], &br[12], &bl[11], &br[11]); + + bl[14] = al[14]; + br[14] = ar[14]; + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[16], al[23]); + br[16] = vaddq_s32(ar[16], ar[23]); + bl[17] = vaddq_s32(al[17], al[22]); + br[17] = vaddq_s32(ar[17], ar[22]); + bl[18] = vaddq_s32(al[18], al[21]); + br[18] = vaddq_s32(ar[18], ar[21]); + bl[19] = vaddq_s32(al[19], al[20]); + br[19] = vaddq_s32(ar[19], ar[20]); + + bl[20] = vsubq_s32(al[19], al[20]); + br[20] = vsubq_s32(ar[19], ar[20]); + bl[21] = vsubq_s32(al[18], al[21]); + br[21] = vsubq_s32(ar[18], ar[21]); + bl[22] = vsubq_s32(al[17], al[22]); + br[22] = vsubq_s32(ar[17], ar[22]); + bl[23] = vsubq_s32(al[16], al[23]); + br[23] = vsubq_s32(ar[16], ar[23]); + + bl[24] = vsubq_s32(al[31], al[24]); + br[24] = vsubq_s32(ar[31], ar[24]); + bl[25] = vsubq_s32(al[30], al[25]); + br[25] = vsubq_s32(ar[30], ar[25]); + bl[26] = vsubq_s32(al[29], al[26]); + br[26] = vsubq_s32(ar[29], ar[26]); + bl[27] = vsubq_s32(al[28], al[27]); + br[27] = vsubq_s32(ar[28], ar[27]); + + bl[28] = vaddq_s32(al[28], al[27]); + br[28] = vaddq_s32(ar[28], ar[27]); + bl[29] = vaddq_s32(al[29], al[26]); + br[29] = vaddq_s32(ar[29], ar[26]); + bl[30] = vaddq_s32(al[30], al[25]); + br[30] = vaddq_s32(ar[30], ar[25]); + bl[31] = vaddq_s32(al[31], al[24]); + br[31] = vaddq_s32(ar[31], ar[24]); + + // Stage 4. + al[0] = vaddq_s32(bl[0], bl[3]); + ar[0] = vaddq_s32(br[0], br[3]); + al[1] = vaddq_s32(bl[1], bl[2]); + ar[1] = vaddq_s32(br[1], br[2]); + al[2] = vsubq_s32(bl[1], bl[2]); + ar[2] = vsubq_s32(br[1], br[2]); + al[3] = vsubq_s32(bl[0], bl[3]); + ar[3] = vsubq_s32(br[0], br[3]); + + al[4] = bl[4]; + ar[4] = br[4]; + + butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6], + &ar[6], &al[5], &ar[5]); + + al[7] = bl[7]; + ar[7] = br[7]; + + al[8] = vaddq_s32(bl[8], bl[11]); + ar[8] = vaddq_s32(br[8], br[11]); + al[9] = vaddq_s32(bl[9], bl[10]); + ar[9] = vaddq_s32(br[9], br[10]); + al[10] = vsubq_s32(bl[9], bl[10]); + ar[10] = vsubq_s32(br[9], br[10]); + al[11] = vsubq_s32(bl[8], bl[11]); + ar[11] = vsubq_s32(br[8], br[11]); + al[12] = vsubq_s32(bl[15], bl[12]); + ar[12] = vsubq_s32(br[15], br[12]); + al[13] = vsubq_s32(bl[14], bl[13]); + ar[13] = vsubq_s32(br[14], br[13]); + al[14] = vaddq_s32(bl[14], bl[13]); + ar[14] = vaddq_s32(br[14], br[13]); + al[15] = vaddq_s32(bl[15], bl[12]); + ar[15] = vaddq_s32(br[15], br[12]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[17] = bl[17]; + ar[17] = br[17]; + + butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64, + cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]); + butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64, + cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]); + butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64, + -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64, + -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]); + + al[22] = bl[22]; + ar[22] = br[22]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[25] = bl[25]; + ar[25] = br[25]; + + al[30] = bl[30]; + ar[30] = br[30]; + al[31] = bl[31]; + ar[31] = br[31]; + + // Stage 5. + butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0], + &br[0], &bl[1], &br[1]); + butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64, + &bl[2], &br[2], &bl[3], &br[3]); + + bl[4] = vaddq_s32(al[4], al[5]); + br[4] = vaddq_s32(ar[4], ar[5]); + bl[5] = vsubq_s32(al[4], al[5]); + br[5] = vsubq_s32(ar[4], ar[5]); + bl[6] = vsubq_s32(al[7], al[6]); + br[6] = vsubq_s32(ar[7], ar[6]); + bl[7] = vaddq_s32(al[7], al[6]); + br[7] = vaddq_s32(ar[7], ar[6]); + + bl[8] = al[8]; + br[8] = ar[8]; + + butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64, + &bl[14], &br[14], &bl[9], &br[9]); + butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64, + -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]); + + bl[11] = al[11]; + br[11] = ar[11]; + bl[12] = al[12]; + br[12] = ar[12]; + + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[19], al[16]); + br[16] = vaddq_s32(ar[19], ar[16]); + bl[17] = vaddq_s32(al[18], al[17]); + br[17] = vaddq_s32(ar[18], ar[17]); + bl[18] = vsubq_s32(al[17], al[18]); + br[18] = vsubq_s32(ar[17], ar[18]); + bl[19] = vsubq_s32(al[16], al[19]); + br[19] = vsubq_s32(ar[16], ar[19]); + bl[20] = vsubq_s32(al[23], al[20]); + br[20] = vsubq_s32(ar[23], ar[20]); + bl[21] = vsubq_s32(al[22], al[21]); + br[21] = vsubq_s32(ar[22], ar[21]); + bl[22] = vaddq_s32(al[21], al[22]); + br[22] = vaddq_s32(ar[21], ar[22]); + bl[23] = vaddq_s32(al[20], al[23]); + br[23] = vaddq_s32(ar[20], ar[23]); + bl[24] = vaddq_s32(al[27], al[24]); + br[24] = vaddq_s32(ar[27], ar[24]); + bl[25] = vaddq_s32(al[26], al[25]); + br[25] = vaddq_s32(ar[26], ar[25]); + bl[26] = vsubq_s32(al[25], al[26]); + br[26] = vsubq_s32(ar[25], ar[26]); + bl[27] = vsubq_s32(al[24], al[27]); + br[27] = vsubq_s32(ar[24], ar[27]); + bl[28] = vsubq_s32(al[31], al[28]); + br[28] = vsubq_s32(ar[31], ar[28]); + bl[29] = vsubq_s32(al[30], al[29]); + br[29] = vsubq_s32(ar[30], ar[29]); + bl[30] = vaddq_s32(al[29], al[30]); + br[30] = vaddq_s32(ar[29], ar[30]); + bl[31] = vaddq_s32(al[28], al[31]); + br[31] = vaddq_s32(ar[28], ar[31]); + + // Stage 6. + al[0] = bl[0]; + ar[0] = br[0]; + al[1] = bl[1]; + ar[1] = br[1]; + al[2] = bl[2]; + ar[2] = br[2]; + al[3] = bl[3]; + ar[3] = br[3]; + + butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64, + &al[4], &ar[4], &al[7], &ar[7]); + butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64, + &al[5], &ar[5], &al[6], &ar[6]); + + al[8] = vaddq_s32(bl[8], bl[9]); + ar[8] = vaddq_s32(br[8], br[9]); + al[9] = vsubq_s32(bl[8], bl[9]); + ar[9] = vsubq_s32(br[8], br[9]); + al[10] = vsubq_s32(bl[11], bl[10]); + ar[10] = vsubq_s32(br[11], br[10]); + al[11] = vaddq_s32(bl[11], bl[10]); + ar[11] = vaddq_s32(br[11], br[10]); + al[12] = vaddq_s32(bl[12], bl[13]); + ar[12] = vaddq_s32(br[12], br[13]); + al[13] = vsubq_s32(bl[12], bl[13]); + ar[13] = vsubq_s32(br[12], br[13]); + al[14] = vsubq_s32(bl[15], bl[14]); + ar[14] = vsubq_s32(br[15], br[14]); + al[15] = vaddq_s32(bl[15], bl[14]); + ar[15] = vaddq_s32(br[15], br[14]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[19] = bl[19]; + ar[19] = br[19]; + al[20] = bl[20]; + ar[20] = br[20]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[27] = bl[27]; + ar[27] = br[27]; + al[28] = bl[28]; + ar[28] = br[28]; + al[31] = bl[31]; + ar[31] = br[31]; + + butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64, + cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]); + butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64, + -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]); + butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64, + cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64, + -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]); + + // Stage 7. + bl[0] = al[0]; + br[0] = ar[0]; + bl[1] = al[1]; + br[1] = ar[1]; + bl[2] = al[2]; + br[2] = ar[2]; + bl[3] = al[3]; + br[3] = ar[3]; + bl[4] = al[4]; + br[4] = ar[4]; + bl[5] = al[5]; + br[5] = ar[5]; + bl[6] = al[6]; + br[6] = ar[6]; + bl[7] = al[7]; + br[7] = ar[7]; + + butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64, + &bl[8], &br[8], &bl[15], &br[15]); + butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64, + cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]); + butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64, + cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]); + butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64, + cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]); + + bl[16] = vaddq_s32(al[16], al[17]); + br[16] = vaddq_s32(ar[16], ar[17]); + bl[17] = vsubq_s32(al[16], al[17]); + br[17] = vsubq_s32(ar[16], ar[17]); + bl[18] = vsubq_s32(al[19], al[18]); + br[18] = vsubq_s32(ar[19], ar[18]); + bl[19] = vaddq_s32(al[19], al[18]); + br[19] = vaddq_s32(ar[19], ar[18]); + bl[20] = vaddq_s32(al[20], al[21]); + br[20] = vaddq_s32(ar[20], ar[21]); + bl[21] = vsubq_s32(al[20], al[21]); + br[21] = vsubq_s32(ar[20], ar[21]); + bl[22] = vsubq_s32(al[23], al[22]); + br[22] = vsubq_s32(ar[23], ar[22]); + bl[23] = vaddq_s32(al[23], al[22]); + br[23] = vaddq_s32(ar[23], ar[22]); + bl[24] = vaddq_s32(al[24], al[25]); + br[24] = vaddq_s32(ar[24], ar[25]); + bl[25] = vsubq_s32(al[24], al[25]); + br[25] = vsubq_s32(ar[24], ar[25]); + bl[26] = vsubq_s32(al[27], al[26]); + br[26] = vsubq_s32(ar[27], ar[26]); + bl[27] = vaddq_s32(al[27], al[26]); + br[27] = vaddq_s32(ar[27], ar[26]); + bl[28] = vaddq_s32(al[28], al[29]); + br[28] = vaddq_s32(ar[28], ar[29]); + bl[29] = vsubq_s32(al[28], al[29]); + br[29] = vsubq_s32(ar[28], ar[29]); + bl[30] = vsubq_s32(al[31], al[30]); + br[30] = vsubq_s32(ar[31], ar[30]); + bl[31] = vaddq_s32(al[31], al[30]); + br[31] = vaddq_s32(ar[31], ar[30]); + + // Final stage. + left[0] = bl[0]; + right[0] = br[0]; + left[16] = bl[1]; + right[16] = br[1]; + left[8] = bl[2]; + right[8] = br[2]; + left[24] = bl[3]; + right[24] = br[3]; + left[4] = bl[4]; + right[4] = br[4]; + left[20] = bl[5]; + right[20] = br[5]; + left[12] = bl[6]; + right[12] = br[6]; + left[28] = bl[7]; + right[28] = br[7]; + left[2] = bl[8]; + right[2] = br[8]; + left[18] = bl[9]; + right[18] = br[9]; + left[10] = bl[10]; + right[10] = br[10]; + left[26] = bl[11]; + right[26] = br[11]; + left[6] = bl[12]; + right[6] = br[12]; + left[22] = bl[13]; + right[22] = br[13]; + left[14] = bl[14]; + right[14] = br[14]; + left[30] = bl[15]; + right[30] = br[15]; + + butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64, + cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]); + left[1] = al[1]; + right[1] = ar[1]; + left[31] = al[31]; + right[31] = ar[31]; + + butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64, + cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]); + left[17] = al[17]; + right[17] = ar[17]; + left[15] = al[15]; + right[15] = ar[15]; + + butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64, + cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]); + left[9] = al[9]; + right[9] = ar[9]; + left[23] = al[23]; + right[23] = ar[23]; + + butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64, + cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]); + left[25] = al[25]; + right[25] = ar[25]; + left[7] = al[7]; + right[7] = ar[7]; + + butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64, + cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]); + left[5] = al[5]; + right[5] = ar[5]; + left[27] = al[27]; + right[27] = ar[27]; + + butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64, + cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]); + left[21] = al[21]; + right[21] = ar[21]; + left[11] = al[11]; + right[11] = ar[11]; + + butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64, + cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]); + left[13] = al[13]; + right[13] = ar[13]; + left[19] = al[19]; + right[19] = ar[19]; + + butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64, + cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]); + left[29] = al[29]; + right[29] = ar[29]; + left[3] = al[3]; + right[3] = ar[3]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + #endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h index 1ea948b3f7..b33da427b4 100644 --- a/vpx_dsp/arm/fdct_neon.h +++ b/vpx_dsp/arm/fdct_neon.h @@ -355,4 +355,13 @@ static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) { return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); } +// Add 2 if positive, 1 if negative, and shift by 2. +// In practice, subtract the sign bit, then shift with rounding. +static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) { + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2); +} + #endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_ diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 68244ea5a1..d55ab67ce1 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -568,10 +568,10 @@ () specialize qw/vpx_highbd_fdct16x16_1 neon/; add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct32x32 sse2/; + specialize qw/vpx_highbd_fdct32x32 sse2 neon/; add_proto qw/void vpx_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct32x32_rd sse2/; + specialize qw/vpx_highbd_fdct32x32_rd sse2 neon/; add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_highbd_fdct32x32_1 neon/; From 5d26626e7aabaa8cd0aeb01618191216d2db90b9 Mon Sep 17 00:00:00 2001 From: Andrew Salkeld Date: Thu, 13 Oct 2022 16:28:41 +0100 Subject: [PATCH 0543/1000] Add Neon implementation of vpx_hadamard_32x32 Add an Arm Neon implementation of vpx_hadamard_32x32 and use it instead of the scalar C implementation. Also add test coverage for the new Neon implementation. Change-Id: Iccc018eec4dbbe629fb0c6f8ad6ea8554e7a0b13 --- test/hadamard_test.cc | 3 ++- vpx_dsp/arm/hadamard_neon.c | 42 ++++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 ++-- 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index 10b1e79c10..f904e814ad 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -264,7 +264,8 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( NEON, HadamardLowbdTest, ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_neon, 8), - HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16))); + HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16), + HadamardFuncWithSize(&vpx_hadamard_32x32_neon, 32))); #endif // HAVE_NEON // TODO(jingning): Remove highbitdepth flag when the SIMD functions are diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c index 523a63c6f7..f6b6d7e3ce 100644 --- a/vpx_dsp/arm/hadamard_neon.c +++ b/vpx_dsp/arm/hadamard_neon.c @@ -114,3 +114,45 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, coeff += 8; } } + +void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int i; + + /* Rearrange 32x32 to 16x64 and remove stride. + * Top left first. */ + vpx_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); + /* Top right. */ + vpx_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride, + coeff + 256); + /* Bottom left. */ + vpx_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride, + coeff + 512); + /* Bottom right. */ + vpx_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride, + coeff + 768); + + for (i = 0; i < 256; i += 8) { + const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0); + const int16x8_t a1 = load_tran_low_to_s16q(coeff + 256); + const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512); + const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768); + + const int16x8_t b0 = vhaddq_s16(a0, a1); + const int16x8_t b1 = vhsubq_s16(a0, a1); + const int16x8_t b2 = vhaddq_s16(a2, a3); + const int16x8_t b3 = vhsubq_s16(a2, a3); + + const int16x8_t c0 = vhaddq_s16(b0, b2); + const int16x8_t c1 = vhaddq_s16(b1, b3); + const int16x8_t c2 = vhsubq_s16(b0, b2); + const int16x8_t c3 = vhsubq_s16(b1, b3); + + store_s16q_to_tran_low(coeff + 0, c0); + store_s16q_to_tran_low(coeff + 256, c1); + store_s16q_to_tran_low(coeff + 512, c2); + store_s16q_to_tran_low(coeff + 768, c3); + + coeff += 8; + } +} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index d55ab67ce1..51f5ebedd6 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -799,7 +799,7 @@ () specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx lsx/; add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_32x32 sse2 avx2/; + specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/; add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/vpx_highbd_hadamard_8x8 avx2/; @@ -823,7 +823,7 @@ () specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx lsx/; add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; - specialize qw/vpx_hadamard_32x32 sse2 avx2/; + specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/; add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; specialize qw/vpx_satd avx2 sse2 neon msa/; From 62dee8012ea70a9f0628471609c5768f98a1e726 Mon Sep 17 00:00:00 2001 From: Sam James Date: Sun, 6 Nov 2022 04:11:59 +0000 Subject: [PATCH 0544/1000] build: fix -Wimplicit-int (Clang 16) Clang 16 will make -Wimplicit-int error by default which can, in addition to other things, lead to some configure tests silently failing/returning the wrong result. Fixes this error: ``` +/var/tmp/portage/media-libs/libvpx-1.12.0/temp/vpx-conf-1802-30624.c:1:15: error: type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int [-Wimplicit-int] ``` For more information, see LWN.net [0] or LLVM's Discourse [1], gentoo-dev@ [2], or the (new) c-std-porting mailing list [3]. [0] https://lwn.net/Articles/913505/ [1] https://discourse.llvm.org/t/configure-script-breakage-with-the-new-werror-implicit-function-declaration/65213 [2] https://archives.gentoo.org/gentoo-dev/message/dd9f2d3082b8b6f8dfbccb0639e6e240 [3] hosted at lists.linux.dev. Bug: https://bugs.gentoo.org/879705 Change-Id: Id73a98944ab3c99a368b9da7a5e902ddff9d937f Signed-off-by: Sam James --- build/make/configure.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/make/configure.sh b/build/make/configure.sh index e9b7fa9c1c..4bf090f006 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -1511,7 +1511,7 @@ EOF # Try to find which inline keywords are supported check_cc < Date: Thu, 10 Nov 2022 18:50:19 -0800 Subject: [PATCH 0545/1000] vp9-rc: Fix key frame setting in external RC Bug: b/257368998 Change-Id: I03e35915ac99b50cb6bdf7bce8b8f9ec5aef75b7 --- vp9/ratectrl_rtc.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index f4d7f7e9e7..1326456c44 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -158,6 +158,8 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { } vp9_set_mb_mi(cm, cm->width, cm->height); cm->frame_type = frame_params.frame_type; + // This is needed to ensure key frame does not get unset in rc_get_svc_params. + cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0; cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; cpi_->sf.use_nonrd_pick_mode = 1; if (cpi_->svc.number_spatial_layers == 1 && From f951514a40554e55715d7a31f182581cdd2bf971 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 9 Nov 2022 09:30:58 +0000 Subject: [PATCH 0546/1000] [NEON] Optimize FHT functions, add highbd FHT 4x4 Refactor & optimize FHT functions further, use new butterfly functions 4x4 5% faster, 8x8 & 16x16 10% faster than previous versions. Highbd 4x4 FHT version 2.27x faster than C version for --rt. Change-Id: I3ebcd26010f6c5c067026aa9353cde46669c5d94 --- test/dct_test.cc | 2 + vp9/common/vp9_rtcd_defs.pl | 1 + vp9/encoder/arm/neon/vp9_dct_neon.c | 1248 +++++++++++---------------- vpx_dsp/arm/fdct_neon.h | 56 ++ 4 files changed, 565 insertions(+), 742 deletions(-) diff --git a/test/dct_test.cc b/test/dct_test.cc index 910d288bd5..0304029bd2 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -667,6 +667,8 @@ static const FuncInfo ht_neon_func_info[] = { #if CONFIG_VP9_HIGHBITDEPTH { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper, 4, 2 }, + { &vp9_highbd_fht4x4_neon, &highbd_iht_wrapper, + 4, 2 }, { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper, 8, 2 }, { &vp9_highbd_fht16x16_c, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 871e4d0a35..f4bd9772c3 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -203,6 +203,7 @@ () # fdct functions add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp9_highbd_fht4x4 neon/; add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c index b8286a8dd5..5961be5f31 100644 --- a/vp9/encoder/arm/neon/vp9_dct_neon.c +++ b/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -23,25 +23,25 @@ static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in, int stride) { - // { 0, 1, 1, 1, 1, 1, 1, 1 }; - const int16x8_t nonzero_bias_a = vextq_s16(vdupq_n_s16(0), vdupq_n_s16(1), 7); - // { 1, 0, 0, 0, 0, 0, 0, 0 }; - const int16x8_t nonzero_bias_b = vextq_s16(vdupq_n_s16(1), vdupq_n_s16(0), 7); - int16x8_t mask; + // { 0, 1, 1, 1 }; + const int16x4_t nonzero_bias_a = vext_s16(vdup_n_s16(0), vdup_n_s16(1), 3); + // { 1, 0, 0, 0 }; + const int16x4_t nonzero_bias_b = vext_s16(vdup_n_s16(1), vdup_n_s16(0), 3); + int16x4_t mask; int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4); int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4); int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4); int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4); - in[0] = vcombine_s16(input_0, input_1); - in[1] = vcombine_s16(input_2, input_3); - // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by // one non-zero first elements - mask = vreinterpretq_s16_u16(vceqq_s16(in[0], nonzero_bias_a)); - in[0] = vaddq_s16(in[0], mask); - in[0] = vaddq_s16(in[0], nonzero_bias_b); + mask = vreinterpret_s16_u16(vceq_s16(input_0, nonzero_bias_a)); + input_0 = vadd_s16(input_0, mask); + input_0 = vadd_s16(input_0, nonzero_bias_b); + + in[0] = vcombine_s16(input_0, input_1); + in[1] = vcombine_s16(input_2, input_3); } static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) { @@ -55,72 +55,54 @@ static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) { } static INLINE void fadst4x4_neon(int16x8_t *in) { - int32x4_t u0, u1, u2, u3; - int16x4_t out_0, out_1, out_2, out_3; - const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING); + int32x4_t u[4], t[4]; + int16x4_t s[4], out[4]; - const int16x4_t s0 = vget_low_s16(in[0]); // | x_00 | x_01 | x_02 | x_03 | - const int16x4_t s1 = vget_high_s16(in[0]); // | x_10 | x_11 | x_12 | x_13 | - const int16x4_t s2 = vget_low_s16(in[1]); // | x_20 | x_21 | x_22 | x_23 | - const int16x4_t s3 = vget_high_s16(in[1]); // | x_30 | x_31 | x_32 | x_33 | + s[0] = vget_low_s16(in[0]); // | x_00 | x_01 | x_02 | x_03 | + s[1] = vget_high_s16(in[0]); // | x_10 | x_11 | x_12 | x_13 | + s[2] = vget_low_s16(in[1]); // | x_20 | x_21 | x_22 | x_23 | + s[3] = vget_high_s16(in[1]); // | x_30 | x_31 | x_32 | x_33 | - // s0 * sinpi_1_9, s0 * sinpi_4_9 // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. - const int32x4_t s0s1_9 = vmull_n_s16(s0, sinpi_1_9); - const int32x4_t s0s4_9 = vmull_n_s16(s0, sinpi_4_9); - // s1 * sinpi_1_9, s1 * sinpi_2_9 - const int32x4_t s1s1_9 = vmull_n_s16(s1, sinpi_1_9); - const int32x4_t s1s2_9 = vmull_n_s16(s1, sinpi_2_9); - // s2 * sinpi_3_9 - const int32x4_t s2s3_9 = vmull_n_s16(s2, sinpi_3_9); - // s3 * sinpi_2_9, s3 * sinpi_4_9 - const int32x4_t s3s2_9 = vmull_n_s16(s3, sinpi_2_9); - const int32x4_t s3s4_9 = vmull_n_s16(s3, sinpi_4_9); - - // (s0 + s1) * sinpi_3_9 - const int32x4_t s0_p_s1 = vaddl_s16(s0, s1); - const int32x4_t s0_p_s1_m_s3 = vsubw_s16(s0_p_s1, s3); - - // s_0 * sinpi_1_9 + s_1 * sinpi_2_9 - // s_0 * sinpi_4_9 - s_1 * sinpi_1_9 - const int32x4_t s0s1_9_p_s1s2_9 = vaddq_s32(s0s1_9, s1s2_9); - const int32x4_t s0s4_9_m_s1s1_9 = vsubq_s32(s0s4_9, s1s1_9); - /* - * t0 = s0s1_9 + s1s2_9 + s3s4_9 - * t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9 - * t2 = s0s4_9 - s1s1_9 + s3s2_9 - * t3 = s2s3_9 - */ - const int32x4_t t0 = vaddq_s32(s0s1_9_p_s1s2_9, s3s4_9); - const int32x4_t t1 = vmulq_n_s32(s0_p_s1_m_s3, sinpi_3_9); - const int32x4_t t2 = vaddq_s32(s0s4_9_m_s1s1_9, s3s2_9); - const int32x4_t t3 = s2s3_9; + // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9 + t[0] = vmull_n_s16(s[0], sinpi_1_9); + t[0] = vmlal_n_s16(t[0], s[1], sinpi_2_9); + t[0] = vmlal_n_s16(t[0], s[3], sinpi_4_9); + + // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9 + t[1] = vmull_n_s16(s[0], sinpi_3_9); + t[1] = vmlal_n_s16(t[1], s[1], sinpi_3_9); + t[1] = vmlsl_n_s16(t[1], s[3], sinpi_3_9); + + // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9 + t[2] = vmull_n_s16(s[0], sinpi_4_9); + t[2] = vmlsl_n_s16(t[2], s[1], sinpi_1_9); + t[2] = vmlal_n_s16(t[2], s[3], sinpi_2_9); + + // t3 = s2 * sinpi_3_9 + t[3] = vmull_n_s16(s[2], sinpi_3_9); + /* * u0 = t0 + t3 * u1 = t1 * u2 = t2 - t3 * u3 = t2 - t0 + t3 */ - u0 = vaddq_s32(t0, t3); - u1 = t1; - u2 = vsubq_s32(t2, t3); - u3 = vaddq_s32(vsubq_s32(t2, t0), t3); + u[0] = vaddq_s32(t[0], t[3]); + u[1] = t[1]; + u[2] = vsubq_s32(t[2], t[3]); + u[3] = vaddq_s32(vsubq_s32(t[2], t[0]), t[3]); // fdct_round_shift - u0 = vaddq_s32(u0, k__DCT_CONST_ROUNDING); - u1 = vaddq_s32(u1, k__DCT_CONST_ROUNDING); - u2 = vaddq_s32(u2, k__DCT_CONST_ROUNDING); - u3 = vaddq_s32(u3, k__DCT_CONST_ROUNDING); - - out_0 = vshrn_n_s32(u0, DCT_CONST_BITS); - out_1 = vshrn_n_s32(u1, DCT_CONST_BITS); - out_2 = vshrn_n_s32(u2, DCT_CONST_BITS); - out_3 = vshrn_n_s32(u3, DCT_CONST_BITS); + out[0] = vrshrn_n_s32(u[0], DCT_CONST_BITS); + out[1] = vrshrn_n_s32(u[1], DCT_CONST_BITS); + out[2] = vrshrn_n_s32(u[2], DCT_CONST_BITS); + out[3] = vrshrn_n_s32(u[3], DCT_CONST_BITS); - transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); - in[0] = vcombine_s16(out_0, out_1); - in[1] = vcombine_s16(out_2, out_3); + in[0] = vcombine_s16(out[0], out[1]); + in[1] = vcombine_s16(out[2], out[3]); } void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, @@ -239,245 +221,158 @@ static INLINE void write_buffer_8x8(tran_low_t *output, int16x8_t *res, } static INLINE void fadst8x8_neon(int16x8_t *in) { - int16x4_t x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi, x4_lo, - x4_hi, x5_lo, x5_hi, x6_lo, x6_hi, x7_lo, x7_hi; - int32x4_t s0_lo, s0_hi, s1_lo, s1_hi, s2_lo, s2_hi, s3_lo, s3_hi, s4_lo, - s4_hi, s5_lo, s5_hi, s6_lo, s6_hi, s7_lo, s7_hi; - int32x4_t t0_lo, t0_hi, t1_lo, t1_hi, t2_lo, t2_hi, t3_lo, t3_hi, t4_lo, - t4_hi, t5_lo, t5_hi, t6_lo, t6_hi, t7_lo, t7_hi; - const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING); - - x0_lo = vget_low_s16(in[7]); - x0_hi = vget_high_s16(in[7]); - x1_lo = vget_low_s16(in[0]); - x1_hi = vget_high_s16(in[0]); - x2_lo = vget_low_s16(in[5]); - x2_hi = vget_high_s16(in[5]); - x3_lo = vget_low_s16(in[2]); - x3_hi = vget_high_s16(in[2]); - x4_lo = vget_low_s16(in[3]); - x4_hi = vget_high_s16(in[3]); - x5_lo = vget_low_s16(in[4]); - x5_hi = vget_high_s16(in[4]); - x6_lo = vget_low_s16(in[1]); - x6_hi = vget_high_s16(in[1]); - x7_lo = vget_low_s16(in[6]); - x7_hi = vget_high_s16(in[6]); + int16x4_t x_lo[8], x_hi[8]; + int32x4_t s_lo[8], s_hi[8]; + int32x4_t t_lo[8], t_hi[8]; + + x_lo[0] = vget_low_s16(in[7]); + x_hi[0] = vget_high_s16(in[7]); + x_lo[1] = vget_low_s16(in[0]); + x_hi[1] = vget_high_s16(in[0]); + x_lo[2] = vget_low_s16(in[5]); + x_hi[2] = vget_high_s16(in[5]); + x_lo[3] = vget_low_s16(in[2]); + x_hi[3] = vget_high_s16(in[2]); + x_lo[4] = vget_low_s16(in[3]); + x_hi[4] = vget_high_s16(in[3]); + x_lo[5] = vget_low_s16(in[4]); + x_hi[5] = vget_high_s16(in[4]); + x_lo[6] = vget_low_s16(in[1]); + x_hi[6] = vget_high_s16(in[1]); + x_lo[7] = vget_low_s16(in[6]); + x_hi[7] = vget_high_s16(in[6]); // stage 1 // s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - s0_lo = vaddq_s32(vmull_n_s16(x0_lo, cospi_2_64), - vmull_n_s16(x1_lo, cospi_30_64)); - s0_hi = vaddq_s32(vmull_n_s16(x0_hi, cospi_2_64), - vmull_n_s16(x1_hi, cospi_30_64)); // s1 = cospi_30_64 * x0 - cospi_2_64 * x1; - s1_lo = vsubq_s32(vmull_n_s16(x0_lo, cospi_30_64), - vmull_n_s16(x1_lo, cospi_2_64)); - s1_hi = vsubq_s32(vmull_n_s16(x0_hi, cospi_30_64), - vmull_n_s16(x1_hi, cospi_2_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1], + cospi_2_64, cospi_30_64, &s_lo[0], + &s_hi[0], &s_lo[1], &s_hi[1]); + // s2 = cospi_10_64 * x2 + cospi_22_64 * x3; - s2_lo = vaddq_s32(vmull_n_s16(x2_lo, cospi_10_64), - vmull_n_s16(x3_lo, cospi_22_64)); - s2_hi = vaddq_s32(vmull_n_s16(x2_hi, cospi_10_64), - vmull_n_s16(x3_hi, cospi_22_64)); // s3 = cospi_22_64 * x2 - cospi_10_64 * x3; - s3_lo = vsubq_s32(vmull_n_s16(x2_lo, cospi_22_64), - vmull_n_s16(x3_lo, cospi_10_64)); - s3_hi = vsubq_s32(vmull_n_s16(x2_hi, cospi_22_64), - vmull_n_s16(x3_hi, cospi_10_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3], + cospi_10_64, cospi_22_64, &s_lo[2], + &s_hi[2], &s_lo[3], &s_hi[3]); + // s4 = cospi_18_64 * x4 + cospi_14_64 * x5; - s4_lo = vaddq_s32(vmull_n_s16(x4_lo, cospi_18_64), - vmull_n_s16(x5_lo, cospi_14_64)); - s4_hi = vaddq_s32(vmull_n_s16(x4_hi, cospi_18_64), - vmull_n_s16(x5_hi, cospi_14_64)); // s5 = cospi_14_64 * x4 - cospi_18_64 * x5; - s5_lo = vsubq_s32(vmull_n_s16(x4_lo, cospi_14_64), - vmull_n_s16(x5_lo, cospi_18_64)); - s5_hi = vsubq_s32(vmull_n_s16(x4_hi, cospi_14_64), - vmull_n_s16(x5_hi, cospi_18_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5], + cospi_18_64, cospi_14_64, &s_lo[4], + &s_hi[4], &s_lo[5], &s_hi[5]); + // s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - s6_lo = vaddq_s32(vmull_n_s16(x6_lo, cospi_26_64), - vmull_n_s16(x7_lo, cospi_6_64)); - s6_hi = vaddq_s32(vmull_n_s16(x6_hi, cospi_26_64), - vmull_n_s16(x7_hi, cospi_6_64)); // s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - s7_lo = vsubq_s32(vmull_n_s16(x6_lo, cospi_6_64), - vmull_n_s16(x7_lo, cospi_26_64)); - s7_hi = vsubq_s32(vmull_n_s16(x6_hi, cospi_6_64), - vmull_n_s16(x7_hi, cospi_26_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7], + cospi_26_64, cospi_6_64, &s_lo[6], + &s_hi[6], &s_lo[7], &s_hi[7]); // fdct_round_shift - t0_lo = vaddq_s32(s0_lo, s4_lo); - t0_hi = vaddq_s32(s0_hi, s4_hi); - t1_lo = vaddq_s32(s1_lo, s5_lo); - t1_hi = vaddq_s32(s1_hi, s5_hi); - t2_lo = vaddq_s32(s2_lo, s6_lo); - t2_hi = vaddq_s32(s2_hi, s6_hi); - t3_lo = vaddq_s32(s3_lo, s7_lo); - t3_hi = vaddq_s32(s3_hi, s7_hi); - t4_lo = vsubq_s32(s0_lo, s4_lo); - t4_hi = vsubq_s32(s0_hi, s4_hi); - t5_lo = vsubq_s32(s1_lo, s5_lo); - t5_hi = vsubq_s32(s1_hi, s5_hi); - t6_lo = vsubq_s32(s2_lo, s6_lo); - t6_hi = vsubq_s32(s2_hi, s6_hi); - t7_lo = vsubq_s32(s3_lo, s7_lo); - t7_hi = vsubq_s32(s3_hi, s7_hi); - - t0_lo = vaddq_s32(t0_lo, k__DCT_CONST_ROUNDING); - t0_hi = vaddq_s32(t0_hi, k__DCT_CONST_ROUNDING); - t1_lo = vaddq_s32(t1_lo, k__DCT_CONST_ROUNDING); - t1_hi = vaddq_s32(t1_hi, k__DCT_CONST_ROUNDING); - t2_lo = vaddq_s32(t2_lo, k__DCT_CONST_ROUNDING); - t2_hi = vaddq_s32(t2_hi, k__DCT_CONST_ROUNDING); - t3_lo = vaddq_s32(t3_lo, k__DCT_CONST_ROUNDING); - t3_hi = vaddq_s32(t3_hi, k__DCT_CONST_ROUNDING); - t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING); - t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING); - t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING); - t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING); - t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING); - t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING); - t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING); - t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING); - - t0_lo = vshrq_n_s32(t0_lo, DCT_CONST_BITS); - t0_hi = vshrq_n_s32(t0_hi, DCT_CONST_BITS); - t1_lo = vshrq_n_s32(t1_lo, DCT_CONST_BITS); - t1_hi = vshrq_n_s32(t1_hi, DCT_CONST_BITS); - t2_lo = vshrq_n_s32(t2_lo, DCT_CONST_BITS); - t2_hi = vshrq_n_s32(t2_hi, DCT_CONST_BITS); - t3_lo = vshrq_n_s32(t3_lo, DCT_CONST_BITS); - t3_hi = vshrq_n_s32(t3_hi, DCT_CONST_BITS); - t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS); - t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS); - t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS); - t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS); - t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS); - t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS); - t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS); - t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS); + t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS); + t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS); + t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS); + t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS); + t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS); + t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS); + t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS); + t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS); + t_lo[4] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS); + t_lo[5] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS); + t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS); + t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS); // stage 2 - s0_lo = t0_lo; - s0_hi = t0_hi; - s1_lo = t1_lo; - s1_hi = t1_hi; - s2_lo = t2_lo; - s2_hi = t2_hi; - s3_lo = t3_lo; - s3_hi = t3_hi; - s4_lo = vaddq_s32(vmulq_n_s32(t4_lo, cospi_8_64), - vmulq_n_s32(t5_lo, cospi_24_64)); - s4_hi = vaddq_s32(vmulq_n_s32(t4_hi, cospi_8_64), - vmulq_n_s32(t5_hi, cospi_24_64)); - s5_lo = vsubq_s32(vmulq_n_s32(t4_lo, cospi_24_64), - vmulq_n_s32(t5_lo, cospi_8_64)); - s5_hi = vsubq_s32(vmulq_n_s32(t4_hi, cospi_24_64), - vmulq_n_s32(t5_hi, cospi_8_64)); - s6_lo = vaddq_s32(vmulq_n_s32(t6_lo, -cospi_24_64), - vmulq_n_s32(t7_lo, cospi_8_64)); - s6_hi = vaddq_s32(vmulq_n_s32(t6_hi, -cospi_24_64), - vmulq_n_s32(t7_hi, cospi_8_64)); - s7_lo = vaddq_s32(vmulq_n_s32(t6_lo, cospi_8_64), - vmulq_n_s32(t7_lo, cospi_24_64)); - s7_hi = vaddq_s32(vmulq_n_s32(t6_hi, cospi_8_64), - vmulq_n_s32(t7_hi, cospi_24_64)); + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + // s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + // s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5], + cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4], + &s_lo[5], &s_hi[5]); + + // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + // s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + butterfly_two_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7], + -cospi_24_64, cospi_8_64, &s_lo[6], &s_hi[6], + &s_lo[7], &s_hi[7]); + // fdct_round_shift // s0 + s2 - t0_lo = vaddq_s32(s0_lo, s2_lo); - t0_hi = vaddq_s32(s0_hi, s2_hi); + t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]); + t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]); // s1 + s3 - t1_lo = vaddq_s32(s1_lo, s3_lo); - t1_hi = vaddq_s32(s1_hi, s3_hi); + t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]); + t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]); // s0 - s2 - t2_lo = vsubq_s32(s0_lo, s2_lo); - t2_hi = vsubq_s32(s0_hi, s2_hi); + t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]); + t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]); // s1 - s3 - t3_lo = vsubq_s32(s1_lo, s3_lo); - t3_hi = vsubq_s32(s1_hi, s3_hi); + t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]); + t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]); // s4 + s6 - t4_lo = vaddq_s32(s4_lo, s6_lo); - t4_hi = vaddq_s32(s4_hi, s6_hi); + t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS); // s5 + s7 - t5_lo = vaddq_s32(s5_lo, s7_lo); - t5_hi = vaddq_s32(s5_hi, s7_hi); + t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS); // s4 - s6 - t6_lo = vsubq_s32(s4_lo, s6_lo); - t6_hi = vsubq_s32(s4_hi, s6_hi); + t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS); // s5 - s7 - t7_lo = vsubq_s32(s5_lo, s7_lo); - t7_hi = vsubq_s32(s5_hi, s7_hi); - - // fdct_round_shift - t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING); - t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING); - t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING); - t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING); - t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING); - t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING); - t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING); - t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING); - t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS); - t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS); - t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS); - t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS); - t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS); - t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS); - t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS); - t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS); + t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS); // stage 3 // cospi_16_64 * (x2 + x3) - s2_lo = vmulq_n_s32(vaddq_s32(t2_lo, t3_lo), cospi_16_64); - s2_hi = vmulq_n_s32(vaddq_s32(t2_hi, t3_hi), cospi_16_64); // cospi_16_64 * (x2 - x3) - s3_lo = vmulq_n_s32(vsubq_s32(t2_lo, t3_lo), cospi_16_64); - s3_hi = vmulq_n_s32(vsubq_s32(t2_hi, t3_hi), cospi_16_64); + butterfly_one_coeff_s32_noround(t_lo[2], t_hi[2], t_lo[3], t_hi[3], + cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3], + &s_hi[3]); + // cospi_16_64 * (x6 + x7) - s6_lo = vmulq_n_s32(vaddq_s32(t6_lo, t7_lo), cospi_16_64); - s6_hi = vmulq_n_s32(vaddq_s32(t6_hi, t7_hi), cospi_16_64); // cospi_16_64 * (x2 - x3) - s7_lo = vmulq_n_s32(vsubq_s32(t6_lo, t7_lo), cospi_16_64); - s7_hi = vmulq_n_s32(vsubq_s32(t6_hi, t7_hi), cospi_16_64); + butterfly_one_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7], + cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7], + &s_hi[7]); // final fdct_round_shift - t2_lo = vaddq_s32(s2_lo, k__DCT_CONST_ROUNDING); - t2_hi = vaddq_s32(s2_hi, k__DCT_CONST_ROUNDING); - t3_lo = vaddq_s32(s3_lo, k__DCT_CONST_ROUNDING); - t3_hi = vaddq_s32(s3_hi, k__DCT_CONST_ROUNDING); - t6_lo = vaddq_s32(s6_lo, k__DCT_CONST_ROUNDING); - t6_hi = vaddq_s32(s6_hi, k__DCT_CONST_ROUNDING); - t7_lo = vaddq_s32(s7_lo, k__DCT_CONST_ROUNDING); - t7_hi = vaddq_s32(s7_hi, k__DCT_CONST_ROUNDING); - - x2_lo = vshrn_n_s32(t2_lo, DCT_CONST_BITS); - x2_hi = vshrn_n_s32(t2_hi, DCT_CONST_BITS); - x3_lo = vshrn_n_s32(t3_lo, DCT_CONST_BITS); - x3_hi = vshrn_n_s32(t3_hi, DCT_CONST_BITS); - x6_lo = vshrn_n_s32(t6_lo, DCT_CONST_BITS); - x6_hi = vshrn_n_s32(t6_hi, DCT_CONST_BITS); - x7_lo = vshrn_n_s32(t7_lo, DCT_CONST_BITS); - x7_hi = vshrn_n_s32(t7_hi, DCT_CONST_BITS); + x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS); + x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS); + x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS); + x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS); + x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS); + x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS); + x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS); + x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS); // x0, x1, x4, x5 narrow down to 16-bits directly - x0_lo = vmovn_s32(t0_lo); - x0_hi = vmovn_s32(t0_hi); - x1_lo = vmovn_s32(t1_lo); - x1_hi = vmovn_s32(t1_hi); - x4_lo = vmovn_s32(t4_lo); - x4_hi = vmovn_s32(t4_hi); - x5_lo = vmovn_s32(t5_lo); - x5_hi = vmovn_s32(t5_hi); - - in[0] = vcombine_s16(x0_lo, x0_hi); - in[1] = vnegq_s16(vcombine_s16(x4_lo, x4_hi)); - in[2] = vcombine_s16(x6_lo, x6_hi); - in[3] = vnegq_s16(vcombine_s16(x2_lo, x2_hi)); - in[4] = vcombine_s16(x3_lo, x3_hi); - in[5] = vnegq_s16(vcombine_s16(x7_lo, x7_hi)); - in[6] = vcombine_s16(x5_lo, x5_hi); - in[7] = vnegq_s16(vcombine_s16(x1_lo, x1_hi)); + x_lo[0] = vmovn_s32(t_lo[0]); + x_hi[0] = vmovn_s32(t_hi[0]); + x_lo[1] = vmovn_s32(t_lo[1]); + x_hi[1] = vmovn_s32(t_hi[1]); + x_lo[4] = vmovn_s32(t_lo[4]); + x_hi[4] = vmovn_s32(t_hi[4]); + x_lo[5] = vmovn_s32(t_lo[5]); + x_hi[5] = vmovn_s32(t_hi[5]); + + in[0] = vcombine_s16(x_lo[0], x_hi[0]); + in[1] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4])); + in[2] = vcombine_s16(x_lo[6], x_hi[6]); + in[3] = vnegq_s16(vcombine_s16(x_lo[2], x_hi[2])); + in[4] = vcombine_s16(x_lo[3], x_hi[3]); + in[5] = vnegq_s16(vcombine_s16(x_lo[7], x_hi[7])); + in[6] = vcombine_s16(x_lo[5], x_hi[5]); + in[7] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1])); transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], &in[7]); @@ -553,7 +448,6 @@ static void fdct16_8col(int16x8_t *in) { int16x8_t i[8], s1[8], s2[8], s3[8], t[8]; int16x4_t t_lo[8], t_hi[8]; int32x4_t u_lo[8], u_hi[8]; - const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING); // stage 1 i[0] = vaddq_s16(in[0], in[15]); @@ -602,23 +496,14 @@ static void fdct16_8col(int16x8_t *in) { u_lo[5] = vmull_n_s16(t_lo[5], cospi_16_64); u_hi[5] = vmull_n_s16(t_hi[5], cospi_16_64); - u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING); - u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING); - u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING); - u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING); - u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING); - u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING); - u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING); - u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING); - - t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS); - t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS); - t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS); - t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS); - t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS); - t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS); - t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS); - t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS); + t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS); + t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS); + t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS); + t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS); + t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS); + t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS); + t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS); s2[2] = vcombine_s16(t_lo[2], t_hi[2]); s2[3] = vcombine_s16(t_lo[3], t_hi[3]); @@ -653,40 +538,26 @@ static void fdct16_8col(int16x8_t *in) { t_lo[7] = vget_low_s16(s3[7]); t_hi[7] = vget_high_s16(s3[7]); - u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_8_64), - vmull_n_s16(t_lo[6], cospi_24_64)); - u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_8_64), - vmull_n_s16(t_hi[6], cospi_24_64)); - u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_24_64), - vmull_n_s16(t_lo[5], cospi_8_64)); - u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_24_64), - vmull_n_s16(t_hi[5], cospi_8_64)); - u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_8_64), - vmull_n_s16(t_lo[5], -cospi_24_64)); - u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_8_64), - vmull_n_s16(t_hi[5], -cospi_24_64)); - u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_24_64), - vmull_n_s16(t_lo[6], cospi_8_64)); - u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_24_64), - vmull_n_s16(t_hi[6], cospi_8_64)); - - u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING); - u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING); - u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING); - u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING); - u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING); - u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING); - u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING); - u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING); - - t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS); - t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS); - t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS); - t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS); - t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS); - t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS); - t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS); - t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS); + // u[1] = -cospi_8_64 * t[1] + cospi_24_64 * t[6] + // u[6] = cospi_24_64 * t[1] + cospi_8_64 * t[6] + butterfly_two_coeff_s16_s32_noround(t_lo[1], t_hi[1], t_lo[6], t_hi[6], + -cospi_8_64, cospi_24_64, &u_lo[1], + &u_hi[1], &u_lo[6], &u_hi[6]); + + // u[5] = -cospi_24_64 * t[5] + cospi_8_64 * t[2] + // u[2] = cospi_8_64 * t[5] + cospi_24_64 * t[2] + butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2], + -cospi_24_64, cospi_8_64, &u_lo[5], + &u_hi[5], &u_lo[2], &u_hi[2]); + + t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS); + t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS); + t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS); + t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS); + t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS); + t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS); + t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS); s2[1] = vcombine_s16(t_lo[1], t_hi[1]); s2[2] = vcombine_s16(t_lo[2], t_hi[2]); @@ -721,88 +592,47 @@ static void fdct16_8col(int16x8_t *in) { t_lo[7] = vget_low_s16(s1[7]); t_hi[7] = vget_high_s16(s1[7]); - // step1[0] * cospi_30_64 + step1[7] * cospi_2_64; - u_lo[0] = vaddq_s32(vmull_n_s16(t_lo[0], cospi_30_64), - vmull_n_s16(t_lo[7], cospi_2_64)); - u_hi[0] = vaddq_s32(vmull_n_s16(t_hi[0], cospi_30_64), - vmull_n_s16(t_hi[7], cospi_2_64)); - - // step1[1] * cospi_14_64 + step1[6] * cospi_18_64; - u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_14_64), - vmull_n_s16(t_lo[6], cospi_18_64)); - u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_14_64), - vmull_n_s16(t_hi[6], cospi_18_64)); - - // step1[2] * cospi_22_64 + step1[5] * cospi_10_64; - u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_22_64), - vmull_n_s16(t_lo[5], cospi_10_64)); - u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_22_64), - vmull_n_s16(t_hi[5], cospi_10_64)); - - // step1[3] * cospi_6_64 + step1[4] * cospi_26_64; - u_lo[3] = vaddq_s32(vmull_n_s16(t_lo[3], cospi_6_64), - vmull_n_s16(t_lo[4], cospi_26_64)); - u_hi[3] = vaddq_s32(vmull_n_s16(t_hi[3], cospi_6_64), - vmull_n_s16(t_hi[4], cospi_26_64)); - - // step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; - u_lo[4] = vaddq_s32(vmull_n_s16(t_lo[3], -cospi_26_64), - vmull_n_s16(t_lo[4], cospi_6_64)); - u_hi[4] = vaddq_s32(vmull_n_s16(t_hi[3], -cospi_26_64), - vmull_n_s16(t_hi[4], cospi_6_64)); - - // step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; - u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], -cospi_10_64), - vmull_n_s16(t_lo[5], cospi_22_64)); - u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], -cospi_10_64), - vmull_n_s16(t_hi[5], cospi_22_64)); - - // step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; - u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_18_64), - vmull_n_s16(t_lo[6], cospi_14_64)); - u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_18_64), - vmull_n_s16(t_hi[6], cospi_14_64)); - - // step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; - u_lo[7] = vaddq_s32(vmull_n_s16(t_lo[0], -cospi_2_64), - vmull_n_s16(t_lo[7], cospi_30_64)); - u_hi[7] = vaddq_s32(vmull_n_s16(t_hi[0], -cospi_2_64), - vmull_n_s16(t_hi[7], cospi_30_64)); + // u[0] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64 + // u[7] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64 + butterfly_two_coeff_s16_s32_noround(t_lo[7], t_hi[7], t_lo[0], t_hi[0], + cospi_2_64, cospi_30_64, &u_lo[0], + &u_hi[0], &u_lo[7], &u_hi[7]); + + // u[1] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64 + // u[6] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64 + butterfly_two_coeff_s16_s32_noround(t_lo[6], t_hi[6], t_lo[1], t_hi[1], + cospi_18_64, cospi_14_64, &u_lo[1], + &u_hi[1], &u_lo[6], &u_hi[6]); + + // u[2] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64 + // u[5] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64 + butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2], + cospi_10_64, cospi_22_64, &u_lo[2], + &u_hi[2], &u_lo[5], &u_hi[5]); + + // u[3] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64 + // u[4] = step1[4] * cospi_6_64 - step1[3] * cospi_26_64 + butterfly_two_coeff_s16_s32_noround(t_lo[4], t_hi[4], t_lo[3], t_hi[3], + cospi_26_64, cospi_6_64, &u_lo[3], + &u_hi[3], &u_lo[4], &u_hi[4]); // final fdct_round_shift - u_lo[0] = vaddq_s32(u_lo[0], k__DCT_CONST_ROUNDING); - u_hi[0] = vaddq_s32(u_hi[0], k__DCT_CONST_ROUNDING); - u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING); - u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING); - u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING); - u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING); - u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING); - u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING); - u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING); - u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING); - u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING); - u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING); - u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING); - u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING); - u_lo[7] = vaddq_s32(u_lo[7], k__DCT_CONST_ROUNDING); - u_hi[7] = vaddq_s32(u_hi[7], k__DCT_CONST_ROUNDING); - - t_lo[0] = vshrn_n_s32(u_lo[0], DCT_CONST_BITS); - t_hi[0] = vshrn_n_s32(u_hi[0], DCT_CONST_BITS); - t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS); - t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS); - t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS); - t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS); - t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS); - t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS); - t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS); - t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS); - t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS); - t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS); - t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS); - t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS); - t_lo[7] = vshrn_n_s32(u_lo[7], DCT_CONST_BITS); - t_hi[7] = vshrn_n_s32(u_hi[7], DCT_CONST_BITS); + t_lo[0] = vrshrn_n_s32(u_lo[0], DCT_CONST_BITS); + t_hi[0] = vrshrn_n_s32(u_hi[0], DCT_CONST_BITS); + t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS); + t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS); + t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS); + t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS); + t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS); + t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS); + t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS); + t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS); + t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS); + t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS); + t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS); + t_lo[7] = vrshrn_n_s32(u_lo[7], DCT_CONST_BITS); + t_hi[7] = vrshrn_n_s32(u_hi[7], DCT_CONST_BITS); in[0] = i[0]; in[2] = i[1]; @@ -827,7 +657,6 @@ static void fadst16_8col(int16x8_t *in) { int16x4_t x_lo[16], x_hi[16]; int32x4_t s_lo[16], s_hi[16]; int32x4_t t_lo[16], t_hi[16]; - const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING); x_lo[0] = vget_low_s16(in[15]); x_hi[0] = vget_high_s16(in[15]); @@ -864,185 +693,79 @@ static void fadst16_8col(int16x8_t *in) { // stage 1 // s0 = cospi_1_64 * x0 + cospi_31_64 * x1; - s_lo[0] = vaddq_s32(vmull_n_s16(x_lo[0], cospi_1_64), - vmull_n_s16(x_lo[1], cospi_31_64)); - s_hi[0] = vaddq_s32(vmull_n_s16(x_hi[0], cospi_1_64), - vmull_n_s16(x_hi[1], cospi_31_64)); // s1 = cospi_31_64 * x0 - cospi_1_64 * x1; - s_lo[1] = vsubq_s32(vmull_n_s16(x_lo[0], cospi_31_64), - vmull_n_s16(x_lo[1], cospi_1_64)); - s_hi[1] = vsubq_s32(vmull_n_s16(x_hi[0], cospi_31_64), - vmull_n_s16(x_hi[1], cospi_1_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1], + cospi_1_64, cospi_31_64, &s_lo[0], + &s_hi[0], &s_lo[1], &s_hi[1]); // s2 = cospi_5_64 * x2 + cospi_27_64 * x3; - s_lo[2] = vaddq_s32(vmull_n_s16(x_lo[2], cospi_5_64), - vmull_n_s16(x_lo[3], cospi_27_64)); - s_hi[2] = vaddq_s32(vmull_n_s16(x_hi[2], cospi_5_64), - vmull_n_s16(x_hi[3], cospi_27_64)); // s3 = cospi_27_64 * x2 - cospi_5_64 * x3; - s_lo[3] = vsubq_s32(vmull_n_s16(x_lo[2], cospi_27_64), - vmull_n_s16(x_lo[3], cospi_5_64)); - s_hi[3] = vsubq_s32(vmull_n_s16(x_hi[2], cospi_27_64), - vmull_n_s16(x_hi[3], cospi_5_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3], + cospi_5_64, cospi_27_64, &s_lo[2], + &s_hi[2], &s_lo[3], &s_hi[3]); // s4 = cospi_9_64 * x4 + cospi_23_64 * x5; - s_lo[4] = vaddq_s32(vmull_n_s16(x_lo[4], cospi_9_64), - vmull_n_s16(x_lo[5], cospi_23_64)); - s_hi[4] = vaddq_s32(vmull_n_s16(x_hi[4], cospi_9_64), - vmull_n_s16(x_hi[5], cospi_23_64)); // s5 = cospi_23_64 * x4 - cospi_9_64 * x5; - s_lo[5] = vsubq_s32(vmull_n_s16(x_lo[4], cospi_23_64), - vmull_n_s16(x_lo[5], cospi_9_64)); - s_hi[5] = vsubq_s32(vmull_n_s16(x_hi[4], cospi_23_64), - vmull_n_s16(x_hi[5], cospi_9_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5], + cospi_9_64, cospi_23_64, &s_lo[4], + &s_hi[4], &s_lo[5], &s_hi[5]); // s6 = cospi_13_64 * x6 + cospi_19_64 * x7; - s_lo[6] = vaddq_s32(vmull_n_s16(x_lo[6], cospi_13_64), - vmull_n_s16(x_lo[7], cospi_19_64)); - s_hi[6] = vaddq_s32(vmull_n_s16(x_hi[6], cospi_13_64), - vmull_n_s16(x_hi[7], cospi_19_64)); // s7 = cospi_19_64 * x6 - cospi_13_64 * x7; - s_lo[7] = vsubq_s32(vmull_n_s16(x_lo[6], cospi_19_64), - vmull_n_s16(x_lo[7], cospi_13_64)); - s_hi[7] = vsubq_s32(vmull_n_s16(x_hi[6], cospi_19_64), - vmull_n_s16(x_hi[7], cospi_13_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7], + cospi_13_64, cospi_19_64, &s_lo[6], + &s_hi[6], &s_lo[7], &s_hi[7]); // s8 = cospi_17_64 * x8 + cospi_15_64 * x9; - s_lo[8] = vaddq_s32(vmull_n_s16(x_lo[8], cospi_17_64), - vmull_n_s16(x_lo[9], cospi_15_64)); - s_hi[8] = vaddq_s32(vmull_n_s16(x_hi[8], cospi_17_64), - vmull_n_s16(x_hi[9], cospi_15_64)); // s9 = cospi_15_64 * x8 - cospi_17_64 * x9; - s_lo[9] = vsubq_s32(vmull_n_s16(x_lo[8], cospi_15_64), - vmull_n_s16(x_lo[9], cospi_17_64)); - s_hi[9] = vsubq_s32(vmull_n_s16(x_hi[8], cospi_15_64), - vmull_n_s16(x_hi[9], cospi_17_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[8], x_hi[8], x_lo[9], x_hi[9], + cospi_17_64, cospi_15_64, &s_lo[8], + &s_hi[8], &s_lo[9], &s_hi[9]); // s10 = cospi_21_64 * x10 + cospi_11_64 * x11; - s_lo[10] = vaddq_s32(vmull_n_s16(x_lo[10], cospi_21_64), - vmull_n_s16(x_lo[11], cospi_11_64)); - s_hi[10] = vaddq_s32(vmull_n_s16(x_hi[10], cospi_21_64), - vmull_n_s16(x_hi[11], cospi_11_64)); // s11 = cospi_11_64 * x10 - cospi_21_64 * x11; - s_lo[11] = vsubq_s32(vmull_n_s16(x_lo[10], cospi_11_64), - vmull_n_s16(x_lo[11], cospi_21_64)); - s_hi[11] = vsubq_s32(vmull_n_s16(x_hi[10], cospi_11_64), - vmull_n_s16(x_hi[11], cospi_21_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[10], x_hi[10], x_lo[11], x_hi[11], + cospi_21_64, cospi_11_64, &s_lo[10], + &s_hi[10], &s_lo[11], &s_hi[11]); // s12 = cospi_25_64 * x12 + cospi_7_64 * x13; - s_lo[12] = vaddq_s32(vmull_n_s16(x_lo[12], cospi_25_64), - vmull_n_s16(x_lo[13], cospi_7_64)); - s_hi[12] = vaddq_s32(vmull_n_s16(x_hi[12], cospi_25_64), - vmull_n_s16(x_hi[13], cospi_7_64)); // s13 = cospi_7_64 * x12 - cospi_25_64 * x13; - s_lo[13] = vsubq_s32(vmull_n_s16(x_lo[12], cospi_7_64), - vmull_n_s16(x_lo[13], cospi_25_64)); - s_hi[13] = vsubq_s32(vmull_n_s16(x_hi[12], cospi_7_64), - vmull_n_s16(x_hi[13], cospi_25_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[12], x_hi[12], x_lo[13], x_hi[13], + cospi_25_64, cospi_7_64, &s_lo[12], + &s_hi[12], &s_lo[13], &s_hi[13]); // s14 = cospi_29_64 * x14 + cospi_3_64 * x15; - s_lo[14] = vaddq_s32(vmull_n_s16(x_lo[14], cospi_29_64), - vmull_n_s16(x_lo[15], cospi_3_64)); - s_hi[14] = vaddq_s32(vmull_n_s16(x_hi[14], cospi_29_64), - vmull_n_s16(x_hi[15], cospi_3_64)); // s15 = cospi_3_64 * x14 - cospi_29_64 * x15; - s_lo[15] = vsubq_s32(vmull_n_s16(x_lo[14], cospi_3_64), - vmull_n_s16(x_lo[15], cospi_29_64)); - s_hi[15] = vsubq_s32(vmull_n_s16(x_hi[14], cospi_3_64), - vmull_n_s16(x_hi[15], cospi_29_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[14], x_hi[14], x_lo[15], x_hi[15], + cospi_29_64, cospi_3_64, &s_lo[14], + &s_hi[14], &s_lo[15], &s_hi[15]); // fdct_round_shift - t_lo[0] = vaddq_s32(s_lo[0], s_lo[8]); - t_hi[0] = vaddq_s32(s_hi[0], s_hi[8]); - t_lo[1] = vaddq_s32(s_lo[1], s_lo[9]); - t_hi[1] = vaddq_s32(s_hi[1], s_hi[9]); - t_lo[2] = vaddq_s32(s_lo[2], s_lo[10]); - t_hi[2] = vaddq_s32(s_hi[2], s_hi[10]); - t_lo[3] = vaddq_s32(s_lo[3], s_lo[11]); - t_hi[3] = vaddq_s32(s_hi[3], s_hi[11]); - t_lo[4] = vaddq_s32(s_lo[4], s_lo[12]); - t_hi[4] = vaddq_s32(s_hi[4], s_hi[12]); - t_lo[5] = vaddq_s32(s_lo[5], s_lo[13]); - t_hi[5] = vaddq_s32(s_hi[5], s_hi[13]); - t_lo[6] = vaddq_s32(s_lo[6], s_lo[14]); - t_hi[6] = vaddq_s32(s_hi[6], s_hi[14]); - t_lo[7] = vaddq_s32(s_lo[7], s_lo[15]); - t_hi[7] = vaddq_s32(s_hi[7], s_hi[15]); - t_lo[8] = vsubq_s32(s_lo[0], s_lo[8]); - t_hi[8] = vsubq_s32(s_hi[0], s_hi[8]); - t_lo[9] = vsubq_s32(s_lo[1], s_lo[9]); - t_hi[9] = vsubq_s32(s_hi[1], s_hi[9]); - t_lo[10] = vsubq_s32(s_lo[2], s_lo[10]); - t_hi[10] = vsubq_s32(s_hi[2], s_hi[10]); - t_lo[11] = vsubq_s32(s_lo[3], s_lo[11]); - t_hi[11] = vsubq_s32(s_hi[3], s_hi[11]); - t_lo[12] = vsubq_s32(s_lo[4], s_lo[12]); - t_hi[12] = vsubq_s32(s_hi[4], s_hi[12]); - t_lo[13] = vsubq_s32(s_lo[5], s_lo[13]); - t_hi[13] = vsubq_s32(s_hi[5], s_hi[13]); - t_lo[14] = vsubq_s32(s_lo[6], s_lo[14]); - t_hi[14] = vsubq_s32(s_hi[6], s_hi[14]); - t_lo[15] = vsubq_s32(s_lo[7], s_lo[15]); - t_hi[15] = vsubq_s32(s_hi[7], s_hi[15]); - - t_lo[0] = vaddq_s32(t_lo[0], k__DCT_CONST_ROUNDING); - t_hi[0] = vaddq_s32(t_hi[0], k__DCT_CONST_ROUNDING); - t_lo[1] = vaddq_s32(t_lo[1], k__DCT_CONST_ROUNDING); - t_hi[1] = vaddq_s32(t_hi[1], k__DCT_CONST_ROUNDING); - t_lo[2] = vaddq_s32(t_lo[2], k__DCT_CONST_ROUNDING); - t_hi[2] = vaddq_s32(t_hi[2], k__DCT_CONST_ROUNDING); - t_lo[3] = vaddq_s32(t_lo[3], k__DCT_CONST_ROUNDING); - t_hi[3] = vaddq_s32(t_hi[3], k__DCT_CONST_ROUNDING); - t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING); - t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING); - t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING); - t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING); - t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING); - t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING); - t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING); - t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING); - t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING); - t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING); - t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING); - t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING); - t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING); - t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING); - t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING); - t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING); - t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING); - t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING); - t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING); - t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING); - t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING); - t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING); - t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING); - t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING); - - t_lo[0] = vshrq_n_s32(t_lo[0], DCT_CONST_BITS); - t_hi[0] = vshrq_n_s32(t_hi[0], DCT_CONST_BITS); - t_lo[1] = vshrq_n_s32(t_lo[1], DCT_CONST_BITS); - t_hi[1] = vshrq_n_s32(t_hi[1], DCT_CONST_BITS); - t_lo[2] = vshrq_n_s32(t_lo[2], DCT_CONST_BITS); - t_hi[2] = vshrq_n_s32(t_hi[2], DCT_CONST_BITS); - t_lo[3] = vshrq_n_s32(t_lo[3], DCT_CONST_BITS); - t_hi[3] = vshrq_n_s32(t_hi[3], DCT_CONST_BITS); - t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS); - t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS); - t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS); - t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS); - t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS); - t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS); - t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS); - t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS); - t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS); - t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS); - t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS); - t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS); - t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS); - t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS); - t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS); - t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS); - t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS); - t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS); - t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS); - t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS); - t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS); - t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS); - t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS); - t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS); + t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS); + t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS); + t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS); + t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS); + t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS); + t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS); + t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS); + t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS); + t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS); + t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS); + t_lo[6] = vrshrq_n_s32(vaddq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(vaddq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS); + t_lo[7] = vrshrq_n_s32(vaddq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(vaddq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS); + t_lo[8] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS); + t_hi[8] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS); + t_lo[9] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS); + t_hi[9] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS); + t_lo[10] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS); + t_hi[10] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS); + t_lo[11] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS); + t_hi[11] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS); + t_lo[12] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS); + t_hi[12] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS); + t_lo[13] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS); + t_hi[13] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS); + t_lo[14] = vrshrq_n_s32(vsubq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS); + t_hi[14] = vrshrq_n_s32(vsubq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS); + t_lo[15] = vrshrq_n_s32(vsubq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS); + t_hi[15] = vrshrq_n_s32(vsubq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS); // stage 2 s_lo[0] = t_lo[0]; @@ -1062,45 +785,25 @@ static void fadst16_8col(int16x8_t *in) { s_lo[7] = t_lo[7]; s_hi[7] = t_hi[7]; // s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s_lo[8] = vaddq_s32(vmulq_n_s32(t_lo[8], cospi_4_64), - vmulq_n_s32(t_lo[9], cospi_28_64)); - s_hi[8] = vaddq_s32(vmulq_n_s32(t_hi[8], cospi_4_64), - vmulq_n_s32(t_hi[9], cospi_28_64)); // s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s_lo[9] = vsubq_s32(vmulq_n_s32(t_lo[8], cospi_28_64), - vmulq_n_s32(t_lo[9], cospi_4_64)); - s_hi[9] = vsubq_s32(vmulq_n_s32(t_hi[8], cospi_28_64), - vmulq_n_s32(t_hi[9], cospi_4_64)); + butterfly_two_coeff_s32_noround(t_lo[8], t_hi[8], t_lo[9], t_hi[9], + cospi_4_64, cospi_28_64, &s_lo[8], &s_hi[8], + &s_lo[9], &s_hi[9]); // s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s_lo[10] = vaddq_s32(vmulq_n_s32(t_lo[10], cospi_20_64), - vmulq_n_s32(t_lo[11], cospi_12_64)); - s_hi[10] = vaddq_s32(vmulq_n_s32(t_hi[10], cospi_20_64), - vmulq_n_s32(t_hi[11], cospi_12_64)); // s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s_lo[11] = vsubq_s32(vmulq_n_s32(t_lo[10], cospi_12_64), - vmulq_n_s32(t_lo[11], cospi_20_64)); - s_hi[11] = vsubq_s32(vmulq_n_s32(t_hi[10], cospi_12_64), - vmulq_n_s32(t_hi[11], cospi_20_64)); + butterfly_two_coeff_s32_noround(t_lo[10], t_hi[10], t_lo[11], t_hi[11], + cospi_20_64, cospi_12_64, &s_lo[10], + &s_hi[10], &s_lo[11], &s_hi[11]); // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], -cospi_28_64), - vmulq_n_s32(t_lo[13], cospi_4_64)); - s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], -cospi_28_64), - vmulq_n_s32(t_hi[13], cospi_4_64)); // s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_4_64), - vmulq_n_s32(t_lo[13], cospi_28_64)); - s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_4_64), - vmulq_n_s32(t_hi[13], cospi_28_64)); + butterfly_two_coeff_s32_noround(t_lo[13], t_hi[13], t_lo[12], t_hi[12], + cospi_28_64, cospi_4_64, &s_lo[13], &s_hi[13], + &s_lo[12], &s_hi[12]); // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_12_64), - vmulq_n_s32(t_lo[15], cospi_20_64)); - s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_12_64), - vmulq_n_s32(t_hi[15], cospi_20_64)); // s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_20_64), - vmulq_n_s32(t_lo[15], cospi_12_64)); - s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_20_64), - vmulq_n_s32(t_hi[15], cospi_12_64)); + butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + cospi_12_64, cospi_20_64, &s_lo[15], + &s_hi[15], &s_lo[14], &s_hi[14]); // s0 + s4 t_lo[0] = vaddq_s32(s_lo[0], s_lo[4]); @@ -1151,38 +854,22 @@ static void fadst16_8col(int16x8_t *in) { t_lo[15] = vsubq_s32(s_lo[11], s_lo[15]); t_hi[15] = vsubq_s32(s_hi[11], s_hi[15]); - t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING); - t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING); - t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING); - t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING); - t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING); - t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING); - t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING); - t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING); - t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING); - t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING); - t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING); - t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING); - t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING); - t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING); - t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING); - t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING); - t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS); - t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS); - t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS); - t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS); - t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS); - t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS); - t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS); - t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS); - t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS); - t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS); - t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS); - t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS); - t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS); - t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS); - t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS); - t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS); + t_lo[8] = vrshrq_n_s32(t_lo[8], DCT_CONST_BITS); + t_hi[8] = vrshrq_n_s32(t_hi[8], DCT_CONST_BITS); + t_lo[9] = vrshrq_n_s32(t_lo[9], DCT_CONST_BITS); + t_hi[9] = vrshrq_n_s32(t_hi[9], DCT_CONST_BITS); + t_lo[10] = vrshrq_n_s32(t_lo[10], DCT_CONST_BITS); + t_hi[10] = vrshrq_n_s32(t_hi[10], DCT_CONST_BITS); + t_lo[11] = vrshrq_n_s32(t_lo[11], DCT_CONST_BITS); + t_hi[11] = vrshrq_n_s32(t_hi[11], DCT_CONST_BITS); + t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS); + t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS); + t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS); + t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS); + t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS); + t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS); + t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS); + t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS); // stage 3 s_lo[0] = t_lo[0]; @@ -1194,25 +881,15 @@ static void fadst16_8col(int16x8_t *in) { s_lo[3] = t_lo[3]; s_hi[3] = t_hi[3]; // s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s_lo[4] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_8_64), - vmulq_n_s32(t_lo[5], cospi_24_64)); - s_hi[4] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_8_64), - vmulq_n_s32(t_hi[5], cospi_24_64)); // s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s_lo[5] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_24_64), - vmulq_n_s32(t_lo[5], -cospi_8_64)); - s_hi[5] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_24_64), - vmulq_n_s32(t_hi[5], -cospi_8_64)); + butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5], + cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4], + &s_lo[5], &s_hi[5]); // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s_lo[6] = vaddq_s32(vmulq_n_s32(t_lo[6], -cospi_24_64), - vmulq_n_s32(t_lo[7], cospi_8_64)); - s_hi[6] = vaddq_s32(vmulq_n_s32(t_hi[6], -cospi_24_64), - vmulq_n_s32(t_hi[7], cospi_8_64)); // s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s_lo[7] = vaddq_s32(vmulq_n_s32(t_lo[6], cospi_8_64), - vmulq_n_s32(t_lo[7], cospi_24_64)); - s_hi[7] = vaddq_s32(vmulq_n_s32(t_hi[6], cospi_8_64), - vmulq_n_s32(t_hi[7], cospi_24_64)); + butterfly_two_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6], + cospi_24_64, cospi_8_64, &s_lo[7], &s_hi[7], + &s_lo[6], &s_hi[6]); s_lo[8] = t_lo[8]; s_hi[8] = t_hi[8]; s_lo[9] = t_lo[9]; @@ -1222,25 +899,15 @@ static void fadst16_8col(int16x8_t *in) { s_lo[11] = t_lo[11]; s_hi[11] = t_hi[11]; // s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_8_64), - vmulq_n_s32(t_lo[13], cospi_24_64)); - s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_8_64), - vmulq_n_s32(t_hi[13], cospi_24_64)); // s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_24_64), - vmulq_n_s32(t_lo[13], -cospi_8_64)); - s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_24_64), - vmulq_n_s32(t_hi[13], -cospi_8_64)); + butterfly_two_coeff_s32_noround(t_lo[12], t_hi[12], t_lo[13], t_hi[13], + cospi_8_64, cospi_24_64, &s_lo[12], &s_hi[12], + &s_lo[13], &s_hi[13]); // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_24_64), - vmulq_n_s32(t_lo[15], cospi_8_64)); - s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_24_64), - vmulq_n_s32(t_hi[15], cospi_8_64)); // s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_8_64), - vmulq_n_s32(t_lo[15], cospi_24_64)); - s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_8_64), - vmulq_n_s32(t_hi[15], cospi_24_64)); + butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + cospi_24_64, cospi_8_64, &s_lo[15], &s_hi[15], + &s_lo[14], &s_hi[14]); // s0 + s4 t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]); @@ -1291,99 +958,62 @@ static void fadst16_8col(int16x8_t *in) { t_lo[15] = vsubq_s32(s_lo[13], s_lo[15]); t_hi[15] = vsubq_s32(s_hi[13], s_hi[15]); - t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING); - t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING); - t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING); - t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING); - t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING); - t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING); - t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING); - t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING); - t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING); - t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING); - t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING); - t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING); - t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING); - t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING); - t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING); - t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING); - t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS); - t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS); - t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS); - t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS); - t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS); - t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS); - t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS); - t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS); - t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS); - t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS); - t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS); - t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS); - t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS); - t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS); - t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS); - t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS); + t_lo[4] = vrshrq_n_s32(t_lo[4], DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(t_hi[4], DCT_CONST_BITS); + t_lo[5] = vrshrq_n_s32(t_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(t_hi[5], DCT_CONST_BITS); + t_lo[6] = vrshrq_n_s32(t_lo[6], DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(t_hi[6], DCT_CONST_BITS); + t_lo[7] = vrshrq_n_s32(t_lo[7], DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(t_hi[7], DCT_CONST_BITS); + t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS); + t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS); + t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS); + t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS); + t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS); + t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS); + t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS); + t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS); // stage 4 // s2 = (-cospi_16_64) * (x2 + x3); - s_lo[2] = vmulq_n_s32(vaddq_s32(t_lo[2], t_lo[3]), -cospi_16_64); - s_hi[2] = vmulq_n_s32(vaddq_s32(t_hi[2], t_hi[3]), -cospi_16_64); // s3 = cospi_16_64 * (x2 - x3); - s_lo[3] = vmulq_n_s32(vsubq_s32(t_lo[2], t_lo[3]), cospi_16_64); - s_hi[3] = vmulq_n_s32(vsubq_s32(t_hi[2], t_hi[3]), cospi_16_64); + butterfly_one_coeff_s32_noround(t_lo[3], t_hi[3], t_lo[2], t_hi[2], + -cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3], + &s_hi[3]); // s6 = cospi_16_64 * (x6 + x7); - s_lo[6] = vmulq_n_s32(vaddq_s32(t_lo[6], t_lo[7]), cospi_16_64); - s_hi[6] = vmulq_n_s32(vaddq_s32(t_hi[6], t_hi[7]), cospi_16_64); // s7 = cospi_16_64 * (-x6 + x7); - s_lo[7] = vmulq_n_s32(vsubq_s32(t_lo[7], t_lo[6]), cospi_16_64); - s_hi[7] = vmulq_n_s32(vsubq_s32(t_hi[7], t_hi[6]), cospi_16_64); + butterfly_one_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6], + cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7], + &s_hi[7]); // s10 = cospi_16_64 * (x10 + x11); - s_lo[10] = vmulq_n_s32(vaddq_s32(t_lo[10], t_lo[11]), cospi_16_64); - s_hi[10] = vmulq_n_s32(vaddq_s32(t_hi[10], t_hi[11]), cospi_16_64); // s11 = cospi_16_64 * (-x10 + x11); - s_lo[11] = vmulq_n_s32(vsubq_s32(t_lo[11], t_lo[10]), cospi_16_64); - s_hi[11] = vmulq_n_s32(vsubq_s32(t_hi[11], t_hi[10]), cospi_16_64); + butterfly_one_coeff_s32_noround(t_lo[11], t_hi[11], t_lo[10], t_hi[10], + cospi_16_64, &s_lo[10], &s_hi[10], &s_lo[11], + &s_hi[11]); // s14 = (-cospi_16_64) * (x14 + x15); - s_lo[14] = vmulq_n_s32(vaddq_s32(t_lo[14], t_lo[15]), -cospi_16_64); - s_hi[14] = vmulq_n_s32(vaddq_s32(t_hi[14], t_hi[15]), -cospi_16_64); // s15 = cospi_16_64 * (x14 - x15); - s_lo[15] = vmulq_n_s32(vsubq_s32(t_lo[14], t_lo[15]), cospi_16_64); - s_hi[15] = vmulq_n_s32(vsubq_s32(t_hi[14], t_hi[15]), cospi_16_64); + butterfly_one_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + -cospi_16_64, &s_lo[14], &s_hi[14], &s_lo[15], + &s_hi[15]); // final fdct_round_shift - t_lo[2] = vaddq_s32(s_lo[2], k__DCT_CONST_ROUNDING); - t_hi[2] = vaddq_s32(s_hi[2], k__DCT_CONST_ROUNDING); - t_lo[3] = vaddq_s32(s_lo[3], k__DCT_CONST_ROUNDING); - t_hi[3] = vaddq_s32(s_hi[3], k__DCT_CONST_ROUNDING); - t_lo[6] = vaddq_s32(s_lo[6], k__DCT_CONST_ROUNDING); - t_hi[6] = vaddq_s32(s_hi[6], k__DCT_CONST_ROUNDING); - t_lo[7] = vaddq_s32(s_lo[7], k__DCT_CONST_ROUNDING); - t_hi[7] = vaddq_s32(s_hi[7], k__DCT_CONST_ROUNDING); - t_lo[10] = vaddq_s32(s_lo[10], k__DCT_CONST_ROUNDING); - t_hi[10] = vaddq_s32(s_hi[10], k__DCT_CONST_ROUNDING); - t_lo[11] = vaddq_s32(s_lo[11], k__DCT_CONST_ROUNDING); - t_hi[11] = vaddq_s32(s_hi[11], k__DCT_CONST_ROUNDING); - t_lo[14] = vaddq_s32(s_lo[14], k__DCT_CONST_ROUNDING); - t_hi[14] = vaddq_s32(s_hi[14], k__DCT_CONST_ROUNDING); - t_lo[15] = vaddq_s32(s_lo[15], k__DCT_CONST_ROUNDING); - t_hi[15] = vaddq_s32(s_hi[15], k__DCT_CONST_ROUNDING); - - x_lo[2] = vshrn_n_s32(t_lo[2], DCT_CONST_BITS); - x_hi[2] = vshrn_n_s32(t_hi[2], DCT_CONST_BITS); - x_lo[3] = vshrn_n_s32(t_lo[3], DCT_CONST_BITS); - x_hi[3] = vshrn_n_s32(t_hi[3], DCT_CONST_BITS); - x_lo[6] = vshrn_n_s32(t_lo[6], DCT_CONST_BITS); - x_hi[6] = vshrn_n_s32(t_hi[6], DCT_CONST_BITS); - x_lo[7] = vshrn_n_s32(t_lo[7], DCT_CONST_BITS); - x_hi[7] = vshrn_n_s32(t_hi[7], DCT_CONST_BITS); - x_lo[10] = vshrn_n_s32(t_lo[10], DCT_CONST_BITS); - x_hi[10] = vshrn_n_s32(t_hi[10], DCT_CONST_BITS); - x_lo[11] = vshrn_n_s32(t_lo[11], DCT_CONST_BITS); - x_hi[11] = vshrn_n_s32(t_hi[11], DCT_CONST_BITS); - x_lo[14] = vshrn_n_s32(t_lo[14], DCT_CONST_BITS); - x_hi[14] = vshrn_n_s32(t_hi[14], DCT_CONST_BITS); - x_lo[15] = vshrn_n_s32(t_lo[15], DCT_CONST_BITS); - x_hi[15] = vshrn_n_s32(t_hi[15], DCT_CONST_BITS); + x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS); + x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS); + x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS); + x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS); + x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS); + x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS); + x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS); + x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS); + x_lo[10] = vrshrn_n_s32(s_lo[10], DCT_CONST_BITS); + x_hi[10] = vrshrn_n_s32(s_hi[10], DCT_CONST_BITS); + x_lo[11] = vrshrn_n_s32(s_lo[11], DCT_CONST_BITS); + x_hi[11] = vrshrn_n_s32(s_hi[11], DCT_CONST_BITS); + x_lo[14] = vrshrn_n_s32(s_lo[14], DCT_CONST_BITS); + x_hi[14] = vrshrn_n_s32(s_hi[14], DCT_CONST_BITS); + x_lo[15] = vrshrn_n_s32(s_lo[15], DCT_CONST_BITS); + x_hi[15] = vrshrn_n_s32(s_hi[15], DCT_CONST_BITS); // x0, x1, x4, x5, x8, x9, x12, x13 narrow down to 16-bits directly x_lo[0] = vmovn_s32(t_lo[0]); @@ -1465,3 +1095,137 @@ void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, break; } } + +#if CONFIG_VP9_HIGHBITDEPTH + +static INLINE void highbd_load_buffer_4x4(const int16_t *input, + int32x4_t *in /*[4]*/, int stride) { + // { 0, 1, 1, 1 }; + const int32x4_t nonzero_bias_a = vextq_s32(vdupq_n_s32(0), vdupq_n_s32(1), 3); + // { 1, 0, 0, 0 }; + const int32x4_t nonzero_bias_b = vextq_s32(vdupq_n_s32(1), vdupq_n_s32(0), 3); + int32x4_t mask; + + in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4); + in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4); + in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4); + in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4); + + // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by + // one non-zero first elements + mask = vreinterpretq_s32_u32(vceqq_s32(in[0], nonzero_bias_a)); + in[0] = vaddq_s32(in[0], mask); + in[0] = vaddq_s32(in[0], nonzero_bias_b); +} + +static INLINE void highbd_write_buffer_4x4(tran_low_t *output, int32x4_t *res) { + const int32x4_t one = vdupq_n_s32(1); + res[0] = vshrq_n_s32(vaddq_s32(res[0], one), 2); + res[1] = vshrq_n_s32(vaddq_s32(res[1], one), 2); + res[2] = vshrq_n_s32(vaddq_s32(res[2], one), 2); + res[3] = vshrq_n_s32(vaddq_s32(res[3], one), 2); + vst1q_s32(output + 0 * 4, res[0]); + vst1q_s32(output + 1 * 4, res[1]); + vst1q_s32(output + 2 * 4, res[2]); + vst1q_s32(output + 3 * 4, res[3]); +} + +static INLINE void highbd_fadst4x4_neon(int32x4_t *in /*[4]*/) { + int32x2_t s_lo[4], s_hi[4]; + int64x2_t u_lo[4], u_hi[4], t_lo[4], t_hi[4]; + + s_lo[0] = vget_low_s32(in[0]); + s_hi[0] = vget_high_s32(in[0]); + s_lo[1] = vget_low_s32(in[1]); + s_hi[1] = vget_high_s32(in[1]); + s_lo[2] = vget_low_s32(in[2]); + s_hi[2] = vget_high_s32(in[2]); + s_lo[3] = vget_low_s32(in[3]); + s_hi[3] = vget_high_s32(in[3]); + + // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9 + t_lo[0] = vmull_n_s32(s_lo[0], sinpi_1_9); + t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[1], sinpi_2_9); + t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[3], sinpi_4_9); + t_hi[0] = vmull_n_s32(s_hi[0], sinpi_1_9); + t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[1], sinpi_2_9); + t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[3], sinpi_4_9); + + // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9 + t_lo[1] = vmull_n_s32(s_lo[0], sinpi_3_9); + t_lo[1] = vmlal_n_s32(t_lo[1], s_lo[1], sinpi_3_9); + t_lo[1] = vmlsl_n_s32(t_lo[1], s_lo[3], sinpi_3_9); + t_hi[1] = vmull_n_s32(s_hi[0], sinpi_3_9); + t_hi[1] = vmlal_n_s32(t_hi[1], s_hi[1], sinpi_3_9); + t_hi[1] = vmlsl_n_s32(t_hi[1], s_hi[3], sinpi_3_9); + + // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9 + t_lo[2] = vmull_n_s32(s_lo[0], sinpi_4_9); + t_lo[2] = vmlsl_n_s32(t_lo[2], s_lo[1], sinpi_1_9); + t_lo[2] = vmlal_n_s32(t_lo[2], s_lo[3], sinpi_2_9); + t_hi[2] = vmull_n_s32(s_hi[0], sinpi_4_9); + t_hi[2] = vmlsl_n_s32(t_hi[2], s_hi[1], sinpi_1_9); + t_hi[2] = vmlal_n_s32(t_hi[2], s_hi[3], sinpi_2_9); + + // t3 = s2 * sinpi_3_9 + t_lo[3] = vmull_n_s32(s_lo[2], sinpi_3_9); + t_hi[3] = vmull_n_s32(s_hi[2], sinpi_3_9); + + /* + * u0 = t0 + t3 + * u1 = t1 + * u2 = t2 - t3 + * u3 = t2 - t0 + t3 + */ + u_lo[0] = vaddq_s64(t_lo[0], t_lo[3]); + u_hi[0] = vaddq_s64(t_hi[0], t_hi[3]); + u_lo[1] = t_lo[1]; + u_hi[1] = t_hi[1]; + u_lo[2] = vsubq_s64(t_lo[2], t_lo[3]); + u_hi[2] = vsubq_s64(t_hi[2], t_hi[3]); + u_lo[3] = vaddq_s64(vsubq_s64(t_lo[2], t_lo[0]), t_lo[3]); + u_hi[3] = vaddq_s64(vsubq_s64(t_hi[2], t_hi[0]), t_hi[3]); + + // fdct_round_shift + in[0] = vcombine_s32(vrshrn_n_s64(u_lo[0], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[0], DCT_CONST_BITS)); + in[1] = vcombine_s32(vrshrn_n_s64(u_lo[1], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[1], DCT_CONST_BITS)); + in[2] = vcombine_s32(vrshrn_n_s64(u_lo[2], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[2], DCT_CONST_BITS)); + in[3] = vcombine_s32(vrshrn_n_s64(u_lo[3], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[3], DCT_CONST_BITS)); + + transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]); +} + +void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + int32x4_t in[4]; + // int i; + + switch (tx_type) { + case DCT_DCT: vpx_highbd_fdct4x4_neon(input, output, stride); break; + case ADST_DCT: + highbd_load_buffer_4x4(input, in, stride); + highbd_fadst4x4_neon(in); + vpx_highbd_fdct4x4_pass1_neon(in); + highbd_write_buffer_4x4(output, in); + break; + case DCT_ADST: + highbd_load_buffer_4x4(input, in, stride); + vpx_highbd_fdct4x4_pass1_neon(in); + highbd_fadst4x4_neon(in); + highbd_write_buffer_4x4(output, in); + break; + default: + assert(tx_type == ADST_ADST); + highbd_load_buffer_4x4(input, in, stride); + highbd_fadst4x4_neon(in); + highbd_fadst4x4_neon(in); + highbd_write_buffer_4x4(output, in); + break; + } +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h index b33da427b4..193594e3dc 100644 --- a/vpx_dsp/arm/fdct_neon.h +++ b/vpx_dsp/arm/fdct_neon.h @@ -130,6 +130,24 @@ static INLINE void butterfly_one_coeff_s16_s32_narrow( *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi)); } +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_noround( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant); + *add_lo = vmlaq_n_s32(a1, b_lo, constant); + *add_hi = vmlaq_n_s32(a2, b_hi, constant); + *sub_lo = vmlsq_n_s32(a3, b_lo, constant); + *sub_hi = vmlsq_n_s32(a4, b_hi, constant); +} + // fdct_round_shift((a +/- b) * c) // Variant that performs fast vqrdmulhq_s32 operation on full vector // more accurate does 32-bit processing, takes and returns 32-bit values, @@ -234,6 +252,44 @@ static INLINE void butterfly_two_coeff_s32_s64_narrow( vrshrn_n_s64(diff[3], DCT_CONST_BITS)); } +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s16_s32_noround( + const int16x4_t a_lo, const int16x4_t a_hi, const int16x4_t b_lo, + const int16x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmull_n_s16(a_lo, constant1); + const int32x4_t a2 = vmull_n_s16(a_hi, constant1); + const int32x4_t a3 = vmull_n_s16(a_lo, constant2); + const int32x4_t a4 = vmull_n_s16(a_hi, constant2); + *add_lo = vmlal_n_s16(a1, b_lo, constant2); + *add_hi = vmlal_n_s16(a2, b_hi, constant2); + *sub_lo = vmlsl_n_s16(a3, b_lo, constant1); + *sub_hi = vmlsl_n_s16(a4, b_hi, constant1); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_noround( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant1); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant1); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant2); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant2); + *add_lo = vmlaq_n_s32(a1, b_lo, constant2); + *add_hi = vmlaq_n_s32(a2, b_hi, constant2); + *sub_lo = vmlsq_n_s32(a3, b_lo, constant1); + *sub_hi = vmlsq_n_s32(a4, b_hi, constant1); +} + // fdct_round_shift(a * c1 +/- b * c2) // Variant that performs normal implementation on half vector // more accurate does 32-bit processing, takes and returns 16-bit values From aeb6ae7393f09e66478f5b800ea989ae95a85e98 Mon Sep 17 00:00:00 2001 From: Johann Date: Mon, 14 Nov 2022 17:59:45 +0900 Subject: [PATCH 0547/1000] quantize: remove vp9_regular_quantize_b_4x4 This was just a helper function which called vpx_quantize_b or vpx_highbd_quantize_b. It also checked for skip_block, which was necessary when webm:1439 was filed but does not appear to be necessary now. Removes a quantize variant and makes subsequent cleanups easier. Change-Id: Ibe545eccd19370f07ff26c8e151f290c642efd2a --- vp9/encoder/vp9_quantize.c | 28 ------------------------- vp9/encoder/vp9_quantize.h | 3 --- vp9/encoder/vp9_rdopt.c | 43 +++++++++++++++++++++++++++++++------- 3 files changed, 35 insertions(+), 39 deletions(-) diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 9058997b0f..dcc44449fd 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -149,34 +149,6 @@ void vp9_highbd_quantize_fp_32x32_c( } #endif -void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, - const int16_t *scan, const int16_t *iscan) { - MACROBLOCKD *const xd = &x->e_mbd; - struct macroblock_plane *p = &x->plane[plane]; - struct macroblockd_plane *pd = &xd->plane[plane]; - tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block), - *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const int n_coeffs = 4 * 4; - - if (x->skip_block) { - memset(qcoeff, 0, n_coeffs * sizeof(*qcoeff)); - memset(dqcoeff, 0, n_coeffs * sizeof(*dqcoeff)); - return; - } - -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, - p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, &p->eobs[block], scan, iscan); - return; - } -#endif - vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, - &p->eobs[block], scan, iscan); -} - static void invert_quant(int16_t *quant, int16_t *shift, int d) { unsigned t; int l, m; diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index 2e6d7da2b6..f626f06566 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -37,9 +37,6 @@ typedef struct { DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]); } QUANTS; -void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, - const int16_t *scan, const int16_t *iscan); - struct VP9_COMP; struct VP9Common; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index bfde5ab1a5..a464ce38f1 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1108,6 +1108,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, xd->mi[0]->tx_size = TX_4X4; + assert(!x->skip_block); + #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { for (mode = DC_PRED; mode <= TM_PRED; ++mode) { @@ -1135,7 +1137,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); - tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; xd->mi[0]->bmi[block].as_mode = mode; vp9_predict_intra_block(xd, 1, TX_4X4, mode, x->skip_encode ? src : dst, @@ -1148,7 +1153,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_highbd_fwht4x4(src_diff, coeff, 8); - vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, + eob, so->scan, so->iscan); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0); @@ -1166,7 +1173,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, vpx_highbd_fdct4x4(src_diff, coeff, 8); else vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type); - vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, + eob, so->scan, so->iscan); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); distortion += vp9_highbd_block_error_dispatch( @@ -1236,7 +1245,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); - tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; xd->mi[0]->bmi[block].as_mode = mode; vp9_predict_intra_block(xd, 1, TX_4X4, mode, x->skip_encode ? src : dst, x->skip_encode ? src_stride : dst_stride, dst, @@ -1248,7 +1260,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_fwht4x4(src_diff, coeff, 8); - vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + so->scan, so->iscan); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0; @@ -1263,7 +1277,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_fht4x4(src_diff, coeff, 8, tx_type); - vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + so->scan, so->iscan); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0; @@ -1640,6 +1656,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, const int is_compound = has_second_ref(mi); const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter]; + assert(!x->skip_block); + for (ref = 0; ref < 1 + is_compound; ++ref) { const int bw = b_width_log2_lookup[BLOCK_8X8]; const int h = 4 * (i >> bw); @@ -1701,18 +1719,27 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8; #endif int64_t ssz, rd, rd1, rd2; - tran_low_t *coeff; + tran_low_t *coeff, *qcoeff, *dqcoeff; + uint16_t *eob; int coeff_ctx; k += (idy * 2 + idx); coeff_ctx = combine_entropy_contexts(ta[k & 1], tl[k >> 1]); coeff = BLOCK_OFFSET(p->coeff, k); + qcoeff = BLOCK_OFFSET(p->qcoeff, k); + dqcoeff = BLOCK_OFFSET(pd->dqcoeff, k); + eob = &p->eobs[k]; + x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), coeff, 8); - vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan); #if CONFIG_VP9_HIGHBITDEPTH + vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + so->scan, so->iscan); thisdistortion += vp9_highbd_block_error_dispatch( coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd); #else + vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, p->quant_shift, + qcoeff, dqcoeff, pd->dequant, eob, so->scan, so->iscan); thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz); #endif // CONFIG_VP9_HIGHBITDEPTH From 76e9bf7a184eb1caf979dd07e1107e3b74ac10b6 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Mon, 14 Nov 2022 22:11:19 -0800 Subject: [PATCH 0548/1000] vp9-svc: Fixes to make SVC work with VBR Prior to this CL SVC with VBR mode was broken. Fixes made here to make VBR rate control work for SVC. Rename is_one_pass_cbr_svc() --> is_one_pass_svc(), as it can be used now for both CBR and VBR. Added rate targetting unittest for (2SL, 3TL). Bug: chromium:1375111 Change-Id: I5a62ffe7fbea29dc5949c88a284768386b1907a9 --- test/svc_datarate_test.cc | 31 ++++++++++++++++++ vp9/encoder/vp9_aq_cyclicrefresh.c | 2 +- vp9/encoder/vp9_encodeframe.c | 6 ++-- vp9/encoder/vp9_encoder.c | 38 +++++++++++------------ vp9/encoder/vp9_encoder.h | 2 +- vp9/encoder/vp9_ratectrl.c | 50 ++++++++++++++++++++++++------ vp9/encoder/vp9_svc_layercontext.c | 6 ++-- vp9/encoder/vp9_svc_layercontext.h | 2 +- vp9/vp9_cx_iface.c | 5 ++- 9 files changed, 101 insertions(+), 41 deletions(-) diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc index 010c273421..484252ca43 100644 --- a/test/svc_datarate_test.cc +++ b/test/svc_datarate_test.cc @@ -1203,6 +1203,37 @@ TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassCbrSvc2SL3TL) { #endif } +// Check basic rate targeting for 1 pass VBR SVC: 2 spatial layers and +// 3 temporal layers. Run VGA clip with 1 thread. +TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassVbrSvc2SL3TL) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_VBR; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + const int bitrates[3] = { 200, 400, 600 }; + cfg_.rc_target_bitrate = bitrates[GET_PARAM(2)]; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70, + 1.3); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + // Params: speed setting, layer framedrop control and index for bitrate array. class DatarateOnePassCbrSvcFrameDropMultiBR : public DatarateOnePassCbrSvc, diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index 90792aebea..28ab10a13b 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -558,7 +558,7 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { cr->percent_refresh = 10; cr->rate_ratio_qdelta = 1.5; cr->rate_boost_fac = 10; - if (cpi->refresh_golden_frame == 1) { + if (cpi->refresh_golden_frame == 1 && !cpi->use_svc) { cr->percent_refresh = 0; cr->rate_ratio_qdelta = 1.0; } diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index a9f392bf51..a1ee9c6784 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1299,7 +1299,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, // the reference (base layer frame) is key frame (i.e., is_key_frame == 1). int is_key_frame = (frame_is_intra_only(cm) || - (is_one_pass_cbr_svc(cpi) && + (is_one_pass_svc(cpi) && cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)); // Always use 4x4 partition for key frame. const int use_4x4_partition = frame_is_intra_only(cm); @@ -1406,7 +1406,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, assert(yv12 != NULL); - if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) || + if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) || cpi->svc.use_gf_temporal_ref_current_layer) { // For now, GOLDEN will not be used for non-zero spatial layers, since // it may not be a temporal reference. @@ -5381,7 +5381,7 @@ static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile, assert(yv12 != NULL); - if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) || + if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) || cpi->svc.use_gf_temporal_ref_current_layer) { // For now, GOLDEN will not be used for non-zero spatial layers, since // it may not be a temporal reference. diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index ca3439d7c0..87c5d7b67f 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1333,7 +1333,7 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a // target of 1/4x1/4. number_spatial_layers must be greater than 2. - if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc && + if (is_one_pass_svc(cpi) && !cpi->svc.scaled_temp_is_alloc && cpi->svc.number_spatial_layers > 2) { cpi->svc.scaled_temp_is_alloc = 1; if (vpx_realloc_frame_buffer( @@ -1511,7 +1511,7 @@ static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { // Temporal scalability. cpi->svc.number_temporal_layers = oxcf->ts_number_layers; - if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || + if ((cpi->svc.number_temporal_layers > 1) || ((cpi->svc.number_temporal_layers > 1 || cpi->svc.number_spatial_layers > 1) && cpi->oxcf.pass != 1)) { @@ -2077,7 +2077,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { rc->rc_2_frame = 0; } - if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || + if ((cpi->svc.number_temporal_layers > 1) || ((cpi->svc.number_temporal_layers > 1 || cpi->svc.number_spatial_layers > 1) && cpi->oxcf.pass != 1)) { @@ -3263,7 +3263,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { vp9_denoiser_update_ref_frame(cpi); #endif - if (is_one_pass_cbr_svc(cpi)) vp9_svc_update_ref_frame(cpi); + if (is_one_pass_svc(cpi)) vp9_svc_update_ref_frame(cpi); } static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { @@ -3857,11 +3857,11 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, int q = 0, bottom_index = 0, top_index = 0; int no_drop_scene_change = 0; const INTERP_FILTER filter_scaler = - (is_one_pass_cbr_svc(cpi)) + (is_one_pass_svc(cpi)) ? svc->downsample_filter_type[svc->spatial_layer_id] : EIGHTTAP; const int phase_scaler = - (is_one_pass_cbr_svc(cpi)) + (is_one_pass_svc(cpi)) ? svc->downsample_filter_phase[svc->spatial_layer_id] : 0; @@ -3882,7 +3882,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, set_frame_size(cpi); - if (is_one_pass_cbr_svc(cpi) && + if (is_one_pass_svc(cpi) && cpi->un_scaled_source->y_width == cm->width << 2 && cpi->un_scaled_source->y_height == cm->height << 2 && svc->scaled_temp.y_width == cm->width << 1 && @@ -3896,7 +3896,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp, filter_scaler, phase_scaler, filter_scaler2, phase_scaler2); svc->scaled_one_half = 1; - } else if (is_one_pass_cbr_svc(cpi) && + } else if (is_one_pass_svc(cpi) && cpi->un_scaled_source->y_width == cm->width << 1 && cpi->un_scaled_source->y_height == cm->height << 1 && svc->scaled_one_half) { @@ -3911,7 +3911,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, } #ifdef OUTPUT_YUV_SVC_SRC // Write out at most 3 spatial layers. - if (is_one_pass_cbr_svc(cpi) && svc->spatial_layer_id < 3) { + if (is_one_pass_svc(cpi) && svc->spatial_layer_id < 3) { vpx_write_yuv_frame(yuv_svc_src[svc->spatial_layer_id], cpi->Source); } #endif @@ -4020,14 +4020,14 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, if (vp9_rc_drop_frame(cpi)) return 0; } - // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame + // For 1 pass SVC, only ZEROMV is allowed for spatial reference frame // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can // avoid this frame-level upsampling (for non intra_only frames). // For SVC single_layer mode, dynamic resize is allowed and we need to // scale references for this case. if (frame_is_intra_only(cm) == 0 && ((svc->single_layer_svc && cpi->oxcf.resize_mode == RESIZE_DYNAMIC) || - !(is_one_pass_cbr_svc(cpi) && svc->force_zero_mode_spatial_ref))) { + !(is_one_pass_svc(cpi) && svc->force_zero_mode_spatial_ref))) { vp9_scale_references(cpi); } @@ -7613,8 +7613,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, const int gf_group_index = cpi->twopass.gf_group.index; int i; - if (is_one_pass_cbr_svc(cpi)) { - vp9_one_pass_cbr_svc_start_layer(cpi); + if (is_one_pass_svc(cpi)) { + vp9_one_pass_svc_start_layer(cpi); } vpx_usec_timer_start(&cmptimer); @@ -7634,7 +7634,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // Normal defaults cm->reset_frame_context = 0; cm->refresh_frame_context = 1; - if (!is_one_pass_cbr_svc(cpi)) { + if (!is_one_pass_svc(cpi)) { cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 0; cpi->refresh_alt_ref_frame = 0; @@ -7767,7 +7767,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, adjust_frame_rate(cpi, source); } - if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_svc(cpi)) { vp9_update_temporal_layer_framerate(cpi); vp9_restore_layer_context(cpi); } @@ -7901,9 +7901,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } // Save layer specific state. - if (is_one_pass_cbr_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 || - cpi->svc.number_spatial_layers > 1) && - oxcf->pass == 2)) { + if (is_one_pass_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) && + oxcf->pass == 2)) { vp9_save_layer_context(cpi); } @@ -8077,7 +8077,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif - if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_svc(cpi)) { if (cm->show_frame) { ++cpi->svc.spatial_layer_to_encode; if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers) diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 1d58945250..3e0b80677e 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1305,7 +1305,7 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required( void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags); -static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) { +static INLINE int is_one_pass_svc(const struct VP9_COMP *const cpi) { return (cpi->use_svc && cpi->oxcf.pass == 0); } diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 1ddf64d41a..d9207f7a2f 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -327,7 +327,7 @@ static void update_buffer_level_postencode(VP9_COMP *cpi, rc->buffer_level = rc->bits_off_target; - if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_svc(cpi)) { update_layer_buffer_level_postencode(&cpi->svc, encoded_frame_size); } } @@ -910,7 +910,7 @@ static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) { active_worst_quality = curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] << 1; } else { - if (!rc->is_src_frame_alt_ref && + if (!rc->is_src_frame_alt_ref && !cpi->use_svc && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { active_worst_quality = curr_frame == 1 @@ -1871,7 +1871,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { } } } else { - if ((cpi->use_svc && oxcf->rc_mode == VPX_CBR) || + if ((cpi->use_svc) || (!rc->is_src_frame_alt_ref && !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { rc->last_q[INTER_FRAME] = qindex; @@ -2021,6 +2021,11 @@ int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) { (rc->baseline_gf_interval + af_ratio - 1) : ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) / (rc->baseline_gf_interval + af_ratio - 1); + // For SVC: refresh flags are used to define the pattern, so we can't + // use that for boosting the target size here. + // TODO(marpan): Consider adding internal boost on TL0 for VBR-SVC. + // For now just use the CBR logic for setting target size. + if (cpi->use_svc) target = vp9_calc_pframe_target_size_one_pass_cbr(cpi); if (target > INT_MAX) target = INT_MAX; return vp9_rc_clamp_pframe_target_size(cpi, (int)target); } @@ -2147,7 +2152,7 @@ int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { } else { target = rc->avg_frame_bandwidth; } - if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_svc(cpi)) { // Note that for layers, avg_frame_bandwidth is the cumulative // per-frame-bandwidth. For the target size of this frame, use the // layer average frame size (i.e., non-cumulative per-frame-bw). @@ -2282,7 +2287,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0)) { cm->frame_type = KEY_FRAME; rc->source_alt_ref_active = 0; - if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_svc(cpi)) { if (cm->current_video_frame > 0) vp9_svc_reset_temporal_layers(cpi, 1); layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, svc->number_temporal_layers); @@ -2290,11 +2295,14 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); // Assumption here is that LAST_FRAME is being updated for a keyframe. // Thus no change in update flags. - target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + if (cpi->oxcf.rc_mode == VPX_CBR) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + else + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi); } } else { cm->frame_type = INTER_FRAME; - if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_svc(cpi)) { LAYER_CONTEXT *lc = &svc->layer_context[layer]; // Add condition current_video_frame > 0 for the case where first frame // is intra only followed by overlay/copy frame. In this case we don't @@ -2303,7 +2311,23 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { (svc->spatial_layer_id == 0 && cm->current_video_frame > 0) ? 0 : svc->layer_context[svc->temporal_layer_id].is_key_frame; - target = vp9_calc_pframe_target_size_one_pass_cbr(cpi); + if (cpi->oxcf.rc_mode == VPX_CBR) { + target = vp9_calc_pframe_target_size_one_pass_cbr(cpi); + } else { + double rate_err = 0.0; + rc->fac_active_worst_inter = 140; + rc->fac_active_worst_gf = 100; + if (rc->rolling_target_bits > 0) { + rate_err = + (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits; + if (rate_err < 1.0) + rc->fac_active_worst_inter = 120; + else if (rate_err > 2.0) + // Increase active_worst faster if rate fluctuation is high. + rc->fac_active_worst_inter = 160; + } + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi); + } } } @@ -2312,7 +2336,10 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { svc->layer_context[layer].is_key_frame == 1) { cm->frame_type = KEY_FRAME; cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); - target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + if (cpi->oxcf.rc_mode == VPX_CBR) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + else + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi); } // Set the buffer idx and refresh flags for key frames in simulcast mode. // Note the buffer slot for long-term reference is set below (line 2255), @@ -2397,7 +2424,10 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { } if (svc->set_intra_only_frame) { set_intra_only_frame(cpi); - target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + if (cpi->oxcf.rc_mode == VPX_CBR) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + else + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi); } // Overlay frame predicts from LAST (intra-only) if (svc->previous_frame_is_intra_only) cpi->ref_frame_flags |= VP9_LAST_FLAG; diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index a57a70ab16..518c00b34a 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -290,7 +290,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, } static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) { - if (is_one_pass_cbr_svc(cpi)) + if (is_one_pass_svc(cpi)) return &cpi->svc.layer_context[cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + cpi->svc.temporal_layer_id]; @@ -354,7 +354,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { cpi->alt_ref_source = lc->alt_ref_source; // Check if it is one_pass_cbr_svc mode and lc->speed > 0 (real-time mode // does not use speed = 0). - if (is_one_pass_cbr_svc(cpi) && lc->speed > 0) { + if (is_one_pass_svc(cpi) && lc->speed > 0) { cpi->oxcf.speed = lc->speed; } cpi->loopfilter_ctrl = lc->loopfilter_ctrl; @@ -754,7 +754,7 @@ void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) { svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG); } -int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { +int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) { int width = 0, height = 0; SVC *const svc = &cpi->svc; LAYER_CONTEXT *lc = NULL; diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index b2d1d1b98f..c7328cf571 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -255,7 +255,7 @@ int vp9_denoise_svc_non_key(struct VP9_COMP *const cpi); void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi); -int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi); +int vp9_one_pass_svc_start_layer(struct VP9_COMP *const cpi); void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi); diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 02bd2e579b..695774e730 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1527,9 +1527,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, cx_data += size; cx_data_sz -= size; - if (is_one_pass_cbr_svc(cpi) && - (cpi->svc.spatial_layer_id == - cpi->svc.number_spatial_layers - 1)) { + if (is_one_pass_svc(cpi) && (cpi->svc.spatial_layer_id == + cpi->svc.number_spatial_layers - 1)) { // Encoded all spatial layers; exit loop. break; } From 3fa698a6e855aad203517b8e71290b837ceda192 Mon Sep 17 00:00:00 2001 From: Hirokazu Honda Date: Thu, 17 Nov 2022 16:05:28 +0900 Subject: [PATCH 0549/1000] vp9/rate_ctrl_rtc: Improve get cyclic refresh data A client of the vp9 rate controller needs to know whether the segmentation is enabled and the size of delta_q. It is also nicer to know the size of map. This CL changes the interface to achieve these. Bug: b:259487065 Test: Build Change-Id: If05854530f97e1430a7b97788910f277ab673a87 --- vp9/ratectrl_rtc.cc | 16 ++++++++++------ vp9/ratectrl_rtc.h | 10 ++++++++-- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index f4d7f7e9e7..a9287b5a3e 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -205,12 +205,16 @@ int VP9RateControlRTC::GetLoopfilterLevel() const { return lf->filter_level; } -signed char *VP9RateControlRTC::GetCyclicRefreshMap() const { - return cpi_->cyclic_refresh->map; -} - -int *VP9RateControlRTC::GetDeltaQ() const { - return cpi_->cyclic_refresh->qindex_delta; +bool VP9RateControlRTC::GetSegmentationData( + VP9SegmentationData *segmentation_data) const { + if (!cpi_->cyclic_refresh->apply_cyclic_refresh) return false; + + segmentation_data->segmentation_map = cpi_->segmentation_map; + segmentation_data->segmentation_map_size = + cpi_->common.mi_cols * cpi_->common.mi_rows; + segmentation_data->delta_q = cpi_->cyclic_refresh->qindex_delta; + segmentation_data->delta_q_size = 3u; + return true; } void VP9RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index d2b9417aef..b209e4db66 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -58,6 +58,13 @@ struct VP9FrameParamsQpRTC { int temporal_layer_id; }; +struct VP9SegmentationData { + const uint8_t *segmentation_map; + size_t segmentation_map_size; + const int *delta_q; + size_t delta_q_size; +}; + // This interface allows using VP9 real-time rate control without initializing // the encoder. To use this interface, you need to link with libvpxrc.a. // @@ -110,8 +117,7 @@ class VP9RateControlRTC { // GetQP() needs to be called after ComputeQP() to get the latest QP int GetQP() const; int GetLoopfilterLevel() const; - signed char *GetCyclicRefreshMap() const; - int *GetDeltaQ() const; + bool GetSegmentationData(VP9SegmentationData *segmentation_data) const; void ComputeQP(const VP9FrameParamsQpRTC &frame_params); // Feedback to rate control with the size of current encoded frame void PostEncodeUpdate(uint64_t encoded_frame_size); From 2a8a25cf447914515dd7c27030f39b1cc06234f3 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Mon, 5 Dec 2022 11:54:33 -0800 Subject: [PATCH 0550/1000] rc-rtc: Remove frame_flags_ change in svc ratectril rtc test SVC test is only in CBR and the frame_flags are set by the SVC pattern, so we shouldn't undo them for svc mode. Change-Id: I5ffa65dd58a7b47f287d124d9e71ba1dc7c5a549 --- test/vp9_ratectrl_rtc_test.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index 03a58fa926..1f429b193a 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -181,11 +181,6 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, } frame_params_.frame_type = video->frame() == 0 ? KEY_FRAME : INTER_FRAME; - if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) { - // Disable golden frame update. - frame_flags_ |= VP8_EFLAG_NO_UPD_GF; - frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; - } encoder_exit_ = video->frame() == kNumFrames; current_superframe_ = video->frame(); } From cbb780ab0bd68dc60e822c3b2e51f37d2128e9cd Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Mon, 5 Dec 2022 14:30:40 -0800 Subject: [PATCH 0551/1000] rc-rtc: Test for periodic key in SVC external RC This test catches the fix merged in here: https://chromium-review.googlesource.com/c/webm/libvpx/+/4022904 Change-Id: Ib68fbcba694b5d465a9faf3ca7d6880bfe8eabb3 --- test/vp9_ratectrl_rtc_test.cc | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index 1f429b193a..931e68c880 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -179,8 +179,8 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, encoder->Control(VP9E_SET_SVC, 1); encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); } - - frame_params_.frame_type = video->frame() == 0 ? KEY_FRAME : INTER_FRAME; + frame_params_.frame_type = + video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME; encoder_exit_ = video->frame() == kNumFrames; current_superframe_ = video->frame(); } @@ -214,6 +214,21 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, void RunSvc() { SetConfigSvc(); + // kNumFrames = 300, so no key frames in this test. + key_interval_ = 10000; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderSvc(); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunSvcPeriodicKey() { + SetConfigSvc(); + // kNumFrames = 300, so 3 key frames in this test. + key_interval_ = 100; rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); SetEncoderSvc(); @@ -297,6 +312,9 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, cfg_.layer_target_bitrate[6] = 450; cfg_.layer_target_bitrate[7] = 630; cfg_.layer_target_bitrate[8] = 900; + + cfg_.kf_min_dist = key_interval_; + cfg_.kf_max_dist = key_interval_; } void SetConfigSvc() { @@ -355,6 +373,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, bool encoder_exit_; int current_superframe_; uint32_t sizes_[8]; + int key_interval_; }; TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); } @@ -363,6 +382,8 @@ TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); } TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); } +TEST_P(RcInterfaceSvcTest, SvcPeriodicKey) { RunSvcPeriodicKey(); } + VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3), ::testing::Values(VPX_CBR, VPX_VBR)); VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3)); From 5887bd234e5468be69f8e6e714623a152efeaf93 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Fri, 2 Dec 2022 18:04:32 -0800 Subject: [PATCH 0552/1000] L2E: Add a new interface to control rdmult Allow external model to control frame rdmult. A function is called per frame to get the value of rdmult from the external model. The external rdmult will overwrite libvpx's default rdmult unless a reserved value is selected. A unit test is added to test when the default rdmult value is set. Change-Id: I2f17a036c188de66dc00709beef4bf2ed86a919a --- test/vp9_ext_ratectrl_test.cc | 81 ++++++++++++++++++++++++++++++++++ vp9/encoder/vp9_encoder.c | 26 +++++++++++ vp9/encoder/vp9_ext_ratectrl.c | 29 ++++++++++++ vp9/encoder/vp9_ext_ratectrl.h | 7 +++ vp9/encoder/vp9_rd.c | 11 +++++ vpx/vpx_ext_ratectrl.h | 34 +++++++++++++- 6 files changed, 186 insertions(+), 2 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index 16e3248f76..2bfa6281d7 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -41,6 +41,7 @@ constexpr int kDefaultMaxGfInterval = 16; constexpr int kReadMinGfInterval = 5; constexpr int kReadMaxGfInterval = 13; const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv"; +const double kPsnrThreshold = 30.50; struct ToyRateCtrl { int magic_number; @@ -642,6 +643,19 @@ vpx_rc_status_t rc_update_encodeframe_result_gop_short( return VPX_RC_OK; } +vpx_rc_status_t rc_get_default_frame_rdmult( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, int *rdmult) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + *rdmult = VPX_DEFAULT_RDMULT; + return VPX_RC_OK; +} + vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) { ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); @@ -880,4 +894,71 @@ TEST_F(ExtRateCtrlTestGOPShortNoARF, EncodeTest) { ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); } +class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest, + public ::testing::Test { + protected: + ExtRateCtrlTestRdmult() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTestRdmult() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void BeginPassHook(unsigned int) override { + psnr_ = 0.0; + nframes_ = 0; + } + + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + vpx_rc_funcs_t rc_funcs; + rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT; + rc_funcs.create_model = rc_create_model_gop_short; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; + rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short; + rc_funcs.get_gop_decision = rc_get_gop_decision_short; + rc_funcs.update_encodeframe_result = + rc_update_encodeframe_result_gop_short; + rc_funcs.get_frame_rdmult = rc_get_default_frame_rdmult; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + private: + double psnr_; + unsigned int nframes_; +}; + +TEST_F(ExtRateCtrlTestRdmult, DefaultRdmult) { + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = kMaxLagInFrames - 1; + cfg_.rc_end_usage = VPX_VBR; + init_flags_ = VPX_CODEC_USE_PSNR; + + std::unique_ptr video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); + + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, kPsnrThreshold); +} + } // namespace diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 87c5d7b67f..5cfd846dd0 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -5515,6 +5515,32 @@ static void encode_frame_to_data_rate( save_encode_params(cpi); } #endif + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0) { + vpx_codec_err_t codec_status; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; + const int ref_frame_flags = get_ref_frame_flags(cpi); + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; + const RefCntBuffer *curr_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx); + // index 0 of a gf group is always KEY/OVERLAY/GOLDEN. + // index 1 refers to the first encoding frame in a gf group. + // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref. + // See function define_gf_group_structure(). + const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE; + int ext_rdmult = VPX_DEFAULT_RDMULT; + get_ref_frame_bufs(cpi, ref_frame_bufs); + codec_status = vp9_extrc_get_frame_rdmult( + &cpi->ext_ratectrl, curr_frame_buf->frame_index, + cm->current_frame_coding_index, gf_group->index, update_type, + gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags, + &ext_rdmult); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_frame_rdmult() failed"); + } + cpi->ext_ratectrl.ext_rdmult = ext_rdmult; + } if (cpi->sf.recode_loop == DISALLOW_RECODE) { if (!encode_without_recode_loop(cpi, size, dest)) return; diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index b4ee574ff1..1d440442b5 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -230,3 +230,32 @@ vpx_codec_err_t vp9_extrc_get_gop_decision( } return VPX_CODEC_OK; } + +vpx_codec_err_t vp9_extrc_get_frame_rdmult( + EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, + FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + int *rdmult) { + vpx_rc_status_t rc_status; + vpx_rc_encodeframe_info_t encode_frame_info; + if (ext_ratectrl == NULL || !ext_ratectrl->ready || + (ext_ratectrl->funcs.rc_type & VPX_RC_RDMULT) == 0) { + return VPX_CODEC_INVALID_PARAM; + } + encode_frame_info.show_index = show_index; + encode_frame_info.coding_index = coding_index; + encode_frame_info.gop_index = gop_index; + encode_frame_info.frame_type = extrc_get_frame_type(update_type); + encode_frame_info.gop_size = gop_size; + encode_frame_info.use_alt_ref = use_alt_ref; + + vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs, + encode_frame_info.ref_frame_coding_indexes, + encode_frame_info.ref_frame_valid_list); + rc_status = ext_ratectrl->funcs.get_frame_rdmult(ext_ratectrl->model, + &encode_frame_info, rdmult); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + return VPX_CODEC_OK; +} diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index b8f3d0c834..7c38758833 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -16,6 +16,7 @@ typedef struct EXT_RATECTRL { int ready; + int ext_rdmult; vpx_rc_model_t model; vpx_rc_funcs_t funcs; vpx_rc_config_t ratectrl_config; @@ -49,4 +50,10 @@ vpx_codec_err_t vp9_extrc_get_gop_decision( EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info, vpx_rc_gop_decision_t *gop_decision); +vpx_codec_err_t vp9_extrc_get_frame_rdmult( + EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, + FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + int *rdmult); + #endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 28f992f4b6..58dd75b441 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -244,6 +244,12 @@ int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) { // largest dc_quant is 21387, therefore rdmult should fit in int32_t int rdmult = q * q; + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 && + cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) { + return cpi->ext_ratectrl.ext_rdmult; + } + // Make sure this function is floating point safe. vpx_clear_system_state(); @@ -287,6 +293,11 @@ static int modulate_rdmult(const VP9_COMP *cpi, int rdmult) { int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { int rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex); + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 && + cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) { + return cpi->ext_ratectrl.ext_rdmult; + } return modulate_rdmult(cpi, rdmult); } diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 95b883413e..3c5fc8cfc3 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -25,20 +25,26 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures. */ -#define VPX_EXT_RATECTRL_ABI_VERSION (5) +#define VPX_EXT_RATECTRL_ABI_VERSION (6) /*!\brief The control type of the inference API. * In VPX_RC_QP mode, the external rate control model determines the * quantization parameter (QP) for each frame. * In VPX_RC_GOP mode, the external rate control model determines the * group of picture (GOP) of the video sequence. + * In VPX_RC_RDMULT mode, the external rate control model determines the + * rate-distortion multiplier (rdmult) for the current frame. * In VPX_RC_GOP_QP mode, the external rate control model determines * both the QP and the GOP. + * In VPX_RC_GOP_QP_RDMULT mode, the external rate control model determines + * the QP, GOP and the rdmult. */ typedef enum vpx_rc_type { VPX_RC_QP = 1 << 0, VPX_RC_GOP = 1 << 1, - VPX_RC_GOP_QP = VPX_RC_QP | VPX_RC_GOP + VPX_RC_RDMULT = 1 << 2, + VPX_RC_GOP_QP = VPX_RC_QP | VPX_RC_GOP, + VPX_RC_GOP_QP_RDMULT = VPX_RC_QP | VPX_RC_GOP | VPX_RC_RDMULT } vpx_rc_type_t; /*!\brief Abstract rate control model handler @@ -55,6 +61,13 @@ typedef void *vpx_rc_model_t; */ #define VPX_DEFAULT_Q -1 +/*!\brief A reserved value for the rdmult. + * If the external rate control model returns this value, + * the encoder will use the default rdmult selected by libvpx's rate control + * system. + */ +#define VPX_DEFAULT_RDMULT -1 + /*!\brief Encode frame decision made by the external rate control model * * The encoder will receive the decision from the external rate control model @@ -432,6 +445,19 @@ typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)( vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, vpx_rc_gop_decision_t *gop_decision); +/*!\brief Get the frame rdmult from the external rate control model. + * + * This callback is invoked by the encoder to get rdmult from + * the external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] frame_info information collected from the encoder + * \param[out] rdmult frame rate-distortion multiplier from the model + */ +typedef vpx_rc_status_t (*vpx_rc_get_frame_rdmult_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *frame_info, + int *rdmult); + /*!\brief Delete the external rate control model callback prototype * * This callback is invoked by the encoder to delete the external rate control @@ -473,6 +499,10 @@ typedef struct vpx_rc_funcs { * Get GOP decisions from the external rate control model. */ vpx_rc_get_gop_decision_cb_fn_t get_gop_decision; + /*! + * Get rdmult for the frame from the external rate control model. + */ + vpx_rc_get_frame_rdmult_cb_fn_t get_frame_rdmult; /*! * Delete the external rate control model. */ From 1450ec46e273b32234b036d7803aaae09423dd08 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Tue, 6 Dec 2022 14:18:03 -0800 Subject: [PATCH 0553/1000] Add vpx highbd subtract test. Change-Id: I069ae0fe22bfc82ad5083df85a7fdf9058a285eb --- test/vp9_subtract_test.cc | 148 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc index f634a032d8..7c69a317cc 100644 --- a/test/vp9_subtract_test.cc +++ b/test/vp9_subtract_test.cc @@ -7,6 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include #include "third_party/googletest/src/include/gtest/gtest.h" @@ -17,9 +18,11 @@ #include "test/bench.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" +#include "test/util.h" #include "vp9/common/vp9_blockd.h" #include "vpx_ports/msvc.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, @@ -161,4 +164,149 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9SubtractBlockTest, ::testing::Values(vpx_subtract_block_lsx)); #endif +#if CONFIG_VP9_HIGHBITDEPTH + +typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride, int bd); + +// +using Params = std::tuple; + +class VPXHBDSubtractBlockTest : public ::testing::TestWithParam { + public: + virtual void SetUp() { + block_width_ = 4 * num_4x4_blocks_wide_lookup[GET_PARAM(0)]; + block_height_ = 4 * num_4x4_blocks_high_lookup[GET_PARAM(0)]; + bit_depth_ = static_cast(GET_PARAM(1)); + func_ = GET_PARAM(2); + ref_func_ = GET_PARAM(3); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + + constexpr size_t kMaxWidth = 128; + constexpr size_t kMaxBlockSize = kMaxWidth * kMaxWidth; + src_ = CONVERT_TO_BYTEPTR(reinterpret_cast( + vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t)))); + ASSERT_NE(src_, nullptr); + pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast( + vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t)))); + ASSERT_NE(pred_, nullptr); + diff_ = reinterpret_cast( + vpx_memalign(16, kMaxBlockSize * sizeof(int16_t))); + ASSERT_NE(diff_, nullptr); + } + + virtual void TearDown() { + vpx_free(CONVERT_TO_SHORTPTR(src_)); + vpx_free(CONVERT_TO_SHORTPTR(pred_)); + vpx_free(diff_); + } + + protected: + void CheckResult(); + void RunForSpeed(); + + private: + ACMRandom rnd_; + int block_height_; + int block_width_; + vpx_bit_depth_t bit_depth_; + HBDSubtractFunc func_; + HBDSubtractFunc ref_func_; + uint8_t *src_; + uint8_t *pred_; + int16_t *diff_; +}; + +void VPXHBDSubtractBlockTest::CheckResult() { + constexpr int kTestNum = 100; + constexpr int kMaxWidth = 128; + constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth; + const int mask = (1 << bit_depth_) - 1; + for (int i = 0; i < kTestNum; ++i) { + for (int j = 0; j < kMaxBlockSize; ++j) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask; + CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask; + } + + func_(block_height_, block_width_, diff_, block_width_, src_, block_width_, + pred_, block_width_, bit_depth_); + + for (int r = 0; r < block_height_; ++r) { + for (int c = 0; c < block_width_; ++c) { + EXPECT_EQ(diff_[r * block_width_ + c], + (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] - + CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c])) + << "r = " << r << ", c = " << c << ", test: " << i; + } + } + } +} + +TEST_P(VPXHBDSubtractBlockTest, CheckResult) { CheckResult(); } + +void VPXHBDSubtractBlockTest::RunForSpeed() { + constexpr int kTestNum = 200000; + constexpr int kMaxWidth = 128; + constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth; + const int mask = (1 << bit_depth_) - 1; + + if (ref_func_ == func_) GTEST_SKIP(); + + for (int j = 0; j < kMaxBlockSize; ++j) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask; + CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask; + } + + vpx_usec_timer ref_timer; + vpx_usec_timer_start(&ref_timer); + for (int i = 0; i < kTestNum; ++i) { + ref_func_(block_height_, block_width_, diff_, block_width_, src_, + block_width_, pred_, block_width_, bit_depth_); + } + vpx_usec_timer_mark(&ref_timer); + const int64_t ref_elapsed_time = vpx_usec_timer_elapsed(&ref_timer); + + for (int j = 0; j < kMaxBlockSize; ++j) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask; + CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask; + } + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < kTestNum; ++i) { + func_(block_height_, block_width_, diff_, block_width_, src_, block_width_, + pred_, block_width_, bit_depth_); + } + vpx_usec_timer_mark(&timer); + const int64_t elapsed_time = vpx_usec_timer_elapsed(&timer); + + printf( + "[%dx%d]: " + "ref_time=%6" PRId64 " \t simd_time=%6" PRId64 + " \t " + "gain=%f \n", + block_width_, block_height_, ref_elapsed_time, elapsed_time, + static_cast(ref_elapsed_time) / + static_cast(elapsed_time)); +} + +TEST_P(VPXHBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); } + +const BLOCK_SIZE kValidBlockSize[] = { BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, + BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, + BLOCK_32X32, BLOCK_32X64, BLOCK_64X32, + BLOCK_64X64 }; + +INSTANTIATE_TEST_SUITE_P( + C, VPXHBDSubtractBlockTest, + ::testing::Combine(::testing::ValuesIn(kValidBlockSize), + ::testing::Values(12), + ::testing::Values(&vpx_highbd_subtract_block_c), + ::testing::Values(&vpx_highbd_subtract_block_c))); + +#endif // CONFIG_VP9_HIGHBITDEPTH } // namespace vp9 From a7bb04b43598cb2dcefea2352a1fde4dbd269fe5 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Tue, 6 Dec 2022 13:13:30 -0800 Subject: [PATCH 0554/1000] [x86]: Add vpx_highbd_subtract_block_avx2(). Up to 4x faster than "sse2 vectorized C". Change-Id: Ie9b3c12a437c5cddf92c4d5349c4f659ca6b82ea --- test/vp9_subtract_test.cc | 9 +++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/subtract_avx2.c | 107 +++++++++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 1 deletion(-) diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc index 7c69a317cc..a57082f1eb 100644 --- a/test/vp9_subtract_test.cc +++ b/test/vp9_subtract_test.cc @@ -308,5 +308,14 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(&vpx_highbd_subtract_block_c), ::testing::Values(&vpx_highbd_subtract_block_c))); +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, VPXHBDSubtractBlockTest, + ::testing::Combine(::testing::ValuesIn(kValidBlockSize), + ::testing::Values(12), + ::testing::Values(&vpx_highbd_subtract_block_avx2), + ::testing::Values(&vpx_highbd_subtract_block_c))); +#endif // HAVE_AVX2 + #endif // CONFIG_VP9_HIGHBITDEPTH } // namespace vp9 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 51f5ebedd6..b6d656820f 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -939,7 +939,7 @@ () # Block subtraction # add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd"; - specialize qw/vpx_highbd_subtract_block neon/; + specialize qw/vpx_highbd_subtract_block neon avx2/; # # Single block SAD diff --git a/vpx_dsp/x86/subtract_avx2.c b/vpx_dsp/x86/subtract_avx2.c index 4d259ef5c5..4849581ed4 100644 --- a/vpx_dsp/x86/subtract_avx2.c +++ b/vpx_dsp/x86/subtract_avx2.c @@ -94,3 +94,110 @@ void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, break; } } + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, + const uint8_t *src8_ptr, + ptrdiff_t src_stride, + const uint8_t *pred8_ptr, + ptrdiff_t pred_stride, int bd) { + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(pred8_ptr); + (void)bd; + if (cols == 64) { + int j = rows; + do { + const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16)); + const __m256i s2 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 32)); + const __m256i s3 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 48)); + const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16)); + const __m256i p2 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 32)); + const __m256i p3 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 48)); + const __m256i d0 = _mm256_sub_epi16(s0, p0); + const __m256i d1 = _mm256_sub_epi16(s1, p1); + const __m256i d2 = _mm256_sub_epi16(s2, p2); + const __m256i d3 = _mm256_sub_epi16(s3, p3); + _mm256_storeu_si256((__m256i *)diff_ptr, d0); + _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1); + _mm256_storeu_si256((__m256i *)(diff_ptr + 32), d2); + _mm256_storeu_si256((__m256i *)(diff_ptr + 48), d3); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } while (--j != 0); + } else if (cols == 32) { + int j = rows; + do { + const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16)); + const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16)); + const __m256i d0 = _mm256_sub_epi16(s0, p0); + const __m256i d1 = _mm256_sub_epi16(s1, p1); + _mm256_storeu_si256((__m256i *)diff_ptr, d0); + _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } while (--j != 0); + } else if (cols == 16) { + int j = rows; + do { + const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i s1 = + _mm256_lddqu_si256((const __m256i *)(src_ptr + src_stride)); + const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i p1 = + _mm256_lddqu_si256((const __m256i *)(pred_ptr + pred_stride)); + const __m256i d0 = _mm256_sub_epi16(s0, p0); + const __m256i d1 = _mm256_sub_epi16(s1, p1); + _mm256_storeu_si256((__m256i *)diff_ptr, d0); + _mm256_storeu_si256((__m256i *)(diff_ptr + diff_stride), d1); + src_ptr += src_stride << 1; + pred_ptr += pred_stride << 1; + diff_ptr += diff_stride << 1; + j -= 2; + } while (j != 0); + } else if (cols == 8) { + int j = rows; + do { + const __m128i s0 = _mm_lddqu_si128((const __m128i *)src_ptr); + const __m128i s1 = + _mm_lddqu_si128((const __m128i *)(src_ptr + src_stride)); + const __m128i p0 = _mm_lddqu_si128((const __m128i *)pred_ptr); + const __m128i p1 = + _mm_lddqu_si128((const __m128i *)(pred_ptr + pred_stride)); + const __m128i d0 = _mm_sub_epi16(s0, p0); + const __m128i d1 = _mm_sub_epi16(s1, p1); + _mm_storeu_si128((__m128i *)diff_ptr, d0); + _mm_storeu_si128((__m128i *)(diff_ptr + diff_stride), d1); + src_ptr += src_stride << 1; + pred_ptr += pred_stride << 1; + diff_ptr += diff_stride << 1; + j -= 2; + } while (j != 0); + } else { + int j = rows; + assert(cols == 4); + do { + const __m128i s0 = _mm_loadl_epi64((const __m128i *)src_ptr); + const __m128i s1 = + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + const __m128i p0 = _mm_loadl_epi64((const __m128i *)pred_ptr); + const __m128i p1 = + _mm_loadl_epi64((const __m128i *)(pred_ptr + pred_stride)); + const __m128i d0 = _mm_sub_epi16(s0, p0); + const __m128i d1 = _mm_sub_epi16(s1, p1); + _mm_storel_epi64((__m128i *)diff_ptr, d0); + _mm_storel_epi64((__m128i *)(diff_ptr + diff_stride), d1); + src_ptr += src_stride << 1; + pred_ptr += pred_stride << 1; + diff_ptr += diff_stride << 1; + j -= 2; + } while (j != 0); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH From 89b8032ff5cbe1d0043587f2df7f5a5e858e6fdb Mon Sep 17 00:00:00 2001 From: Anton Venema Date: Tue, 13 Dec 2022 10:27:37 -0800 Subject: [PATCH 0555/1000] Add additional ARM targets for Visual Studio. configure: Add an armv7-win32-vs16 target configure: Add an armv7-win32-vs17 target configure: Add an arm64-win64-vs16 target configure: Add an arm64-win64-vs17 target Change-Id: I11d6cd6e51f7703939d6fd3fc6a7469591e3b09d --- AUTHORS | 1 + configure | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/AUTHORS b/AUTHORS index fffda63360..536e0e7cf0 100644 --- a/AUTHORS +++ b/AUTHORS @@ -23,6 +23,7 @@ Andrew Lewis Andrew Russell Angie Chen Angie Chiang +Anton Venema Aron Rosenberg Attila Nagy Birk Magnussen diff --git a/configure b/configure index bf92e1ad1f..ae289f77b4 100755 --- a/configure +++ b/configure @@ -105,6 +105,8 @@ all_platforms="${all_platforms} arm64-darwin22-gcc" all_platforms="${all_platforms} arm64-linux-gcc" all_platforms="${all_platforms} arm64-win64-gcc" all_platforms="${all_platforms} arm64-win64-vs15" +all_platforms="${all_platforms} arm64-win64-vs16" +all_platforms="${all_platforms} arm64-win64-vs17" all_platforms="${all_platforms} armv7-android-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-darwin-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-linux-rvct" #neon Cortex-A8 @@ -113,6 +115,8 @@ all_platforms="${all_platforms} armv7-none-rvct" #neon Cortex-A8 all_platforms="${all_platforms} armv7-win32-gcc" all_platforms="${all_platforms} armv7-win32-vs14" all_platforms="${all_platforms} armv7-win32-vs15" +all_platforms="${all_platforms} armv7-win32-vs16" +all_platforms="${all_platforms} armv7-win32-vs17" all_platforms="${all_platforms} armv7s-darwin-gcc" all_platforms="${all_platforms} armv8-linux-gcc" all_platforms="${all_platforms} loongarch32-linux-gcc" From 55d3184503edf36e01bbefd6a09a602b02f60986 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Wed, 7 Dec 2022 00:17:22 -0800 Subject: [PATCH 0556/1000] rc-svc: Add tests for dynamic svc in external RC Test to verify RC for going down and back up in spatial layers. Going back up has an issue so added a TODO. Make the test more flexible to handle dynamic layers. Test for dyanmic change in temporal layers to follow. Change-Id: Ic5542f7b274135277429e116f56ba54e682e96a0 --- test/vp9_ratectrl_rtc_test.cc | 246 +++++++++++++++++++++++++--------- 1 file changed, 186 insertions(+), 60 deletions(-) diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index 931e68c880..1d1a78f43d 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -26,7 +26,11 @@ namespace { const size_t kNumFrames = 300; -const int kTemporalId[4] = { 0, 2, 1, 2 }; +const int kTemporalId3Layer[4] = { 0, 2, 1, 2 }; +const int kTemporalId2Layer[2] = { 0, 1 }; +const int kTemporalRateAllocation3Layer[3] = { 50, 70, 100 }; +const int kTemporalRateAllocation2Layer[2] = { 60, 100 }; +const int kSpatialLayerBitrate[3] = { 200, 400, 1000 }; class RcInterfaceTest : public ::libvpx_test::EncoderTest, @@ -183,19 +187,69 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME; encoder_exit_ = video->frame() == kNumFrames; current_superframe_ = video->frame(); + if (dynamic_spatial_layers_ == 1) { + if (video->frame() == 100) { + // Go down to 2 spatial layers: set top SL to 0 bitrate. + // Update the encoder config. + cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8]; + cfg_.layer_target_bitrate[6] = 0; + cfg_.layer_target_bitrate[7] = 0; + cfg_.layer_target_bitrate[8] = 0; + encoder->Config(&cfg_); + // Update the RC config. + rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[8]; + rc_cfg_.layer_target_bitrate[6] = 0; + rc_cfg_.layer_target_bitrate[7] = 0; + rc_cfg_.layer_target_bitrate[8] = 0; + rc_api_->UpdateRateControl(rc_cfg_); + } else if (video->frame() == 200) { + // Go down to 1 spatial layer. + // Update the encoder config. + cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[5]; + cfg_.layer_target_bitrate[3] = 0; + cfg_.layer_target_bitrate[4] = 0; + cfg_.layer_target_bitrate[5] = 0; + encoder->Config(&cfg_); + // Update the RC config. + rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[5]; + rc_cfg_.layer_target_bitrate[3] = 0; + rc_cfg_.layer_target_bitrate[4] = 0; + rc_cfg_.layer_target_bitrate[5] = 0; + rc_api_->UpdateRateControl(rc_cfg_); + } else if (0 && video->frame() == 280) { + // TODO(marpan): Re-enable this going back up when issue is fixed. + // Go back up to 3 spatial layers. + // Update the encoder config: use the original bitrates. + SetEncoderConfigSvc(3, 3); + encoder->Config(&cfg_); + // Update the RC config. + SetRCConfigSvc(3, 3); + rc_api_->UpdateRateControl(rc_cfg_); + } + } } virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0; while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { ParseSuperframeSizes(static_cast(pkt->data.frame.buf), pkt->data.frame.sz); for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { - frame_params_.spatial_layer_id = sl; - frame_params_.temporal_layer_id = kTemporalId[current_superframe_ % 4]; - rc_api_->ComputeQP(frame_params_); - frame_params_.frame_type = INTER_FRAME; - rc_api_->PostEncodeUpdate(sizes_[sl]); + if (sizes_[sl] > 0) { + frame_params_.spatial_layer_id = sl; + if (rc_cfg_.ts_number_layers == 3) + frame_params_.temporal_layer_id = + kTemporalId3Layer[current_superframe_ % 4]; + else if (rc_cfg_.ts_number_layers == 2) + frame_params_.temporal_layer_id = + kTemporalId2Layer[current_superframe_ % 2]; + else + frame_params_.temporal_layer_id = 0; + rc_api_->ComputeQP(frame_params_); + frame_params_.frame_type = INTER_FRAME; + rc_api_->PostEncodeUpdate(sizes_[sl]); + } } } if (!encoder_exit_) { @@ -213,11 +267,11 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, const vpx_image_t * /*img2*/) {} void RunSvc() { - SetConfigSvc(); - // kNumFrames = 300, so no key frames in this test. + dynamic_spatial_layers_ = 0; + SetRCConfigSvc(3, 3); key_interval_ = 10000; rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); - SetEncoderSvc(); + SetEncoderConfigSvc(3, 3); ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280, 720, 30, 1, 0, kNumFrames); @@ -226,11 +280,24 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, } void RunSvcPeriodicKey() { - SetConfigSvc(); - // kNumFrames = 300, so 3 key frames in this test. + dynamic_spatial_layers_ = 0; + SetRCConfigSvc(3, 3); key_interval_ = 100; rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); - SetEncoderSvc(); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunSvcDynamicSpatial() { + dynamic_spatial_layers_ = 1; + SetRCConfigSvc(3, 3); + key_interval_ = 10000; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280, 720, 30, 1, 0, kNumFrames); @@ -266,17 +333,31 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, return VPX_CODEC_OK; } - void SetEncoderSvc() { - cfg_.ss_number_layers = 3; - cfg_.ts_number_layers = 3; + void SetEncoderConfigSvc(int number_spatial_layers, + int number_temporal_layers) { + cfg_.g_w = 1280; + cfg_.g_h = 720; + cfg_.ss_number_layers = number_spatial_layers; + cfg_.ts_number_layers = number_temporal_layers; cfg_.g_timebase.num = 1; cfg_.g_timebase.den = 30; - svc_params_.scaling_factor_num[0] = 72; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 144; - svc_params_.scaling_factor_den[1] = 288; - svc_params_.scaling_factor_num[2] = 288; - svc_params_.scaling_factor_den[2] = 288; + if (number_spatial_layers == 3) { + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 4; + svc_params_.scaling_factor_num[1] = 2; + svc_params_.scaling_factor_den[1] = 4; + svc_params_.scaling_factor_num[2] = 4; + svc_params_.scaling_factor_den[2] = 4; + } else if (number_spatial_layers == 2) { + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 2; + svc_params_.scaling_factor_num[1] = 2; + svc_params_.scaling_factor_den[1] = 2; + } else if (number_spatial_layers == 1) { + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 1; + } + for (int i = 0; i < VPX_MAX_LAYERS; ++i) { svc_params_.max_quantizers[i] = 56; svc_params_.min_quantizers[i] = 2; @@ -286,11 +367,20 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, cfg_.rc_end_usage = VPX_CBR; cfg_.g_lag_in_frames = 0; cfg_.g_error_resilient = 0; - // 3 temporal layers - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - cfg_.temporal_layering_mode = 3; + + if (number_temporal_layers == 3) { + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + } else if (number_temporal_layers == 2) { + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.temporal_layering_mode = 2; + } else if (number_temporal_layers == 1) { + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + } cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 600; @@ -299,30 +389,39 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, cfg_.rc_max_quantizer = 56; cfg_.g_threads = 1; cfg_.kf_max_dist = 9999; - cfg_.rc_target_bitrate = 1600; cfg_.rc_overshoot_pct = 50; cfg_.rc_undershoot_pct = 50; - cfg_.layer_target_bitrate[0] = 100; - cfg_.layer_target_bitrate[1] = 140; - cfg_.layer_target_bitrate[2] = 200; - cfg_.layer_target_bitrate[3] = 250; - cfg_.layer_target_bitrate[4] = 350; - cfg_.layer_target_bitrate[5] = 500; - cfg_.layer_target_bitrate[6] = 450; - cfg_.layer_target_bitrate[7] = 630; - cfg_.layer_target_bitrate[8] = 900; + cfg_.rc_target_bitrate = 0; + for (int sl = 0; sl < number_spatial_layers; sl++) { + int spatial_bitrate = 0; + if (number_spatial_layers <= 3) + spatial_bitrate = kSpatialLayerBitrate[sl]; + for (int tl = 0; tl < number_temporal_layers; tl++) { + int layer = sl * number_temporal_layers + tl; + if (number_temporal_layers == 3) + cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 2) + cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 1) + cfg_.layer_target_bitrate[layer] = spatial_bitrate; + } + cfg_.rc_target_bitrate += spatial_bitrate; + } cfg_.kf_min_dist = key_interval_; cfg_.kf_max_dist = key_interval_; } - void SetConfigSvc() { + void SetRCConfigSvc(int number_spatial_layers, int number_temporal_layers) { rc_cfg_.width = 1280; rc_cfg_.height = 720; + rc_cfg_.ss_number_layers = number_spatial_layers; + rc_cfg_.ts_number_layers = number_temporal_layers; rc_cfg_.max_quantizer = 56; rc_cfg_.min_quantizer = 2; - rc_cfg_.target_bandwidth = 1600; rc_cfg_.buf_initial_sz = 500; rc_cfg_.buf_optimal_sz = 600; rc_cfg_.buf_sz = 1000; @@ -330,31 +429,55 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, rc_cfg_.overshoot_pct = 50; rc_cfg_.max_intra_bitrate_pct = 900; rc_cfg_.framerate = 30.0; - rc_cfg_.ss_number_layers = 3; - rc_cfg_.ts_number_layers = 3; rc_cfg_.rc_mode = VPX_CBR; rc_cfg_.aq_mode = aq_mode_; - rc_cfg_.scaling_factor_num[0] = 1; - rc_cfg_.scaling_factor_den[0] = 4; - rc_cfg_.scaling_factor_num[1] = 2; - rc_cfg_.scaling_factor_den[1] = 4; - rc_cfg_.scaling_factor_num[2] = 4; - rc_cfg_.scaling_factor_den[2] = 4; - - rc_cfg_.ts_rate_decimator[0] = 4; - rc_cfg_.ts_rate_decimator[1] = 2; - rc_cfg_.ts_rate_decimator[2] = 1; - - rc_cfg_.layer_target_bitrate[0] = 100; - rc_cfg_.layer_target_bitrate[1] = 140; - rc_cfg_.layer_target_bitrate[2] = 200; - rc_cfg_.layer_target_bitrate[3] = 250; - rc_cfg_.layer_target_bitrate[4] = 350; - rc_cfg_.layer_target_bitrate[5] = 500; - rc_cfg_.layer_target_bitrate[6] = 450; - rc_cfg_.layer_target_bitrate[7] = 630; - rc_cfg_.layer_target_bitrate[8] = 900; + if (number_spatial_layers == 3) { + rc_cfg_.scaling_factor_num[0] = 1; + rc_cfg_.scaling_factor_den[0] = 4; + rc_cfg_.scaling_factor_num[1] = 2; + rc_cfg_.scaling_factor_den[1] = 4; + rc_cfg_.scaling_factor_num[2] = 4; + rc_cfg_.scaling_factor_den[2] = 4; + } else if (number_spatial_layers == 2) { + rc_cfg_.scaling_factor_num[0] = 1; + rc_cfg_.scaling_factor_den[0] = 2; + rc_cfg_.scaling_factor_num[1] = 2; + rc_cfg_.scaling_factor_den[1] = 2; + } else if (number_spatial_layers == 1) { + rc_cfg_.scaling_factor_num[0] = 1; + rc_cfg_.scaling_factor_den[0] = 1; + } + + if (number_temporal_layers == 3) { + rc_cfg_.ts_rate_decimator[0] = 4; + rc_cfg_.ts_rate_decimator[1] = 2; + rc_cfg_.ts_rate_decimator[2] = 1; + } else if (number_temporal_layers == 2) { + rc_cfg_.ts_rate_decimator[0] = 2; + rc_cfg_.ts_rate_decimator[1] = 1; + } else if (number_temporal_layers == 1) { + rc_cfg_.ts_rate_decimator[0] = 1; + } + + rc_cfg_.target_bandwidth = 0; + for (int sl = 0; sl < number_spatial_layers; sl++) { + int spatial_bitrate = 0; + if (number_spatial_layers <= 3) + spatial_bitrate = kSpatialLayerBitrate[sl]; + for (int tl = 0; tl < number_temporal_layers; tl++) { + int layer = sl * number_temporal_layers + tl; + if (number_temporal_layers == 3) + rc_cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 2) + rc_cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 1) + rc_cfg_.layer_target_bitrate[layer] = spatial_bitrate; + } + rc_cfg_.target_bandwidth += spatial_bitrate; + } for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) { for (int tl = 0; tl < rc_cfg_.ts_number_layers; ++tl) { @@ -374,6 +497,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, int current_superframe_; uint32_t sizes_[8]; int key_interval_; + int dynamic_spatial_layers_; }; TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); } @@ -384,6 +508,8 @@ TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); } TEST_P(RcInterfaceSvcTest, SvcPeriodicKey) { RunSvcPeriodicKey(); } +TEST_P(RcInterfaceSvcTest, SvcDynamicSpatial) { RunSvcDynamicSpatial(); } + VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3), ::testing::Values(VPX_CBR, VPX_VBR)); VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3)); From 883863001652627f47dc1ecc6e42294687c8785b Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Fri, 16 Dec 2022 10:21:00 -0800 Subject: [PATCH 0557/1000] Add vpx_highbd_comp_avg_pred_c() test. Change-Id: I6b2c3379c49a62e56e5ac56fd4782a50b3c4e12a --- test/comp_avg_pred_test.cc | 163 ++++++++++++++++++++++++++----------- 1 file changed, 115 insertions(+), 48 deletions(-) diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc index 3977a2d0b5..66dc4eb4e1 100644 --- a/test/comp_avg_pred_test.cc +++ b/test/comp_avg_pred_test.cc @@ -22,13 +22,14 @@ namespace { using ::libvpx_test::ACMRandom; using ::libvpx_test::Buffer; -typedef void (*AvgPredFunc)(uint8_t *a, const uint8_t *b, int w, int h, - const uint8_t *c, int c_stride); - -uint8_t avg_with_rounding(uint8_t a, uint8_t b) { return (a + b + 1) >> 1; } +template +Pixel avg_with_rounding(Pixel a, Pixel b) { + return (a + b + 1) >> 1; +} -void reference_pred(const Buffer &pred, const Buffer &ref, - int width, int height, Buffer *avg) { +template +void reference_pred(const Buffer &pred, const Buffer &ref, + int width, int height, Buffer *avg) { ASSERT_NE(avg->TopLeftPixel(), nullptr); ASSERT_NE(pred.TopLeftPixel(), nullptr); ASSERT_NE(ref.TopLeftPixel(), nullptr); @@ -36,12 +37,16 @@ void reference_pred(const Buffer &pred, const Buffer &ref, for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { avg->TopLeftPixel()[y * avg->stride() + x] = - avg_with_rounding(pred.TopLeftPixel()[y * pred.stride() + x], - ref.TopLeftPixel()[y * ref.stride() + x]); + avg_with_rounding(pred.TopLeftPixel()[y * pred.stride() + x], + ref.TopLeftPixel()[y * ref.stride() + x]); } } } +using AvgPredFunc = void (*)(uint8_t *a, const uint8_t *b, int w, int h, + const uint8_t *c, int c_stride); + +template class AvgPredTest : public ::testing::TestWithParam { public: virtual void SetUp() { @@ -49,15 +54,19 @@ class AvgPredTest : public ::testing::TestWithParam { rnd_.Reset(ACMRandom::DeterministicSeed()); } + void TestSizeCombinations(); + void TestCompareReferenceRandom(); + void TestSpeed(); + protected: AvgPredFunc avg_pred_func_; ACMRandom rnd_; }; -TEST_P(AvgPredTest, SizeCombinations) { +template +void AvgPredTest::TestSizeCombinations() { // This is called as part of the sub pixel variance. As such it must be one of // the variance block sizes. - for (int width_pow = 2; width_pow <= 6; ++width_pow) { for (int height_pow = width_pow - 1; height_pow <= width_pow + 1; ++height_pow) { @@ -70,23 +79,30 @@ TEST_P(AvgPredTest, SizeCombinations) { const int width = 1 << width_pow; const int height = 1 << height_pow; // Only the reference buffer may have a stride not equal to width. - Buffer ref = - Buffer(width, height, ref_padding ? 8 : 0); + Buffer ref = Buffer(width, height, ref_padding ? 8 : 0); ASSERT_TRUE(ref.Init()); - Buffer pred = Buffer(width, height, 0, 16); + Buffer pred = Buffer(width, height, 0, 16); ASSERT_TRUE(pred.Init()); - Buffer avg_ref = Buffer(width, height, 0, 16); + Buffer avg_ref = Buffer(width, height, 0, 16); ASSERT_TRUE(avg_ref.Init()); - Buffer avg_chk = Buffer(width, height, 0, 16); + Buffer avg_chk = Buffer(width, height, 0, 16); ASSERT_TRUE(avg_chk.Init()); + const int bitdepth_mask = (1 << bitdepth) - 1; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } - ref.Set(&rnd_, &ACMRandom::Rand8); - pred.Set(&rnd_, &ACMRandom::Rand8); - - reference_pred(pred, ref, width, height, &avg_ref); - ASM_REGISTER_STATE_CHECK( - avg_pred_func_(avg_chk.TopLeftPixel(), pred.TopLeftPixel(), width, - height, ref.TopLeftPixel(), ref.stride())); + reference_pred(pred, ref, width, height, &avg_ref); + ASM_REGISTER_STATE_CHECK(avg_pred_func_( + (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(), + width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride())); EXPECT_TRUE(avg_chk.CheckValues(avg_ref)); if (HasFailure()) { @@ -99,26 +115,36 @@ TEST_P(AvgPredTest, SizeCombinations) { } } -TEST_P(AvgPredTest, CompareReferenceRandom) { +template +void AvgPredTest::TestCompareReferenceRandom() { const int width = 64; const int height = 32; - Buffer ref = Buffer(width, height, 8); + Buffer ref = Buffer(width, height, 8); ASSERT_TRUE(ref.Init()); - Buffer pred = Buffer(width, height, 0, 16); + Buffer pred = Buffer(width, height, 0, 16); ASSERT_TRUE(pred.Init()); - Buffer avg_ref = Buffer(width, height, 0, 16); + Buffer avg_ref = Buffer(width, height, 0, 16); ASSERT_TRUE(avg_ref.Init()); - Buffer avg_chk = Buffer(width, height, 0, 16); + Buffer avg_chk = Buffer(width, height, 0, 16); ASSERT_TRUE(avg_chk.Init()); for (int i = 0; i < 500; ++i) { - ref.Set(&rnd_, &ACMRandom::Rand8); - pred.Set(&rnd_, &ACMRandom::Rand8); + const int bitdepth_mask = (1 << bitdepth) - 1; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } - reference_pred(pred, ref, width, height, &avg_ref); - ASM_REGISTER_STATE_CHECK(avg_pred_func_(avg_chk.TopLeftPixel(), - pred.TopLeftPixel(), width, height, - ref.TopLeftPixel(), ref.stride())); + reference_pred(pred, ref, width, height, &avg_ref); + ASM_REGISTER_STATE_CHECK(avg_pred_func_( + (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(), + width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride())); EXPECT_TRUE(avg_chk.CheckValues(avg_ref)); if (HasFailure()) { printf("Width: %d Height: %d\n", width, height); @@ -128,7 +154,8 @@ TEST_P(AvgPredTest, CompareReferenceRandom) { } } -TEST_P(AvgPredTest, DISABLED_Speed) { +template +void AvgPredTest::TestSpeed() { for (int width_pow = 2; width_pow <= 6; ++width_pow) { for (int height_pow = width_pow - 1; height_pow <= width_pow + 1; ++height_pow) { @@ -138,22 +165,30 @@ TEST_P(AvgPredTest, DISABLED_Speed) { for (int ref_padding = 0; ref_padding < 2; ref_padding++) { const int width = 1 << width_pow; const int height = 1 << height_pow; - Buffer ref = - Buffer(width, height, ref_padding ? 8 : 0); + Buffer ref = Buffer(width, height, ref_padding ? 8 : 0); ASSERT_TRUE(ref.Init()); - Buffer pred = Buffer(width, height, 0, 16); + Buffer pred = Buffer(width, height, 0, 16); ASSERT_TRUE(pred.Init()); - Buffer avg = Buffer(width, height, 0, 16); + Buffer avg = Buffer(width, height, 0, 16); ASSERT_TRUE(avg.Init()); - - ref.Set(&rnd_, &ACMRandom::Rand8); - pred.Set(&rnd_, &ACMRandom::Rand8); + const int bitdepth_mask = (1 << bitdepth) - 1; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } vpx_usec_timer timer; vpx_usec_timer_start(&timer); for (int i = 0; i < 10000000 / (width * height); ++i) { - avg_pred_func_(avg.TopLeftPixel(), pred.TopLeftPixel(), width, height, - ref.TopLeftPixel(), ref.stride()); + avg_pred_func_((uint8_t *)avg.TopLeftPixel(), + (uint8_t *)pred.TopLeftPixel(), width, height, + (uint8_t *)ref.TopLeftPixel(), ref.stride()); } vpx_usec_timer_mark(&timer); @@ -166,26 +201,58 @@ TEST_P(AvgPredTest, DISABLED_Speed) { } } -INSTANTIATE_TEST_SUITE_P(C, AvgPredTest, +using AvgPredTestLBD = AvgPredTest<8, uint8_t>; + +TEST_P(AvgPredTestLBD, SizeCombinations) { TestSizeCombinations(); } + +TEST_P(AvgPredTestLBD, CompareReferenceRandom) { TestCompareReferenceRandom(); } + +TEST_P(AvgPredTestLBD, DISABLED_Speed) { TestSpeed(); } + +INSTANTIATE_TEST_SUITE_P(C, AvgPredTestLBD, ::testing::Values(&vpx_comp_avg_pred_c)); #if HAVE_SSE2 -INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTest, +INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTestLBD, ::testing::Values(&vpx_comp_avg_pred_sse2)); #endif // HAVE_SSE2 #if HAVE_NEON -INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTest, +INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD, ::testing::Values(&vpx_comp_avg_pred_neon)); #endif // HAVE_NEON #if HAVE_VSX -INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTest, +INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTestLBD, ::testing::Values(&vpx_comp_avg_pred_vsx)); #endif // HAVE_VSX #if HAVE_LSX -INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTest, +INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTestLBD, ::testing::Values(&vpx_comp_avg_pred_lsx)); #endif // HAVE_LSX + +#if CONFIG_VP9_HIGHBITDEPTH +using HighbdAvgPredFunc = void (*)(uint16_t *a, const uint16_t *b, int w, int h, + const uint16_t *c, int c_stride); + +template +void highbd_wrapper(uint8_t *a, const uint8_t *b, int w, int h, + const uint8_t *c, int c_stride) { + fn((uint16_t *)a, (const uint16_t *)b, w, h, (const uint16_t *)c, c_stride); +} + +using AvgPredTestHBD = AvgPredTest<12, uint16_t>; + +TEST_P(AvgPredTestHBD, SizeCombinations) { TestSizeCombinations(); } + +TEST_P(AvgPredTestHBD, CompareReferenceRandom) { TestCompareReferenceRandom(); } + +TEST_P(AvgPredTestHBD, DISABLED_Speed) { TestSpeed(); } + +INSTANTIATE_TEST_SUITE_P( + C, AvgPredTestHBD, + ::testing::Values(&highbd_wrapper)); + +#endif // CONFIG_VP9_HIGHBITDEPTH } // namespace From e022d5b71ffca486b5bc174702a9fe0e35038c75 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Tue, 20 Dec 2022 15:43:44 -0800 Subject: [PATCH 0558/1000] [x86]: Add vpx_highbd_comp_avg_pred_sse2(). C vs SSE2 4x4: 3.38x 8x8: 3.45x 16x16: 2.06x 32x32: 2.19x 64x64: 1.39x Change-Id: I46638fe187b49a78fee554114fac51c485d74474 --- test/comp_avg_pred_test.cc | 8 ++++- vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/highbd_variance_sse2.c | 47 ++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc index 66dc4eb4e1..70aeab8d7e 100644 --- a/test/comp_avg_pred_test.cc +++ b/test/comp_avg_pred_test.cc @@ -185,7 +185,7 @@ void AvgPredTest::TestSpeed() { vpx_usec_timer timer; vpx_usec_timer_start(&timer); - for (int i = 0; i < 10000000 / (width * height); ++i) { + for (int i = 0; i < 100000000 / (width * height); ++i) { avg_pred_func_((uint8_t *)avg.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(), width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride()); @@ -254,5 +254,11 @@ INSTANTIATE_TEST_SUITE_P( C, AvgPredTestHBD, ::testing::Values(&highbd_wrapper)); +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, AvgPredTestHBD, + ::testing::Values(&highbd_wrapper)); +#endif // HAVE_SSE2 + #endif // CONFIG_VP9_HIGHBITDEPTH } // namespace diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index b6d656820f..8725821b67 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1400,7 +1400,7 @@ () specialize qw/vpx_highbd_12_mse8x8 sse2 neon/; add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride"; - specialize qw/vpx_highbd_comp_avg_pred neon/; + specialize qw/vpx_highbd_comp_avg_pred neon sse2/; # # Subpixel Variance diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c index 7c8d79b09e..381e0ad193 100644 --- a/vpx_dsp/x86/highbd_variance_sse2.c +++ b/vpx_dsp/x86/highbd_variance_sse2.c @@ -7,6 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include // SSE2 #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -559,3 +560,49 @@ FNS(sse2) #undef FNS #undef FN + +void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i, j; + if (width > 8) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]); + const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]); + const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]); + const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]); + _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0)); + _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1)); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + for (i = 0; i < height; i += 2) { + const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]); + const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]); + const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]); + const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]); + _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0)); + _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1)); + comp_pred += 8 << 1; + pred += 8 << 1; + ref += ref_stride << 1; + } + } else { + assert(width == 4); + for (i = 0; i < height; i += 2) { + const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]); + const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]); + const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]); + const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]); + _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0)); + _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1)); + comp_pred += 4 << 1; + pred += 4 << 1; + ref += ref_stride << 1; + } + } +} From 11151943b1877824da6086ea0c89b5617caecb67 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 21 Dec 2022 11:13:40 -0500 Subject: [PATCH 0559/1000] Remove references to deprecated NumPy type aliases This change replaces references to a number of deprecated NumPy type aliases (np.bool, np.int, np.float, np.complex, np.object, np.str) with their recommended replacement (bool, int, float, complex, object, str). NumPy 1.24 drops the deprecated aliases so we must remove uses before updating NumPy. Change-Id: I9f5dfcbb11fe6534fce358054f210c7653f278c3 --- tools/3D-Reconstruction/MotionEST/Exhaust.py | 2 +- tools/3D-Reconstruction/MotionEST/GroundTruth.py | 4 ++-- tools/3D-Reconstruction/MotionEST/MotionEST.py | 4 ++-- tools/3D-Reconstruction/MotionEST/Util.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/3D-Reconstruction/MotionEST/Exhaust.py b/tools/3D-Reconstruction/MotionEST/Exhaust.py index 2d6a4d8114..d763de8562 100644 --- a/tools/3D-Reconstruction/MotionEST/Exhaust.py +++ b/tools/3D-Reconstruction/MotionEST/Exhaust.py @@ -83,7 +83,7 @@ def __init__(self, cur_f, ref_f, blk_size, wnd_size, beta, metric=MSE): self.beta = beta self.metric = metric super(ExhaustNeighbor, self).__init__(cur_f, ref_f, blk_size) - self.assign = np.zeros((self.num_row, self.num_col), dtype=np.bool) + self.assign = np.zeros((self.num_row, self.num_col), dtype=bool) """ estimate neighbor loss: diff --git a/tools/3D-Reconstruction/MotionEST/GroundTruth.py b/tools/3D-Reconstruction/MotionEST/GroundTruth.py index 12bc53ff73..37305898a7 100644 --- a/tools/3D-Reconstruction/MotionEST/GroundTruth.py +++ b/tools/3D-Reconstruction/MotionEST/GroundTruth.py @@ -29,7 +29,7 @@ class GroundTruth(MotionEST): def __init__(self, cur_f, ref_f, blk_sz, gt_path, mf=None, mask=None): self.name = 'ground truth' super(GroundTruth, self).__init__(cur_f, ref_f, blk_sz) - self.mask = np.zeros((self.num_row, self.num_col), dtype=np.bool) + self.mask = np.zeros((self.num_row, self.num_col), dtype=bool) if gt_path: with open(gt_path) as gt_file: lines = gt_file.readlines() @@ -42,7 +42,7 @@ def __init__(self, cur_f, ref_f, blk_sz, gt_path, mf=None, mask=None): self.mask[i, -j - 1] = True continue #the order of original file is flipped on the x axis - self.mf[i, -j - 1] = np.array([float(y), -float(x)], dtype=np.int) + self.mf[i, -j - 1] = np.array([float(y), -float(x)], dtype=int) else: self.mf = mf self.mask = mask diff --git a/tools/3D-Reconstruction/MotionEST/MotionEST.py b/tools/3D-Reconstruction/MotionEST/MotionEST.py index 0959530fa0..fc393818d9 100644 --- a/tools/3D-Reconstruction/MotionEST/MotionEST.py +++ b/tools/3D-Reconstruction/MotionEST/MotionEST.py @@ -28,8 +28,8 @@ def __init__(self, cur_f, ref_f, blk_sz): self.ref_f = ref_f self.blk_sz = blk_sz #convert RGB to YUV - self.cur_yuv = np.array(self.cur_f.convert('YCbCr'), dtype=np.int) - self.ref_yuv = np.array(self.ref_f.convert('YCbCr'), dtype=np.int) + self.cur_yuv = np.array(self.cur_f.convert('YCbCr'), dtype=int) + self.ref_yuv = np.array(self.ref_f.convert('YCbCr'), dtype=int) #frame size self.width = self.cur_f.size[0] self.height = self.cur_f.size[1] diff --git a/tools/3D-Reconstruction/MotionEST/Util.py b/tools/3D-Reconstruction/MotionEST/Util.py index 551881cfd7..c2416163be 100644 --- a/tools/3D-Reconstruction/MotionEST/Util.py +++ b/tools/3D-Reconstruction/MotionEST/Util.py @@ -18,7 +18,7 @@ def MSE(blk1, blk2): return np.mean( LA.norm( - np.array(blk1, dtype=np.int) - np.array(blk2, dtype=np.int), axis=2)) + np.array(blk1, dtype=int) - np.array(blk2, dtype=int), axis=2)) def drawMF(img, blk_sz, mf): From ab1192c2907185d59f0044230602ea6025a42844 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 5 Jan 2023 12:20:03 +0000 Subject: [PATCH 0560/1000] Use lane-referencing intrinsics in Neon convolution kernels The Neon convolution helper functions take a pointer to a filter and load the 8 values into a single Neon register. For some reason, filter values 3 and 4 are then duplicated into their own separate registers. This patch modifies these helper functions so that they access filter values 3 and 4 via the lane-referencing versions of the various Neon multiply instructions. This reduces register pressure and tidies up the source code quite a bit. Change-Id: Ia4aeee8b46fe218658fb8577dc07ff04a9324b3e --- vpx_dsp/arm/vpx_convolve8_neon.c | 166 +++++++----------------- vpx_dsp/arm/vpx_convolve8_neon.h | 20 +-- vpx_dsp/arm/vpx_scaled_convolve8_neon.c | 9 +- 3 files changed, 59 insertions(+), 136 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index ca5222fa07..28018398a5 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -807,16 +807,13 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, if (h == 4) { uint8x8_t d01, d23; - int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, - d1, d2, d3; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; int16x8_t tt0, tt1, tt2, tt3; __builtin_prefetch(src + 0 * src_stride); __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); - filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - filter4 = vdup_lane_s16(vget_high_s16(filters), 0); load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); @@ -848,14 +845,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vget_low_s16(tt2); s10 = vget_low_s16(tt3); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); @@ -882,8 +875,6 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, w -= 4; } while (w != 0); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); int width; const uint8_t *s; uint8x8_t t4, t5, t6, t7; @@ -926,14 +917,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); transpose_u8_8x4(&t0, &t1, &t2, &t3); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0); @@ -1001,22 +988,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); - t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, - filter4); - t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, - filter4); - t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, - filter4); - t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, - filter3, filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); + t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); + t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); + t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7); @@ -1060,8 +1039,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, if (h == 4) { uint8x8_t d01, d23; - int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, - d1, d2, d3; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; int16x8_t tt0, tt1, tt2, tt3; uint32x4_t d0123 = vdupq_n_u32(0); @@ -1069,8 +1047,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); - filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - filter4 = vdup_lane_s16(vget_high_s16(filters), 0); load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); @@ -1102,14 +1078,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vget_low_s16(tt2); s10 = vget_low_s16(tt3); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); @@ -1139,8 +1111,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, w -= 4; } while (w != 0); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); int width; const uint8_t *s; uint8x8_t t4, t5, t6, t7; @@ -1185,14 +1155,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); transpose_u8_8x4(&t0, &t1, &t2, &t3); @@ -1275,22 +1241,14 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); - t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, - filter4); - t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, - filter4); - t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, - filter4); - t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, - filter3, filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); + t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); + t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); + t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); @@ -1348,8 +1306,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3 * src_stride; if (w == 4) { - const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t d01, d23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; @@ -1386,14 +1342,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); @@ -1416,8 +1368,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); int height; const uint8_t *s; uint8_t *d; @@ -1468,14 +1418,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); vst1_u8(d, t0); d += dst_stride; @@ -1520,8 +1466,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3 * src_stride; if (w == 4) { - const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t d01, d23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; uint32x4_t d0123 = vdupq_n_u32(0); @@ -1559,14 +1503,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); @@ -1597,8 +1537,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); int height; const uint8_t *s; uint8_t *d; @@ -1650,14 +1588,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vcombine_u8(t0, t1); d23 = vcombine_u8(t2, t3); diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index b112cb249a..b8dfce71ea 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -176,9 +176,7 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, - const int16x8_t filters, - const int16x4_t filter3, - const int16x4_t filter4) { + const int16x8_t filters) { const int16x4_t filters_lo = vget_low_s16(filters); const int16x4_t filters_hi = vget_high_s16(filters); int16x4_t sum; @@ -189,8 +187,8 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, sum = vmla_lane_s16(sum, s5, filters_hi, 1); sum = vmla_lane_s16(sum, s6, filters_hi, 2); sum = vmla_lane_s16(sum, s7, filters_hi, 3); - sum = vqadd_s16(sum, vmul_s16(s3, filter3)); - sum = vqadd_s16(sum, vmul_s16(s4, filter4)); + sum = vqadd_s16(sum, vmul_lane_s16(s3, filters_lo, 3)); + sum = vqadd_s16(sum, vmul_lane_s16(s4, filters_hi, 0)); return sum; } @@ -198,9 +196,7 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, - const int16x8_t filters, - const int16x8_t filter3, - const int16x8_t filter4) { + const int16x8_t filters) { const int16x4_t filters_lo = vget_low_s16(filters); const int16x4_t filters_hi = vget_high_s16(filters); int16x8_t sum; @@ -211,15 +207,13 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, sum = vmlaq_lane_s16(sum, s5, filters_hi, 1); sum = vmlaq_lane_s16(sum, s6, filters_hi, 2); sum = vmlaq_lane_s16(sum, s7, filters_hi, 3); - sum = vqaddq_s16(sum, vmulq_s16(s3, filter3)); - sum = vqaddq_s16(sum, vmulq_s16(s4, filter4)); + sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3)); + sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0)); return vqrshrun_n_s16(sum, 7); } static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, const int16x8_t filters) { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); int16x8_t ss[8]; ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); @@ -232,7 +226,7 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7])); return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7], - filters, filter3, filter4); + filters); } #endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_ diff --git a/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/vpx_dsp/arm/vpx_scaled_convolve8_neon.c index 8edf8a66e6..8491ca7ac5 100644 --- a/vpx_dsp/arm/vpx_scaled_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_scaled_convolve8_neon.c @@ -38,8 +38,6 @@ static INLINE void scaledconvolve_horiz_w4( const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; if (x_q4 & SUBPEL_MASK) { const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); - const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t s[8], d; int16x8_t ss[4]; int16x4_t t[8], tt; @@ -61,7 +59,7 @@ static INLINE void scaledconvolve_horiz_w4( t[7] = vget_high_s16(ss[3]); tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], - filters, filter3, filter4); + filters); d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0); } else { @@ -167,8 +165,6 @@ static INLINE void scaledconvolve_vert_w4( if (y_q4 & SUBPEL_MASK) { const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); - const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t s[8], d; int16x4_t t[8], tt; @@ -183,8 +179,7 @@ static INLINE void scaledconvolve_vert_w4( t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6]))); t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7]))); - tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters, - filter3, filter4); + tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters); d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); } else { From 708c4aa8540ec81aa5f0d93edc2e1e4d6d4581ac Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 5 Jan 2023 15:04:53 +0000 Subject: [PATCH 0561/1000] Use Neon load/store helper functions consistently Define all Neon load/store helper functions in mem_neon.h and use them consistently in Neon convolution functions. Change-Id: I57905bc0a3574c77999cf4f4a73442c3420fa2be --- vp9/encoder/arm/neon/vp9_frame_scale_neon.c | 1 + vpx_dsp/arm/mem_neon.h | 157 +++++++++++++++++ vpx_dsp/arm/vpx_convolve8_neon.c | 177 +++++--------------- vpx_dsp/arm/vpx_convolve8_neon.h | 56 ------- vpx_dsp/arm/vpx_scaled_convolve8_neon.c | 1 + 5 files changed, 198 insertions(+), 194 deletions(-) diff --git a/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/vp9/encoder/arm/neon/vp9_frame_scale_neon.c index 69b8cfffd7..bc8dd4a341 100644 --- a/vp9/encoder/arm/neon/vp9_frame_scale_neon.c +++ b/vp9/encoder/arm/neon/vp9_frame_scale_neon.c @@ -14,6 +14,7 @@ #include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vp9/common/vp9_blockd.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/arm/vpx_convolve8_neon.h" #include "vpx_dsp/vpx_filter.h" diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index 84aae161b3..19cfc7c7f2 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -201,4 +201,161 @@ static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) { buf += stride; vst1_lane_u32((uint32_t *)buf, a_u32, 1); } + +static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); +} + +static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); +} + +static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); +} + +static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p, + const uint8x16_t s0, const uint8x16_t s1, + const uint8x16_t s2, const uint8x16_t s3) { + vst1q_u8(s, s0); + s += p; + vst1q_u8(s, s1); + s += p; + vst1q_u8(s, s2); + s += p; + vst1q_u8(s, s3); +} + +static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); +} + +static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6, uint8x8_t *const s7) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); + s += p; + *s7 = vld1_u8(s); +} + +static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3, + const uint8x8_t s4, const uint8x8_t s5, + const uint8x8_t s6, const uint8x8_t s7) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); + s += p; + vst1_u8(s, s4); + s += p; + vst1_u8(s, s5); + s += p; + vst1_u8(s, s6); + s += p; + vst1_u8(s, s7); +} + +static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3, + uint8x16_t *const s4, uint8x16_t *const s5, + uint8x16_t *const s6, uint8x16_t *const s7) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); + s += p; + *s4 = vld1q_u8(s); + s += p; + *s5 = vld1q_u8(s); + s += p; + *s6 = vld1q_u8(s); + s += p; + *s7 = vld1q_u8(s); +} + +static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p, + const uint8x16_t s0, const uint8x16_t s1, + const uint8x16_t s2, const uint8x16_t s3, + const uint8x16_t s4, const uint8x16_t s5, + const uint8x16_t s6, const uint8x16_t s7) { + vst1q_u8(s, s0); + s += p; + vst1q_u8(s, s1); + s += p; + vst1q_u8(s, s2); + s += p; + vst1q_u8(s, s3); + s += p; + vst1q_u8(s, s4); + s += p; + vst1q_u8(s, s5); + s += p; + vst1q_u8(s, s6); + s += p; + vst1q_u8(s, s7); +} + #endif // VPX_VPX_DSP_ARM_MEM_NEON_H_ diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index 28018398a5..dba436b1a0 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -124,33 +124,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, int16x8_t t01, t23; uint8x8_t d01, d23; - s0 = vld1q_u8(src); - src += src_stride; - s1 = vld1q_u8(src); - src += src_stride; - s2 = vld1q_u8(src); - src += src_stride; - s3 = vld1q_u8(src); - src += src_stride; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl); t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl); t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl); t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl); - t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); d01 = vqrshrun_n_s16(t01, 7); d23 = vqrshrun_n_s16(t23, 7); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { @@ -165,20 +154,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s = src; d = dst; do { - s0 = vld1q_u8(s + 0 * src_stride); - s1 = vld1q_u8(s + 1 * src_stride); - s2 = vld1q_u8(s + 2 * src_stride); - s3 = vld1q_u8(s + 3 * src_stride); + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl); d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl); d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl); d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl); - vst1_u8(d + 0 * dst_stride, d0); - vst1_u8(d + 1 * dst_stride, d1); - vst1_u8(d + 2 * dst_stride, d2); - vst1_u8(d + 3 * dst_stride, d3); + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; @@ -221,20 +204,12 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, dd01 = vdup_n_u8(0); dd23 = vdup_n_u8(0); - s0 = vld1q_u8(src); - src += src_stride; - s1 = vld1q_u8(src); - src += src_stride; - s2 = vld1q_u8(src); - src += src_stride; - s3 = vld1q_u8(src); - src += src_stride; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl); t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl); t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl); t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl); - t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); d01 = vqrshrun_n_s16(t01, 7); @@ -242,17 +217,15 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, dd01 = load_u8(dst + 0 * dst_stride, dst_stride); dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { @@ -267,29 +240,21 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s = src; d = dst; do { - s0 = vld1q_u8(s + 0 * src_stride); - s1 = vld1q_u8(s + 1 * src_stride); - s2 = vld1q_u8(s + 2 * src_stride); - s3 = vld1q_u8(s + 3 * src_stride); + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl); d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl); d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl); d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl); - dd0 = vld1_u8(d + 0 * dst_stride); - dd1 = vld1_u8(d + 1 * dst_stride); - dd2 = vld1_u8(d + 2 * dst_stride); - dd3 = vld1_u8(d + 3 * dst_stride); + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + d0 = vrhadd_u8(d0, dd0); d1 = vrhadd_u8(d1, dd1); d2 = vrhadd_u8(d2, dd2); d3 = vrhadd_u8(d3, dd3); - vst1_u8(d + 0 * dst_stride, d0); - vst1_u8(d + 1 * dst_stride, d1); - vst1_u8(d + 2 * dst_stride, d2); - vst1_u8(d + 3 * dst_stride, d3); + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; @@ -332,14 +297,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, int32x4_t d0, d1, d2, d3; uint8x8_t d01, d23; - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - src += 4 * src_stride; - t4 = vld1_u8(src); - src += src_stride; - t5 = vld1_u8(src); - src += src_stride; - t6 = vld1_u8(src); - src += src_stride; + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); @@ -387,18 +346,11 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters); d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters); d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters); - d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); /* Prepare block for next iteration - re-using as much as possible. */ /* Shuffle everything up four rows. */ @@ -408,6 +360,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s3456 = s78910; src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { @@ -425,14 +378,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s = src; d = dst; - load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s += 4 * src_stride; - t4 = vld1_u8(s); - s += src_stride; - t5 = vld1_u8(s); - s += src_stride; - t6 = vld1_u8(s); - s += src_stride; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); @@ -498,10 +445,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, correction, filters); d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, correction, filters); - vst1_u8(d + 0 * dst_stride, d0); - vst1_u8(d + 1 * dst_stride, d1); - vst1_u8(d + 2 * dst_stride, d2); - vst1_u8(d + 3 * dst_stride, d3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); /* Prepare block for next iteration - re-using as much as possible. */ /* Shuffle everything up four rows. */ @@ -555,14 +500,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, int32x4_t d0, d1, d2, d3; uint8x8_t d01, d23, dd01, dd23; - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - src += 4 * src_stride; - t4 = vld1_u8(src); - src += src_stride; - t5 = vld1_u8(src); - src += src_stride; - t6 = vld1_u8(src); - src += src_stride; + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); @@ -610,23 +549,17 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters); d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters); d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters); - d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); dd01 = load_u8(dst + 0 * dst_stride, dst_stride); dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); /* Prepare block for next iteration - re-using as much as possible. */ /* Shuffle everything up four rows. */ @@ -636,6 +569,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s3456 = s78910; src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { @@ -653,14 +587,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s = src; d = dst; - load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s += 4 * src_stride; - t4 = vld1_u8(s); - s += src_stride; - t5 = vld1_u8(s); - s += src_stride; - t6 = vld1_u8(s); - s += src_stride; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); @@ -727,19 +655,14 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, correction, filters); - dd0 = vld1_u8(d + 0 * dst_stride); - dd1 = vld1_u8(d + 1 * dst_stride); - dd2 = vld1_u8(d + 2 * dst_stride); - dd3 = vld1_u8(d + 3 * dst_stride); + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + d0 = vrhadd_u8(d0, dd0); d1 = vrhadd_u8(d1, dd1); d2 = vrhadd_u8(d2, dd2); d3 = vrhadd_u8(d3, dd3); - vst1_u8(d + 0 * dst_stride, d0); - vst1_u8(d + 1 * dst_stride, d1); - vst1_u8(d + 2 * dst_stride, d2); - vst1_u8(d + 3 * dst_stride, d3); + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); /* Prepare block for next iteration - re-using as much as possible. */ /* Shuffle everything up four rows. */ @@ -765,28 +688,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, #else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) -static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, - const uint8x8_t s0, const uint8x8_t s1, - const uint8x8_t s2, const uint8x8_t s3, - const uint8x8_t s4, const uint8x8_t s5, - const uint8x8_t s6, const uint8x8_t s7) { - vst1_u8(s, s0); - s += p; - vst1_u8(s, s1); - s += p; - vst1_u8(s, s2); - s += p; - vst1_u8(s, s3); - s += p; - vst1_u8(s, s4); - s += p; - vst1_u8(s, s5); - s += p; - vst1_u8(s, s6); - s += p; - vst1_u8(s, s7); -} - void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index b8dfce71ea..26a5fa688a 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -16,62 +16,6 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" -static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, - uint8x8_t *const s0, uint8x8_t *const s1, - uint8x8_t *const s2, uint8x8_t *const s3) { - *s0 = vld1_u8(s); - s += p; - *s1 = vld1_u8(s); - s += p; - *s2 = vld1_u8(s); - s += p; - *s3 = vld1_u8(s); -} - -static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p, - uint8x8_t *const s0, uint8x8_t *const s1, - uint8x8_t *const s2, uint8x8_t *const s3, - uint8x8_t *const s4, uint8x8_t *const s5, - uint8x8_t *const s6, uint8x8_t *const s7) { - *s0 = vld1_u8(s); - s += p; - *s1 = vld1_u8(s); - s += p; - *s2 = vld1_u8(s); - s += p; - *s3 = vld1_u8(s); - s += p; - *s4 = vld1_u8(s); - s += p; - *s5 = vld1_u8(s); - s += p; - *s6 = vld1_u8(s); - s += p; - *s7 = vld1_u8(s); -} - -static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p, - uint8x16_t *const s0, uint8x16_t *const s1, - uint8x16_t *const s2, uint8x16_t *const s3, - uint8x16_t *const s4, uint8x16_t *const s5, - uint8x16_t *const s6, uint8x16_t *const s7) { - *s0 = vld1q_u8(s); - s += p; - *s1 = vld1q_u8(s); - s += p; - *s2 = vld1q_u8(s); - s += p; - *s3 = vld1q_u8(s); - s += p; - *s4 = vld1q_u8(s); - s += p; - *s5 = vld1q_u8(s); - s += p; - *s6 = vld1q_u8(s); - s += p; - *s7 = vld1q_u8(s); -} - #if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo, diff --git a/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/vpx_dsp/arm/vpx_scaled_convolve8_neon.c index 8491ca7ac5..b8e3c5e540 100644 --- a/vpx_dsp/arm/vpx_scaled_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_scaled_convolve8_neon.c @@ -15,6 +15,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/arm/vpx_convolve8_neon.h" #include "vpx_ports/mem.h" From e067469e77bec79f7f52d074c440f01bfa4c14af Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 10 Jan 2023 13:49:15 -0800 Subject: [PATCH 0562/1000] build: replace egrep with grep -E avoids a warning on some platforms: egrep: warning: egrep is obsolescent; using grep -E Bug: webm:1786 Change-Id: Ia434297731303aacb0b02cf3dcbfd8e03936485d Fixed: webm:1786 --- build/make/gen_asm_deps.sh | 2 +- libs.mk | 4 ++-- test/tools_common.sh | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/build/make/gen_asm_deps.sh b/build/make/gen_asm_deps.sh index 6a7bff9ebc..3bd4d125f1 100755 --- a/build/make/gen_asm_deps.sh +++ b/build/make/gen_asm_deps.sh @@ -42,7 +42,7 @@ done [ -n "$srcfile" ] || show_help sfx=${sfx:-asm} -includes=$(LC_ALL=C egrep -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile | +includes=$(LC_ALL=C grep -E -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile | perl -p -e "s;.*?([a-z0-9_/]+.${sfx}).*;\1;") #" restore editor state for inc in ${includes}; do diff --git a/libs.mk b/libs.mk index f65e99242c..fb6fbbeb20 100644 --- a/libs.mk +++ b/libs.mk @@ -446,13 +446,13 @@ ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes) # YASM $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h @echo " [CREATE] $@" - @LC_ALL=C egrep "#define [A-Z0-9_]+ [01]" $< \ + @LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \ | awk '{print $$2 " equ " $$3}' > $@ else ADS2GAS=$(if $(filter yes,$(CONFIG_GCC)),| $(ASM_CONVERSION)) $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h @echo " [CREATE] $@" - @LC_ALL=C egrep "#define [A-Z0-9_]+ [01]" $< \ + @LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \ | awk '{print $$2 " EQU " $$3}' $(ADS2GAS) > $@ @echo " END" $(ADS2GAS) >> $@ CLEAN-OBJS += $(BUILD_PFX)vpx_config.asm diff --git a/test/tools_common.sh b/test/tools_common.sh index 844a12534d..0e4a0a5c0e 100755 --- a/test/tools_common.sh +++ b/test/tools_common.sh @@ -133,7 +133,7 @@ vpx_config_option_enabled() { vpx_config_option="${1}" vpx_config_file="${LIBVPX_CONFIG_PATH}/vpx_config.h" config_line=$(grep "${vpx_config_option}" "${vpx_config_file}") - if echo "${config_line}" | egrep -q '1$'; then + if echo "${config_line}" | grep -E -q '1$'; then echo yes fi } @@ -222,7 +222,7 @@ filter_strings() { if [ -n "${filter}" ]; then for s in ${strings}; do - if echo "${s}" | egrep -q ${exclude} "${filter}" > /dev/null 2>&1; then + if echo "${s}" | grep -E -q ${exclude} "${filter}" > /dev/null 2>&1; then filtered_strings="${filtered_strings} ${s}" fi done From f952068691bcc397a17721d004ac84e63e46bb3c Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 18 May 2022 14:14:56 +0100 Subject: [PATCH 0563/1000] Implement horizontal convolutions using Neon USDOT instruction Add additional AArch64 paths for vpx_convolve8_horiz_neon and vpx_convolve8_avg_horiz_neon that use the Armv8.6-A USDOT (mixed-sign dot-product) instruction. The USDOT instruction takes an 8-bit unsigned operand vector and a signed 8-bit operand vector to produce a signed 32-bit result. This is helpful because convolution filters often have both positive and negative values, while the 8-bit pixel channel data being filtered is all unsigned. As a result, the USDOT convolution paths added here do not have to do the "transform the pixel channel data to [-128, 128) and correct for it later" dance that we have to do with the SDOT paths. The USDOT instruction is optional from Armv8.2 to Armv8.5 but mandatory from Armv8.6 onwards. The availability of the USDOT instruction is indicated by the feature macro __ARM_FEATURE_MATMUL_INT8. The SDOT paths are retained for use on target CPUs that do not implement the USDOT instructions. Change-Id: If19f5872c3453458a8cfb7c7d2be82a2c0eab46a --- vpx_dsp/arm/vpx_convolve8_neon.c | 271 ++++++++++++++++++++++++++----- vpx_dsp/arm/vpx_convolve8_neon.h | 91 ++++++++--- 2 files changed, 299 insertions(+), 63 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index dba436b1a0..81ceb518dd 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -31,7 +31,9 @@ // instructions. This optimization is much faster in speed unit test, but slowed // down the whole decoder by 5%. -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#if defined(__aarch64__) && \ + (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)) + DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, @@ -96,6 +98,175 @@ static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1, *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); } +#if defined(__ARM_FEATURE_MATMUL_INT8) + +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + uint8x16_t s0, s1, s2, s3; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int32x4_t t0, t1, t2, t3; + int16x8_t t01, t23; + uint8x8_t d01, d23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_usdot(s0, filters, permute_tbl); + t1 = convolve8_4_usdot(s1, filters, permute_tbl); + t2 = convolve8_4_usdot(s2, filters, permute_tbl); + t3 = convolve8_4_usdot(s3, filters, permute_tbl); + t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); + t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); + d01 = vqrshrun_n_s16(t01, 7); + d23 = vqrshrun_n_s16(t23, 7); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filters, permute_tbl); + d1 = convolve8_8_usdot(s1, filters, permute_tbl); + d2 = convolve8_8_usdot(s2, filters, permute_tbl); + d3 = convolve8_8_usdot(s3, filters, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } +} + +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + uint8x16_t s0, s1, s2, s3; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int32x4_t t0, t1, t2, t3; + int16x8_t t01, t23; + uint8x8_t d01, d23, dd01, dd23; + dd01 = vdup_n_u8(0); + dd23 = vdup_n_u8(0); + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_usdot(s0, filters, permute_tbl); + t1 = convolve8_4_usdot(s1, filters, permute_tbl); + t2 = convolve8_4_usdot(s2, filters, permute_tbl); + t3 = convolve8_4_usdot(s3, filters, permute_tbl); + t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); + t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); + d01 = vqrshrun_n_s16(t01, 7); + d23 = vqrshrun_n_s16(t23, 7); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filters, permute_tbl); + d1 = convolve8_8_usdot(s1, filters, permute_tbl); + d2 = convolve8_8_usdot(s2, filters, permute_tbl); + d3 = convolve8_8_usdot(s3, filters, permute_tbl); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } +} + +#else // !defined(__ARM_FEATURE_MATMUL_INT8) + void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -126,10 +297,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl); - t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl); - t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl); - t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl); + t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl); + t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl); + t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl); + t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl); t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); d01 = vqrshrun_n_s16(t01, 7); @@ -156,10 +327,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, do { load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl); - d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl); - d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl); - d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl); + d0 = + convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl); + d1 = + convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl); + d2 = + convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl); + d3 = + convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -206,10 +381,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl); - t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl); - t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl); - t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl); + t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl); + t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl); + t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl); + t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl); t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); d01 = vqrshrun_n_s16(t01, 7); @@ -242,10 +417,14 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, do { load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl); - d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl); - d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl); - d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl); + d0 = + convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl); + d1 = + convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl); + d2 = + convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl); + d3 = + convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl); load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); @@ -267,6 +446,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } +#endif // defined(__ARM_FEATURE_MATMUL_INT8) + void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -342,10 +523,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters); - d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters); - d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters); - d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters); + d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); + d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); + d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); + d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); @@ -437,14 +618,14 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filters); - d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filters); - d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filters); - d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filters); + d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filters); + d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filters); + d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filters); + d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filters); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -545,10 +726,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters); - d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters); - d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters); - d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters); + d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); + d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); + d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); + d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); @@ -646,14 +827,14 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filters); - d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filters); - d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filters); - d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filters); + d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filters); + d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filters); + d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filters); + d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filters); load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); @@ -686,7 +867,9 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } } -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) +#else // !(defined(__aarch64__) && + // (defined(__ARM_FEATURE_DOTPROD) || + // defined(__ARM_FEATURE_MATMUL_INT8))) void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, @@ -1528,4 +1711,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#endif // #if defined(__aarch64__) && + // (defined(__ARM_FEATURE_DOTPROD) || + // defined(__ARM_FEATURE_MATMUL_INT8)) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index 26a5fa688a..a62e4f461c 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -18,10 +18,10 @@ #if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) -static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo, - const int8x16_t samples_hi, - const int32x4_t correction, - const int8x8_t filters) { +static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, + const int8x16_t samples_hi, + const int32x4_t correction, + const int8x8_t filters) { /* Sample range-clamping and permutation are performed by the caller. */ int32x4_t sum; @@ -33,11 +33,11 @@ static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo, return sum; } -static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x2_t permute_tbl) { +static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x2_t permute_tbl) { int8x16_t clamped_samples, permuted_samples[2]; int32x4_t sum; @@ -58,12 +58,12 @@ static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples, return sum; } -static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo, - const int8x16_t samples0_hi, - const int8x16_t samples1_lo, - const int8x16_t samples1_hi, - const int32x4_t correction, - const int8x8_t filters) { +static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo, + const int8x16_t samples0_hi, + const int8x16_t samples1_lo, + const int8x16_t samples1_hi, + const int32x4_t correction, + const int8x8_t filters) { /* Sample range-clamping and permutation are performed by the caller. */ int32x4_t sum0, sum1; int16x8_t sum; @@ -81,11 +81,11 @@ static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo, return vqrshrun_n_s16(sum, 7); } -static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x3_t permute_tbl) { +static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { int8x16_t clamped_samples, permuted_samples[3]; int32x4_t sum0, sum1; int16x8_t sum; @@ -116,6 +116,57 @@ static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples, #endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8) + +static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + uint8x16_t permuted_samples[2]; + int32x4_t sum; + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + + /* Accumulate dot product into 'correction' to account for range clamp. */ + sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); + + /* Narrowing and packing is performed by the caller. */ + return sum; +} + +static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + uint8x16_t permuted_samples[3]; + int32x4_t sum0, sum1; + int16x8_t sum; + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + /* First 4 output values. */ + sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1); + /* Second 4 output values. */ + sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, 7); +} + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8) + static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, From 5645938c36b9cd1fa4f7c97da0e8c0ef0330d45d Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 18 May 2022 16:58:50 +0100 Subject: [PATCH 0564/1000] Implement vertical convolutions using Neon USDOT instruction Add additional AArch64 paths for vpx_convolve8_vert_neon and vpx_convolve8_avg_vert_neon that use the Armv8.6-A USDOT (mixed-sign dot-product) instruction. The USDOT instruction takes an 8-bit unsigned operand vector and a signed 8-bit operand vector to produce a signed 32-bit result. This is helpful because convolution filters often have both positive and negative values, while the 8-bit pixel channel data being filtered is all unsigned. As a result, the USDOT convolution paths added here do not have to do the "transform the pixel channel data to [-128, 128) and correct for it later" dance that we have to do with the SDOT paths. The USDOT instruction is optional from Armv8.2 to Armv8.5 but mandatory from Armv8.6 onwards. The availability of the USDOT instruction is indicated by the feature macro __ARM_FEATURE_MATMUL_INT8. The SDOT paths are retained for use on target CPUs that do not implement the USDOT instructions. Change-Id: Ifbf467681dd53bb1d26e22359885e6edde3c5c72 --- vpx_dsp/arm/vpx_convolve8_neon.c | 548 ++++++++++++++++++++++++++----- vpx_dsp/arm/vpx_convolve8_neon.h | 34 ++ 2 files changed, 505 insertions(+), 77 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index 81ceb518dd..b4cdd58c70 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -54,50 +54,6 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1, - int8x8_t *a2, int8x8_t *a3, - int8x16_t *b, - const uint8x16_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, XX, XX, XX, XX - * a1: 10, 11, 12, 13, XX, XX, XX, XX - * a2: 20, 21, 22, 23, XX, XX, XX, XX - * a3: 30, 31, 32, 33, XX, XX, XX, XX - * - * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } }; - *b = vqtbl2q_s8(samples, permute_tbl); -} - -static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1, - int8x8_t *a2, int8x8_t *a3, - int8x16_t *b0, int8x16_t *b1, - const uint8x16x2_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, 04, 05, 06, 07 - * a1: 10, 11, 12, 13, 14, 15, 16, 17 - * a2: 20, 21, 22, 23, 24, 25, 26, 27 - * a3: 30, 31, 32, 33, 34, 35, 36, 37 - * - * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } }; - *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); -} - #if defined(__ARM_FEATURE_MATMUL_INT8) void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, @@ -265,6 +221,401 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } +static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b, + const uint8x16_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, XX, XX, XX, XX + * a1: 10, 11, 12, 13, XX, XX, XX, XX + * a2: 20, 21, 22, 23, XX, XX, XX, XX + * a3: 30, 31, 32, 33, XX, XX, XX, XX + * + * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; + *b = vqtbl2q_u8(samples, permute_tbl); +} + +static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b0, uint8x16_t *b1, + const uint8x16x2_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, 04, 05, 06, 07 + * a1: 10, 11, 12, 13, 14, 15, 16, 17 + * a2: 20, 21, 22, 23, 24, 25, 26, 27 + * a3: 30, 31, 32, 33, 34, 35, 36, 37 + * + * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; + *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]); + *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]); +} + +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint8x16x2_t samples_LUT; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int32x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_usdot_partial(s0123, s4567, filters); + d1 = convolve8_4_usdot_partial(s1234, s5678, filters); + d2 = convolve8_4_usdot_partial(s2345, s6789, filters); + d3 = convolve8_4_usdot_partial(s3456, s78910, filters); + d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); + d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + filters); + d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + filters); + d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + filters); + d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + filters); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } +} + +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint8x16x2_t samples_LUT; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int32x4_t d0, d1, d2, d3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_usdot_partial(s0123, s4567, filters); + d1 = convolve8_4_usdot_partial(s1234, s5678, filters); + d2 = convolve8_4_usdot_partial(s2345, s6789, filters); + d3 = convolve8_4_usdot_partial(s3456, s78910, filters); + d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); + d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + filters); + d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + filters); + d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + filters); + d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + filters); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } +} + #else // !defined(__ARM_FEATURE_MATMUL_INT8) void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, @@ -446,7 +797,48 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } -#endif // defined(__ARM_FEATURE_MATMUL_INT8) +static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b, + const uint8x16_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, XX, XX, XX, XX + * a1: 10, 11, 12, 13, XX, XX, XX, XX + * a2: 20, 21, 22, 23, XX, XX, XX, XX + * a3: 30, 31, 32, 33, XX, XX, XX, XX + * + * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; + *b = vqtbl2q_s8(samples, permute_tbl); +} + +static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b0, + int8x16_t *b1, + const uint8x16x2_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, 04, 05, 06, 07 + * a1: 10, 11, 12, 13, 14, 15, 16, 17 + * a2: 20, 21, 22, 23, 24, 25, 26, 27 + * a3: 30, 31, 32, 33, 34, 35, 36, 37 + * + * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; + *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); + *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); +} void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, @@ -496,13 +888,13 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ - transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl); + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); do { uint8x8_t t7, t8, t9, t10; @@ -514,7 +906,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl); + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); /* Merge new data into block from previous iteration. */ samples_LUT.val[0] = s3456; @@ -577,19 +969,19 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ - transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi, + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, tran_concat_tbl); - transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi, + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, tran_concat_tbl); - transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi, + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, tran_concat_tbl); - transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi, + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, tran_concat_tbl); - transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi, + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, tran_concat_tbl); - transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi, + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, tran_concat_tbl); - transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi, + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, tran_concat_tbl); do { @@ -602,7 +994,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi, + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, tran_concat_tbl); /* Merge new data into block from previous iteration. */ @@ -699,13 +1091,13 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ - transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl); + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); do { uint8x8_t t7, t8, t9, t10; @@ -717,7 +1109,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl); + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); /* Merge new data into block from previous iteration. */ samples_LUT.val[0] = s3456; @@ -786,19 +1178,19 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ - transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi, + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, tran_concat_tbl); - transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi, + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, tran_concat_tbl); - transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi, + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, tran_concat_tbl); - transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi, + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, tran_concat_tbl); - transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi, + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, tran_concat_tbl); - transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi, + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, tran_concat_tbl); - transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi, + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, tran_concat_tbl); do { @@ -811,7 +1203,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi, + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, tran_concat_tbl); /* Merge new data into block from previous iteration. */ @@ -867,6 +1259,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } } +#endif // defined(__ARM_FEATURE_MATMUL_INT8) + #else // !(defined(__aarch64__) && // (defined(__ARM_FEATURE_DOTPROD) || // defined(__ARM_FEATURE_MATMUL_INT8))) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index a62e4f461c..ed7f180538 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -118,6 +118,19 @@ static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, #if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8) +static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, + const uint8x16_t samples_hi, + const int8x8_t filters) { + /* Sample permutation is performed by the caller. */ + int32x4_t sum; + + sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); + sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); + + /* Narrowing and packing is performed by the caller. */ + return sum; +} + static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples, const int8x8_t filters, const uint8x16x2_t permute_tbl) { @@ -138,6 +151,27 @@ static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples, return sum; } +static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo, + const uint8x16_t samples0_hi, + const uint8x16_t samples1_lo, + const uint8x16_t samples1_hi, + const int8x8_t filters) { + /* Sample permutation is performed by the caller. */ + int32x4_t sum0, sum1; + int16x8_t sum; + + /* First 4 output values. */ + sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0); + sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1); + /* Second 4 output values. */ + sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0); + sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, 7); +} + static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples, const int8x8_t filters, const uint8x16x3_t permute_tbl) { From 32878bb1f3db472e642eb0c98d62b37f57b12f68 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Thu, 12 Jan 2023 11:03:28 -0800 Subject: [PATCH 0565/1000] variance_test.cc: Enable VpxHBDMseTest for C and SSE2. Change-Id: I66c0db6c605876d6757684fd715614881ca261e7 --- test/variance_test.cc | 72 +++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index 8aed5d2ed9..ac6d226a5f 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -499,14 +499,21 @@ template void MainTestClass::RefTestMse() { for (int i = 0; i < 10; ++i) { for (int j = 0; j < block_size(); ++j) { - src_[j] = rnd_.Rand8(); - ref_[j] = rnd_.Rand8(); + if (!use_high_bit_depth()) { + src_[j] = rnd_.Rand8(); + ref_[j] = rnd_.Rand8(); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); +#endif // CONFIG_VP9_HIGHBITDEPTH + } } unsigned int sse1, sse2; const int stride = width(); ASM_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1)); variance_ref(src_, ref_, params_.log2width, params_.log2height, stride, - stride, &sse2, false, VPX_BITS_8); + stride, &sse2, use_high_bit_depth(), params_.bit_depth); EXPECT_EQ(sse1, sse2); } } @@ -530,8 +537,15 @@ void MainTestClass::RefTestSse() { template void MainTestClass::MaxTestMse() { - memset(src_, 255, block_size()); - memset(ref_, 0, block_size()); + if (!use_high_bit_depth()) { + memset(src_, 255, block_size()); + memset(ref_, 0, block_size()); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size()); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, block_size()); +#endif // CONFIG_VP9_HIGHBITDEPTH + } unsigned int sse; ASM_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse)); const unsigned int expected = block_size() * 255 * 255; @@ -854,25 +868,24 @@ TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); } TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } -/* TODO(debargha): This test does not support the highbd version typedef MainTestClass VpxHBDMseTest; TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); } TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); } INSTANTIATE_TEST_SUITE_P( C, VpxHBDMseTest, - ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c), - MseParams(4, 4, &vpx_highbd_12_mse16x8_c), - MseParams(4, 4, &vpx_highbd_12_mse8x16_c), - MseParams(4, 4, &vpx_highbd_12_mse8x8_c), - MseParams(4, 4, &vpx_highbd_10_mse16x16_c), - MseParams(4, 4, &vpx_highbd_10_mse16x8_c), - MseParams(4, 4, &vpx_highbd_10_mse8x16_c), - MseParams(4, 4, &vpx_highbd_10_mse8x8_c), - MseParams(4, 4, &vpx_highbd_8_mse16x16_c), - MseParams(4, 4, &vpx_highbd_8_mse16x8_c), - MseParams(4, 4, &vpx_highbd_8_mse8x16_c), - MseParams(4, 4, &vpx_highbd_8_mse8x8_c))); -*/ + ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c, VPX_BITS_12), + MseParams(4, 3, &vpx_highbd_12_mse16x8_c, VPX_BITS_12), + MseParams(3, 4, &vpx_highbd_12_mse8x16_c, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_c, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_c, VPX_BITS_10), + MseParams(4, 3, &vpx_highbd_10_mse16x8_c, VPX_BITS_10), + MseParams(3, 4, &vpx_highbd_10_mse8x16_c, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_c, VPX_BITS_10), + MseParams(4, 4, &vpx_highbd_8_mse16x16_c, VPX_BITS_8), + MseParams(4, 3, &vpx_highbd_8_mse16x8_c, VPX_BITS_8), + MseParams(3, 4, &vpx_highbd_8_mse8x16_c, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_c, VPX_BITS_8))); + GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VpxHBDMseTest); INSTANTIATE_TEST_SUITE_P( @@ -1138,22 +1151,15 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0))); #if CONFIG_VP9_HIGHBITDEPTH -/* TODO(debargha): This test does not support the highbd version INSTANTIATE_TEST_SUITE_P( SSE2, VpxHBDMseTest, - ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2), - MseParams(4, 3, &vpx_highbd_12_mse16x8_sse2), - MseParams(3, 4, &vpx_highbd_12_mse8x16_sse2), - MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2), - MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2), - MseParams(4, 3, &vpx_highbd_10_mse16x8_sse2), - MseParams(3, 4, &vpx_highbd_10_mse8x16_sse2), - MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2), - MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2), - MseParams(4, 3, &vpx_highbd_8_mse16x8_sse2), - MseParams(3, 4, &vpx_highbd_8_mse8x16_sse2), - MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2))); -*/ + ::testing::Values( + MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2, VPX_BITS_10), + MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2, VPX_BITS_8))); INSTANTIATE_TEST_SUITE_P( SSE2, VpxHBDVarianceTest, From 59d4a686166e1017654fe47178371d7101528baa Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Fri, 13 Jan 2023 07:30:07 -0800 Subject: [PATCH 0566/1000] variance_test.cc: Enable HBDMse speed test. Change-Id: If0226307a6efd704f8a35cb986f570304d698b95 --- test/variance_test.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index ac6d226a5f..a6c8ef0480 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -488,8 +488,8 @@ void MainTestClass::SpeedTest() { } vpx_usec_timer_mark(&timer); const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); - printf("Variance %dx%d time: %5d ms\n", width(), height(), - elapsed_time / 1000); + printf("Variance %dx%d %dbpp time: %5d ms\n", width(), height(), + params_.bit_depth, elapsed_time / 1000); } //////////////////////////////////////////////////////////////////////////////// @@ -871,6 +871,7 @@ TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } typedef MainTestClass VpxHBDMseTest; TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); } TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); } +TEST_P(VpxHBDMseTest, DISABLED_Speed) { SpeedTest(); } INSTANTIATE_TEST_SUITE_P( C, VpxHBDMseTest, ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c, VPX_BITS_12), From 71d01660cc40306c2c7c80c8ed510e520a0c4b93 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Fri, 13 Jan 2023 19:46:10 -0800 Subject: [PATCH 0567/1000] Fix to segfault for external resize test in vp9 Failure occurs for 1 pass non-realtime mode at speed 0. Due to speed feautre rd_ml_partition.var_pruning, which doesn't check for scaled reference in simple_motion_search(). Bug: webm:1768 Change-Id: Iddcb56033bac042faebb5196eed788317590b23f --- test/resize_test.cc | 5 +---- vp9/encoder/vp9_encodeframe.c | 8 +++++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/test/resize_test.cc b/test/resize_test.cc index e122a74742..715bb9d70f 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -777,10 +777,7 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) { } VP8_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES); -// TODO(https://crbug.com/webm/1768): VP9 should use ONE_PASS_TEST_MODES for -// the ResizeTest instantiation after segfault is fixed. -VP9_INSTANTIATE_TEST_SUITE(ResizeTest, - ::testing::Values(::libvpx_test::kRealTime)); +VP9_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES); VP9_INSTANTIATE_TEST_SUITE(ResizeInternalTest, ::testing::Values(::libvpx_test::kOnePassBest)); VP9_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest, diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index a1ee9c6784..1483ac069d 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -3413,7 +3413,8 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, const VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MODE_INFO *const mi = xd->mi[0]; - const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_buffer(cpi, ref); + YV12_BUFFER_CONFIG *yv12; + YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); const int step_param = 1; const MvLimits tmp_mv_limits = x->mv_limits; const SEARCH_METHODS search_method = NSTEP; @@ -3422,6 +3423,11 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, MV best_mv = { 0, 0 }; int cost_list[5]; + if (scaled_ref_frame) + yv12 = scaled_ref_frame; + else + yv12 = get_ref_frame_buffer(cpi, ref); + assert(yv12 != NULL); if (!yv12) return; vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, From 0ce866562fb9c70d5825a6279f3aa3a10f7a9289 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Mon, 16 Jan 2023 16:44:04 +0000 Subject: [PATCH 0568/1000] Refactor Neon implementation of variance functions Refactor and optimize the Neon implementation of variance functions - effectively backporting these libaom changes[1,2]. After this change, the only differences between the code in libvpx and libaom are due to libvpx being compiled with ISO C90, which forbids mixing declarations and code [-Wdeclaration-after-statement]. [1] https://aomedia-review.googlesource.com/c/aom/+/162241 [2] https://aomedia-review.googlesource.com/c/aom/+/162262 Change-Id: Ia4e8fff4d53297511d1a1e43bca8053bf811e551 --- vpx_dsp/arm/variance_neon.c | 538 ++++++++++++++++++------------------ 1 file changed, 275 insertions(+), 263 deletions(-) diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index f9969ed5a4..3ccc4e807b 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -22,236 +22,310 @@ #if defined(__ARM_FEATURE_DOTPROD) // Process a block of width 4 four rows at a time. -static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i; - uint32x4_t sum_a = vdupq_n_u32(0); - uint32x4_t sum_b = vdupq_n_u32(0); +static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); uint32x4_t sse_u32 = vdupq_n_u32(0); - for (i = 0; i < h; i += 4) { - const uint8x16_t a = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t b = load_unaligned_u8q(ref_ptr, ref_stride); + int i = h; + do { + const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t abs_diff = vabdq_u8(a, b); + const uint8x16_t abs_diff = vabdq_u8(s, r); sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); - sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1)); - sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1)); + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); src_ptr += 4 * src_stride; ref_ptr += 4 * ref_stride; - } + i -= 4; + } while (i != 0); - *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b))); + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); *sse = horizontal_add_uint32x4(sse_u32); } -// Process a block of any size where the width is divisible by 16. -static void variance_neon_w16(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int w, - int h, uint32_t *sse, int *sum) { - int i, j; - uint32x4_t sum_a = vdupq_n_u32(0); - uint32x4_t sum_b = vdupq_n_u32(0); +// Process a block of width 8 two rows at a time. +static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); uint32x4_t sse_u32 = vdupq_n_u32(0); - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - const uint8x16_t a = vld1q_u8(src_ptr + j); - const uint8x16_t b = vld1q_u8(ref_ptr + j); + int i = h; + do { + const uint8x16_t s = + vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride)); + const uint8x16_t r = + vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride)); - const uint8x16_t abs_diff = vabdq_u8(a, b); - sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of width 16 one row at a time. +static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = vld1q_u8(src_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); - sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1)); - sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1)); - } src_ptr += src_stride; ref_ptr += ref_stride; - } + } while (--i != 0); - *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b))); + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); *sse = horizontal_add_uint32x4(sse_u32); } -// Process a block of width 8 two rows at a time. -static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i = 0; - uint32x2_t sum_a = vdup_n_u32(0); - uint32x2_t sum_b = vdup_n_u32(0); - uint32x2_t sse_lo_u32 = vdup_n_u32(0); - uint32x2_t sse_hi_u32 = vdup_n_u32(0); +// Process a block of any size where the width is divisible by 16. +static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int w, int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + int i = h; do { - const uint8x8_t a_0 = vld1_u8(src_ptr); - const uint8x8_t a_1 = vld1_u8(src_ptr + src_stride); - const uint8x8_t b_0 = vld1_u8(ref_ptr); - const uint8x8_t b_1 = vld1_u8(ref_ptr + ref_stride); - - const uint8x8_t abs_diff_0 = vabd_u8(a_0, b_0); - const uint8x8_t abs_diff_1 = vabd_u8(a_1, b_1); - sse_lo_u32 = vdot_u32(sse_lo_u32, abs_diff_0, abs_diff_0); - sse_hi_u32 = vdot_u32(sse_hi_u32, abs_diff_1, abs_diff_1); - - sum_a = vdot_u32(sum_a, a_0, vdup_n_u8(1)); - sum_b = vdot_u32(sum_b, b_0, vdup_n_u8(1)); - sum_a = vdot_u32(sum_a, a_1, vdup_n_u8(1)); - sum_b = vdot_u32(sum_b, b_1, vdup_n_u8(1)); - - src_ptr += src_stride + src_stride; - ref_ptr += ref_stride + ref_stride; - i += 2; - } while (i < h); + int j = 0; + do { + const uint8x16_t s = vld1q_u8(src_ptr + j); + const uint8x16_t r = vld1q_u8(ref_ptr + j); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + j += 16; + } while (j < w); - *sum = horizontal_add_int32x2(vreinterpret_s32_u32(vsub_u32(sum_a, sum_b))); - *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32)); + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); } -#else // !defined(__ARM_FEATURE_DOTPROD) +static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum); +} -// The variance helper functions use int16_t for sum. 8 values are accumulated -// and then added (at which point they expand up to int32_t). To avoid overflow, -// there can be no more than 32767 / 255 ~= 128 values accumulated in each -// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32 -// rows = 128. Asserts have been added to each function to warn against reaching -// this limit. +static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum); +} -// Process a block of width 4 four rows at a time. -static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i; +#else // !defined(__ARM_FEATURE_DOTPROD) + +// Process a block of width 4 two rows at a time. +static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { int16x8_t sum_s16 = vdupq_n_s16(0); - int32x4_t sse_lo_s32 = vdupq_n_s32(0); - int32x4_t sse_hi_s32 = vdupq_n_s32(0); + int32x4_t sse_s32 = vdupq_n_s32(0); + int i = h; - // Since width is only 4, sum_s16 only loads a half row per loop. + // Number of rows we can process before 'sum_s16' overflows: + // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows. assert(h <= 256); - for (i = 0; i < h; i += 4) { - const uint8x16_t a_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t b_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint16x8_t diff_lo_u16 = - vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); - const uint16x8_t diff_hi_u16 = - vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)); - - const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16); - const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16); - - sum_s16 = vaddq_s16(sum_s16, diff_lo_s16); - sum_s16 = vaddq_s16(sum_s16, diff_hi_s16); + do { + const uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + const uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16), - vget_low_s16(diff_lo_s16)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16), - vget_high_s16(diff_lo_s16)); + sum_s16 = vaddq_s16(sum_s16, diff); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16), - vget_low_s16(diff_hi_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), - vget_high_s16(diff_hi_s16)); + sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff)); + sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff)); - src_ptr += 4 * src_stride; - ref_ptr += 4 * ref_stride; - } + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); *sum = horizontal_add_int16x8(sum_s16); - *sse = horizontal_add_uint32x4( - vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); + *sse = (uint32_t)horizontal_add_int32x4(sse_s32); } -// Process a block of any size where the width is divisible by 16. -static void variance_neon_w16(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int w, - int h, uint32_t *sse, int *sum) { - int i, j; +// Process a block of width 8 one row at a time. +static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { int16x8_t sum_s16 = vdupq_n_s16(0); - int32x4_t sse_lo_s32 = vdupq_n_s32(0); - int32x4_t sse_hi_s32 = vdupq_n_s32(0); - - // The loop loads 16 values at a time but doubles them up when accumulating - // into sum_s16. - assert(w / 8 * h <= 128); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - const uint8x16_t a_u8 = vld1q_u8(src_ptr + j); - const uint8x16_t b_u8 = vld1q_u8(ref_ptr + j); - - const uint16x8_t diff_lo_u16 = - vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); - const uint16x8_t diff_hi_u16 = - vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)); - - const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16); - const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16); - - sum_s16 = vaddq_s16(sum_s16, diff_lo_s16); - sum_s16 = vaddq_s16(sum_s16, diff_hi_s16); - - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16), - vget_low_s16(diff_lo_s16)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16), - vget_high_s16(diff_lo_s16)); - - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16), - vget_low_s16(diff_hi_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), - vget_high_s16(diff_hi_s16)); - } + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int i = h; + + // Number of rows we can process before 'sum_s16' overflows: + // 32767 / 255 ~= 128 + assert(h <= 128); + + do { + const uint8x8_t s = vld1_u8(src_ptr); + const uint8x8_t r = vld1_u8(ref_ptr); + const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); + + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + src_ptr += src_stride; ref_ptr += ref_stride; - } + } while (--i != 0); *sum = horizontal_add_int16x8(sum_s16); - *sse = horizontal_add_uint32x4( - vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); } -// Process a block of width 8 two rows at a time. -static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i = 0; - int16x8_t sum_s16 = vdupq_n_s16(0); - int32x4_t sse_lo_s32 = vdupq_n_s32(0); - int32x4_t sse_hi_s32 = vdupq_n_s32(0); +// Process a block of width 16 one row at a time. +static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int i = h; - // Each column has it's own accumulator entry in sum_s16. + // Number of rows we can process before 'sum_s16' accumulators overflow: + // 32767 / 255 ~= 128, so 128 16-wide rows. assert(h <= 128); do { - const uint8x8_t a_0_u8 = vld1_u8(src_ptr); - const uint8x8_t a_1_u8 = vld1_u8(src_ptr + src_stride); - const uint8x8_t b_0_u8 = vld1_u8(ref_ptr); - const uint8x8_t b_1_u8 = vld1_u8(ref_ptr + ref_stride); - const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8); - const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8); - const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16); - const int16x8_t diff_1_s16 = vreinterpretq_s16_u16(diff_1_u16); - sum_s16 = vaddq_s16(sum_s16, diff_0_s16); - sum_s16 = vaddq_s16(sum_s16, diff_1_s16); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_0_s16), - vget_low_s16(diff_0_s16)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_1_s16), - vget_low_s16(diff_1_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_0_s16), - vget_high_s16(diff_0_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16), - vget_high_s16(diff_1_s16)); - src_ptr += src_stride + src_stride; - ref_ptr += ref_stride + ref_stride; - i += 2; + const uint8x16_t s = vld1q_u8(src_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + + const int16x8_t diff_l = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); + const int16x8_t diff_h = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); + + sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); + sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int16x8(vaddq_s16(sum_s16[0], sum_s16[1])); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); +} + +// Process a block of any size where the width is divisible by 16. +static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int w, int h, int h_limit, + unsigned int *sse, int *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit + // accumulator overflows. After hitting this limit we accumulate into 32-bit + // elements. + int h_tmp = h > h_limit ? h_limit : h; + + int i = 0; + do { + int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + do { + int j = 0; + do { + const uint8x16_t s = vld1q_u8(src_ptr + j); + const uint8x16_t r = vld1q_u8(ref_ptr + j); + + const int16x8_t diff_l = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); + const int16x8_t diff_h = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); + + sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); + sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); + + j += 16; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + i++; + } while (i < h_tmp); + + sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]); + sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]); + + h_tmp += h_limit; } while (i < h); - *sum = horizontal_add_int16x8(sum_s16); - *sse = horizontal_add_uint32x4( - vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); + *sum = horizontal_add_int32x4(sum_s32); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); +} + +static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum); +} + +static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum); } #endif // defined(__ARM_FEATURE_DOTPROD) @@ -259,103 +333,41 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum) { - variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum); + variance_8xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum); } void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum) { - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16, sse, sum); + variance_16xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, sum); } -#define VARIANCENXM(n, m, shift) \ - unsigned int vpx_variance##n##x##m##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, unsigned int *sse) { \ - int sum; \ - if (n == 4) \ - variance_neon_w4x4(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \ - &sum); \ - else if (n == 8) \ - variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \ - &sum); \ - else \ - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, n, m, sse, \ - &sum); \ - if (n * m < 16 * 16) \ - return *sse - ((sum * sum) >> shift); \ - else \ - return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ +#define VARIANCE_WXH_NEON(w, h, shift) \ + unsigned int vpx_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ } -VARIANCENXM(4, 4, 4) -VARIANCENXM(4, 8, 5) -VARIANCENXM(8, 4, 5) -VARIANCENXM(8, 8, 6) -VARIANCENXM(8, 16, 7) -VARIANCENXM(16, 8, 7) -VARIANCENXM(16, 16, 8) -VARIANCENXM(16, 32, 9) -VARIANCENXM(32, 16, 9) -VARIANCENXM(32, 32, 10) - -unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32, &sse1, - &sum1); - variance_neon_w16(src_ptr + (32 * src_stride), src_stride, - ref_ptr + (32 * ref_stride), ref_stride, 32, 32, &sse2, - &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); -} +VARIANCE_WXH_NEON(4, 4, 4) +VARIANCE_WXH_NEON(4, 8, 5) -unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1, - &sum1); - variance_neon_w16(src_ptr + (16 * src_stride), src_stride, - ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); -} +VARIANCE_WXH_NEON(8, 4, 5) +VARIANCE_WXH_NEON(8, 8, 6) +VARIANCE_WXH_NEON(8, 16, 7) -unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1, - &sum1); - variance_neon_w16(src_ptr + (16 * src_stride), src_stride, - ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w16(src_ptr + (16 * 2 * src_stride), src_stride, - ref_ptr + (16 * 2 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w16(src_ptr + (16 * 3 * src_stride), src_stride, - ref_ptr + (16 * 3 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); -} +VARIANCE_WXH_NEON(16, 8, 7) +VARIANCE_WXH_NEON(16, 16, 8) +VARIANCE_WXH_NEON(16, 32, 9) + +VARIANCE_WXH_NEON(32, 16, 9) +VARIANCE_WXH_NEON(32, 32, 10) +VARIANCE_WXH_NEON(32, 64, 11) + +VARIANCE_WXH_NEON(64, 32, 11) +VARIANCE_WXH_NEON(64, 64, 12) #if defined(__ARM_FEATURE_DOTPROD) From 5e861795339a7c196a381c3aaf5209b5d4c8c468 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 18 Jan 2023 19:19:01 -0800 Subject: [PATCH 0569/1000] */Android.mk: add a check for NDK_ROOT This simplifies integration with the Android platform and avoids the files from being used when a non-NDK build is performed. In that case Android.bp is preferred. Change-Id: I803912146dac788b7f0af27199c7613cabbc9fa0 --- build/make/Android.mk | 3 +++ test/android/Android.mk | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/build/make/Android.mk b/build/make/Android.mk index b8032e67aa..ba24f541b1 100644 --- a/build/make/Android.mk +++ b/build/make/Android.mk @@ -8,6 +8,8 @@ ## be found in the AUTHORS file in the root of the source tree. ## +# Ignore this file during non-NDK builds. +ifdef NDK_ROOT # # This file is to be used for compiling libvpx for Android using the NDK. # In an Android project place a libvpx checkout in the jni directory. @@ -212,3 +214,4 @@ endif ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes) $(call import-module,android/cpufeatures) endif +endif # NDK_ROOT diff --git a/test/android/Android.mk b/test/android/Android.mk index 87155fcb58..9a7533ebba 100644 --- a/test/android/Android.mk +++ b/test/android/Android.mk @@ -10,6 +10,9 @@ # The test app itself runs on the command line through adb shell # The paths are really messed up as the libvpx make file # expects to be made from a parent directory. + +# Ignore this file during non-NDK builds. +ifdef NDK_ROOT CUR_WD := $(call my-dir) BINDINGS_DIR := $(CUR_WD)/../../.. LOCAL_PATH := $(CUR_WD)/../../.. @@ -61,3 +64,4 @@ LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC)) # some test files depend on *_rtcd.h, ensure they're generated first. $(eval $(call rtcd_dep_template)) include $(BUILD_EXECUTABLE) +endif # NDK_ROOT From ae4240edc7879b30eddaa9ba38525fcd2a1514a0 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 12 Jan 2023 15:58:00 -0500 Subject: [PATCH 0570/1000] Add codec control to set per frame QP Use case is for 1 pass encoding. Forces max_quantizer = min_quantizer and aq-mode = 0. Applicalble to spatial layers, where user may set the QP per spatial layer. Change-Id: Idfcb7daefde94c475ed1bc0eb8af47c9f309110b --- test/vp9_datarate_test.cc | 95 +++++++++++++++++++++++++++++++++++++++ vp9/encoder/vp9_encoder.c | 4 ++ vp9/encoder/vp9_encoder.h | 2 + vp9/vp9_cx_iface.c | 20 +++++++++ vpx/vp8cx.h | 12 +++++ 5 files changed, 133 insertions(+) diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc index 286fa335a1..eccb001071 100644 --- a/test/vp9_datarate_test.cc +++ b/test/vp9_datarate_test.cc @@ -9,6 +9,7 @@ */ #include "./vpx_config.h" #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" @@ -809,6 +810,93 @@ TEST_P(DatarateTestVP9PostEncodeDrop, PostEncodeDropScreenContent) { << " The datarate for the file is greater than target by too much!"; } +using libvpx_test::ACMRandom; + +class DatarateTestVP9FrameQp + : public DatarateTestVP9, + public ::testing::TestWithParam { + public: + DatarateTestVP9FrameQp() : DatarateTestVP9(GetParam()), frame_(0) {} + virtual ~DatarateTestVP9FrameQp() {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + ResetModel(); + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + set_cpu_used_ = 7; + DatarateTestVP9::PreEncodeFrameHook(video, encoder); + ACMRandom rnd; + frame_qp_ = static_cast(rnd.RandRange(64)); + encoder->Control(VP9E_SET_QUANTIZER_ONE_PASS, frame_qp_); + frame_++; + } + + virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + int qp = 0; + if (frame_ >= total_frame_) return; + encoder->Control(VP8E_GET_LAST_QUANTIZER_64, &qp); + ASSERT_EQ(frame_qp_, qp); + } + + protected: + int total_frame_; + + private: + int frame_qp_; + int frame_; +}; + +TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + total_frame_ = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, total_frame_); + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayers) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_min_quantizer = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + cfg_.rc_target_bitrate = 200; + total_frame_ = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, total_frame_); + ResetModel(); + cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + #if CONFIG_VP9_TEMPORAL_DENOISING // Params: speed setting. class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime { @@ -943,6 +1031,13 @@ VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9LargeVBR, ::testing::Range(5, 9), VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTime, ::testing::Range(5, 10)); +#if CONFIG_VP9 +INSTANTIATE_TEST_SUITE_P( + VP9, DatarateTestVP9FrameQp, + ::testing::Values( + static_cast(&libvpx_test::kVP9))); +#endif + VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeDeltaQUV, ::testing::Range(5, 10), ::testing::Values(-5, -10, -15)); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 5cfd846dd0..1c5c9fc923 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1527,6 +1527,7 @@ static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { init_buffer_indices(cpi); vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height); + cpi->fixed_qp_onepass = 0; } void vp9_check_reset_rc_flag(VP9_COMP *cpi) { @@ -7933,6 +7934,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, vp9_save_layer_context(cpi); } + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cpi->fixed_qp_onepass = 0; + vpx_usec_timer_mark(&cmptimer); cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 3e0b80677e..cca8b53f8e 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -971,6 +971,8 @@ typedef struct VP9_COMP { RATE_QSTEP_MODEL rq_model[ENCODE_FRAME_TYPES]; #endif EXT_RATECTRL ext_ratectrl; + + int fixed_qp_onepass; } VP9_COMP; #if CONFIG_RATE_CTRL diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 695774e730..dee175dc09 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1014,6 +1014,7 @@ static vpx_codec_err_t ctrl_set_aq_mode(vpx_codec_alg_priv_t *ctx, va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.aq_mode = CAST(VP9E_SET_AQ_MODE, args); + if (ctx->cpi->fixed_qp_onepass) extra_cfg.aq_mode = 0; return update_extra_cfg(ctx, &extra_cfg); } @@ -1951,6 +1952,24 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_quantizer_one_pass(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const int qp = va_arg(args, int); + vpx_codec_enc_cfg_t *cfg = &ctx->cfg; + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + vpx_codec_err_t res; + + if (qp < 0 || qp > 63) return VPX_CODEC_INVALID_PARAM; + + cfg->rc_min_quantizer = cfg->rc_max_quantizer = qp; + extra_cfg.aq_mode = 0; + cpi->fixed_qp_onepass = 1; + + res = update_extra_cfg(ctx, &extra_cfg); + return res; +} + static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP8_COPY_REFERENCE, ctrl_copy_reference }, @@ -2005,6 +2024,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_SET_DISABLE_LOOPFILTER, ctrl_set_disable_loopfilter }, { VP9E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl }, { VP9E_SET_EXTERNAL_RATE_CONTROL, ctrl_set_external_rate_control }, + { VP9E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass }, // Getters { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer }, diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index a61238cb10..e0b679fbb7 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -757,6 +757,16 @@ enum vp8e_enc_control_id { * Supported in codecs: VP8 */ VP8E_SET_RTC_EXTERNAL_RATECTRL, + + /*!\brief Codec control to set quantizer for the next frame. + * + * This will turn off cyclic refresh. Only applicable to 1-pass without + * spatial layers. + * + * Supported in codecs: VP9 + * + */ + VP9E_SET_QUANTIZER_ONE_PASS, }; /*!\brief vpx 1-D scaling mode @@ -1085,6 +1095,8 @@ VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *) #define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int) #define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL +VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int) +#define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS /*!\endcond */ /*! @} - end defgroup vp8_encoder */ From fcfb471ce2a413e760bdff805c5ae66778cb4169 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Thu, 19 Jan 2023 18:02:52 +0000 Subject: [PATCH 0571/1000] Refactor Neon subpel variance functions Refactor the Neon implementation of the sub-pixel variance bilinear filter helper functions - effectively backporting this libaom patch[1]. [1] https://aomedia-review.googlesource.com/c/aom/+/162462 Change-Id: I3dee32e8125250bbeffeb63d1fef5da559bacbf1 --- vpx_dsp/arm/subpel_variance_neon.c | 271 ++++++++++++++--------------- 1 file changed, 135 insertions(+), 136 deletions(-) diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c index a3befdc348..3fb0acd544 100644 --- a/vpx_dsp/arm/subpel_variance_neon.c +++ b/vpx_dsp/arm/subpel_variance_neon.c @@ -17,156 +17,155 @@ #include "vpx_dsp/variance.h" #include "vpx_dsp/arm/mem_neon.h" -static const uint8_t bilinear_filters[8][2] = { - { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, -}; - // Process a block exactly 4 wide and a multiple of 2 high. -static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - const uint8_t *filter) { - const uint8x8_t f0 = vdup_n_u8(filter[0]); - const uint8x8_t f1 = vdup_n_u8(filter[1]); - unsigned int i; - for (i = 0; i < output_height; i += 2) { - const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line); - const uint8x8_t src_1 = - load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line); - const uint16x8_t a = vmull_u8(src_0, f0); - const uint16x8_t b = vmlal_u8(a, src_1, f1); - const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); - vst1_u8(output_ptr, out); - src_ptr += 2 * src_pixels_per_line; - output_ptr += 8; - } +static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + vst1_u8(dst_ptr, blend_u8); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + i -= 2; + } while (i != 0); } // Process a block exactly 8 wide and any height. -static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - const uint8_t *filter) { - const uint8x8_t f0 = vdup_n_u8(filter[0]); - const uint8x8_t f1 = vdup_n_u8(filter[1]); - unsigned int i; - for (i = 0; i < output_height; ++i) { - const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); - const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); - const uint16x8_t a = vmull_u8(src_0, f0); - const uint16x8_t b = vmlal_u8(a, src_1, f1); - const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); - vst1_u8(output_ptr, out); - src_ptr += src_pixels_per_line; - output_ptr += 8; - } +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + vst1_u8(dst_ptr, blend_u8); + + src_ptr += src_stride; + dst_ptr += 8; + } while (--i != 0); } // Process a block which is a mutiple of 16 wide and any height. -static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { - const uint8x8_t f0 = vdup_n_u8(filter[0]); - const uint8x8_t f1 = vdup_n_u8(filter[1]); - unsigned int i, j; - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 16) { - const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); - const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); - const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); - const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); - const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); - const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); - const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); - const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); - vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi)); - } - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } +static void var_filter_block2d_bil_large(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint16x8_t blend_l = + vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1); + uint16x8_t blend_h = + vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1); + uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3); + uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3); + vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi)); + + j += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); } -// 4xM filter writes an extra row to fdata because it processes two rows at a -// time. -#define SUB_PIXEL_VARIANCENXM(n, m) \ - uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ - uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ - uint8_t temp1[n * m]; \ - \ - if (n == 4) { \ - var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else if (n == 8) { \ - var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else { \ - var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ - bilinear_filters[y_offset]); \ - } \ - return vpx_variance##n##x##m(temp1, n, ref_ptr, ref_stride, sse); \ +static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16, + dst_height, filter_offset); +} +static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32, + dst_height, filter_offset); +} +static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64, + dst_height, filter_offset); +} + +#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } -SUB_PIXEL_VARIANCENXM(4, 4) -SUB_PIXEL_VARIANCENXM(4, 8) -SUB_PIXEL_VARIANCENXM(8, 4) -SUB_PIXEL_VARIANCENXM(8, 8) -SUB_PIXEL_VARIANCENXM(8, 16) -SUB_PIXEL_VARIANCENXM(16, 8) -SUB_PIXEL_VARIANCENXM(16, 16) -SUB_PIXEL_VARIANCENXM(16, 32) -SUB_PIXEL_VARIANCENXM(32, 16) -SUB_PIXEL_VARIANCENXM(32, 32) -SUB_PIXEL_VARIANCENXM(32, 64) -SUB_PIXEL_VARIANCENXM(64, 32) -SUB_PIXEL_VARIANCENXM(64, 64) +// 4x blocks are processed two rows at a time, so require an extra row of +// padding. +SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) +SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) + +SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) +SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) +SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) + +SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) +SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) +SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) + +SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) +SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) +SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) + +SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) +SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) // 4xM filter writes an extra row to fdata because it processes two rows at a // time. -#define SUB_PIXEL_AVG_VARIANCENXM(n, m) \ - uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ - uint8_t temp1[n * m]; \ - \ - if (n == 4) { \ - var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else if (n == 8) { \ - var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else { \ - var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ - bilinear_filters[y_offset]); \ - } \ - \ - vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \ - \ - return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse); \ +#define SUB_PIXEL_AVG_VARIANCENXM(n, m) \ + uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ + uint8_t temp1[n * m]; \ + \ + if (n == 4) { \ + var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \ + x_offset); \ + var_filter_block2d_bil_w4(temp0, temp1, n, n, m, y_offset); \ + } else if (n == 8) { \ + var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \ + x_offset); \ + var_filter_block2d_bil_w8(temp0, temp1, n, n, m, y_offset); \ + } else { \ + var_filter_block2d_bil_large(src_ptr, temp0, src_stride, 1, n, (m + 1), \ + x_offset); \ + var_filter_block2d_bil_large(temp0, temp1, n, n, n, m, y_offset); \ + } \ + \ + vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \ + \ + return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse); \ } SUB_PIXEL_AVG_VARIANCENXM(4, 4) From ae5b60cb4730639fc7742df577600ce71ddb5936 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Fri, 20 Jan 2023 10:35:34 +0000 Subject: [PATCH 0572/1000] Specialize Neon subpel variance by filter value for large blocks The optimal implementation of the bilinear interpolation depends on the filter values being used. For both horizontal and vertical interpolation this can simplify to just taking the source values, or averaging the source and reference values - which can be computed more easily than a bilinear interpolation with arbitrary filter values. This patch introduces tests to find the most optimal bilinear interpolation implementation based on the filter values being used. This new specialization is only used for larger block sizes (>= 16x16) as we need to be doing enough work to make the cost of finding the optimal implementation worth it. This is a backport of this libaom change[1]. After this change, the only differences between the code in libvpx and libaom are due to libvpx being compiled with ISO C90, which forbids mixing declarations and code [-Wdeclaration-after-statement]. [1] https://aomedia-review.googlesource.com/c/aom/+/162463 Change-Id: Ia818e148f6fd126656e8411d59c184b55dd43094 --- vpx_dsp/arm/subpel_variance_neon.c | 95 +++++++++++++++++++++++++++--- 1 file changed, 87 insertions(+), 8 deletions(-) diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c index 3fb0acd544..60650b703a 100644 --- a/vpx_dsp/arm/subpel_variance_neon.c +++ b/vpx_dsp/arm/subpel_variance_neon.c @@ -107,6 +107,30 @@ static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr, dst_height, filter_offset); } +static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_width, int dst_height) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint8x16_t avg = vrhaddq_u8(s0, s1); + vst1q_u8(dst_ptr + j, avg); + + j += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + #define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ @@ -119,6 +143,61 @@ static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr, return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } +#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse) { \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return vpx_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \ + sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \ + return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \ + yoffset); \ + return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \ + return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \ + return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } \ + } \ + } + // 4x blocks are processed two rows at a time, so require an extra row of // padding. SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) @@ -128,16 +207,16 @@ SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) -SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) -SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) -SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) -SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) -SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) -SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) -SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) -SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) // 4xM filter writes an extra row to fdata because it processes two rows at a // time. From b7f6c641397eb1ddac6fcaf34ec6db8fa0cbd7e7 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Fri, 20 Jan 2023 11:21:02 +0000 Subject: [PATCH 0573/1000] Refactor Neon averaging subpel variance functions Merge the computation of vpx_comp_avg_pred into the second pass of the bilinear filter - avoiding the overhead of loading and storing the entire block again. This is a backport of this libaom change[1]. [1] https://aomedia-review.googlesource.com/c/aom/+/166961 Change-Id: I9327ff7382a46d50c42a5213a11379b957146372 --- vpx_dsp/arm/subpel_variance_neon.c | 188 +++++++++++++++++++++++------ 1 file changed, 148 insertions(+), 40 deletions(-) diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c index 60650b703a..237f7fad25 100644 --- a/vpx_dsp/arm/subpel_variance_neon.c +++ b/vpx_dsp/arm/subpel_variance_neon.c @@ -218,45 +218,153 @@ SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) -// 4xM filter writes an extra row to fdata because it processes two rows at a -// time. -#define SUB_PIXEL_AVG_VARIANCENXM(n, m) \ - uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ - uint8_t temp1[n * m]; \ - \ - if (n == 4) { \ - var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \ - x_offset); \ - var_filter_block2d_bil_w4(temp0, temp1, n, n, m, y_offset); \ - } else if (n == 8) { \ - var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \ - x_offset); \ - var_filter_block2d_bil_w8(temp0, temp1, n, n, m, y_offset); \ - } else { \ - var_filter_block2d_bil_large(src_ptr, temp0, src_stride, 1, n, (m + 1), \ - x_offset); \ - var_filter_block2d_bil_large(temp0, temp1, n, n, n, m, y_offset); \ - } \ - \ - vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \ - \ - return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse); \ +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 4. +static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + + uint8x8_t p = vld1_u8(second_pred); + uint8x8_t avg = vrhadd_u8(blend_u8, p); + + vst1_u8(dst_ptr, avg); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + second_pred += 2 * 4; + i -= 2; + } while (i != 0); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 8. +static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + + uint8x8_t p = vld1_u8(second_pred); + uint8x8_t avg = vrhadd_u8(blend_u8, p); + + vst1_u8(dst_ptr, avg); + + src_ptr += src_stride; + dst_ptr += 8; + second_pred += 8; + } while (--i > 0); +} + +// Combine bilinear filter with vpx_comp_avg_pred for large blocks. +static void avg_pred_var_filter_block2d_bil_large( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint16x8_t blend_l = + vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1); + uint16x8_t blend_h = + vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1); + uint8x16_t blend_u8 = + vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3)); + + uint8x16_t p = vld1q_u8(second_pred); + uint8x16_t avg = vrhaddq_u8(blend_u8, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 16. +static void avg_pred_var_filter_block2d_bil_w16( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 16, dst_height, + filter_offset, second_pred); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 32. +static void avg_pred_var_filter_block2d_bil_w32( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 32, dst_height, + filter_offset, second_pred); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 64. +static void avg_pred_var_filter_block2d_bil_w64( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 64, dst_height, + filter_offset, second_pred); +} + +#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \ + xoffset); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } -SUB_PIXEL_AVG_VARIANCENXM(4, 4) -SUB_PIXEL_AVG_VARIANCENXM(4, 8) -SUB_PIXEL_AVG_VARIANCENXM(8, 4) -SUB_PIXEL_AVG_VARIANCENXM(8, 8) -SUB_PIXEL_AVG_VARIANCENXM(8, 16) -SUB_PIXEL_AVG_VARIANCENXM(16, 8) -SUB_PIXEL_AVG_VARIANCENXM(16, 16) -SUB_PIXEL_AVG_VARIANCENXM(16, 32) -SUB_PIXEL_AVG_VARIANCENXM(32, 16) -SUB_PIXEL_AVG_VARIANCENXM(32, 32) -SUB_PIXEL_AVG_VARIANCENXM(32, 64) -SUB_PIXEL_AVG_VARIANCENXM(64, 32) -SUB_PIXEL_AVG_VARIANCENXM(64, 64) +// 4x blocks are processed two rows at a time, so require an extra row of +// padding. +SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2) +SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2) + +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1) + +SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1) + +SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1) + +SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1) From 67abc6738942fff8299919e736138679d4a08016 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Fri, 20 Jan 2023 11:42:06 +0000 Subject: [PATCH 0574/1000] Specialize Neon averaging subpel variance by filter value Use the same specialization for averaging subpel variance functions as used for the non-averaging variants. The rationale for the specialization is as follows: The optimal implementation of the bilinear interpolation depends on the filter values being used. For both horizontal and vertical interpolation this can simplify to just taking the source values, or averaging the source and reference values - which can be computed more easily than a bilinear interpolation with arbitrary filter values. This patch introduces tests to find the most optimal bilinear interpolation implementation based on the filter values being used. This new specialization is only used for larger block sizes This is a backport of this libaom change[1]. After this change, the only differences between the code in libvpx and libaom are due to libvpx being compiled with ISO C90, which forbids mixing declarations and code [-Wdeclaration-after-statement]. [1] https://aomedia-review.googlesource.com/c/aom/+/166962 Change-Id: I7860c852db94a7c9c3d72ae4411316685f3800a4 --- vpx_dsp/arm/subpel_variance_neon.c | 136 +++++++++++++++++++++++++++-- 1 file changed, 128 insertions(+), 8 deletions(-) diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c index 237f7fad25..9328c3ed89 100644 --- a/vpx_dsp/arm/subpel_variance_neon.c +++ b/vpx_dsp/arm/subpel_variance_neon.c @@ -335,6 +335,66 @@ static void avg_pred_var_filter_block2d_bil_w64( filter_offset, second_pred); } +// Combine averaging subpel filter with vpx_comp_avg_pred. +static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height, + const uint8_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint8x16_t avg = vrhaddq_u8(s0, s1); + + uint8x16_t p = vld1q_u8(second_pred); + avg = vrhaddq_u8(avg, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Implementation of vpx_comp_avg_pred for blocks having width >= 16. +static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, + int dst_width, int dst_height, + const uint8_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint8x16_t s = vld1q_u8(src_ptr + j); + uint8x16_t p = vld1q_u8(second_pred); + + uint8x16_t avg = vrhaddq_u8(s, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + #define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \ const uint8_t *src, int source_stride, int xoffset, int yoffset, \ @@ -349,6 +409,66 @@ static void avg_pred_var_filter_block2d_bil_w64( return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } +#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse, \ + const uint8_t *second_pred) { \ + if (xoffset == 0) { \ + uint8_t tmp[w * h]; \ + if (yoffset == 0) { \ + avg_pred(src, tmp, source_stride, w, h, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \ + source_stride, w, h, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + } else { \ + avg_pred_var_filter_block2d_bil_w##w( \ + src, tmp, source_stride, source_stride, h, yoffset, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \ + second_pred); \ + return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ + avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \ + xoffset, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } \ + } \ + } + // 4x blocks are processed two rows at a time, so require an extra row of // padding. SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2) @@ -358,13 +478,13 @@ SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1) SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1) SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1) From 72cfcdd95ab0d17c4b8b13d9da00d1458105bf80 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 24 Jan 2023 14:08:17 -0500 Subject: [PATCH 0575/1000] Skip calculating internal stats when frame dropped Bug: webm:1771 Change-Id: I30cd5b7ec0945b521a1cc03999d39ec6a25f1696 --- vp9/encoder/vp9_encoder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 1c5c9fc923..b66fdc0bca 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -7945,7 +7945,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #if CONFIG_INTERNAL_STATS - if (oxcf->pass != 1) { + if (oxcf->pass != 1 && !cpi->last_frame_dropped) { double samples = 0.0; cpi->bytes += (int)(*size); From 3384b83da0856df86bab0811e7b5a3495925ac70 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 24 Jan 2023 20:48:06 +0000 Subject: [PATCH 0576/1000] [NEON] Add Highbd FHT 8x8/16x16 functions In total this gives about 9% extra performance for both rt/best profiles. Furthermore, add transpose_s32 16x16 function Change-Id: Ib6f368bbb9af7f03c9ce0deba1664cef77632fe2 --- test/dct_test.cc | 4 + vp9/common/vp9_rtcd_defs.pl | 2 + vp9/encoder/arm/neon/vp9_dct_neon.c | 942 ++++++++++++++++++++++++++++ vpx_dsp/arm/fdct16x16_neon.c | 306 +++++++++ vpx_dsp/arm/fdct16x16_neon.h | 306 --------- vpx_dsp/arm/fdct8x8_neon.h | 78 +-- vpx_dsp/arm/fdct_neon.h | 119 ++++ vpx_dsp/arm/transpose_neon.h | 62 ++ 8 files changed, 1437 insertions(+), 382 deletions(-) diff --git a/test/dct_test.cc b/test/dct_test.cc index 0304029bd2..9a150a24f1 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -671,8 +671,12 @@ static const FuncInfo ht_neon_func_info[] = { 4, 2 }, { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper, 8, 2 }, + { &vp9_highbd_fht8x8_neon, &highbd_iht_wrapper, + 8, 2 }, { &vp9_highbd_fht16x16_c, &highbd_iht_wrapper, 16, 2 }, + { &vp9_highbd_fht16x16_neon, + &highbd_iht_wrapper, 16, 2 }, #endif { &vp9_fht4x4_c, &iht_wrapper, 4, 1 }, { &vp9_fht4x4_neon, &iht_wrapper, 4, 1 }, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index f4bd9772c3..20a482c85f 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -206,8 +206,10 @@ () specialize qw/vp9_highbd_fht4x4 neon/; add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp9_highbd_fht8x8 neon/; add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp9_highbd_fht16x16 neon/; add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c index 5961be5f31..997b5477e1 100644 --- a/vp9/encoder/arm/neon/vp9_dct_neon.c +++ b/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -20,6 +20,7 @@ #include "vpx_dsp/arm/fdct_neon.h" #include "vpx_dsp/arm/fdct4x4_neon.h" #include "vpx_dsp/arm/fdct8x8_neon.h" +#include "vpx_dsp/arm/fdct16x16_neon.h" static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in, int stride) { @@ -1228,4 +1229,945 @@ void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, } } +static INLINE void highbd_load_buffer_8x8(const int16_t *input, + int32x4_t *lo /*[8]*/, + int32x4_t *hi /*[8]*/, int stride) { + int16x8_t in[8]; + in[0] = vld1q_s16(input + 0 * stride); + in[1] = vld1q_s16(input + 1 * stride); + in[2] = vld1q_s16(input + 2 * stride); + in[3] = vld1q_s16(input + 3 * stride); + in[4] = vld1q_s16(input + 4 * stride); + in[5] = vld1q_s16(input + 5 * stride); + in[6] = vld1q_s16(input + 6 * stride); + in[7] = vld1q_s16(input + 7 * stride); + lo[0] = vshll_n_s16(vget_low_s16(in[0]), 2); + hi[0] = vshll_n_s16(vget_high_s16(in[0]), 2); + lo[1] = vshll_n_s16(vget_low_s16(in[1]), 2); + hi[1] = vshll_n_s16(vget_high_s16(in[1]), 2); + lo[2] = vshll_n_s16(vget_low_s16(in[2]), 2); + hi[2] = vshll_n_s16(vget_high_s16(in[2]), 2); + lo[3] = vshll_n_s16(vget_low_s16(in[3]), 2); + hi[3] = vshll_n_s16(vget_high_s16(in[3]), 2); + lo[4] = vshll_n_s16(vget_low_s16(in[4]), 2); + hi[4] = vshll_n_s16(vget_high_s16(in[4]), 2); + lo[5] = vshll_n_s16(vget_low_s16(in[5]), 2); + hi[5] = vshll_n_s16(vget_high_s16(in[5]), 2); + lo[6] = vshll_n_s16(vget_low_s16(in[6]), 2); + hi[6] = vshll_n_s16(vget_high_s16(in[6]), 2); + lo[7] = vshll_n_s16(vget_low_s16(in[7]), 2); + hi[7] = vshll_n_s16(vget_high_s16(in[7]), 2); +} + +/* right shift and rounding + * first get the sign bit (bit 15). + * If bit == 1, it's the simple case of shifting right by one bit. + * If bit == 2, it essentially computes the expression: + * + * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; + * + * for each row. + */ +static INLINE void highbd_right_shift_8x8(int32x4_t *lo, int32x4_t *hi, + const int bit) { + int32x4_t sign_lo[8], sign_hi[8]; + sign_lo[0] = vshrq_n_s32(lo[0], 31); + sign_hi[0] = vshrq_n_s32(hi[0], 31); + sign_lo[1] = vshrq_n_s32(lo[1], 31); + sign_hi[1] = vshrq_n_s32(hi[1], 31); + sign_lo[2] = vshrq_n_s32(lo[2], 31); + sign_hi[2] = vshrq_n_s32(hi[2], 31); + sign_lo[3] = vshrq_n_s32(lo[3], 31); + sign_hi[3] = vshrq_n_s32(hi[3], 31); + sign_lo[4] = vshrq_n_s32(lo[4], 31); + sign_hi[4] = vshrq_n_s32(hi[4], 31); + sign_lo[5] = vshrq_n_s32(lo[5], 31); + sign_hi[5] = vshrq_n_s32(hi[5], 31); + sign_lo[6] = vshrq_n_s32(lo[6], 31); + sign_hi[6] = vshrq_n_s32(hi[6], 31); + sign_lo[7] = vshrq_n_s32(lo[7], 31); + sign_hi[7] = vshrq_n_s32(hi[7], 31); + + if (bit == 2) { + const int32x4_t const_rounding = vdupq_n_s32(1); + lo[0] = vaddq_s32(lo[0], const_rounding); + hi[0] = vaddq_s32(hi[0], const_rounding); + lo[1] = vaddq_s32(lo[1], const_rounding); + hi[1] = vaddq_s32(hi[1], const_rounding); + lo[2] = vaddq_s32(lo[2], const_rounding); + hi[2] = vaddq_s32(hi[2], const_rounding); + lo[3] = vaddq_s32(lo[3], const_rounding); + hi[3] = vaddq_s32(hi[3], const_rounding); + lo[4] = vaddq_s32(lo[4], const_rounding); + hi[4] = vaddq_s32(hi[4], const_rounding); + lo[5] = vaddq_s32(lo[5], const_rounding); + hi[5] = vaddq_s32(hi[5], const_rounding); + lo[6] = vaddq_s32(lo[6], const_rounding); + hi[6] = vaddq_s32(hi[6], const_rounding); + lo[7] = vaddq_s32(lo[7], const_rounding); + hi[7] = vaddq_s32(hi[7], const_rounding); + } + + lo[0] = vsubq_s32(lo[0], sign_lo[0]); + hi[0] = vsubq_s32(hi[0], sign_hi[0]); + lo[1] = vsubq_s32(lo[1], sign_lo[1]); + hi[1] = vsubq_s32(hi[1], sign_hi[1]); + lo[2] = vsubq_s32(lo[2], sign_lo[2]); + hi[2] = vsubq_s32(hi[2], sign_hi[2]); + lo[3] = vsubq_s32(lo[3], sign_lo[3]); + hi[3] = vsubq_s32(hi[3], sign_hi[3]); + lo[4] = vsubq_s32(lo[4], sign_lo[4]); + hi[4] = vsubq_s32(hi[4], sign_hi[4]); + lo[5] = vsubq_s32(lo[5], sign_lo[5]); + hi[5] = vsubq_s32(hi[5], sign_hi[5]); + lo[6] = vsubq_s32(lo[6], sign_lo[6]); + hi[6] = vsubq_s32(hi[6], sign_hi[6]); + lo[7] = vsubq_s32(lo[7], sign_lo[7]); + hi[7] = vsubq_s32(hi[7], sign_hi[7]); + + if (bit == 1) { + lo[0] = vshrq_n_s32(lo[0], 1); + hi[0] = vshrq_n_s32(hi[0], 1); + lo[1] = vshrq_n_s32(lo[1], 1); + hi[1] = vshrq_n_s32(hi[1], 1); + lo[2] = vshrq_n_s32(lo[2], 1); + hi[2] = vshrq_n_s32(hi[2], 1); + lo[3] = vshrq_n_s32(lo[3], 1); + hi[3] = vshrq_n_s32(hi[3], 1); + lo[4] = vshrq_n_s32(lo[4], 1); + hi[4] = vshrq_n_s32(hi[4], 1); + lo[5] = vshrq_n_s32(lo[5], 1); + hi[5] = vshrq_n_s32(hi[5], 1); + lo[6] = vshrq_n_s32(lo[6], 1); + hi[6] = vshrq_n_s32(hi[6], 1); + lo[7] = vshrq_n_s32(lo[7], 1); + hi[7] = vshrq_n_s32(hi[7], 1); + } else { + lo[0] = vshrq_n_s32(lo[0], 2); + hi[0] = vshrq_n_s32(hi[0], 2); + lo[1] = vshrq_n_s32(lo[1], 2); + hi[1] = vshrq_n_s32(hi[1], 2); + lo[2] = vshrq_n_s32(lo[2], 2); + hi[2] = vshrq_n_s32(hi[2], 2); + lo[3] = vshrq_n_s32(lo[3], 2); + hi[3] = vshrq_n_s32(hi[3], 2); + lo[4] = vshrq_n_s32(lo[4], 2); + hi[4] = vshrq_n_s32(hi[4], 2); + lo[5] = vshrq_n_s32(lo[5], 2); + hi[5] = vshrq_n_s32(hi[5], 2); + lo[6] = vshrq_n_s32(lo[6], 2); + hi[6] = vshrq_n_s32(hi[6], 2); + lo[7] = vshrq_n_s32(lo[7], 2); + hi[7] = vshrq_n_s32(hi[7], 2); + } +} + +static INLINE void highbd_write_buffer_8x8(tran_low_t *output, int32x4_t *lo, + int32x4_t *hi, int stride) { + vst1q_s32(output + 0 * stride, lo[0]); + vst1q_s32(output + 0 * stride + 4, hi[0]); + vst1q_s32(output + 1 * stride, lo[1]); + vst1q_s32(output + 1 * stride + 4, hi[1]); + vst1q_s32(output + 2 * stride, lo[2]); + vst1q_s32(output + 2 * stride + 4, hi[2]); + vst1q_s32(output + 3 * stride, lo[3]); + vst1q_s32(output + 3 * stride + 4, hi[3]); + vst1q_s32(output + 4 * stride, lo[4]); + vst1q_s32(output + 4 * stride + 4, hi[4]); + vst1q_s32(output + 5 * stride, lo[5]); + vst1q_s32(output + 5 * stride + 4, hi[5]); + vst1q_s32(output + 6 * stride, lo[6]); + vst1q_s32(output + 6 * stride + 4, hi[6]); + vst1q_s32(output + 7 * stride, lo[7]); + vst1q_s32(output + 7 * stride + 4, hi[7]); +} + +static INLINE void highbd_fadst8x8_neon(int32x4_t *lo /*[8]*/, + int32x4_t *hi /*[8]*/) { + int32x4_t s_lo[8], s_hi[8]; + int32x4_t t_lo[8], t_hi[8]; + int32x4_t x_lo[8], x_hi[8]; + int64x2_t s64_lo[16], s64_hi[16]; + + x_lo[0] = lo[7]; + x_hi[0] = hi[7]; + x_lo[1] = lo[0]; + x_hi[1] = hi[0]; + x_lo[2] = lo[5]; + x_hi[2] = hi[5]; + x_lo[3] = lo[2]; + x_hi[3] = hi[2]; + x_lo[4] = lo[3]; + x_hi[4] = hi[3]; + x_lo[5] = lo[4]; + x_hi[5] = hi[4]; + x_lo[6] = lo[1]; + x_hi[6] = hi[1]; + x_lo[7] = lo[6]; + x_hi[7] = hi[6]; + + // stage 1 + // s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + // s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + butterfly_two_coeff_s32_s64_noround( + x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_2_64, cospi_30_64, + &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]); + // s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + // s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + butterfly_two_coeff_s32_s64_noround( + x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_10_64, cospi_22_64, + &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]); + + // s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + // s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + butterfly_two_coeff_s32_s64_noround( + x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_18_64, cospi_14_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + + // s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + // s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + butterfly_two_coeff_s32_s64_noround( + x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_26_64, cospi_6_64, + &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]); + + // fdct_round_shift, indices are doubled + t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]); + t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]); + t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]); + t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]); + t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]); + t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]); + t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]); + t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]); + t_lo[4] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]); + t_hi[4] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]); + t_lo[5] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]); + t_hi[5] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]); + t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]); + t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]); + t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]); + t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]); + + // stage 2 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + // s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + // s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + butterfly_two_coeff_s32_s64_noround( + t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + + // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + // s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + butterfly_two_coeff_s32_s64_noround( + t_lo[6], t_hi[6], t_lo[7], t_hi[7], -cospi_24_64, cospi_8_64, + &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]); + + // fdct_round_shift + // s0 + s2 + t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]); + // s0 - s2 + t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]); + + // s1 + s3 + t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]); + // s1 - s3 + t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]); + + // s4 + s6 + t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + // s4 - s6 + t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + + // s5 + s7 + t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + // s5 - s7 + t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + + // stage 3 + // s2 = cospi_16_64 * (x2 + x3) + // s3 = cospi_16_64 * (x2 - x3) + butterfly_one_coeff_s32_fast(t_lo[2], t_hi[2], t_lo[3], t_hi[3], cospi_16_64, + &s_lo[2], &s_hi[2], &s_lo[3], &s_hi[3]); + + // s6 = cospi_16_64 * (x6 + x7) + // s7 = cospi_16_64 * (x6 - x7) + butterfly_one_coeff_s32_fast(t_lo[6], t_hi[6], t_lo[7], t_hi[7], cospi_16_64, + &s_lo[6], &s_hi[6], &s_lo[7], &s_hi[7]); + + // x0, x2, x4, x6 pass through + lo[0] = t_lo[0]; + hi[0] = t_hi[0]; + lo[2] = s_lo[6]; + hi[2] = s_hi[6]; + lo[4] = s_lo[3]; + hi[4] = s_hi[3]; + lo[6] = t_lo[5]; + hi[6] = t_hi[5]; + + lo[1] = vnegq_s32(t_lo[4]); + hi[1] = vnegq_s32(t_hi[4]); + lo[3] = vnegq_s32(s_lo[2]); + hi[3] = vnegq_s32(s_hi[2]); + lo[5] = vnegq_s32(s_lo[7]); + hi[5] = vnegq_s32(s_hi[7]); + lo[7] = vnegq_s32(t_lo[1]); + hi[7] = vnegq_s32(t_hi[1]); + + transpose_s32_8x8_2(lo, hi, lo, hi); +} + +void vp9_highbd_fht8x8_neon(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + int32x4_t lo[8], hi[8]; + + switch (tx_type) { + case DCT_DCT: vpx_highbd_fdct8x8_neon(input, output, stride); break; + case ADST_DCT: + highbd_load_buffer_8x8(input, lo, hi, stride); + highbd_fadst8x8_neon(lo, hi); + // pass1 variant is not precise enough + vpx_highbd_fdct8x8_pass2_neon(lo, hi); + highbd_right_shift_8x8(lo, hi, 1); + highbd_write_buffer_8x8(output, lo, hi, 8); + break; + case DCT_ADST: + highbd_load_buffer_8x8(input, lo, hi, stride); + // pass1 variant is not precise enough + vpx_highbd_fdct8x8_pass2_neon(lo, hi); + highbd_fadst8x8_neon(lo, hi); + highbd_right_shift_8x8(lo, hi, 1); + highbd_write_buffer_8x8(output, lo, hi, 8); + break; + default: + assert(tx_type == ADST_ADST); + highbd_load_buffer_8x8(input, lo, hi, stride); + highbd_fadst8x8_neon(lo, hi); + highbd_fadst8x8_neon(lo, hi); + highbd_right_shift_8x8(lo, hi, 1); + highbd_write_buffer_8x8(output, lo, hi, 8); + break; + } +} + +static INLINE void highbd_load_buffer_16x16( + const int16_t *input, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/, + int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) { + // load first 8 columns + highbd_load_buffer_8x8(input, left1, right1, stride); + highbd_load_buffer_8x8(input + 8 * stride, left1 + 8, right1 + 8, stride); + + input += 8; + // load second 8 columns + highbd_load_buffer_8x8(input, left2, right2, stride); + highbd_load_buffer_8x8(input + 8 * stride, left2 + 8, right2 + 8, stride); +} + +static INLINE void highbd_write_buffer_16x16( + tran_low_t *output, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/, + int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) { + // write first 8 columns + highbd_write_buffer_8x8(output, left1, right1, stride); + highbd_write_buffer_8x8(output + 8 * stride, left1 + 8, right1 + 8, stride); + + // write second 8 columns + output += 8; + highbd_write_buffer_8x8(output, left2, right2, stride); + highbd_write_buffer_8x8(output + 8 * stride, left2 + 8, right2 + 8, stride); +} + +static INLINE void highbd_right_shift_16x16(int32x4_t *left1 /*[16]*/, + int32x4_t *right1 /*[16]*/, + int32x4_t *left2 /*[16]*/, + int32x4_t *right2 /*[16]*/, + const int bit) { + // perform rounding operations + highbd_right_shift_8x8(left1, right1, bit); + highbd_right_shift_8x8(left1 + 8, right1 + 8, bit); + highbd_right_shift_8x8(left2, right2, bit); + highbd_right_shift_8x8(left2 + 8, right2 + 8, bit); +} + +static void highbd_fdct16_8col(int32x4_t *left, int32x4_t *right) { + // perform 16x16 1-D DCT for 8 columns + int32x4_t s1_lo[8], s1_hi[8], s2_lo[8], s2_hi[8], s3_lo[8], s3_hi[8]; + int32x4_t left8[8], right8[8]; + + // stage 1 + left8[0] = vaddq_s32(left[0], left[15]); + right8[0] = vaddq_s32(right[0], right[15]); + left8[1] = vaddq_s32(left[1], left[14]); + right8[1] = vaddq_s32(right[1], right[14]); + left8[2] = vaddq_s32(left[2], left[13]); + right8[2] = vaddq_s32(right[2], right[13]); + left8[3] = vaddq_s32(left[3], left[12]); + right8[3] = vaddq_s32(right[3], right[12]); + left8[4] = vaddq_s32(left[4], left[11]); + right8[4] = vaddq_s32(right[4], right[11]); + left8[5] = vaddq_s32(left[5], left[10]); + right8[5] = vaddq_s32(right[5], right[10]); + left8[6] = vaddq_s32(left[6], left[9]); + right8[6] = vaddq_s32(right[6], right[9]); + left8[7] = vaddq_s32(left[7], left[8]); + right8[7] = vaddq_s32(right[7], right[8]); + + // step 1 + s1_lo[0] = vsubq_s32(left[7], left[8]); + s1_hi[0] = vsubq_s32(right[7], right[8]); + s1_lo[1] = vsubq_s32(left[6], left[9]); + s1_hi[1] = vsubq_s32(right[6], right[9]); + s1_lo[2] = vsubq_s32(left[5], left[10]); + s1_hi[2] = vsubq_s32(right[5], right[10]); + s1_lo[3] = vsubq_s32(left[4], left[11]); + s1_hi[3] = vsubq_s32(right[4], right[11]); + s1_lo[4] = vsubq_s32(left[3], left[12]); + s1_hi[4] = vsubq_s32(right[3], right[12]); + s1_lo[5] = vsubq_s32(left[2], left[13]); + s1_hi[5] = vsubq_s32(right[2], right[13]); + s1_lo[6] = vsubq_s32(left[1], left[14]); + s1_hi[6] = vsubq_s32(right[1], right[14]); + s1_lo[7] = vsubq_s32(left[0], left[15]); + s1_hi[7] = vsubq_s32(right[0], right[15]); + + // pass1 variant is not accurate enough + vpx_highbd_fdct8x8_pass2_notranspose_neon(left8, right8); + + // step 2 + // step2[2] = (step1[5] - step1[2]) * cospi_16_64; + // step2[5] = (step1[5] + step1[2]) * cospi_16_64; + butterfly_one_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2], + cospi_16_64, &s2_lo[5], &s2_hi[5], + &s2_lo[2], &s2_hi[2]); + // step2[3] = (step1[4] - step1[3]) * cospi_16_64; + // step2[4] = (step1[4] + step1[3]) * cospi_16_64; + butterfly_one_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3], + cospi_16_64, &s2_lo[4], &s2_hi[4], + &s2_lo[3], &s2_hi[3]); + + // step 3 + s3_lo[0] = vaddq_s32(s1_lo[0], s2_lo[3]); + s3_hi[0] = vaddq_s32(s1_hi[0], s2_hi[3]); + s3_lo[1] = vaddq_s32(s1_lo[1], s2_lo[2]); + s3_hi[1] = vaddq_s32(s1_hi[1], s2_hi[2]); + s3_lo[2] = vsubq_s32(s1_lo[1], s2_lo[2]); + s3_hi[2] = vsubq_s32(s1_hi[1], s2_hi[2]); + s3_lo[3] = vsubq_s32(s1_lo[0], s2_lo[3]); + s3_hi[3] = vsubq_s32(s1_hi[0], s2_hi[3]); + s3_lo[4] = vsubq_s32(s1_lo[7], s2_lo[4]); + s3_hi[4] = vsubq_s32(s1_hi[7], s2_hi[4]); + s3_lo[5] = vsubq_s32(s1_lo[6], s2_lo[5]); + s3_hi[5] = vsubq_s32(s1_hi[6], s2_hi[5]); + s3_lo[6] = vaddq_s32(s1_lo[6], s2_lo[5]); + s3_hi[6] = vaddq_s32(s1_hi[6], s2_hi[5]); + s3_lo[7] = vaddq_s32(s1_lo[7], s2_lo[4]); + s3_hi[7] = vaddq_s32(s1_hi[7], s2_hi[4]); + + // step 4 + // s2[1] = cospi_24_64 * s3[6] - cospi_8_64 * s3[1] + // s2[6] = cospi_8_64 * s3[6] + cospi_24_64 * s3[1] + butterfly_two_coeff_s32_s64_narrow(s3_lo[6], s3_hi[6], s3_lo[1], s3_hi[1], + cospi_8_64, cospi_24_64, &s2_lo[6], + &s2_hi[6], &s2_lo[1], &s2_hi[1]); + + // s2[5] = cospi_8_64 * s3[2] - cospi_24_64 * s3[5] + // s2[2] = cospi_24_64 * s3[2] + cospi_8_64 * s3[5] + butterfly_two_coeff_s32_s64_narrow(s3_lo[2], s3_hi[2], s3_lo[5], s3_hi[5], + cospi_24_64, cospi_8_64, &s2_lo[2], + &s2_hi[2], &s2_lo[5], &s2_hi[5]); + + // step 5 + s1_lo[0] = vaddq_s32(s3_lo[0], s2_lo[1]); + s1_hi[0] = vaddq_s32(s3_hi[0], s2_hi[1]); + s1_lo[1] = vsubq_s32(s3_lo[0], s2_lo[1]); + s1_hi[1] = vsubq_s32(s3_hi[0], s2_hi[1]); + s1_lo[2] = vaddq_s32(s3_lo[3], s2_lo[2]); + s1_hi[2] = vaddq_s32(s3_hi[3], s2_hi[2]); + s1_lo[3] = vsubq_s32(s3_lo[3], s2_lo[2]); + s1_hi[3] = vsubq_s32(s3_hi[3], s2_hi[2]); + s1_lo[4] = vsubq_s32(s3_lo[4], s2_lo[5]); + s1_hi[4] = vsubq_s32(s3_hi[4], s2_hi[5]); + s1_lo[5] = vaddq_s32(s3_lo[4], s2_lo[5]); + s1_hi[5] = vaddq_s32(s3_hi[4], s2_hi[5]); + s1_lo[6] = vsubq_s32(s3_lo[7], s2_lo[6]); + s1_hi[6] = vsubq_s32(s3_hi[7], s2_hi[6]); + s1_lo[7] = vaddq_s32(s3_lo[7], s2_lo[6]); + s1_hi[7] = vaddq_s32(s3_hi[7], s2_hi[6]); + + // step 6 + // out[1] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64 + // out[15] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[7], s1_hi[7], s1_lo[0], s1_hi[0], + cospi_2_64, cospi_30_64, &left[1], + &right[1], &left[15], &right[15]); + + // out[9] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64 + // out[7] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[6], s1_hi[6], s1_lo[1], s1_hi[1], + cospi_18_64, cospi_14_64, &left[9], + &right[9], &left[7], &right[7]); + + // out[5] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64 + // out[11] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2], + cospi_10_64, cospi_22_64, &left[5], + &right[5], &left[11], &right[11]); + + // out[13] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64 + // out[3] = step1[4] * cospi_6_64 - step1[3] * cospi_26_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3], + cospi_26_64, cospi_6_64, &left[13], + &right[13], &left[3], &right[3]); + + left[0] = left8[0]; + right[0] = right8[0]; + left[2] = left8[1]; + right[2] = right8[1]; + left[4] = left8[2]; + right[4] = right8[2]; + left[6] = left8[3]; + right[6] = right8[3]; + left[8] = left8[4]; + right[8] = right8[4]; + left[10] = left8[5]; + right[10] = right8[5]; + left[12] = left8[6]; + right[12] = right8[6]; + left[14] = left8[7]; + right[14] = right8[7]; +} + +static void highbd_fadst16_8col(int32x4_t *left, int32x4_t *right) { + // perform 16x16 1-D ADST for 8 columns + int32x4_t x_lo[16], x_hi[16]; + int32x4_t s_lo[16], s_hi[16]; + int32x4_t t_lo[16], t_hi[16]; + int64x2_t s64_lo[32], s64_hi[32]; + + x_lo[0] = left[15]; + x_hi[0] = right[15]; + x_lo[1] = left[0]; + x_hi[1] = right[0]; + x_lo[2] = left[13]; + x_hi[2] = right[13]; + x_lo[3] = left[2]; + x_hi[3] = right[2]; + x_lo[4] = left[11]; + x_hi[4] = right[11]; + x_lo[5] = left[4]; + x_hi[5] = right[4]; + x_lo[6] = left[9]; + x_hi[6] = right[9]; + x_lo[7] = left[6]; + x_hi[7] = right[6]; + x_lo[8] = left[7]; + x_hi[8] = right[7]; + x_lo[9] = left[8]; + x_hi[9] = right[8]; + x_lo[10] = left[5]; + x_hi[10] = right[5]; + x_lo[11] = left[10]; + x_hi[11] = right[10]; + x_lo[12] = left[3]; + x_hi[12] = right[3]; + x_lo[13] = left[12]; + x_hi[13] = right[12]; + x_lo[14] = left[1]; + x_hi[14] = right[1]; + x_lo[15] = left[14]; + x_hi[15] = right[14]; + + // stage 1, indices are doubled + // s0 = cospi_1_64 * x0 + cospi_31_64 * x1; + // s1 = cospi_31_64 * x0 - cospi_1_64 * x1; + butterfly_two_coeff_s32_s64_noround( + x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_1_64, cospi_31_64, + &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]); + // s2 = cospi_5_64 * x2 + cospi_27_64 * x3; + // s3 = cospi_27_64 * x2 - cospi_5_64 * x3; + butterfly_two_coeff_s32_s64_noround( + x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_5_64, cospi_27_64, + &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]); + // s4 = cospi_9_64 * x4 + cospi_23_64 * x5; + // s5 = cospi_23_64 * x4 - cospi_9_64 * x5; + butterfly_two_coeff_s32_s64_noround( + x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_9_64, cospi_23_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + // s6 = cospi_13_64 * x6 + cospi_19_64 * x7; + // s7 = cospi_19_64 * x6 - cospi_13_64 * x7; + butterfly_two_coeff_s32_s64_noround( + x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_13_64, cospi_19_64, + &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]); + // s8 = cospi_17_64 * x8 + cospi_15_64 * x9; + // s9 = cospi_15_64 * x8 - cospi_17_64 * x9; + butterfly_two_coeff_s32_s64_noround( + x_lo[8], x_hi[8], x_lo[9], x_hi[9], cospi_17_64, cospi_15_64, + &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]); + // s10 = cospi_21_64 * x10 + cospi_11_64 * x11; + // s11 = cospi_11_64 * x10 - cospi_21_64 * x11; + butterfly_two_coeff_s32_s64_noround( + x_lo[10], x_hi[10], x_lo[11], x_hi[11], cospi_21_64, cospi_11_64, + &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]); + // s12 = cospi_25_64 * x12 + cospi_7_64 * x13; + // s13 = cospi_7_64 * x12 - cospi_25_64 * x13; + butterfly_two_coeff_s32_s64_noround( + x_lo[12], x_hi[12], x_lo[13], x_hi[13], cospi_25_64, cospi_7_64, + &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]); + // s14 = cospi_29_64 * x14 + cospi_3_64 * x15; + // s15 = cospi_3_64 * x14 - cospi_29_64 * x15; + butterfly_two_coeff_s32_s64_noround( + x_lo[14], x_hi[14], x_lo[15], x_hi[15], cospi_29_64, cospi_3_64, + &s64_lo[2 * 14], &s64_hi[2 * 14], &s64_lo[2 * 15], &s64_hi[2 * 15]); + + // fdct_round_shift, indices are doubled + t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]); + t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]); + t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]); + t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]); + t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]); + t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]); + t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]); + t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]); + t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]); + t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]); + t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]); + t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]); + t_lo[6] = add_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]); + t_hi[6] = add_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]); + t_lo[7] = add_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]); + t_hi[7] = add_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]); + t_lo[8] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]); + t_hi[8] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]); + t_lo[9] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]); + t_hi[9] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]); + t_lo[10] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]); + t_hi[10] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]); + t_lo[11] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]); + t_hi[11] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]); + t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]); + t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]); + t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]); + t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]); + t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]); + t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]); + t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]); + t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]); + + // stage 2 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + s_lo[4] = t_lo[4]; + s_hi[4] = t_hi[4]; + s_lo[5] = t_lo[5]; + s_hi[5] = t_hi[5]; + s_lo[6] = t_lo[6]; + s_hi[6] = t_hi[6]; + s_lo[7] = t_lo[7]; + s_hi[7] = t_hi[7]; + // s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + // s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[8], t_hi[8], t_lo[9], t_hi[9], cospi_4_64, cospi_28_64, + &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]); + // s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + // s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[10], t_hi[10], t_lo[11], t_hi[11], cospi_20_64, cospi_12_64, + &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]); + // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + // s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[13], t_hi[13], t_lo[12], t_hi[12], cospi_28_64, cospi_4_64, + &s64_lo[2 * 13], &s64_hi[2 * 13], &s64_lo[2 * 12], &s64_hi[2 * 12]); + // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + // s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_12_64, cospi_20_64, + &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]); + + // s0 + s4 + t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[4]); + t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[4]); + // s1 + s5 + t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[5]); + t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[5]); + // s2 + s6 + t_lo[2] = add_s32_s64_narrow(s_lo[2], s_lo[6]); + t_hi[2] = add_s32_s64_narrow(s_hi[2], s_hi[6]); + // s3 + s7 + t_lo[3] = add_s32_s64_narrow(s_lo[3], s_lo[7]); + t_hi[3] = add_s32_s64_narrow(s_hi[3], s_hi[7]); + + // s0 - s4 + t_lo[4] = sub_s32_s64_narrow(s_lo[0], s_lo[4]); + t_hi[4] = sub_s32_s64_narrow(s_hi[0], s_hi[4]); + // s1 - s5 + t_lo[5] = sub_s32_s64_narrow(s_lo[1], s_lo[5]); + t_hi[5] = sub_s32_s64_narrow(s_hi[1], s_hi[5]); + // s2 - s6 + t_lo[6] = sub_s32_s64_narrow(s_lo[2], s_lo[6]); + t_hi[6] = sub_s32_s64_narrow(s_hi[2], s_hi[6]); + // s3 - s7 + t_lo[7] = sub_s32_s64_narrow(s_lo[3], s_lo[7]); + t_hi[7] = sub_s32_s64_narrow(s_hi[3], s_hi[7]); + + // fdct_round_shift() + // s8 + s12 + t_lo[8] = add_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]); + t_hi[8] = add_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]); + // s9 + s13 + t_lo[9] = add_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]); + t_hi[9] = add_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]); + // s10 + s14 + t_lo[10] = add_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]); + t_hi[10] = add_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]); + // s11 + s15 + t_lo[11] = add_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]); + t_hi[11] = add_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]); + + // s8 - s12 + t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]); + t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]); + // s9 - s13 + t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]); + t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]); + // s10 - s14 + t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]); + t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]); + // s11 - s15 + t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]); + t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]); + + // stage 3 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + // s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + // s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + // s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[7], t_hi[7], t_lo[6], t_hi[6], cospi_24_64, cospi_8_64, + &s64_lo[2 * 7], &s64_hi[2 * 7], &s64_lo[2 * 6], &s64_hi[2 * 6]); + s_lo[8] = t_lo[8]; + s_hi[8] = t_hi[8]; + s_lo[9] = t_lo[9]; + s_hi[9] = t_hi[9]; + s_lo[10] = t_lo[10]; + s_hi[10] = t_hi[10]; + s_lo[11] = t_lo[11]; + s_hi[11] = t_hi[11]; + // s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + // s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[12], t_hi[12], t_lo[13], t_hi[13], cospi_8_64, cospi_24_64, + &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]); + // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + // s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_24_64, cospi_8_64, + &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]); + + // s0 + s2 + t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]); + // s1 + s3 + t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]); + // s0 - s2 + t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]); + // s1 - s3 + t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]); + // fdct_round_shift() + // s4 + s6 + t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + // s5 + s7 + t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + // s4 - s6 + t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + // s5 - s7 + t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + // s8 + s10 + t_lo[8] = add_s32_s64_narrow(s_lo[8], s_lo[10]); + t_hi[8] = add_s32_s64_narrow(s_hi[8], s_hi[10]); + // s9 + s11 + t_lo[9] = add_s32_s64_narrow(s_lo[9], s_lo[11]); + t_hi[9] = add_s32_s64_narrow(s_hi[9], s_hi[11]); + // s8 - s10 + t_lo[10] = sub_s32_s64_narrow(s_lo[8], s_lo[10]); + t_hi[10] = sub_s32_s64_narrow(s_hi[8], s_hi[10]); + // s9 - s11 + t_lo[11] = sub_s32_s64_narrow(s_lo[9], s_lo[11]); + t_hi[11] = sub_s32_s64_narrow(s_hi[9], s_hi[11]); + // fdct_round_shift() + // s12 + s14 + t_lo[12] = add_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]); + t_hi[12] = add_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]); + // s13 + s15 + t_lo[13] = add_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]); + t_hi[13] = add_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]); + // s12 - s14 + t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]); + t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]); + // s13 - s15 + t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]); + t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]); + + // stage 4, with fdct_round_shift + // s2 = (-cospi_16_64) * (x2 + x3); + // s3 = cospi_16_64 * (x2 - x3); + butterfly_one_coeff_s32_s64_narrow(t_lo[3], t_hi[3], t_lo[2], t_hi[2], + -cospi_16_64, &x_lo[2], &x_hi[2], &x_lo[3], + &x_hi[3]); + // s6 = cospi_16_64 * (x6 + x7); + // s7 = cospi_16_64 * (-x6 + x7); + butterfly_one_coeff_s32_s64_narrow(t_lo[7], t_hi[7], t_lo[6], t_hi[6], + cospi_16_64, &x_lo[6], &x_hi[6], &x_lo[7], + &x_hi[7]); + // s10 = cospi_16_64 * (x10 + x11); + // s11 = cospi_16_64 * (-x10 + x11); + butterfly_one_coeff_s32_s64_narrow(t_lo[11], t_hi[11], t_lo[10], t_hi[10], + cospi_16_64, &x_lo[10], &x_hi[10], + &x_lo[11], &x_hi[11]); + // s14 = (-cospi_16_64) * (x14 + x15); + // s15 = cospi_16_64 * (x14 - x15); + butterfly_one_coeff_s32_s64_narrow(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + -cospi_16_64, &x_lo[14], &x_hi[14], + &x_lo[15], &x_hi[15]); + + // Just copy x0, x1, x4, x5, x8, x9, x12, x13 + x_lo[0] = t_lo[0]; + x_hi[0] = t_hi[0]; + x_lo[1] = t_lo[1]; + x_hi[1] = t_hi[1]; + x_lo[4] = t_lo[4]; + x_hi[4] = t_hi[4]; + x_lo[5] = t_lo[5]; + x_hi[5] = t_hi[5]; + x_lo[8] = t_lo[8]; + x_hi[8] = t_hi[8]; + x_lo[9] = t_lo[9]; + x_hi[9] = t_hi[9]; + x_lo[12] = t_lo[12]; + x_hi[12] = t_hi[12]; + x_lo[13] = t_lo[13]; + x_hi[13] = t_hi[13]; + + left[0] = x_lo[0]; + right[0] = x_hi[0]; + left[1] = vnegq_s32(x_lo[8]); + right[1] = vnegq_s32(x_hi[8]); + left[2] = x_lo[12]; + right[2] = x_hi[12]; + left[3] = vnegq_s32(x_lo[4]); + right[3] = vnegq_s32(x_hi[4]); + left[4] = x_lo[6]; + right[4] = x_hi[6]; + left[5] = x_lo[14]; + right[5] = x_hi[14]; + left[6] = x_lo[10]; + right[6] = x_hi[10]; + left[7] = x_lo[2]; + right[7] = x_hi[2]; + left[8] = x_lo[3]; + right[8] = x_hi[3]; + left[9] = x_lo[11]; + right[9] = x_hi[11]; + left[10] = x_lo[15]; + right[10] = x_hi[15]; + left[11] = x_lo[7]; + right[11] = x_hi[7]; + left[12] = x_lo[5]; + right[12] = x_hi[5]; + left[13] = vnegq_s32(x_lo[13]); + right[13] = vnegq_s32(x_hi[13]); + left[14] = x_lo[9]; + right[14] = x_hi[9]; + left[15] = vnegq_s32(x_lo[1]); + right[15] = vnegq_s32(x_hi[1]); +} + +static void highbd_fdct16x16_neon(int32x4_t *left1, int32x4_t *right1, + int32x4_t *left2, int32x4_t *right2) { + // Left half. + highbd_fdct16_8col(left1, right1); + // Right half. + highbd_fdct16_8col(left2, right2); + transpose_s32_16x16(left1, right1, left2, right2); +} + +static void highbd_fadst16x16_neon(int32x4_t *left1, int32x4_t *right1, + int32x4_t *left2, int32x4_t *right2) { + // Left half. + highbd_fadst16_8col(left1, right1); + // Right half. + highbd_fadst16_8col(left2, right2); + transpose_s32_16x16(left1, right1, left2, right2); +} + +void vp9_highbd_fht16x16_neon(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + int32x4_t left1[16], right1[16], left2[16], right2[16]; + + switch (tx_type) { + case DCT_DCT: vpx_highbd_fdct16x16_neon(input, output, stride); break; + case ADST_DCT: + highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + highbd_right_shift_16x16(left1, right1, left2, right2, 2); + highbd_fdct16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + break; + case DCT_ADST: + highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride); + highbd_fdct16x16_neon(left1, right1, left2, right2); + highbd_right_shift_16x16(left1, right1, left2, right2, 2); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + break; + default: + assert(tx_type == ADST_ADST); + highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_right_shift_16x16(left1, right1, left2, right2, 2); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + break; + } +} + #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c index a458ecaa41..8a8aaa1ed4 100644 --- a/vpx_dsp/arm/fdct16x16_neon.c +++ b/vpx_dsp/arm/fdct16x16_neon.c @@ -28,6 +28,124 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { #else +// Main body of fdct16x16. +static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/, + int16x8_t *out /*[16]*/) { + int16x8_t s[8]; + int16x8_t x[4]; + int16x8_t step[8]; + + // stage 1 + // From fwd_txfm.c: Work on the first eight values; fdct8(input, + // even_results);" + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) + // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], + &out[8]); + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]); + + // Stage 2 + // Re-using source s5/s6 + // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) + // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) + butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]); + + // Stage 3 + x[0] = vaddq_s16(s[4], s[5]); + x[1] = vsubq_s16(s[4], s[5]); + x[2] = vsubq_s16(s[7], s[6]); + x[3] = vaddq_s16(s[7], s[6]); + + // Stage 4 + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]); + + // step 2 + // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" + // That file distinguished between "in_high" and "step1" but the only + // difference is that "in_high" is the first 8 values and "step 1" is the + // second. Here, since they are all in one array, "step1" values are += 8. + + // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) + // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) + // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) + // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) + butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]); + butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]); + + // step 3 + s[0] = vaddq_s16(in[8], s[3]); + s[1] = vaddq_s16(in[9], s[2]); + x[0] = vsubq_s16(in[9], s[2]); + x[1] = vsubq_s16(in[8], s[3]); + x[2] = vsubq_s16(in[15], s[4]); + x[3] = vsubq_s16(in[14], s[5]); + s[6] = vaddq_s16(in[14], s[5]); + s[7] = vaddq_s16(in[15], s[4]); + + // step 4 + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]); + + // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]); + + // step 5 + step[0] = vaddq_s16(s[0], s[1]); + step[1] = vsubq_s16(s[0], s[1]); + step[2] = vaddq_s16(x[1], s[2]); + step[3] = vsubq_s16(x[1], s[2]); + step[4] = vsubq_s16(x[2], s[5]); + step[5] = vaddq_s16(x[2], s[5]); + step[6] = vsubq_s16(s[7], s[6]); + step[7] = vaddq_s16(s[7], s[6]); + + // step 6 + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9], + &out[7]); + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1], + &out[15]); + + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13], + &out[3]); + + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5], + &out[11]); +} + void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp0[16]; int16x8_t temp1[16]; @@ -79,6 +197,194 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { #if CONFIG_VP9_HIGHBITDEPTH +// Main body of fdct8x16 column +static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + int32x4_t sl[8]; + int32x4_t sr[8]; + int32x4_t xl[4]; + int32x4_t xr[4]; + int32x4_t inl[8]; + int32x4_t inr[8]; + int32x4_t stepl[8]; + int32x4_t stepr[8]; + + // stage 1 + // From fwd_txfm.c: Work on the first eight values; fdct8(input, + // even_results);" + sl[0] = vaddq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sr[1] = vaddq_s32(right[1], right[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sr[2] = vaddq_s32(right[2], right[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sr[3] = vaddq_s32(right[3], right[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sr[5] = vsubq_s32(right[2], right[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sr[6] = vsubq_s32(right[1], right[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[7] = vsubq_s32(right[0], right[7]); + + // Copy values 8-15 as we're storing in-place + inl[0] = left[8]; + inr[0] = right[8]; + inl[1] = left[9]; + inr[1] = right[9]; + inl[2] = left[10]; + inr[2] = right[10]; + inl[3] = left[11]; + inr[3] = right[11]; + inl[4] = left[12]; + inr[4] = right[12]; + inl[5] = left[13]; + inr[5] = right[13]; + inl[6] = left[14]; + inr[6] = right[14]; + inl[7] = left[15]; + inr[7] = right[15]; + + // fdct4(step, step); + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) + // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[8], &right[8]); + + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, + cospi_24_64, &left[4], &right[4], + &left[12], &right[12]); + + // Stage 2 + // Re-using source s5/s6 + // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) + // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6], + &sr[6], &sl[5], &sr[5]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], sl[5]); + xr[0] = vaddq_s32(sr[4], sr[5]); + xl[1] = vsubq_s32(sl[4], sl[5]); + xr[1] = vsubq_s32(sr[4], sr[5]); + xl[2] = vsubq_s32(sl[7], sl[6]); + xr[2] = vsubq_s32(sr[7], sr[6]); + xl[3] = vaddq_s32(sl[7], sl[6]); + xr[3] = vaddq_s32(sr[7], sr[6]); + + // Stage 4 + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, + cospi_28_64, &left[2], &right[2], + &left[14], &right[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, + cospi_12_64, &left[10], &right[10], + &left[6], &right[6]); + + // step 2 + // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" + // That file distinguished between "in_high" and "step1" but the only + // difference is that "in_high" is the first 8 values and "step 1" is the + // second. Here, since they are all in one array, "step1" values are += 8. + + // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) + // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) + // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) + // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) + butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64, + &sl[5], &sr[5], &sl[2], &sr[2]); + butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64, + &sl[4], &sr[4], &sl[3], &sr[3]); + + // step 3 + sl[0] = vaddq_s32(inl[0], sl[3]); + sr[0] = vaddq_s32(inr[0], sr[3]); + sl[1] = vaddq_s32(inl[1], sl[2]); + sr[1] = vaddq_s32(inr[1], sr[2]); + xl[0] = vsubq_s32(inl[1], sl[2]); + xr[0] = vsubq_s32(inr[1], sr[2]); + xl[1] = vsubq_s32(inl[0], sl[3]); + xr[1] = vsubq_s32(inr[0], sr[3]); + xl[2] = vsubq_s32(inl[7], sl[4]); + xr[2] = vsubq_s32(inr[7], sr[4]); + xl[3] = vsubq_s32(inl[6], sl[5]); + xr[3] = vsubq_s32(inr[6], sr[5]); + sl[6] = vaddq_s32(inl[6], sl[5]); + sr[6] = vaddq_s32(inr[6], sr[5]); + sl[7] = vaddq_s32(inl[7], sl[4]); + sr[7] = vaddq_s32(inr[7], sr[4]); + + // step 4 + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64, + cospi_24_64, &sl[6], &sr[6], &sl[1], + &sr[1]); + // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64, + cospi_8_64, &sl[2], &sr[2], &sl[5], + &sr[5]); + + // step 5 + stepl[0] = vaddq_s32(sl[0], sl[1]); + stepr[0] = vaddq_s32(sr[0], sr[1]); + stepl[1] = vsubq_s32(sl[0], sl[1]); + stepr[1] = vsubq_s32(sr[0], sr[1]); + stepl[2] = vaddq_s32(xl[1], sl[2]); + stepr[2] = vaddq_s32(xr[1], sr[2]); + stepl[3] = vsubq_s32(xl[1], sl[2]); + stepr[3] = vsubq_s32(xr[1], sr[2]); + stepl[4] = vsubq_s32(xl[2], sl[5]); + stepr[4] = vsubq_s32(xr[2], sr[5]); + stepl[5] = vaddq_s32(xl[2], sl[5]); + stepr[5] = vaddq_s32(xr[2], sr[5]); + stepl[6] = vsubq_s32(sl[7], sl[6]); + stepr[6] = vsubq_s32(sr[7], sr[6]); + stepl[7] = vaddq_s32(sl[7], sl[6]); + stepr[7] = vaddq_s32(sr[7], sr[6]); + + // step 6 + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1], + cospi_18_64, cospi_14_64, &left[9], + &right[9], &left[7], &right[7]); + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0], + cospi_2_64, cospi_30_64, &left[1], + &right[1], &left[15], &right[15]); + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3], + cospi_26_64, cospi_6_64, &left[13], + &right[13], &left[3], &right[3]); + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2], + cospi_10_64, cospi_22_64, &left[5], + &right[5], &left[11], &right[11]); +} + void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp0[16]; diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h index 43d820b6bd..cd58675ca4 100644 --- a/vpx_dsp/arm/fdct16x16_neon.h +++ b/vpx_dsp/arm/fdct16x16_neon.h @@ -159,124 +159,6 @@ static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) { a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2); } -// Main body of fdct16x16. -static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/, - int16x8_t *out /*[16]*/) { - int16x8_t s[8]; - int16x8_t x[4]; - int16x8_t step[8]; - - // stage 1 - // From fwd_txfm.c: Work on the first eight values; fdct8(input, - // even_results);" - s[0] = vaddq_s16(in[0], in[7]); - s[1] = vaddq_s16(in[1], in[6]); - s[2] = vaddq_s16(in[2], in[5]); - s[3] = vaddq_s16(in[3], in[4]); - s[4] = vsubq_s16(in[3], in[4]); - s[5] = vsubq_s16(in[2], in[5]); - s[6] = vsubq_s16(in[1], in[6]); - s[7] = vsubq_s16(in[0], in[7]); - - // fdct4(step, step); - x[0] = vaddq_s16(s[0], s[3]); - x[1] = vaddq_s16(s[1], s[2]); - x[2] = vsubq_s16(s[1], s[2]); - x[3] = vsubq_s16(s[0], s[3]); - - // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) - // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) - butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], - &out[8]); - // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); - // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); - butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]); - - // Stage 2 - // Re-using source s5/s6 - // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) - // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) - butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]); - - // Stage 3 - x[0] = vaddq_s16(s[4], s[5]); - x[1] = vsubq_s16(s[4], s[5]); - x[2] = vsubq_s16(s[7], s[6]); - x[3] = vaddq_s16(s[7], s[6]); - - // Stage 4 - // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) - // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) - butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]); - // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) - // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) - butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]); - - // step 2 - // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" - // That file distinguished between "in_high" and "step1" but the only - // difference is that "in_high" is the first 8 values and "step 1" is the - // second. Here, since they are all in one array, "step1" values are += 8. - - // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) - // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) - // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) - // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) - butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]); - butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]); - - // step 3 - s[0] = vaddq_s16(in[8], s[3]); - s[1] = vaddq_s16(in[9], s[2]); - x[0] = vsubq_s16(in[9], s[2]); - x[1] = vsubq_s16(in[8], s[3]); - x[2] = vsubq_s16(in[15], s[4]); - x[3] = vsubq_s16(in[14], s[5]); - s[6] = vaddq_s16(in[14], s[5]); - s[7] = vaddq_s16(in[15], s[4]); - - // step 4 - // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * - // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] - // * cospi_8_64) - butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]); - - // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) - // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * - // cospi_24_64) - butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]); - - // step 5 - step[0] = vaddq_s16(s[0], s[1]); - step[1] = vsubq_s16(s[0], s[1]); - step[2] = vaddq_s16(x[1], s[2]); - step[3] = vsubq_s16(x[1], s[2]); - step[4] = vsubq_s16(x[2], s[5]); - step[5] = vaddq_s16(x[2], s[5]); - step[6] = vsubq_s16(s[7], s[6]); - step[7] = vaddq_s16(s[7], s[6]); - - // step 6 - // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) - // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) - butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9], - &out[7]); - // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) - // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) - butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1], - &out[15]); - - // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) - // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) - butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13], - &out[3]); - - // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) - // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) - butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5], - &out[11]); -} - #if CONFIG_VP9_HIGHBITDEPTH static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/, @@ -431,194 +313,6 @@ static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) { vst1q_s32(a, b[15]); } -// Main body of fdct8x16 column -static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/, - int32x4_t *right /* [16] */) { - int32x4_t sl[8]; - int32x4_t sr[8]; - int32x4_t xl[4]; - int32x4_t xr[4]; - int32x4_t inl[8]; - int32x4_t inr[8]; - int32x4_t stepl[8]; - int32x4_t stepr[8]; - - // stage 1 - // From fwd_txfm.c: Work on the first eight values; fdct8(input, - // even_results);" - sl[0] = vaddq_s32(left[0], left[7]); - sr[0] = vaddq_s32(right[0], right[7]); - sl[1] = vaddq_s32(left[1], left[6]); - sr[1] = vaddq_s32(right[1], right[6]); - sl[2] = vaddq_s32(left[2], left[5]); - sr[2] = vaddq_s32(right[2], right[5]); - sl[3] = vaddq_s32(left[3], left[4]); - sr[3] = vaddq_s32(right[3], right[4]); - sl[4] = vsubq_s32(left[3], left[4]); - sr[4] = vsubq_s32(right[3], right[4]); - sl[5] = vsubq_s32(left[2], left[5]); - sr[5] = vsubq_s32(right[2], right[5]); - sl[6] = vsubq_s32(left[1], left[6]); - sr[6] = vsubq_s32(right[1], right[6]); - sl[7] = vsubq_s32(left[0], left[7]); - sr[7] = vsubq_s32(right[0], right[7]); - - // Copy values 8-15 as we're storing in-place - inl[0] = left[8]; - inr[0] = right[8]; - inl[1] = left[9]; - inr[1] = right[9]; - inl[2] = left[10]; - inr[2] = right[10]; - inl[3] = left[11]; - inr[3] = right[11]; - inl[4] = left[12]; - inr[4] = right[12]; - inl[5] = left[13]; - inr[5] = right[13]; - inl[6] = left[14]; - inr[6] = right[14]; - inl[7] = left[15]; - inr[7] = right[15]; - - // fdct4(step, step); - xl[0] = vaddq_s32(sl[0], sl[3]); - xr[0] = vaddq_s32(sr[0], sr[3]); - xl[1] = vaddq_s32(sl[1], sl[2]); - xr[1] = vaddq_s32(sr[1], sr[2]); - xl[2] = vsubq_s32(sl[1], sl[2]); - xr[2] = vsubq_s32(sr[1], sr[2]); - xl[3] = vsubq_s32(sl[0], sl[3]); - xr[3] = vsubq_s32(sr[0], sr[3]); - - // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) - // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) - butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, - &left[0], &right[0], &left[8], &right[8]); - - // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); - // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); - butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, - cospi_24_64, &left[4], &right[4], - &left[12], &right[12]); - - // Stage 2 - // Re-using source s5/s6 - // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) - // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) - butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6], - &sr[6], &sl[5], &sr[5]); - - // Stage 3 - xl[0] = vaddq_s32(sl[4], sl[5]); - xr[0] = vaddq_s32(sr[4], sr[5]); - xl[1] = vsubq_s32(sl[4], sl[5]); - xr[1] = vsubq_s32(sr[4], sr[5]); - xl[2] = vsubq_s32(sl[7], sl[6]); - xr[2] = vsubq_s32(sr[7], sr[6]); - xl[3] = vaddq_s32(sl[7], sl[6]); - xr[3] = vaddq_s32(sr[7], sr[6]); - - // Stage 4 - // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) - // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) - butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, - cospi_28_64, &left[2], &right[2], - &left[14], &right[14]); - // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) - // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) - butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, - cospi_12_64, &left[10], &right[10], - &left[6], &right[6]); - - // step 2 - // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" - // That file distinguished between "in_high" and "step1" but the only - // difference is that "in_high" is the first 8 values and "step 1" is the - // second. Here, since they are all in one array, "step1" values are += 8. - - // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) - // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) - // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) - // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) - butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64, - &sl[5], &sr[5], &sl[2], &sr[2]); - butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64, - &sl[4], &sr[4], &sl[3], &sr[3]); - - // step 3 - sl[0] = vaddq_s32(inl[0], sl[3]); - sr[0] = vaddq_s32(inr[0], sr[3]); - sl[1] = vaddq_s32(inl[1], sl[2]); - sr[1] = vaddq_s32(inr[1], sr[2]); - xl[0] = vsubq_s32(inl[1], sl[2]); - xr[0] = vsubq_s32(inr[1], sr[2]); - xl[1] = vsubq_s32(inl[0], sl[3]); - xr[1] = vsubq_s32(inr[0], sr[3]); - xl[2] = vsubq_s32(inl[7], sl[4]); - xr[2] = vsubq_s32(inr[7], sr[4]); - xl[3] = vsubq_s32(inl[6], sl[5]); - xr[3] = vsubq_s32(inr[6], sr[5]); - sl[6] = vaddq_s32(inl[6], sl[5]); - sr[6] = vaddq_s32(inr[6], sr[5]); - sl[7] = vaddq_s32(inl[7], sl[4]); - sr[7] = vaddq_s32(inr[7], sr[4]); - - // step 4 - // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * - // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] - // * cospi_8_64) - butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64, - cospi_24_64, &sl[6], &sr[6], &sl[1], - &sr[1]); - // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) - // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * - // cospi_24_64) - butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64, - cospi_8_64, &sl[2], &sr[2], &sl[5], - &sr[5]); - - // step 5 - stepl[0] = vaddq_s32(sl[0], sl[1]); - stepr[0] = vaddq_s32(sr[0], sr[1]); - stepl[1] = vsubq_s32(sl[0], sl[1]); - stepr[1] = vsubq_s32(sr[0], sr[1]); - stepl[2] = vaddq_s32(xl[1], sl[2]); - stepr[2] = vaddq_s32(xr[1], sr[2]); - stepl[3] = vsubq_s32(xl[1], sl[2]); - stepr[3] = vsubq_s32(xr[1], sr[2]); - stepl[4] = vsubq_s32(xl[2], sl[5]); - stepr[4] = vsubq_s32(xr[2], sr[5]); - stepl[5] = vaddq_s32(xl[2], sl[5]); - stepr[5] = vaddq_s32(xr[2], sr[5]); - stepl[6] = vsubq_s32(sl[7], sl[6]); - stepr[6] = vsubq_s32(sr[7], sr[6]); - stepl[7] = vaddq_s32(sl[7], sl[6]); - stepr[7] = vaddq_s32(sr[7], sr[6]); - - // step 6 - // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) - // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) - butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1], - cospi_18_64, cospi_14_64, &left[9], - &right[9], &left[7], &right[7]); - // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) - // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) - butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0], - cospi_2_64, cospi_30_64, &left[1], - &right[1], &left[15], &right[15]); - // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) - // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) - butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3], - cospi_26_64, cospi_6_64, &left[13], - &right[13], &left[3], &right[3]); - // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) - // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) - butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2], - cospi_10_64, cospi_22_64, &left[5], - &right[5], &left[11], &right[11]); -} - #endif // CONFIG_VP9_HIGHBITDEPTH #endif // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_ diff --git a/vpx_dsp/arm/fdct8x8_neon.h b/vpx_dsp/arm/fdct8x8_neon.h index d8fa600448..cc65157430 100644 --- a/vpx_dsp/arm/fdct8x8_neon.h +++ b/vpx_dsp/arm/fdct8x8_neon.h @@ -293,88 +293,14 @@ static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left, static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left, int32x4_t *right) { - int32x4x2_t out[8]; vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right); - - out[0].val[0] = left[0]; - out[0].val[1] = right[0]; - out[1].val[0] = left[1]; - out[1].val[1] = right[1]; - out[2].val[0] = left[2]; - out[2].val[1] = right[2]; - out[3].val[0] = left[3]; - out[3].val[1] = right[3]; - out[4].val[0] = left[4]; - out[4].val[1] = right[4]; - out[5].val[0] = left[5]; - out[5].val[1] = right[5]; - out[6].val[0] = left[6]; - out[6].val[1] = right[6]; - out[7].val[0] = left[7]; - out[7].val[1] = right[7]; - - transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], - &out[6], &out[7]); - - left[0] = out[0].val[0]; - right[0] = out[0].val[1]; - left[1] = out[1].val[0]; - right[1] = out[1].val[1]; - left[2] = out[2].val[0]; - right[2] = out[2].val[1]; - left[3] = out[3].val[0]; - right[3] = out[3].val[1]; - left[4] = out[4].val[0]; - right[4] = out[4].val[1]; - left[5] = out[5].val[0]; - right[5] = out[5].val[1]; - left[6] = out[6].val[0]; - right[6] = out[6].val[1]; - left[7] = out[7].val[0]; - right[7] = out[7].val[1]; + transpose_s32_8x8_2(left, right, left, right); } static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left, int32x4_t *right) { - int32x4x2_t out[8]; vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right); - - out[0].val[0] = left[0]; - out[0].val[1] = right[0]; - out[1].val[0] = left[1]; - out[1].val[1] = right[1]; - out[2].val[0] = left[2]; - out[2].val[1] = right[2]; - out[3].val[0] = left[3]; - out[3].val[1] = right[3]; - out[4].val[0] = left[4]; - out[4].val[1] = right[4]; - out[5].val[0] = left[5]; - out[5].val[1] = right[5]; - out[6].val[0] = left[6]; - out[6].val[1] = right[6]; - out[7].val[0] = left[7]; - out[7].val[1] = right[7]; - - transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], - &out[6], &out[7]); - - left[0] = out[0].val[0]; - right[0] = out[0].val[1]; - left[1] = out[1].val[0]; - right[1] = out[1].val[1]; - left[2] = out[2].val[0]; - right[2] = out[2].val[1]; - left[3] = out[3].val[0]; - right[3] = out[3].val[1]; - left[4] = out[4].val[0]; - right[4] = out[4].val[1]; - left[5] = out[5].val[0]; - right[5] = out[5].val[1]; - left[6] = out[6].val[0]; - right[6] = out[6].val[1]; - left[7] = out[7].val[0]; - right[7] = out[7].val[1]; + transpose_s32_8x8_2(left, right, left, right); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h index 193594e3dc..16f5c5fc0e 100644 --- a/vpx_dsp/arm/fdct_neon.h +++ b/vpx_dsp/arm/fdct_neon.h @@ -177,6 +177,45 @@ static INLINE void butterfly_one_coeff_s32_fast( *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c); } +// fdct_round_shift((a +/- b) * c) +// Variant that performs normal implementation on full vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_one_coeff_s32_s64_narrow( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + // ac holds the following values: + // ac: vget_low_s32(a_lo) * c, vget_high_s32(a_lo) * c, + // vget_low_s32(a_hi) * c, vget_high_s32(a_hi) * c + int64x2_t ac[4]; + int64x2_t sum[4]; + int64x2_t diff[4]; + + ac[0] = vmull_n_s32(vget_low_s32(a_lo), constant); + ac[1] = vmull_n_s32(vget_high_s32(a_lo), constant); + ac[2] = vmull_n_s32(vget_low_s32(a_hi), constant); + ac[3] = vmull_n_s32(vget_high_s32(a_hi), constant); + + sum[0] = vmlal_n_s32(ac[0], vget_low_s32(b_lo), constant); + sum[1] = vmlal_n_s32(ac[1], vget_high_s32(b_lo), constant); + sum[2] = vmlal_n_s32(ac[2], vget_low_s32(b_hi), constant); + sum[3] = vmlal_n_s32(ac[3], vget_high_s32(b_hi), constant); + *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS), + vrshrn_n_s64(sum[1], DCT_CONST_BITS)); + *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS), + vrshrn_n_s64(sum[3], DCT_CONST_BITS)); + + diff[0] = vmlsl_n_s32(ac[0], vget_low_s32(b_lo), constant); + diff[1] = vmlsl_n_s32(ac[1], vget_high_s32(b_lo), constant); + diff[2] = vmlsl_n_s32(ac[2], vget_low_s32(b_hi), constant); + diff[3] = vmlsl_n_s32(ac[3], vget_high_s32(b_hi), constant); + *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS), + vrshrn_n_s64(diff[1], DCT_CONST_BITS)); + *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS), + vrshrn_n_s64(diff[3], DCT_CONST_BITS)); +} + // fdct_round_shift(a * c1 +/- b * c2) // Variant that performs normal implementation on half vector // more accurate does 64-bit processing, takes and returns 32-bit values @@ -205,6 +244,44 @@ static INLINE void butterfly_two_coeff_s32_s64_narrow_half( vrshrn_n_s64(diff_hi, DCT_CONST_BITS)); } +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on full vector +// more accurate does 64-bit processing, takes and returns 64-bit values +// returns results without rounding +static INLINE void butterfly_two_coeff_s32_s64_noround( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int64x2_t *add_lo /*[2]*/, + int64x2_t *add_hi /*[2]*/, int64x2_t *sub_lo /*[2]*/, + int64x2_t *sub_hi /*[2]*/) { + // ac1/ac2 hold the following values: + // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1, + // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1 + // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2, + // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2 + int64x2_t ac1[4]; + int64x2_t ac2[4]; + + ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1); + ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1); + ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1); + ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1); + ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2); + ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2); + ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2); + ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2); + + add_lo[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2); + add_lo[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2); + add_hi[0] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2); + add_hi[1] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2); + + sub_lo[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1); + sub_lo[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1); + sub_hi[0] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1); + sub_hi[1] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1); +} + // fdct_round_shift(a * c1 +/- b * c2) // Variant that performs normal implementation on full vector // more accurate does 64-bit processing, takes and returns 32-bit values @@ -420,4 +497,46 @@ static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) { return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2); } +static INLINE int32x4_t add_s64_round_narrow(const int64x2_t *a /*[2]*/, + const int64x2_t *b /*[2]*/) { + int64x2_t result[2]; + result[0] = vaddq_s64(a[0], b[0]); + result[1] = vaddq_s64(a[1], b[1]); + return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS), + vrshrn_n_s64(result[1], DCT_CONST_BITS)); +} + +static INLINE int32x4_t sub_s64_round_narrow(const int64x2_t *a /*[2]*/, + const int64x2_t *b /*[2]*/) { + int64x2_t result[2]; + result[0] = vsubq_s64(a[0], b[0]); + result[1] = vsubq_s64(a[1], b[1]); + return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS), + vrshrn_n_s64(result[1], DCT_CONST_BITS)); +} + +static INLINE int32x4_t add_s32_s64_narrow(const int32x4_t a, + const int32x4_t b) { + int64x2_t a64[2], b64[2], result[2]; + a64[0] = vmovl_s32(vget_low_s32(a)); + a64[1] = vmovl_s32(vget_high_s32(a)); + b64[0] = vmovl_s32(vget_low_s32(b)); + b64[1] = vmovl_s32(vget_high_s32(b)); + result[0] = vaddq_s64(a64[0], b64[0]); + result[1] = vaddq_s64(a64[1], b64[1]); + return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1])); +} + +static INLINE int32x4_t sub_s32_s64_narrow(const int32x4_t a, + const int32x4_t b) { + int64x2_t a64[2], b64[2], result[2]; + a64[0] = vmovl_s32(vget_low_s32(a)); + a64[1] = vmovl_s32(vget_high_s32(a)); + b64[0] = vmovl_s32(vget_low_s32(b)); + b64[1] = vmovl_s32(vget_high_s32(b)); + result[0] = vsubq_s64(a64[0], b64[0]); + result[1] = vsubq_s64(a64[1], b64[1]); + return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1])); +} + #endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_ diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index 41d44f2b1f..6c0bd08f77 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -866,6 +866,68 @@ static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/, out_right[7] = out[7].val[1]; } +static INLINE void transpose_s32_16x16(int32x4_t *left1, int32x4_t *right1, + int32x4_t *left2, int32x4_t *right2) { + int32x4_t tl[16], tr[16]; + + // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3. + tl[0] = left1[8]; + tl[1] = left1[9]; + tl[2] = left1[10]; + tl[3] = left1[11]; + tl[4] = left1[12]; + tl[5] = left1[13]; + tl[6] = left1[14]; + tl[7] = left1[15]; + tr[0] = right1[8]; + tr[1] = right1[9]; + tr[2] = right1[10]; + tr[3] = right1[11]; + tr[4] = right1[12]; + tr[5] = right1[13]; + tr[6] = right1[14]; + tr[7] = right1[15]; + + left1[8] = left2[0]; + left1[9] = left2[1]; + left1[10] = left2[2]; + left1[11] = left2[3]; + left1[12] = left2[4]; + left1[13] = left2[5]; + left1[14] = left2[6]; + left1[15] = left2[7]; + right1[8] = right2[0]; + right1[9] = right2[1]; + right1[10] = right2[2]; + right1[11] = right2[3]; + right1[12] = right2[4]; + right1[13] = right2[5]; + right1[14] = right2[6]; + right1[15] = right2[7]; + + left2[0] = tl[0]; + left2[1] = tl[1]; + left2[2] = tl[2]; + left2[3] = tl[3]; + left2[4] = tl[4]; + left2[5] = tl[5]; + left2[6] = tl[6]; + left2[7] = tl[7]; + right2[0] = tr[0]; + right2[1] = tr[1]; + right2[2] = tr[2]; + right2[3] = tr[3]; + right2[4] = tr[4]; + right2[5] = tr[5]; + right2[6] = tr[6]; + right2[7] = tr[7]; + + transpose_s32_8x8_2(left1, right1, left1, right1); + transpose_s32_8x8_2(left2, right2, left2, right2); + transpose_s32_8x8_2(left1 + 8, right1 + 8, left1 + 8, right1 + 8); + transpose_s32_8x8_2(left2 + 8, right2 + 8, left2 + 8, right2 + 8); +} + static INLINE void transpose_u8_16x8( const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2, const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5, From 7fed9187c4fb0c97dd2df9b165f640baad001fdc Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Tue, 24 Jan 2023 14:27:14 +0000 Subject: [PATCH 0577/1000] Refactor Neon implementation of SAD functions Refactor and optimize the Neon implementation of SAD functions - effectively backporting these libaom changes[1,2,3]. [1] https://aomedia-review.googlesource.com/c/aom/+/161921 [2] https://aomedia-review.googlesource.com/c/aom/+/161923 [3] https://aomedia-review.googlesource.com/c/aom/+/166963 Change-Id: I2d72fd0f27d61a3e31a78acd33172e2afb044cb8 --- vpx_dsp/arm/sad_neon.c | 1011 ++++++++++++++++++---------------------- 1 file changed, 462 insertions(+), 549 deletions(-) diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index ad575d4aae..7336edb694 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -17,635 +17,548 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { - const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); #if defined(__ARM_FEATURE_DOTPROD) - const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8); - const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); - return horizontal_add_uint32x4(dp); -#else - uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); - abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); - return horizontal_add_uint16x8(abs); -#endif -} -uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred) { - const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); - const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); -#if defined(__ARM_FEATURE_DOTPROD) - const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg); - const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); - return horizontal_add_uint32x4(prod); -#else - uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg)); - abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); - return horizontal_add_uint16x8(abs); -#endif -} +static INLINE unsigned int sadwxh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int w, int h) { + // Only two accumulators are required for optimal instruction throughput of + // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; -uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { -#if defined(__ARM_FEATURE_DOTPROD) - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t src2_u8 = - load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride); - const uint8x16_t ref2_u8 = - load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride); - const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, ref1_u8); - const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, ref2_u8); - prod = vdotq_u32(prod, sad1_u8, ones); - prod = vdotq_u32(prod, sad2_u8, ones); - return horizontal_add_uint32x4(prod); -#else - int i; - uint16x8_t abs = vdupq_n_u16(0); - for (i = 0; i < 8; i += 4) { - const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - src_ptr += 4 * src_stride; - ref_ptr += 4 * ref_stride; - abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8)); - abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); - } + int i = h; + do { + int j = 0; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; - return horizontal_add_uint16x8(abs); -#endif + s0 = vld1q_u8(src_ptr + j); + r0 = vld1q_u8(ref_ptr + j); + diff0 = vabdq_u8(s0, r0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + s1 = vld1q_u8(src_ptr + j + 16); + r1 = vld1q_u8(ref_ptr + j + 16); + diff1 = vabdq_u8(s1, r1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + j += 32; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); } -uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred) { -#if defined(__ARM_FEATURE_DOTPROD) - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t src2_u8 = - load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride); - const uint8x16_t ref2_u8 = - load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride); - const uint8x16_t second_pred1_u8 = vld1q_u8(second_pred); - const uint8x16_t second_pred2_u8 = vld1q_u8(second_pred + 16); - const uint8x16_t avg1 = vrhaddq_u8(ref1_u8, second_pred1_u8); - const uint8x16_t avg2 = vrhaddq_u8(ref2_u8, second_pred2_u8); - const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, avg1); - const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, avg2); - prod = vdotq_u32(prod, sad1_u8, ones); - prod = vdotq_u32(prod, sad2_u8, ones); - return horizontal_add_uint32x4(prod); -#else - int i; - uint16x8_t abs = vdupq_n_u16(0); - for (i = 0; i < 8; i += 4) { - const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); - const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); - src_ptr += 4 * src_stride; - ref_ptr += 4 * ref_stride; - second_pred += 16; - abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg)); - abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); - } +static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h); +} - return horizontal_add_uint16x8(abs); -#endif +static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h); } -#if defined(__ARM_FEATURE_DOTPROD) -static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint32x2_t prod = vdup_n_u32(0); - const uint8x8_t ones = vdup_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(src_ptr); - const uint8x8_t b_u8 = vld1_u8(ref_ptr); - const uint8x8_t sad_u8 = vabd_u8(a_u8, b_u8); +static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + diff0 = vabdq_u8(s0, r0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + src_ptr += src_stride; ref_ptr += ref_stride; - prod = vdot_u32(prod, sad_u8, ones); - } - return prod; -} -static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint32x2_t prod = vdup_n_u32(0); - const uint8x8_t ones = vdup_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(src_ptr); - const uint8x8_t b_u8 = vld1_u8(ref_ptr); - const uint8x8_t c_u8 = vld1_u8(second_pred); - const uint8x8_t avg = vrhadd_u8(b_u8, c_u8); - const uint8x8_t sad_u8 = vabd_u8(a_u8, avg); + s1 = vld1q_u8(src_ptr); + r1 = vld1q_u8(ref_ptr); + diff1 = vabdq_u8(s1, r1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + src_ptr += src_stride; ref_ptr += ref_stride; - second_pred += 8; - prod = vdot_u32(prod, sad_u8, ones); - } - return prod; -} + } while (--i != 0); -#define SAD8XN(n) \ - uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint32x2_t prod = \ - sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint32x2(prod); \ - } \ - \ - uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint32x2_t prod = \ - sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint32x2(prod); \ - } + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} #else // !defined(__ARM_FEATURE_DOTPROD) -static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(src_ptr); - const uint8x8_t b_u8 = vld1_u8(ref_ptr); + +static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32; + + int i = h; + do { + uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3; + uint8x16_t diff0, diff1, diff2, diff3; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + diff0 = vabdq_u8(s0, r0); + sum[0] = vpadalq_u8(sum[0], diff0); + + s1 = vld1q_u8(src_ptr + 16); + r1 = vld1q_u8(ref_ptr + 16); + diff1 = vabdq_u8(s1, r1); + sum[1] = vpadalq_u8(sum[1], diff1); + + s2 = vld1q_u8(src_ptr + 32); + r2 = vld1q_u8(ref_ptr + 32); + diff2 = vabdq_u8(s2, r2); + sum[2] = vpadalq_u8(sum[2], diff2); + + s3 = vld1q_u8(src_ptr + 48); + r3 = vld1q_u8(ref_ptr + 48); + diff3 = vabdq_u8(s3, r3); + sum[3] = vpadalq_u8(sum[3], diff3); + src_ptr += src_stride; ref_ptr += ref_stride; - abs = vabal_u8(abs, a_u8, b_u8); - } - return abs; + } while (--i != 0); + + sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); + + return horizontal_add_uint32x4(sum_u32); } -static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(src_ptr); - const uint8x8_t b_u8 = vld1_u8(ref_ptr); - const uint8x8_t c_u8 = vld1_u8(second_pred); - const uint8x8_t avg = vrhadd_u8(b_u8, c_u8); +static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t diff0 = vabdq_u8(s0, r0); + uint16x8_t sum0 = vpaddlq_u8(diff0); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t diff1 = vabdq_u8(s1, r1); + uint16x8_t sum1 = vpaddlq_u8(diff1); + + sum = vpadalq_u16(sum, sum0); + sum = vpadalq_u16(sum, sum1); + src_ptr += src_stride; ref_ptr += ref_stride; - second_pred += 8; - abs = vabal_u8(abs, a_u8, avg); - } - return abs; + } while (--i != 0); + + return horizontal_add_uint32x4(sum); } -#define SAD8XN(n) \ - uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint16x8(abs); \ - } \ - \ - uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint16x8_t abs = \ - sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint16x8(abs); \ - } -#endif // defined(__ARM_FEATURE_DOTPROD) +static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); -SAD8XN(4) -SAD8XN(8) -SAD8XN(16) + int i = h; + do { + uint8x16_t s = vld1q_u8(src_ptr); + uint8x16_t r = vld1q_u8(ref_ptr); + + uint8x16_t diff = vabdq_u8(s, r); + sum = vpadalq_u8(sum, diff); -#if defined(__ARM_FEATURE_DOTPROD) -static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t src_u8 = vld1q_u8(src_ptr); - const uint8x16_t ref_u8 = vld1q_u8(ref_ptr); - const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8); src_ptr += src_stride; ref_ptr += ref_stride; - prod = vdotq_u32(prod, sad_u8, ones); - } - return prod; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); } -static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(src_ptr); - const uint8x16_t b_u8 = vld1q_u8(ref_ptr); - const uint8x16_t c_u8 = vld1q_u8(second_pred); - const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8); - const uint8x16_t sad_u8 = vabdq_u8(a_u8, avg); +#endif // defined(__ARM_FEATURE_DOTPROD) + +static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t r = vld1_u8(ref_ptr); + + sum = vabal_u8(sum, s, r); + src_ptr += src_stride; ref_ptr += ref_stride; - second_pred += 16; - prod = vdotq_u32(prod, sad_u8, ones); - } - return prod; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); } -#define SAD16XN(n) \ - uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint32x4_t prod = \ - sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint32x4(prod); \ - } \ - \ - uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint32x4_t prod = \ - sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint32x4(prod); \ - } -#else // !defined(__ARM_FEATURE_DOTPROD) -static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(src_ptr); - const uint8x16_t b_u8 = vld1q_u8(ref_ptr); +static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h / 2; + do { + uint32x2_t s, r; + uint32_t s0, s1, r0, r1; + + memcpy(&s0, src_ptr, 4); + memcpy(&r0, ref_ptr, 4); + s = vdup_n_u32(s0); + r = vdup_n_u32(r0); src_ptr += src_stride; ref_ptr += ref_stride; - abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8)); - abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8)); - } - return abs; -} -static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(src_ptr); - const uint8x16_t b_u8 = vld1q_u8(ref_ptr); - const uint8x16_t c_u8 = vld1q_u8(second_pred); - const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8); + memcpy(&s1, src_ptr, 4); + memcpy(&r1, ref_ptr, 4); + s = vset_lane_u32(s1, s, 1); + r = vset_lane_u32(r1, r, 1); src_ptr += src_stride; ref_ptr += ref_stride; - second_pred += 16; - abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg)); - abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg)); - } - return abs; + + sum = vabal_u8(sum, vreinterpret_u8_u32(s), vreinterpret_u8_u32(r)); + } while (--i != 0); + + return horizontal_add_uint16x8(sum); } -#define SAD16XN(n) \ - uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint16x8_t abs = \ - sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint16x8(abs); \ - } \ - \ - uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint16x8_t abs = \ - sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint16x8(abs); \ +#define SAD_WXH_NEON(w, h) \ + unsigned int vpx_sad##w##x##h##_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \ } -#endif // defined(__ARM_FEATURE_DOTPROD) -SAD16XN(8) -SAD16XN(16) -SAD16XN(32) +SAD_WXH_NEON(4, 4) +SAD_WXH_NEON(4, 8) + +SAD_WXH_NEON(8, 4) +SAD_WXH_NEON(8, 8) +SAD_WXH_NEON(8, 16) + +SAD_WXH_NEON(16, 8) +SAD_WXH_NEON(16, 16) +SAD_WXH_NEON(16, 32) + +SAD_WXH_NEON(32, 16) +SAD_WXH_NEON(32, 32) +SAD_WXH_NEON(32, 64) + +SAD_WXH_NEON(64, 32) +SAD_WXH_NEON(64, 64) #if defined(__ARM_FEATURE_DOTPROD) -static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(src_ptr); - const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); - const uint8x16_t b_lo = vld1q_u8(ref_ptr); - const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); - const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, b_lo); - const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, b_hi); + +static INLINE unsigned int sadwxh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + const uint8_t *second_pred) { + // Only two accumulators are required for optimal instruction throughput of + // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; + + s0 = vld1q_u8(src_ptr + j); + r0 = vld1q_u8(ref_ptr + j); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + s1 = vld1q_u8(src_ptr + j + 16); + r1 = vld1q_u8(ref_ptr + j + 16); + p1 = vld1q_u8(second_pred + 16); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + j += 32; + second_pred += 32; + } while (j < w); + src_ptr += src_stride; ref_ptr += ref_stride; - prod = vdotq_u32(prod, sad_lo_u8, ones); - prod = vdotq_u32(prod, sad_hi_u8, ones); - } - return prod; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h, + second_pred); } -static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(src_ptr); - const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); - const uint8x16_t b_lo = vld1q_u8(ref_ptr); - const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); - const uint8x16_t c_lo = vld1q_u8(second_pred); - const uint8x16_t c_hi = vld1q_u8(second_pred + 16); - const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo); - const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi); - const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, avg_lo); - const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, avg_hi); +static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h, + second_pred); +} + +static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + src_ptr += src_stride; ref_ptr += ref_stride; - second_pred += 32; - prod = vdotq_u32(prod, sad_lo_u8, ones); - prod = vdotq_u32(prod, sad_hi_u8, ones); - } - return prod; + second_pred += 16; + + s1 = vld1q_u8(src_ptr); + r1 = vld1q_u8(ref_ptr); + p1 = vld1q_u8(second_pred); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); } -#define SAD32XN(n) \ - uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint32x4_t prod = \ - sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint32x4(prod); \ - } \ - \ - uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint32x4_t prod = \ - sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint32x4(prod); \ - } +#else // !defined(__ARM_FEATURE_DOTPROD) + +static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32; + + int i = h; + do { + uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; + uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vpadalq_u8(sum[0], diff0); + + s1 = vld1q_u8(src_ptr + 16); + r1 = vld1q_u8(ref_ptr + 16); + p1 = vld1q_u8(second_pred + 16); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vpadalq_u8(sum[1], diff1); + + s2 = vld1q_u8(src_ptr + 32); + r2 = vld1q_u8(ref_ptr + 32); + p2 = vld1q_u8(second_pred + 32); + avg2 = vrhaddq_u8(r2, p2); + diff2 = vabdq_u8(s2, avg2); + sum[2] = vpadalq_u8(sum[2], diff2); + + s3 = vld1q_u8(src_ptr + 48); + r3 = vld1q_u8(ref_ptr + 48); + p3 = vld1q_u8(second_pred + 48); + avg3 = vrhaddq_u8(r3, p3); + diff3 = vabdq_u8(s3, avg3); + sum[3] = vpadalq_u8(sum[3], diff3); -#else // defined(__ARM_FEATURE_DOTPROD) -static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(src_ptr); - const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); - const uint8x16_t b_lo = vld1q_u8(ref_ptr); - const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); src_ptr += src_stride; ref_ptr += ref_stride; - abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo)); - abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo)); - abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi)); - abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi)); - } - return abs; + second_pred += 64; + } while (--i != 0); + + sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); + + return horizontal_add_uint32x4(sum_u32); } -static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(src_ptr); - const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); - const uint8x16_t b_lo = vld1q_u8(ref_ptr); - const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); - const uint8x16_t c_lo = vld1q_u8(second_pred); - const uint8x16_t c_hi = vld1q_u8(second_pred + 16); - const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo); - const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi); +static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t avg0 = vrhaddq_u8(r0, p0); + uint8x16_t diff0 = vabdq_u8(s0, avg0); + uint16x8_t sum0 = vpaddlq_u8(diff0); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t p1 = vld1q_u8(second_pred + 16); + uint8x16_t avg1 = vrhaddq_u8(r1, p1); + uint8x16_t diff1 = vabdq_u8(s1, avg1); + uint16x8_t sum1 = vpaddlq_u8(diff1); + + sum = vpadalq_u16(sum, sum0); + sum = vpadalq_u16(sum, sum1); + src_ptr += src_stride; ref_ptr += ref_stride; second_pred += 32; - abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo)); - abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo)); - abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi)); - abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi)); - } - return abs; + } while (--i != 0); + + return horizontal_add_uint32x4(sum); } -#define SAD32XN(n) \ - uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint16x8_t abs = \ - sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint16x8(abs); \ - } \ - \ - uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint16x8_t abs = \ - sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint16x8(abs); \ - } -#endif // defined(__ARM_FEATURE_DOTPROD) +static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); -SAD32XN(16) -SAD32XN(32) -SAD32XN(64) + int i = h; + do { + uint8x16_t s = vld1q_u8(src_ptr); + uint8x16_t r = vld1q_u8(ref_ptr); + uint8x16_t p = vld1q_u8(second_pred); + + uint8x16_t avg = vrhaddq_u8(r, p); + uint8x16_t diff = vabdq_u8(s, avg); + sum = vpadalq_u8(sum, diff); -#if defined(__ARM_FEATURE_DOTPROD) -static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(src_ptr); - const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); - const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); - const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); - const uint8x16_t b_0 = vld1q_u8(ref_ptr); - const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); - const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); - const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); - const uint8x16_t sad_0_u8 = vabdq_u8(a_0, b_0); - const uint8x16_t sad_1_u8 = vabdq_u8(a_1, b_1); - const uint8x16_t sad_2_u8 = vabdq_u8(a_2, b_2); - const uint8x16_t sad_3_u8 = vabdq_u8(a_3, b_3); src_ptr += src_stride; ref_ptr += ref_stride; - prod = vdotq_u32(prod, sad_0_u8, ones); - prod = vdotq_u32(prod, sad_1_u8, ones); - prod = vdotq_u32(prod, sad_2_u8, ones); - prod = vdotq_u32(prod, sad_3_u8, ones); - } - return prod; + second_pred += 16; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); } -static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(src_ptr); - const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); - const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); - const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); - const uint8x16_t b_0 = vld1q_u8(ref_ptr); - const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); - const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); - const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); - const uint8x16_t c_0 = vld1q_u8(second_pred); - const uint8x16_t c_1 = vld1q_u8(second_pred + 16); - const uint8x16_t c_2 = vld1q_u8(second_pred + 32); - const uint8x16_t c_3 = vld1q_u8(second_pred + 48); - const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0); - const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1); - const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2); - const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3); - const uint8x16_t sad_0_u8 = vabdq_u8(a_0, avg_0); - const uint8x16_t sad_1_u8 = vabdq_u8(a_1, avg_1); - const uint8x16_t sad_2_u8 = vabdq_u8(a_2, avg_2); - const uint8x16_t sad_3_u8 = vabdq_u8(a_3, avg_3); +#endif // defined(__ARM_FEATURE_DOTPROD) + +static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t r = vld1_u8(ref_ptr); + uint8x8_t p = vld1_u8(second_pred); + + uint8x8_t avg = vrhadd_u8(r, p); + sum = vabal_u8(sum, s, avg); + src_ptr += src_stride; ref_ptr += ref_stride; - second_pred += 64; - prod = vdotq_u32(prod, sad_0_u8, ones); - prod = vdotq_u32(prod, sad_1_u8, ones); - prod = vdotq_u32(prod, sad_2_u8, ones); - prod = vdotq_u32(prod, sad_3_u8, ones); - } - return prod; + second_pred += 8; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); } -#else // !defined(__ARM_FEATURE_DOTPROD) -static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint16x8_t abs_0 = vdupq_n_u16(0); - uint16x8_t abs_1 = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(src_ptr); - const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); - const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); - const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); - const uint8x16_t b_0 = vld1q_u8(ref_ptr); - const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); - const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); - const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); + +static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h / 2; + do { + uint32x2_t s, r; + uint32_t s0, s1, r0, r1; + uint8x8_t p, avg; + + memcpy(&s0, src_ptr, 4); + memcpy(&r0, ref_ptr, 4); + s = vdup_n_u32(s0); + r = vdup_n_u32(r0); src_ptr += src_stride; ref_ptr += ref_stride; - abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0)); - abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0)); - abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1)); - abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1)); - abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2)); - abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2)); - abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3)); - abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3)); - } - - { - const uint32x4_t sum = vpaddlq_u16(abs_0); - return vpadalq_u16(sum, abs_1); - } -} -static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint16x8_t abs_0 = vdupq_n_u16(0); - uint16x8_t abs_1 = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(src_ptr); - const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); - const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); - const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); - const uint8x16_t b_0 = vld1q_u8(ref_ptr); - const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); - const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); - const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); - const uint8x16_t c_0 = vld1q_u8(second_pred); - const uint8x16_t c_1 = vld1q_u8(second_pred + 16); - const uint8x16_t c_2 = vld1q_u8(second_pred + 32); - const uint8x16_t c_3 = vld1q_u8(second_pred + 48); - const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0); - const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1); - const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2); - const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3); + memcpy(&s1, src_ptr, 4); + memcpy(&r1, ref_ptr, 4); + s = vset_lane_u32(s1, s, 1); + r = vset_lane_u32(r1, r, 1); src_ptr += src_stride; ref_ptr += ref_stride; - second_pred += 64; - abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0)); - abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0)); - abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1)); - abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1)); - abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2)); - abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2)); - abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3)); - abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3)); - } - { - const uint32x4_t sum = vpaddlq_u16(abs_0); - return vpadalq_u16(sum, abs_1); - } + p = vld1_u8(second_pred); + avg = vrhadd_u8(vreinterpret_u8_u32(r), p); + + sum = vabal_u8(sum, vreinterpret_u8_u32(s), avg); + second_pred += 8; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); } -#endif // defined(__ARM_FEATURE_DOTPROD) -#define SAD64XN(n) \ - uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint32x4_t abs = \ - sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint32x4(abs); \ - } \ - \ - uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint32x4_t abs = \ - sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint32x4(abs); \ +#define SAD_WXH_AVG_NEON(w, h) \ + uint32_t vpx_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ } -SAD64XN(32) -SAD64XN(64) +SAD_WXH_AVG_NEON(4, 4) +SAD_WXH_AVG_NEON(4, 8) + +SAD_WXH_AVG_NEON(8, 4) +SAD_WXH_AVG_NEON(8, 8) +SAD_WXH_AVG_NEON(8, 16) + +SAD_WXH_AVG_NEON(16, 8) +SAD_WXH_AVG_NEON(16, 16) +SAD_WXH_AVG_NEON(16, 32) + +SAD_WXH_AVG_NEON(32, 16) +SAD_WXH_AVG_NEON(32, 32) +SAD_WXH_AVG_NEON(32, 64) + +SAD_WXH_AVG_NEON(64, 32) +SAD_WXH_AVG_NEON(64, 64) From db69ce6aea278bee88668fd9cc2af2e544516fdb Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 25 Jan 2023 19:25:12 -0500 Subject: [PATCH 0578/1000] Fix per frame qp for temporal layers Also add tests with fixed temporal layering mode. Change-Id: If516fe94e3fb7f5a745821d1788bfe6cf90edaac --- test/vp9_datarate_test.cc | 66 +++++++++++++++++++++++++----- vp9/encoder/vp9_svc_layercontext.c | 4 ++ 2 files changed, 59 insertions(+), 11 deletions(-) diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc index eccb001071..7e91807492 100644 --- a/test/vp9_datarate_test.cc +++ b/test/vp9_datarate_test.cc @@ -148,14 +148,16 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest { if (video->frame() == 0) { encoder->Control(VP9E_SET_SVC, 1); } - vpx_svc_layer_id_t layer_id; - layer_id.spatial_layer_id = 0; - frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers); - layer_id.temporal_layer_id = - SetLayerId(video->frame(), cfg_.ts_number_layers); - layer_id.temporal_layer_id_per_spatial[0] = - SetLayerId(video->frame(), cfg_.ts_number_layers); - encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + if (cfg_.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + vpx_svc_layer_id_t layer_id; + frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers); + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = + SetLayerId(video->frame(), cfg_.ts_number_layers); + layer_id.temporal_layer_id_per_spatial[0] = + SetLayerId(video->frame(), cfg_.ts_number_layers); + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + } } const vpx_rational_t tb = video->timebase(); timebase_ = static_cast(tb.num) / tb.den; @@ -830,25 +832,37 @@ class DatarateTestVP9FrameQp ::libvpx_test::Encoder *encoder) { set_cpu_used_ = 7; DatarateTestVP9::PreEncodeFrameHook(video, encoder); - ACMRandom rnd; - frame_qp_ = static_cast(rnd.RandRange(64)); + frame_qp_ = static_cast(rnd_.RandRange(64)); encoder->Control(VP9E_SET_QUANTIZER_ONE_PASS, frame_qp_); frame_++; } virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { int qp = 0; + vpx_svc_layer_id_t layer_id; if (frame_ >= total_frame_) return; encoder->Control(VP8E_GET_LAST_QUANTIZER_64, &qp); ASSERT_EQ(frame_qp_, qp); + encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id); + temporal_layer_id_ = layer_id.temporal_layer_id; + } + + virtual void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) { + if (frame_ >= total_frame_) return; + ASSERT_TRUE(cfg_.temporal_layering_mode == + VP9E_TEMPORAL_LAYERING_MODE_0212 && + temporal_layer_id_ == 2); } protected: int total_frame_; private: + ACMRandom rnd_; int frame_qp_; int frame_; + int temporal_layer_id_; }; TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) { @@ -868,7 +882,7 @@ TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayers) { +TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersBypass) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_sz = 1000; @@ -897,6 +911,36 @@ TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayers) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } +TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_min_quantizer = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212; + cfg_.rc_target_bitrate = 200; + cfg_.g_error_resilient = 1; + total_frame_ = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, total_frame_); + ResetModel(); + cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + #if CONFIG_VP9_TEMPORAL_DENOISING // Params: speed setting. class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime { diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 518c00b34a..7e9435fb5f 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -894,6 +894,10 @@ int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) { RATE_CONTROL *const lrc = &lc->rc; lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q); lrc->best_quality = vp9_quantizer_to_qindex(lc->min_q); + if (cpi->fixed_qp_onepass) { + lrc->worst_quality = cpi->rc.worst_quality; + lrc->best_quality = cpi->rc.best_quality; + } } if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 && From 5e92d6d103e923a28cd56ee2c7efd6b48a0611b4 Mon Sep 17 00:00:00 2001 From: Gerda Zsejke More Date: Thu, 26 Jan 2023 16:12:55 +0100 Subject: [PATCH 0579/1000] Refactor 8x8 16-bit Neon transpose functions Refactor the Neon implementation of transpose_s16_8x8(q) and transpose_u16_8x8 so that the final step compiles to 8 ZIP1/ZIP2 instructions as opposed to 8 EXT, MOV pairs. This change removes 8 instructions per call to transpose_s16_8x8(q), transpose_u16_8x8 where the result stays in registers for further processing - rather than being stored to memory - like in vpx_hadamard_8x8_neon, for example. This is a backport of this libaom patch[1]. [1] https://aomedia-review.googlesource.com/c/aom/+/169426 Change-Id: Icef3e51d40efeca7008e1c4fc701bf39bd319c88 --- vpx_dsp/arm/fdct16x16_neon.c | 6 +- vpx_dsp/arm/fdct32x32_neon.c | 64 ++++++++++---------- vpx_dsp/arm/transpose_neon.h | 114 +++++++++++++++++++++++++---------- 3 files changed, 118 insertions(+), 66 deletions(-) diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c index a458ecaa41..0628acb750 100644 --- a/vpx_dsp/arm/fdct16x16_neon.c +++ b/vpx_dsp/arm/fdct16x16_neon.c @@ -47,8 +47,8 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Transpose top left and top right quarters into one contiguous location to // process to the top half. - transpose_s16_8x8_new(&temp0[0], &temp2[0]); - transpose_s16_8x8_new(&temp1[0], &temp2[8]); + transpose_s16_8x8q(&temp0[0], &temp2[0]); + transpose_s16_8x8q(&temp1[0], &temp2[8]); partial_round_shift(temp2); cross_input(temp2, temp3); vpx_fdct8x16_body(temp3, temp2); @@ -62,7 +62,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Transpose bottom left and bottom right quarters into one contiguous // location to process to the bottom half. - transpose_s16_8x8_new(&temp0[8], &temp1[0]); + transpose_s16_8x8q(&temp0[8], &temp1[0]); transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], &temp1[13], &temp1[14], &temp1[15]); diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c index d6818d2ec6..a91730ce8b 100644 --- a/vpx_dsp/arm/fdct32x32_neon.c +++ b/vpx_dsp/arm/fdct32x32_neon.c @@ -60,10 +60,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. - transpose_s16_8x8_new(&temp1[0], &temp0[0]); - transpose_s16_8x8_new(&temp2[0], &temp0[8]); - transpose_s16_8x8_new(&temp3[0], &temp0[16]); - transpose_s16_8x8_new(&temp4[0], &temp0[24]); + transpose_s16_8x8q(&temp1[0], &temp0[0]); + transpose_s16_8x8q(&temp2[0], &temp0[8]); + transpose_s16_8x8q(&temp3[0], &temp0[16]); + transpose_s16_8x8q(&temp4[0], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -78,10 +78,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output, temp5); // Second row of 8x32. - transpose_s16_8x8_new(&temp1[8], &temp0[0]); - transpose_s16_8x8_new(&temp2[8], &temp0[8]); - transpose_s16_8x8_new(&temp3[8], &temp0[16]); - transpose_s16_8x8_new(&temp4[8], &temp0[24]); + transpose_s16_8x8q(&temp1[8], &temp0[0]); + transpose_s16_8x8q(&temp2[8], &temp0[8]); + transpose_s16_8x8q(&temp3[8], &temp0[16]); + transpose_s16_8x8q(&temp4[8], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -96,10 +96,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output + 8 * 32, temp5); // Third row of 8x32 - transpose_s16_8x8_new(&temp1[16], &temp0[0]); - transpose_s16_8x8_new(&temp2[16], &temp0[8]); - transpose_s16_8x8_new(&temp3[16], &temp0[16]); - transpose_s16_8x8_new(&temp4[16], &temp0[24]); + transpose_s16_8x8q(&temp1[16], &temp0[0]); + transpose_s16_8x8q(&temp2[16], &temp0[8]); + transpose_s16_8x8q(&temp3[16], &temp0[16]); + transpose_s16_8x8q(&temp4[16], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -114,10 +114,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output + 16 * 32, temp5); // Final row of 8x32. - transpose_s16_8x8_new(&temp1[24], &temp0[0]); - transpose_s16_8x8_new(&temp2[24], &temp0[8]); - transpose_s16_8x8_new(&temp3[24], &temp0[16]); - transpose_s16_8x8_new(&temp4[24], &temp0[24]); + transpose_s16_8x8q(&temp1[24], &temp0[0]); + transpose_s16_8x8q(&temp2[24], &temp0[8]); + transpose_s16_8x8q(&temp3[24], &temp0[16]); + transpose_s16_8x8q(&temp4[24], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -159,10 +159,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. - transpose_s16_8x8_new(&temp1[0], &temp0[0]); - transpose_s16_8x8_new(&temp2[0], &temp0[8]); - transpose_s16_8x8_new(&temp3[0], &temp0[16]); - transpose_s16_8x8_new(&temp4[0], &temp0[24]); + transpose_s16_8x8q(&temp1[0], &temp0[0]); + transpose_s16_8x8q(&temp2[0], &temp0[8]); + transpose_s16_8x8q(&temp3[0], &temp0[16]); + transpose_s16_8x8q(&temp4[0], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -177,10 +177,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output, temp5); // Second row of 8x32. - transpose_s16_8x8_new(&temp1[8], &temp0[0]); - transpose_s16_8x8_new(&temp2[8], &temp0[8]); - transpose_s16_8x8_new(&temp3[8], &temp0[16]); - transpose_s16_8x8_new(&temp4[8], &temp0[24]); + transpose_s16_8x8q(&temp1[8], &temp0[0]); + transpose_s16_8x8q(&temp2[8], &temp0[8]); + transpose_s16_8x8q(&temp3[8], &temp0[16]); + transpose_s16_8x8q(&temp4[8], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -195,10 +195,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output + 8 * 32, temp5); // Third row of 8x32 - transpose_s16_8x8_new(&temp1[16], &temp0[0]); - transpose_s16_8x8_new(&temp2[16], &temp0[8]); - transpose_s16_8x8_new(&temp3[16], &temp0[16]); - transpose_s16_8x8_new(&temp4[16], &temp0[24]); + transpose_s16_8x8q(&temp1[16], &temp0[0]); + transpose_s16_8x8q(&temp2[16], &temp0[8]); + transpose_s16_8x8q(&temp3[16], &temp0[16]); + transpose_s16_8x8q(&temp4[16], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -213,10 +213,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output + 16 * 32, temp5); // Final row of 8x32. - transpose_s16_8x8_new(&temp1[24], &temp0[0]); - transpose_s16_8x8_new(&temp2[24], &temp0[8]); - transpose_s16_8x8_new(&temp3[24], &temp0[16]); - transpose_s16_8x8_new(&temp4[24], &temp0[24]); + transpose_s16_8x8q(&temp1[24], &temp0[0]); + transpose_s16_8x8q(&temp2[24], &temp0[8]); + transpose_s16_8x8q(&temp3[24], &temp0[16]); + transpose_s16_8x8q(&temp4[24], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index 41d44f2b1f..9d13132502 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -23,10 +23,17 @@ // b0.val[1]: 04 05 06 07 20 21 22 23 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { int16x8x2_t b0; +#if defined(__aarch64__) + b0.val[0] = vreinterpretq_s16_s64( + vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); + b0.val[1] = vreinterpretq_s16_s64( + vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); +#else b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), vreinterpret_s16_s32(vget_low_s32(a1))); b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), vreinterpret_s16_s32(vget_high_s32(a1))); +#endif return b0; } @@ -57,10 +64,17 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) { static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { uint16x8x2_t b0; +#if defined(__aarch64__) + b0.val[0] = vreinterpretq_u16_u64( + vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); + b0.val[1] = vreinterpretq_u16_u64( + vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); +#else b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), vreinterpret_u16_u32(vget_low_u32(a1))); b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), vreinterpret_u16_u32(vget_high_u32(a1))); +#endif return b0; } @@ -569,37 +583,73 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, } // Transpose 8x8 to a new location. -static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) { - // Swap 16 bit elements. - const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); - const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); - const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); - const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); - - // Swap 32 bit elements. - const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), - vreinterpretq_s32_s16(c1.val[0])); - const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), - vreinterpretq_s32_s16(c1.val[1])); - const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), - vreinterpretq_s32_s16(c3.val[0])); - const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), - vreinterpretq_s32_s16(c3.val[1])); - - // Swap 64 bit elements - const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); - const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); - const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); - const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); - - b[0] = e0.val[0]; - b[1] = e1.val[0]; - b[2] = e2.val[0]; - b[3] = e3.val[0]; - b[4] = e0.val[1]; - b[5] = e1.val[1]; - b[6] = e2.val[1]; - b[7] = e3.val[1]; +static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); + + out[0] = d0.val[0]; + out[1] = d1.val[0]; + out[2] = d2.val[0]; + out[3] = d3.val[0]; + out[4] = d0.val[1]; + out[5] = d1.val[1]; + out[6] = d2.val[1]; + out[7] = d3.val[1]; } static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, @@ -658,6 +708,7 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); @@ -729,6 +780,7 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1, // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 + const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]); const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]); const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]); From 5dd3d70a4f7256cfb48aa925e44c42d80abe93b1 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Thu, 26 Jan 2023 17:20:54 -0800 Subject: [PATCH 0580/1000] Add encoder component timing information Change-Id: Iaa5b73a9593ecfd74b6426ed47d2b529ec7ae2b5 --- configure | 1 + vp9/encoder/vp9_encodeframe.c | 37 +++++++++++++ vp9/encoder/vp9_encoder.c | 90 +++++++++++++++++++++++++++++- vp9/encoder/vp9_encoder.h | 101 ++++++++++++++++++++++++++++++++++ vp9/encoder/vp9_rdopt.c | 37 ++++++++++++- 5 files changed, 264 insertions(+), 2 deletions(-) diff --git a/configure b/configure index ae289f77b4..18f0ea798b 100755 --- a/configure +++ b/configure @@ -293,6 +293,7 @@ EXPERIMENT_LIST=" emulate_hardware non_greedy_mv rate_ctrl + collect_component_timing " CONFIG_LIST=" dependency_tracking diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 1483ac069d..a22c00bd8f 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1980,6 +1980,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, int64_t best_rd = INT64_MAX; vpx_clear_system_state(); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_sb_modes_time); +#endif // Use the lower precision, but faster, 32x32 fdct for mode selection. x->use_lp32x32fdct = 1; @@ -2047,15 +2050,27 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd); } else { if (bsize >= BLOCK_8X8) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_rd_pick_inter_mode_sb_time); +#endif if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize, ctx, best_rd); else vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost, bsize, ctx, best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_rd_pick_inter_mode_sb_time); +#endif } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time); +#endif vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost, bsize, ctx, best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time); +#endif } } @@ -2078,6 +2093,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, ctx->rate = rd_cost->rate; ctx->dist = rd_cost->dist; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_sb_modes_time); +#endif } #endif // !CONFIG_REALTIME_ONLY @@ -4411,8 +4429,14 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (should_encode_sb && pc_tree->index != 3) { int output_enabled = (bsize == BLOCK_64X64); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_sb_time); +#endif encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_sb_time); +#endif #if CONFIG_RATE_CTRL if (oxcf->use_simple_encode_api) { // Store partition, motion vector of the superblock. @@ -4539,8 +4563,15 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, &x->min_partition_size, &x->max_partition_size); } td->pc_root->none.rdcost = 0; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_partition_time); +#endif rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rdc, dummy_rdc, td->pc_root); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_partition_time); +#endif } (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row, sb_col_in_tile, num_sb_cols); @@ -6283,7 +6314,13 @@ void vp9_encode_frame(VP9_COMP *cpi) { if (cm->interp_filter == SWITCHABLE) cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_frame_internal_time); +#endif encode_frame_internal(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_frame_internal_time); +#endif for (i = 0; i < REFERENCE_MODES; ++i) mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index b66fdc0bca..b9fc148d7b 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4415,6 +4415,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest (cpi->twopass.gf_group.index == 1) : 0; +#if CONFIG_COLLECT_COMPONENT_TIMING + printf("\n Encoding a frame: \n"); +#endif do { vpx_clear_system_state(); @@ -4802,6 +4805,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) if (loop) restore_coding_context(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + if (loop) printf("\n Recoding:"); +#endif } while (loop); rc->max_frame_bandwidth = orig_rc_max_frame_bandwidth; @@ -5549,8 +5555,14 @@ static void encode_frame_to_data_rate( #if !CONFIG_REALTIME_ONLY #if CONFIG_RATE_CTRL encode_with_recode_loop(cpi, size, dest, &encode_frame_result->rq_history); -#else // CONFIG_RATE_CTRL +#else // CONFIG_RATE_CTRL +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_with_recode_loop_time); +#endif encode_with_recode_loop(cpi, size, dest); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_with_recode_loop_time); +#endif #endif // CONFIG_RATE_CTRL #endif // !CONFIG_REALTIME_ONLY } @@ -5609,13 +5621,25 @@ static void encode_frame_to_data_rate( cm->frame_to_show->render_width = cm->render_width; cm->frame_to_show->render_height = cm->render_height; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, loopfilter_frame_time); +#endif // Pick the loop filter level for the frame. loopfilter_frame(cpi, cm); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, loopfilter_frame_time); +#endif if (cpi->rc.use_post_encode_drop) save_coding_context(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_pack_bitstream_time); +#endif // build the bitstream vp9_pack_bitstream(cpi, dest, size); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_pack_bitstream_time); +#endif if (cpi->ext_ratectrl.ready) { const RefCntBuffer *coded_frame_buf = @@ -7640,6 +7664,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, const int gf_group_index = cpi->twopass.gf_group.index; int i; +#if CONFIG_COLLECT_COMPONENT_TIMING + if (oxcf->pass == 2) start_timing(cpi, vp9_get_compressed_data_time); +#endif + if (is_one_pass_svc(cpi)) { vp9_one_pass_svc_start_layer(cpi); } @@ -7704,9 +7732,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, int not_last_frame = (cpi->lookahead->sz - arf_src_index > 1); not_last_frame |= ALT_REF_AQ_APPLY_TO_LAST_FRAME; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_temporal_filter_time); +#endif // Produce the filtered ARF frame. vp9_temporal_filter(cpi, arf_src_index); vpx_extend_frame_borders(&cpi->alt_ref_buffer); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_temporal_filter_time); +#endif // for small bitrates segmentation overhead usually // eats all bitrate gain from enabling delta quantizers @@ -7820,7 +7854,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #if !CONFIG_REALTIME_ONLY if ((oxcf->pass == 2) && !cpi->use_svc) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_rc_get_second_pass_params_time); +#endif vp9_rc_get_second_pass_params(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_rc_get_second_pass_params_time); +#endif } else if (oxcf->pass == 1) { set_frame_size(cpi); } @@ -7860,6 +7900,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } #endif // CONFIG_NON_GREEDY_MV +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, setup_tpl_stats_time); +#endif if (gf_group_index == 1 && cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE && cpi->sf.enable_tpl_model) { @@ -7867,6 +7910,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, vp9_estimate_qp_gop(cpi); setup_tpl_stats(cpi); } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, setup_tpl_stats_time); +#endif #if CONFIG_BITSTREAM_DEBUG assert(cpi->oxcf.max_threads == 0 && @@ -7903,8 +7949,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; vp9_first_pass(cpi, source); } else if (oxcf->pass == 2 && !cpi->use_svc) { +#if CONFIG_COLLECT_COMPONENT_TIMING + // Accumulate 2nd pass time in 2-pass case. + start_timing(cpi, Pass2Encode_time); +#endif Pass2Encode(cpi, size, dest, frame_flags, encode_frame_result); vp9_twopass_postencode_update(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, Pass2Encode_time); +#endif } else if (cpi->use_svc) { SvcEncode(cpi, size, dest, frame_flags); } else { @@ -8107,6 +8160,41 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif +#if CONFIG_COLLECT_COMPONENT_TIMING + if (oxcf->pass == 2) end_timing(cpi, vp9_get_compressed_data_time); + + // Print out timing information. + // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of + // show_existing_frame and lag-in-frames. + // if (cpi->frame_component_time[0] > 100) + if (oxcf->pass == 2) { + uint64_t frame_total = 0, total = 0; + int i; + + fprintf(stderr, + "\n Frame number: %d, Frame type: %s, Show Frame: %d, Q: %d\n", + cm->current_video_frame, get_frame_type_enum(cm->frame_type), + cm->show_frame, cm->base_qindex); + for (i = 0; i < kTimingComponents; i++) { + cpi->component_time[i] += cpi->frame_component_time[i]; + // Use vp9_get_compressed_data_time (i = 0) as the total time. + if (i == 0) { + frame_total = cpi->frame_component_time[0]; + total = cpi->component_time[0]; + } + fprintf(stderr, + " %50s: %15" PRId64 " us [%6.2f%%] (total: %15" PRId64 + " us [%6.2f%%])\n", + get_component_name(i), cpi->frame_component_time[i], + (float)((float)cpi->frame_component_time[i] * 100.0 / + (float)frame_total), + cpi->component_time[i], + (float)((float)cpi->component_time[i] * 100.0 / (float)total)); + cpi->frame_component_time[i] = 0; + } + } +#endif + if (is_one_pass_svc(cpi)) { if (cm->show_frame) { ++cpi->svc.spatial_layer_to_encode; diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index cca8b53f8e..33a2844d3e 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -659,6 +659,72 @@ static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; } static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; } #endif // CONFIG_RATE_CTRL +#if CONFIG_COLLECT_COMPONENT_TIMING +#include "vpx_ports/vpx_timer.h" +// Adjust the following to add new components. +typedef enum { + vp9_get_compressed_data_time, + vp9_temporal_filter_time, + vp9_rc_get_second_pass_params_time, + setup_tpl_stats_time, + Pass2Encode_time, + + encode_with_recode_loop_time, + loopfilter_frame_time, + vp9_pack_bitstream_time, + + encode_frame_internal_time, + rd_pick_partition_time, + rd_pick_sb_modes_time, + encode_sb_time, + + vp9_rd_pick_inter_mode_sb_time, + vp9_rd_pick_inter_mode_sub8x8_time, + + intra_mode_search_time, + handle_inter_mode_time, + single_motion_search_time, + joint_motion_search_time, + interp_filter_time, + + kTimingComponents, +} TIMING_COMPONENT; + +static INLINE char const *get_component_name(int index) { + switch (index) { + case vp9_get_compressed_data_time: return "vp9_get_compressed_data_time"; + case vp9_temporal_filter_time: return "vp9_temporal_filter_time"; + case vp9_rc_get_second_pass_params_time: + return "vp9_rc_get_second_pass_params_time"; + case setup_tpl_stats_time: return "setup_tpl_stats_time"; + case Pass2Encode_time: return "Pass2Encode_time"; + + case encode_with_recode_loop_time: return "encode_with_recode_loop_time"; + case loopfilter_frame_time: return "loopfilter_frame_time"; + case vp9_pack_bitstream_time: return "vp9_pack_bitstream_time"; + + case encode_frame_internal_time: return "encode_frame_internal_time"; + case rd_pick_partition_time: return "rd_pick_partition_time"; + case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time"; + case encode_sb_time: return "encode_sb_time"; + + case vp9_rd_pick_inter_mode_sb_time: + return "vp9_rd_pick_inter_mode_sb_time"; + case vp9_rd_pick_inter_mode_sub8x8_time: + return "vp9_rd_pick_inter_mode_sub8x8_time"; + + case intra_mode_search_time: return "intra_mode_search_time"; + case handle_inter_mode_time: return "handle_inter_mode_time"; + case single_motion_search_time: return "single_motion_search_time"; + case joint_motion_search_time: return "joint_motion_search_time"; + case interp_filter_time: return "interp_filter_time"; + + default: assert(0); + } + return "error"; +} +#endif + typedef struct VP9_COMP { FRAME_INFO frame_info; QUANTS quants; @@ -973,6 +1039,22 @@ typedef struct VP9_COMP { EXT_RATECTRL ext_ratectrl; int fixed_qp_onepass; + +#if CONFIG_COLLECT_COMPONENT_TIMING + /*! + * component_time[] are initialized to zero while encoder starts. + */ + uint64_t component_time[kTimingComponents]; + /*! + * Stores timing for individual components between calls of start_timing() + * and end_timing(). + */ + struct vpx_usec_timer component_timer[kTimingComponents]; + /*! + * frame_component_time[] are initialized to zero at beginning of each frame. + */ + uint64_t frame_component_time[kTimingComponents]; +#endif } VP9_COMP; #if CONFIG_RATE_CTRL @@ -1392,6 +1474,25 @@ int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr); #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl)) +#if CONFIG_COLLECT_COMPONENT_TIMING +static INLINE void start_timing(VP9_COMP *cpi, int component) { + vpx_usec_timer_start(&cpi->component_timer[component]); +} +static INLINE void end_timing(VP9_COMP *cpi, int component) { + vpx_usec_timer_mark(&cpi->component_timer[component]); + cpi->frame_component_time[component] += + vpx_usec_timer_elapsed(&cpi->component_timer[component]); +} +static INLINE char const *get_frame_type_enum(int type) { + switch (type) { + case 0: return "KEY_FRAME"; + case 1: return "INTER_FRAME"; + default: assert(0); + } + return "error"; +} +#endif + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index a464ce38f1..d9b031cdc8 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -2832,8 +2832,14 @@ static int64_t handle_inter_mode( frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int; if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, joint_motion_search_time); +#endif joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, single_newmv, &rate_mv); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, joint_motion_search_time); +#endif } else { rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv, &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv, @@ -2845,7 +2851,13 @@ static int64_t handle_inter_mode( *rate2 += rate_mv; } else { int_mv tmp_mv; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, single_motion_search_time); +#endif single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, single_motion_search_time); +#endif if (tmp_mv.as_int == INVALID_MV) return INT64_MAX; frame_mv[refs[0]].as_int = xd->mi[0]->bmi[0].as_mv[0].as_int = @@ -2908,6 +2920,9 @@ static int64_t handle_inter_mode( intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv); if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, interp_filter_time); +#endif // Search for best switchable filter by checking the variance of // pred error irrespective of whether the filter will be used for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX; @@ -3005,6 +3020,9 @@ static int64_t handle_inter_mode( restore_dst_buf(xd, orig_dst, orig_dst_stride); } } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, interp_filter_time); +#endif // Set the appropriate filter mi->interp_filter = cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter; @@ -3707,19 +3725,30 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (ref_frame == INTRA_FRAME) { TX_SIZE uv_tx; struct macroblockd_plane *const pd = &xd->plane[1]; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, intra_mode_search_time); +#endif memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize, best_rd, recon); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, intra_mode_search_time); +#endif if (rate_y == INT_MAX) continue; uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x] [pd->subsampling_y]; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, intra_mode_search_time); +#endif if (rate_uv_intra[uv_tx] == INT_MAX) { choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]); } - +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, intra_mode_search_time); +#endif rate_uv = rate_uv_tokenonly[uv_tx]; distortion_uv = dist_uv[uv_tx]; skippable = skippable && skip_uv[uv_tx]; @@ -3730,11 +3759,17 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, rate2 += intra_cost_penalty; distortion2 = distortion_y + distortion_uv; } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_inter_mode_time); +#endif this_rd = handle_inter_mode( cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv, recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv, single_inter_filter, single_skippable, &total_sse, best_rd, &mask_filter, filter_cache); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_inter_mode_time); +#endif if (this_rd == INT64_MAX) continue; compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred); From 8047e6f2b3b1c325fae106951c3aee747fde7884 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Fri, 27 Jan 2023 16:16:16 +0000 Subject: [PATCH 0581/1000] Refactor Neon implementation of SAD4D functions Refactor and optimize the Neon implementation of SAD4D functions - effectively backporting these libaom changes[1,2]. [1] https://aomedia-review.googlesource.com/c/aom/+/162181 [2] https://aomedia-review.googlesource.com/c/aom/+/162183 Change-Id: Icb04bd841d86f2d0e2596aa7ba86b74f8d2d360b --- vpx_dsp/arm/sad4d_neon.c | 881 +++++++++++++-------------------------- vpx_dsp/arm/sum_neon.h | 17 + 2 files changed, 309 insertions(+), 589 deletions(-) diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 5fc621aee1..5064770ee6 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -17,633 +17,336 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0, - const void *const buf1) { - uint32_t a; - uint32x2_t aa; - memcpy(&a, buf0, 4); - aa = vdup_n_u32(a); - memcpy(&a, buf1, 4); - aa = vset_lane_u32(a, aa, 1); - return vreinterpret_u8_u32(aa); -} - -static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride, - const uint8_t *const ref_array[4], - const int ref_stride, const int height, - uint32_t sad_array[4]) { - int i; - uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; -#if !defined(__aarch64__) - uint16x4_t a[2]; -#endif - uint32x4_t r; - - assert(!((intptr_t)src_ptr % sizeof(uint32_t))); - assert(!(src_stride % sizeof(uint32_t))); - - for (i = 0; i < height; ++i) { - const uint8x8_t s = vreinterpret_u8_u32( - vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride))); - const uint8x8_t ref01 = load_unaligned_2_buffers( - ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride); - const uint8x8_t ref23 = load_unaligned_2_buffers( - ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride); - abs[0] = vabal_u8(abs[0], s, ref01); - abs[1] = vabal_u8(abs[1], s, ref23); - } - -#if defined(__aarch64__) - abs[0] = vpaddq_u16(abs[0], abs[1]); - r = vpaddlq_u16(abs[0]); -#else - a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0])); - a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1])); - r = vpaddlq_u16(vcombine_u16(a[0], a[1])); -#endif - vst1q_u32(sad_array, r); -} - -void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, sad_array); -} - -void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, sad_array); -} - -//////////////////////////////////////////////////////////////////////////////// - -// Can handle 512 pixels' sad sum (such as 16x32 or 32x16) -static INLINE void sad_512_pel_final_neon(const uint16x8_t sum[4], - uint32_t sad_array[4]) { -#if defined(__aarch64__) - const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); - const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); - const uint16x8_t b0 = vpaddq_u16(a0, a1); - const uint32x4_t r = vpaddlq_u16(b0); -#else - const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); - const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); - const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); - const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); - const uint16x4_t b0 = vpadd_u16(a0, a1); - const uint16x4_t b1 = vpadd_u16(a2, a3); - const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1)); -#endif - vst1q_u32(sad_array, r); -} +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) -#if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD) - -// Can handle 1024 pixels' sad sum (such as 32x32) -static INLINE void sad_1024_pel_final_neon(const uint16x8_t sum[4], - uint32_t sad_array[4]) { -#if defined(__aarch64__) - const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); - const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); - const uint32x4_t b0 = vpaddlq_u16(a0); - const uint32x4_t b1 = vpaddlq_u16(a1); - const uint32x4_t r = vpaddq_u32(b0, b1); - vst1q_u32(sad_array, r); -#else - const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); - const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); - const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); - const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); - const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1)); - const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3)); - const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0)); - const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1)); - vst1q_u32(sad_array, vcombine_u32(c0, c1)); -#endif +static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, + uint32x4_t *const sad_sum) { + uint8x16_t abs_diff = vabdq_u8(src, ref); + *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1)); } -// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32) -static INLINE void sad_2048_pel_final_neon(const uint16x8_t sum[4], - uint32_t sad_array[4]) { -#if defined(__aarch64__) - const uint32x4_t a0 = vpaddlq_u16(sum[0]); - const uint32x4_t a1 = vpaddlq_u16(sum[1]); - const uint32x4_t a2 = vpaddlq_u16(sum[2]); - const uint32x4_t a3 = vpaddlq_u16(sum[3]); - const uint32x4_t b0 = vpaddq_u32(a0, a1); - const uint32x4_t b1 = vpaddq_u32(a2, a3); - const uint32x4_t r = vpaddq_u32(b0, b1); - vst1q_u32(sad_array, r); -#else - const uint32x4_t a0 = vpaddlq_u16(sum[0]); - const uint32x4_t a1 = vpaddlq_u16(sum[1]); - const uint32x4_t a2 = vpaddlq_u16(sum[2]); - const uint32x4_t a3 = vpaddlq_u16(sum[3]); - const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0)); - const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1)); - const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2)); - const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3)); - const uint32x2_t c0 = vpadd_u32(b0, b1); - const uint32x2_t c1 = vpadd_u32(b2, b3); - vst1q_u32(sad_array, vcombine_u32(c0, c1)); -#endif +static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint32x4_t res0, res1; + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = 0; + do { + uint8x16_t s0, s1, s2, s3; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + s2 = vld1q_u8(src + i * src_stride + 32); + sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); + sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); + sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); + sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); + + s3 = vld1q_u8(src + i * src_stride + 48); + sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); + sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); + sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); + sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); + + i++; + } while (i < h); + + res0 = vpaddq_u32(vaddq_u32(sum_lo[0], sum_hi[0]), + vaddq_u32(sum_lo[1], sum_hi[1])); + res1 = vpaddq_u32(vaddq_u32(sum_lo[2], sum_hi[2]), + vaddq_u32(sum_lo[3], sum_hi[3])); + vst1q_u32(res, vpaddq_u32(res0, res1)); } -// Can handle 4096 pixels' sad sum (such as 64x64) -static INLINE void sad_4096_pel_final_neon(const uint16x8_t sum[8], - uint32_t sad_array[4]) { -#if defined(__aarch64__) - const uint32x4_t a0 = vpaddlq_u16(sum[0]); - const uint32x4_t a1 = vpaddlq_u16(sum[1]); - const uint32x4_t a2 = vpaddlq_u16(sum[2]); - const uint32x4_t a3 = vpaddlq_u16(sum[3]); - const uint32x4_t a4 = vpaddlq_u16(sum[4]); - const uint32x4_t a5 = vpaddlq_u16(sum[5]); - const uint32x4_t a6 = vpaddlq_u16(sum[6]); - const uint32x4_t a7 = vpaddlq_u16(sum[7]); - const uint32x4_t b0 = vaddq_u32(a0, a1); - const uint32x4_t b1 = vaddq_u32(a2, a3); - const uint32x4_t b2 = vaddq_u32(a4, a5); - const uint32x4_t b3 = vaddq_u32(a6, a7); - const uint32x4_t c0 = vpaddq_u32(b0, b1); - const uint32x4_t c1 = vpaddq_u32(b2, b3); - const uint32x4_t r = vpaddq_u32(c0, c1); - vst1q_u32(sad_array, r); -#else - const uint32x4_t a0 = vpaddlq_u16(sum[0]); - const uint32x4_t a1 = vpaddlq_u16(sum[1]); - const uint32x4_t a2 = vpaddlq_u16(sum[2]); - const uint32x4_t a3 = vpaddlq_u16(sum[3]); - const uint32x4_t a4 = vpaddlq_u16(sum[4]); - const uint32x4_t a5 = vpaddlq_u16(sum[5]); - const uint32x4_t a6 = vpaddlq_u16(sum[6]); - const uint32x4_t a7 = vpaddlq_u16(sum[7]); - const uint32x4_t b0 = vaddq_u32(a0, a1); - const uint32x4_t b1 = vaddq_u32(a2, a3); - const uint32x4_t b2 = vaddq_u32(a4, a5); - const uint32x4_t b3 = vaddq_u32(a6, a7); - const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0)); - const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1)); - const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2)); - const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); - const uint32x2_t d0 = vpadd_u32(c0, c1); - const uint32x2_t d1 = vpadd_u32(c2, c3); - vst1q_u32(sad_array, vcombine_u32(d0, d1)); -#endif +static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint32x4_t res0, res1; + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = 0; + do { + uint8x16_t s0, s1; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + i++; + } while (i < h); + + res0 = vpaddq_u32(vaddq_u32(sum_lo[0], sum_hi[0]), + vaddq_u32(sum_lo[1], sum_hi[1])); + res1 = vpaddq_u32(vaddq_u32(sum_lo[2], sum_hi[2]), + vaddq_u32(sum_lo[3], sum_hi[3])); + vst1q_u32(res, vpaddq_u32(res0, res1)); } -#endif - -static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4], const int height) { - int i, j; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0) }; - - for (i = 0; i < height; ++i) { - const uint8x8_t s = vld1_u8(src_ptr); - src_ptr += src_stride; - for (j = 0; j < 4; ++j) { - const uint8x8_t b_u8 = vld1_u8(ref_loop[j]); - ref_loop[j] += ref_stride; - sum[j] = vabal_u8(sum[j], s, b_u8); - } - } - - sad_512_pel_final_neon(sum, sad_array); -} +static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint32x4_t res0, res1; + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; -void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 4); -} + int i = 0; + do { + const uint8x16_t s = vld1q_u8(src + i * src_stride); + sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]); + sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]); + sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]); + sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]); -void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8); -} + i++; + } while (i < h); -void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16); + res0 = vpaddq_u32(sum[0], sum[1]); + res1 = vpaddq_u32(sum[2], sum[3]); + vst1q_u32(res, vpaddq_u32(res0, res1)); } -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) -static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, - uint32x4_t *const sum) { - const uint8x16_t r = vld1q_u8(ref_ptr); - const uint8x16_t diff = vabdq_u8(src_ptr, r); - *sum = vdotq_u32(*sum, diff, vdupq_n_u8(1)); +static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, + uint16x8_t *const sad_sum) { + uint8x16_t abs_diff = vabdq_u8(src, ref); + *sad_sum = vpadalq_u8(*sad_sum, abs_diff); } -static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4], const int height) { - int i; - uint32x4_t r0, r1; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0) }; - - for (i = 0; i < height; ++i) { - const uint8x16_t s = vld1q_u8(src_ptr + i * src_stride); - sad16_neon(ref_loop[0] + i * ref_stride, s, &sum[0]); - sad16_neon(ref_loop[1] + i * ref_stride, s, &sum[1]); - sad16_neon(ref_loop[2] + i * ref_stride, s, &sum[2]); - sad16_neon(ref_loop[3] + i * ref_stride, s, &sum[3]); - } - - r0 = vpaddq_u32(sum[0], sum[1]); - r1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(sad_array, vpaddq_u32(r0, r1)); +static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + int h_tmp = h > 64 ? 64 : h; + int i = 0; + vst1q_u32(res, vdupq_n_u32(0)); + + do { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + do { + uint8x16_t s0, s1, s2, s3; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + s2 = vld1q_u8(src + i * src_stride + 32); + sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); + sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); + sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); + sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); + + s3 = vld1q_u8(src + i * src_stride + 48); + sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); + sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); + sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); + sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); + + i++; + } while (i < h_tmp); + + res[0] += horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]); + res[1] += horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]); + res[2] += horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]); + res[3] += horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]); + + h_tmp += 64; + } while (i < h); } -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) - -static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, - uint16x8_t *const sum) { - const uint8x16_t r = vld1q_u8(ref_ptr); - *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r)); - *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r)); +static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int i = 0; + do { + uint8x16_t s0, s1; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + i++; + } while (i < h); + + res[0] = horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]); + res[1] = horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]); + res[2] = horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]); + res[3] = horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]); } -static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4], const int height) { - int i; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; +static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; - for (i = 0; i < height; ++i) { - const uint8x16_t s = vld1q_u8(src_ptr); - src_ptr += src_stride; - /* Manual unrolling here stops the compiler from getting confused. */ - sad16_neon(ref_loop[0], s, &sum[0]); - ref_loop[0] += ref_stride; - sad16_neon(ref_loop[1], s, &sum[1]); - ref_loop[1] += ref_stride; - sad16_neon(ref_loop[2], s, &sum[2]); - ref_loop[2] += ref_stride; - sad16_neon(ref_loop[3], s, &sum[3]); - ref_loop[3] += ref_stride; - } - - sad_512_pel_final_neon(sum, sad_array); + int i = 0; + do { + const uint8x16_t s = vld1q_u8(src + i * src_stride); + sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]); + sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]); + sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]); + sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]); + + i++; + } while (i < h); + + res[0] = horizontal_add_uint16x8(sum[0]); + res[1] = horizontal_add_uint16x8(sum[1]); + res[2] = horizontal_add_uint16x8(sum[2]); + res[3] = horizontal_add_uint16x8(sum[3]); } #endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) -void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8); -} - -void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16); -} - -void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32); -} - -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) - -static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4], const int height) { - int i; - uint32x4_t r0, r1; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - - uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0) }; - - for (i = 0; i < height; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; - } - - r0 = vpaddq_u32(sum[0], sum[1]); - r1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(sad_array, vpaddq_u32(r0, r1)); -} - -void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16); -} - -void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32); -} - -void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64); -} - -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) - -static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - const int height, uint16x8_t *const sum) { - int i; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - - sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; - } +static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref, + uint16x8_t *const sad_sum) { + uint8x8_t abs_diff = vabd_u8(src, ref); + *sad_sum = vaddw_u8(*sad_sum, abs_diff); } -void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - uint16x8_t sum[4]; - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum); - sad_512_pel_final_neon(sum, sad_array); -} - -void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - uint16x8_t sum[4]; - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum); - sad_1024_pel_final_neon(sum, sad_array); -} +static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; -void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - uint16x8_t sum[4]; - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum); - sad_2048_pel_final_neon(sum, sad_array); + int i = 0; + do { + const uint8x8_t s = vld1_u8(src + i * src_stride); + sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]); + sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]); + sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]); + sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]); + + i++; + } while (i < h); + + res[0] = horizontal_add_uint16x8(sum[0]); + res[1] = horizontal_add_uint16x8(sum[1]); + res[2] = horizontal_add_uint16x8(sum[2]); + res[3] = horizontal_add_uint16x8(sum[3]); } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) - -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) - -void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - int i; - uint32x4_t r0, r1; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0) }; - - for (i = 0; i < 32; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 2 * 16); - sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 3 * 16); - sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; - } +static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; - r0 = vpaddq_u32(sum[0], sum[1]); - r1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(sad_array, vpaddq_u32(r0, r1)); + int i = 0; + do { + uint32x2_t s, r0, r1, r2, r3; + uint32_t s_lo, s_hi, r0_lo, r0_hi, r1_lo, r1_hi, r2_lo, r2_hi, r3_lo, r3_hi; + + memcpy(&s_lo, src + i * src_stride, 4); + memcpy(&r0_lo, ref[0] + i * ref_stride, 4); + memcpy(&r1_lo, ref[1] + i * ref_stride, 4); + memcpy(&r2_lo, ref[2] + i * ref_stride, 4); + memcpy(&r3_lo, ref[3] + i * ref_stride, 4); + s = vdup_n_u32(s_lo); + r0 = vdup_n_u32(r0_lo); + r1 = vdup_n_u32(r1_lo); + r2 = vdup_n_u32(r2_lo); + r3 = vdup_n_u32(r3_lo); + + memcpy(&s_hi, src + (i + 1) * src_stride, 4); + memcpy(&r0_hi, ref[0] + (i + 1) * ref_stride, 4); + memcpy(&r1_hi, ref[1] + (i + 1) * ref_stride, 4); + memcpy(&r2_hi, ref[2] + (i + 1) * ref_stride, 4); + memcpy(&r3_hi, ref[3] + (i + 1) * ref_stride, 4); + s = vset_lane_u32(s_hi, s, 1); + r0 = vset_lane_u32(r0_hi, r0, 1); + r1 = vset_lane_u32(r1_hi, r1, 1); + r2 = vset_lane_u32(r2_hi, r2, 1); + r3 = vset_lane_u32(r3_hi, r3, 1); + + sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r0), &sum[0]); + sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r1), &sum[1]); + sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r2), &sum[2]); + sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r3), &sum[3]); + + i += 2; + } while (i < h); + + res[0] = horizontal_add_uint16x8(sum[0]); + res[1] = horizontal_add_uint16x8(sum[1]); + res[2] = horizontal_add_uint16x8(sum[2]); + res[3] = horizontal_add_uint16x8(sum[3]); } -void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - int i; - uint32x4_t r0, r1, r2, r3; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0), vdupq_n_u32(0) }; - - for (i = 0; i < 64; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]); - - s = vld1q_u8(src_ptr + 2 * 16); - sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]); - sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]); - sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]); - sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]); - - s = vld1q_u8(src_ptr + 3 * 16); - sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]); - sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]); - sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]); - sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; +#define SAD_WXH_4D_NEON(w, h) \ + void vpx_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], int ref_stride, \ + uint32_t res[4]) { \ + sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \ } - r0 = vpaddq_u32(sum[0], sum[1]); - r1 = vpaddq_u32(sum[2], sum[3]); - r2 = vpaddq_u32(sum[4], sum[5]); - r3 = vpaddq_u32(sum[6], sum[7]); - r0 = vpaddq_u32(r0, r1); - r1 = vpaddq_u32(r2, r3); - vst1q_u32(sad_array, vpaddq_u32(r0, r1)); -} +SAD_WXH_4D_NEON(4, 4) +SAD_WXH_4D_NEON(4, 8) -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) +SAD_WXH_4D_NEON(8, 4) +SAD_WXH_4D_NEON(8, 8) +SAD_WXH_4D_NEON(8, 16) -void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - int i; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0) }; +SAD_WXH_4D_NEON(16, 8) +SAD_WXH_4D_NEON(16, 16) +SAD_WXH_4D_NEON(16, 32) - for (i = 0; i < 32; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 2 * 16); - sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 3 * 16); - sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; - } +SAD_WXH_4D_NEON(32, 16) +SAD_WXH_4D_NEON(32, 32) +SAD_WXH_4D_NEON(32, 64) - sad_2048_pel_final_neon(sum, sad_array); -} +SAD_WXH_4D_NEON(64, 32) +SAD_WXH_4D_NEON(64, 64) -void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - int i; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0), vdupq_n_u16(0) }; - - for (i = 0; i < 64; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]); - - s = vld1q_u8(src_ptr + 2 * 16); - sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]); - sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]); - sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]); - sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]); - - s = vld1q_u8(src_ptr + 3 * 16); - sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]); - sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]); - sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]); - sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; - } - - sad_4096_pel_final_neon(sum, sad_array); -} - -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#undef SAD_WXH_4D_NEON diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index 9a7c424e8e..5f20f9d99a 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -40,6 +40,23 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) { #endif } +static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { +#if defined(__aarch64__) + return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi); +#else + const uint32x4_t vec_l_lo = + vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = + vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +#endif +} + static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) { #if defined(__aarch64__) return vaddv_s32(a); From a94cdd57ffd95ee7beb48d2794dae538f25da46c Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Mon, 30 Jan 2023 11:51:58 -0800 Subject: [PATCH 0582/1000] Fix unsigned integer overflow in sse computation Basically port the fix from libaom: https://aomedia-review.googlesource.com/c/aom/+/169361 Change-Id: Id06a5db91372037832399200ded75d514e096726 --- vpx_dsp/psnr.c | 67 ++++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 43 deletions(-) diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c index 48bac04508..f0d4e927ae 100644 --- a/vpx_dsp/psnr.c +++ b/vpx_dsp/psnr.c @@ -26,57 +26,44 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) { /* TODO(yaowu): The block_variance calls the unoptimized versions of variance() * and highbd_8_variance(). It should not. */ -static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, unsigned int *sse, - int *sum) { +static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h) { int i, j; - - *sum = 0; - *sse = 0; + int64_t sse = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; + sse += diff * diff; } a += a_stride; b += b_stride; } + + return sse; } #if CONFIG_VP9_HIGHBITDEPTH -static void encoder_highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h, uint64_t *sse, int64_t *sum) { +static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h) { int i, j; + int64_t sse = 0; uint16_t *a = CONVERT_TO_SHORTPTR(a8); uint16_t *b = CONVERT_TO_SHORTPTR(b8); - *sum = 0; - *sse = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; + sse += diff * diff; } a += a_stride; b += b_stride; } -} -static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h, unsigned int *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, - &sum_long); - *sse = (unsigned int)sse_long; - *sum = (int)sum_long; + return sse; } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -85,26 +72,23 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, const int dw = width % 16; const int dh = height % 16; int64_t total_sse = 0; - unsigned int sse = 0; - int sum = 0; int x, y; if (dw > 0) { - encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw, - height, &sse, &sum); - total_sse += sse; + total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride, + dw, height); } if (dh > 0) { - encoder_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, width - dw, dh, - &sse, &sum); - total_sse += sse; + total_sse += + encoder_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; + unsigned int sse; for (x = 0; x < width / 16; ++x) { vpx_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; @@ -146,22 +130,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, int x, y; const int dw = width % 16; const int dh = height % 16; - unsigned int sse = 0; - int sum = 0; if (dw > 0) { - encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], - b_stride, dw, height, &sse, &sum); - total_sse += sse; + total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw], + b_stride, dw, height); } if (dh > 0) { - encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, - width - dw, dh, &sse, &sum); - total_sse += sse; + total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; + unsigned int sse; for (x = 0; x < width / 16; ++x) { vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; From 472c839c9f6e88e976faebdc05848e64e7f3945d Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Tue, 31 Jan 2023 13:32:33 +0000 Subject: [PATCH 0583/1000] Use load_unaligned mem_neon.h helpers in SAD and SAD4D Use the load_unaligned helper functions in mem_neon.h to load strided sequences of 4 bytes where alignment is not guaranteed in the Neon SAD and SAD4D paths. Change-Id: I941d226ef94fd7a633b09fc92165a00ba68a1501 --- vpx_dsp/arm/sad4d_neon.c | 39 +++++++++----------------------- vpx_dsp/arm/sad_neon.c | 48 ++++++++++------------------------------ 2 files changed, 22 insertions(+), 65 deletions(-) diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 5064770ee6..85f6c1e5b1 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -285,35 +285,16 @@ static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride, int i = 0; do { - uint32x2_t s, r0, r1, r2, r3; - uint32_t s_lo, s_hi, r0_lo, r0_hi, r1_lo, r1_hi, r2_lo, r2_hi, r3_lo, r3_hi; - - memcpy(&s_lo, src + i * src_stride, 4); - memcpy(&r0_lo, ref[0] + i * ref_stride, 4); - memcpy(&r1_lo, ref[1] + i * ref_stride, 4); - memcpy(&r2_lo, ref[2] + i * ref_stride, 4); - memcpy(&r3_lo, ref[3] + i * ref_stride, 4); - s = vdup_n_u32(s_lo); - r0 = vdup_n_u32(r0_lo); - r1 = vdup_n_u32(r1_lo); - r2 = vdup_n_u32(r2_lo); - r3 = vdup_n_u32(r3_lo); - - memcpy(&s_hi, src + (i + 1) * src_stride, 4); - memcpy(&r0_hi, ref[0] + (i + 1) * ref_stride, 4); - memcpy(&r1_hi, ref[1] + (i + 1) * ref_stride, 4); - memcpy(&r2_hi, ref[2] + (i + 1) * ref_stride, 4); - memcpy(&r3_hi, ref[3] + (i + 1) * ref_stride, 4); - s = vset_lane_u32(s_hi, s, 1); - r0 = vset_lane_u32(r0_hi, r0, 1); - r1 = vset_lane_u32(r1_hi, r1, 1); - r2 = vset_lane_u32(r2_hi, r2, 1); - r3 = vset_lane_u32(r3_hi, r3, 1); - - sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r0), &sum[0]); - sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r1), &sum[1]); - sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r2), &sum[2]); - sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r3), &sum[3]); + uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride); + uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride); + uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride); + uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride); + uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride); + + sad8_neon(s, r0, &sum[0]); + sad8_neon(s, r1, &sum[1]); + sad8_neon(s, r2, &sum[2]); + sad8_neon(s, r3, &sum[3]); i += 2; } while (i < h); diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index 7336edb694..9382b80626 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -214,24 +214,13 @@ static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride, int i = h / 2; do { - uint32x2_t s, r; - uint32_t s0, s1, r0, r1; + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); - memcpy(&s0, src_ptr, 4); - memcpy(&r0, ref_ptr, 4); - s = vdup_n_u32(s0); - r = vdup_n_u32(r0); - src_ptr += src_stride; - ref_ptr += ref_stride; - - memcpy(&s1, src_ptr, 4); - memcpy(&r1, ref_ptr, 4); - s = vset_lane_u32(s1, s, 1); - r = vset_lane_u32(r1, r, 1); - src_ptr += src_stride; - ref_ptr += ref_stride; + sum = vabal_u8(sum, s, r); - sum = vabal_u8(sum, vreinterpret_u8_u32(s), vreinterpret_u8_u32(r)); + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; } while (--i != 0); return horizontal_add_uint16x8(sum); @@ -509,28 +498,15 @@ static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr, int i = h / 2; do { - uint32x2_t s, r; - uint32_t s0, s1, r0, r1; - uint8x8_t p, avg; - - memcpy(&s0, src_ptr, 4); - memcpy(&r0, ref_ptr, 4); - s = vdup_n_u32(s0); - r = vdup_n_u32(r0); - src_ptr += src_stride; - ref_ptr += ref_stride; - - memcpy(&s1, src_ptr, 4); - memcpy(&r1, ref_ptr, 4); - s = vset_lane_u32(s1, s, 1); - r = vset_lane_u32(r1, r, 1); - src_ptr += src_stride; - ref_ptr += ref_stride; + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + uint8x8_t p = vld1_u8(second_pred); - p = vld1_u8(second_pred); - avg = vrhadd_u8(vreinterpret_u8_u32(r), p); + uint8x8_t avg = vrhadd_u8(r, p); + sum = vabal_u8(sum, s, avg); - sum = vabal_u8(sum, vreinterpret_u8_u32(s), avg); + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; second_pred += 8; } while (--i != 0); From 3f109f786a91a40bd6feb87ce133f351afeff63b Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 31 Jan 2023 12:16:38 -0500 Subject: [PATCH 0584/1000] Update CHANGELOG Bug: webm:1780 Change-Id: I3ab4729bff1d27ef7127ef26e780a469e9278c21 --- CHANGELOG | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 4f5dcbd44e..3fb2d19bbe 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,39 @@ +2023-01-31 v1.13.0 "Ugly Duckling" + This release includes more Neon and AVX2 optimizations, adds a new codec + control to set per frame QP, upgrades GoogleTest to v1.12.1, and includes + numerous bug fixes. + + - Upgrading: + This release is ABI incompatible with the previous release. + + New codec control VP9E_SET_QUANTIZER_ONE_PASS to set per frame QP. + + GoogleTest is upgraded to v1.12.1. + + .clang-format is upgraded to clang-format-11. + + VPX_EXT_RATECTRL_ABI_VERSION was bumped due to incompatible changes to the + feature of using external rate control models for vp9. + + - Enhancement: + Numerous improvements on Neon optimizations. + Numerous improvements on AVX2 optimizations. + Additional ARM targets added for Visual Studio. + + - Bug fixes: + Fix to calculating internal stats when frame dropped. + Fix to segfault for external resize test in vp9. + Fix to build system with replacing egrep with grep -E. + Fix to a few bugs with external RTC rate control library. + Fix to make SVC work with VBR. + Fix to key frame setting in VP9 external RC. + Fix to -Wimplicit-int (Clang 16). + Fix to VP8 external RC for buffer levels. + Fix to VP8 external RC for dynamic update of layers. + Fix to VP9 auto level. + Fix to off-by-one error of max w/h in validate_config. + Fix to make SVC work for Profile 1. + 2022-06-17 v1.12.0 "Torrent Duck" This release adds optimizations for Loongarch, adds support for vp8 in the real-time rate control library, upgrades GoogleTest to v1.11.0, updates From aa5b62236a4d40679372c28ce37b95f965260aec Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 25 Jan 2023 19:25:12 -0500 Subject: [PATCH 0585/1000] Fix per frame qp for temporal layers Also add tests with fixed temporal layering mode. Change-Id: If516fe94e3fb7f5a745821d1788bfe6cf90edaac (cherry picked from commit db69ce6aea278bee88668fd9cc2af2e544516fdb) --- test/vp9_datarate_test.cc | 66 +++++++++++++++++++++++++----- vp9/encoder/vp9_svc_layercontext.c | 4 ++ 2 files changed, 59 insertions(+), 11 deletions(-) diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc index eccb001071..7e91807492 100644 --- a/test/vp9_datarate_test.cc +++ b/test/vp9_datarate_test.cc @@ -148,14 +148,16 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest { if (video->frame() == 0) { encoder->Control(VP9E_SET_SVC, 1); } - vpx_svc_layer_id_t layer_id; - layer_id.spatial_layer_id = 0; - frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers); - layer_id.temporal_layer_id = - SetLayerId(video->frame(), cfg_.ts_number_layers); - layer_id.temporal_layer_id_per_spatial[0] = - SetLayerId(video->frame(), cfg_.ts_number_layers); - encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + if (cfg_.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + vpx_svc_layer_id_t layer_id; + frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers); + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = + SetLayerId(video->frame(), cfg_.ts_number_layers); + layer_id.temporal_layer_id_per_spatial[0] = + SetLayerId(video->frame(), cfg_.ts_number_layers); + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + } } const vpx_rational_t tb = video->timebase(); timebase_ = static_cast(tb.num) / tb.den; @@ -830,25 +832,37 @@ class DatarateTestVP9FrameQp ::libvpx_test::Encoder *encoder) { set_cpu_used_ = 7; DatarateTestVP9::PreEncodeFrameHook(video, encoder); - ACMRandom rnd; - frame_qp_ = static_cast(rnd.RandRange(64)); + frame_qp_ = static_cast(rnd_.RandRange(64)); encoder->Control(VP9E_SET_QUANTIZER_ONE_PASS, frame_qp_); frame_++; } virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { int qp = 0; + vpx_svc_layer_id_t layer_id; if (frame_ >= total_frame_) return; encoder->Control(VP8E_GET_LAST_QUANTIZER_64, &qp); ASSERT_EQ(frame_qp_, qp); + encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id); + temporal_layer_id_ = layer_id.temporal_layer_id; + } + + virtual void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) { + if (frame_ >= total_frame_) return; + ASSERT_TRUE(cfg_.temporal_layering_mode == + VP9E_TEMPORAL_LAYERING_MODE_0212 && + temporal_layer_id_ == 2); } protected: int total_frame_; private: + ACMRandom rnd_; int frame_qp_; int frame_; + int temporal_layer_id_; }; TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) { @@ -868,7 +882,7 @@ TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayers) { +TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersBypass) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_sz = 1000; @@ -897,6 +911,36 @@ TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayers) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } +TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_min_quantizer = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212; + cfg_.rc_target_bitrate = 200; + cfg_.g_error_resilient = 1; + total_frame_ = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, total_frame_); + ResetModel(); + cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + #if CONFIG_VP9_TEMPORAL_DENOISING // Params: speed setting. class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime { diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 518c00b34a..7e9435fb5f 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -894,6 +894,10 @@ int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) { RATE_CONTROL *const lrc = &lc->rc; lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q); lrc->best_quality = vp9_quantizer_to_qindex(lc->min_q); + if (cpi->fixed_qp_onepass) { + lrc->worst_quality = cpi->rc.worst_quality; + lrc->best_quality = cpi->rc.best_quality; + } } if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 && From b5a2b3a92948a52f03719557a97d2c5e3617b0bb Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 1 Feb 2023 11:38:42 -0500 Subject: [PATCH 0586/1000] Update AUTHORS .mailmap and version Bug: webm:1780 Change-Id: I75a24bdd076dc1746b23bababfaafccbce3b4214 --- .mailmap | 1 + AUTHORS | 3 +++ libs.mk | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index 3593de4b9c..bb0ddd95b2 100644 --- a/.mailmap +++ b/.mailmap @@ -25,6 +25,7 @@ Johann Koenig Johann John Koleszar Joshua Litt +Konstantinos Margaritis Marco Paniconi Marco Paniconi Martin Storsjö diff --git a/AUTHORS b/AUTHORS index 536e0e7cf0..2db4a113e4 100644 --- a/AUTHORS +++ b/AUTHORS @@ -21,6 +21,7 @@ Andoni Morales Alastruey Andres Mejia Andrew Lewis Andrew Russell +Andrew Salkeld Angie Chen Angie Chiang Anton Venema @@ -175,7 +176,9 @@ Rob Bradford Ronald S. Bultje Rui Ueyama Sai Deng +Salome Thirot Sami Pietilä +Sam James Sarah Parker Sasi Inguva Scott Graham diff --git a/libs.mk b/libs.mk index fb6fbbeb20..1f7f03aa38 100644 --- a/libs.mk +++ b/libs.mk @@ -312,8 +312,8 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) # To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current # SO_VERSION_* then follow the rules in the link to detemine the new version # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1 -SO_VERSION_MAJOR := 7 -SO_VERSION_MINOR := 1 +SO_VERSION_MAJOR := 8 +SO_VERSION_MINOR := 0 SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib From 858a8c611f4c965078485860a6820e2135e6611b Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 1 Feb 2023 13:27:06 -0800 Subject: [PATCH 0587/1000] vp9_diamond_search_sad_neon: use DECLARE_ALIGNED rather than the gcc specific __attribute__((aligned())); fixes build targeting ARM64 windows. Bug: webm:1788 Change-Id: I2210fc215f44d90c1ce9dee9b54888eb1b78c99e --- vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c index 33753f77b0..997775a668 100644 --- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c +++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -220,7 +220,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, // Look up the component cost of the residual motion vector { uint32_t cost[4]; - int16_t __attribute__((aligned(16))) rowcol[8]; + DECLARE_ALIGNED(16, int16_t, rowcol[8]); vst1q_s16(rowcol, v_diff_mv_w); // Note: This is a use case for gather instruction From d6382e4469e8864477139636207d0c056066e526 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Thu, 2 Feb 2023 16:30:09 -0800 Subject: [PATCH 0588/1000] Fix uninitialized mesh feature for BEST mode At BEST encoding mode, the mesh search range wasn't initialized for non FC_GRAPHICS_ANIMATION content type, which actually/mistakenly used speed 0's setting. Fixed it by adding the initialization. There were 2 ways to fix this. Patchset 1 set to use speed 0's setting for non FC_GRAPHICS_ANIMATION type. This didn't change BEST mode's encoding results much, and only a couple of clips' results were changed. Borg result for BEST mode: avg_psnr: ovr_psnr: ssim: encoding_spdup: lowres2: -0.004 -0.003 -0.000 0.030 midres2: -0.006 -0.009 -0.012 0.033 hdres2: 0.002 0.002 0.004 0.015 Patchset 2 set to use BEST's setting for non FC_GRAPHICS_ANIMATION type. However, the majority of test clips' BDrate got changed up to ~0.5% (gain or loss), and overall it didn't give better performance than patchset 1. So, we chose to use patchset 1. Change-Id: Ibbf578dad04420e6ba22cb9a3ddec137a7e4deef --- vp9/encoder/vp9_speed_features.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 0431d8a452..41a742c5a1 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -16,8 +16,11 @@ #include "vpx_dsp/vpx_dsp_common.h" // Mesh search patters for various speed settings -static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = { - { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } +// Define 2 mesh density levels for FC_GRAPHICS_ANIMATION content type and non +// FC_GRAPHICS_ANIMATION content type. +static MESH_PATTERN best_quality_mesh_pattern[2][MAX_MESH_STEP] = { + { { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } }, + { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, }; #if !CONFIG_REALTIME_ONLY @@ -991,10 +994,14 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { sf->exhaustive_searches_thresh = (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20) : INT_MAX; - if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + { + const int mesh_density_level = + (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? 0 : 1; for (i = 0; i < MAX_MESH_STEP; ++i) { - sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range; - sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval; + sf->mesh_patterns[i].range = + best_quality_mesh_pattern[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + best_quality_mesh_pattern[mesh_density_level][i].interval; } } From 18a3421b7df75468b2f3d6f69c8a1ae906dde3ae Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Fri, 3 Feb 2023 14:07:09 -0800 Subject: [PATCH 0589/1000] Set _img->bit_depth in y4m_input_fetch_frame() This is a port of https://aomedia-review.googlesource.com/c/aom/+/169961. Change-Id: I2aa0d12cafde0c73448bf8c57eab0cd92e846468 --- y4minput.c | 1 + 1 file changed, 1 insertion(+) diff --git a/y4minput.c b/y4minput.c index 745e2f1cd6..210ce52fce 100644 --- a/y4minput.c +++ b/y4minput.c @@ -1148,6 +1148,7 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) { _img->fmt = _y4m->vpx_fmt; _img->w = _img->d_w = _y4m->pic_w; _img->h = _img->d_h = _y4m->pic_h; + _img->bit_depth = _y4m->bit_depth; _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1; _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1; _img->bps = _y4m->bps; From e3028ddbb408381601ab8d2c67be37124a9726e5 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Wed, 1 Feb 2023 16:37:24 +0000 Subject: [PATCH 0590/1000] Optimize Neon implementation of high bitdepth SAD functions Optimizations take a similar form to those implemented for standard bitdepth SAD: - Use ABD, UADALP instead of ABAL, ABAL2 (double the throughput on modern out-of-order Arm-designed cores.) - Use more accumulator registers to make better use of Neon pipeline resources on Arm CPUs that have four Neon pipes. Change-Id: I9e626d7fa0e271908dc43448405a7985b80e6230 --- vpx_dsp/arm/highbd_sad_neon.c | 209 ++++++++++++++++++++++++---------- 1 file changed, 149 insertions(+), 60 deletions(-) diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c index ecb52ce5a5..8415481f00 100644 --- a/vpx_dsp/arm/highbd_sad_neon.c +++ b/vpx_dsp/arm/highbd_sad_neon.c @@ -17,53 +17,169 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -static VPX_FORCE_INLINE uint32_t highbd_sad4_neon(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, int width, - int height) { - int i, j; - uint32x4_t sum_abs_diff = vdupq_n_u32(0); +static INLINE uint32_t highbd_sad4xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 4) { - const uint16x4_t src_u16 = vld1_u16(src16_ptr + j); - const uint16x4_t ref_u16 = vld1_u16(ref16_ptr + j); - sum_abs_diff = vabal_u16(sum_abs_diff, src_u16, ref_u16); - } + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x4_t s = vld1_u16(src16_ptr); + uint16x4_t r = vld1_u16(ref16_ptr); + sum = vabal_u16(sum, s, r); + src16_ptr += src_stride; ref16_ptr += ref_stride; - } + } while (--i != 0); - return horizontal_add_uint32x4(sum_abs_diff); + return horizontal_add_uint32x4(sum); } -static VPX_FORCE_INLINE uint32_t highbd_sad8_neon(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, int width, - int height) { - int i, j; - uint32x4_t sum_abs_diff = vdupq_n_u32(0); +static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 8) { - const uint16x8_t src_u16 = vld1q_u16(src16_ptr + j); - const uint16x8_t ref_u16 = vld1q_u16(ref16_ptr + j); - sum_abs_diff = - vabal_u16(sum_abs_diff, vget_low_u16(src_u16), vget_low_u16(ref_u16)); - sum_abs_diff = vabal_u16(sum_abs_diff, vget_high_u16(src_u16), - vget_high_u16(ref_u16)); - } + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x8_t s = vld1q_u16(src16_ptr); + uint16x8_t r = vld1q_u16(ref16_ptr); + uint16x8_t diff = vabdq_u16(s, r); + sum = vpadalq_u16(sum, diff); + src16_ptr += src_stride; ref16_ptr += ref_stride; - } + } while (--i != 0); - return horizontal_add_uint32x4(sum_abs_diff); + return horizontal_add_uint32x4(sum); +} + +static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + uint16x8_t s0, s1, r0, r1; + uint16x8_t diff0, diff1; + + s0 = vld1q_u16(src16_ptr); + r0 = vld1q_u16(ref16_ptr); + diff0 = vabdq_u16(s0, r0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + 8); + r1 = vld1q_u16(ref16_ptr + 8); + diff1 = vabdq_u16(s1, r1); + sum[1] = vpadalq_u16(sum[1], diff1); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + return horizontal_add_uint32x4(sum[0]); } +static INLINE uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3; + uint16x8_t diff0, diff1, diff2, diff3; + + s0 = vld1q_u16(src16_ptr + j); + r0 = vld1q_u16(ref16_ptr + j); + diff0 = vabdq_u16(s0, r0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + j + 8); + r1 = vld1q_u16(ref16_ptr + j + 8); + diff1 = vabdq_u16(s1, r1); + sum[1] = vpadalq_u16(sum[1], diff1); + + s2 = vld1q_u16(src16_ptr + j + 16); + r2 = vld1q_u16(ref16_ptr + j + 16); + diff2 = vabdq_u16(s2, r2); + sum[2] = vpadalq_u16(sum[2], diff2); + + s3 = vld1q_u16(src16_ptr + j + 24); + r3 = vld1q_u16(ref16_ptr + j + 24); + diff3 = vabdq_u16(s3, r3); + sum[3] = vpadalq_u16(sum[3], diff3); + + j += 32; + } while (j < w); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + sum[2] = vaddq_u32(sum[2], sum[3]); + sum[0] = vaddq_u32(sum[0], sum[2]); + + return horizontal_add_uint32x4(sum[0]); +} + +static INLINE unsigned int highbd_sad64xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h); +} + +static INLINE unsigned int highbd_sad32xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h); +} + +#define HBD_SAD_WXH_NEON(w, h) \ + unsigned int vpx_highbd_sad##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \ + } + +HBD_SAD_WXH_NEON(4, 4) +HBD_SAD_WXH_NEON(4, 8) + +HBD_SAD_WXH_NEON(8, 4) +HBD_SAD_WXH_NEON(8, 8) +HBD_SAD_WXH_NEON(8, 16) + +HBD_SAD_WXH_NEON(16, 8) +HBD_SAD_WXH_NEON(16, 16) +HBD_SAD_WXH_NEON(16, 32) + +HBD_SAD_WXH_NEON(32, 16) +HBD_SAD_WXH_NEON(32, 32) +HBD_SAD_WXH_NEON(32, 64) + +HBD_SAD_WXH_NEON(64, 32) +HBD_SAD_WXH_NEON(64, 64) + static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon( const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, int width, int height) { @@ -115,20 +231,6 @@ static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon( return horizontal_add_uint32x4(sum_abs_diff); } -#define highbd_sad4MxN(m, n) \ - unsigned int vpx_highbd_sad##m##x##n##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - return highbd_sad4_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ - } - -#define highbd_sadMxN(m, n) \ - unsigned int vpx_highbd_sad##m##x##n##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - return highbd_sad8_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ - } - #define highbd_sad4MxN_avg(m, n) \ unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ @@ -159,67 +261,54 @@ static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon( /* clang-format off */ // 4x4 -highbd_sad4MxN(4, 4) highbd_sad4MxN_avg(4, 4) highbd_sadMxNx4D(4, 4) // 4x8 -highbd_sad4MxN(4, 8) highbd_sad4MxN_avg(4, 8) highbd_sadMxNx4D(4, 8) // 8x4 -highbd_sadMxN(8, 4) highbd_sadMxN_avg(8, 4) highbd_sadMxNx4D(8, 4) // 8x8 -highbd_sadMxN(8, 8) highbd_sadMxN_avg(8, 8) highbd_sadMxNx4D(8, 8) // 8x16 -highbd_sadMxN(8, 16) highbd_sadMxN_avg(8, 16) highbd_sadMxNx4D(8, 16) // 16x8 -highbd_sadMxN(16, 8) highbd_sadMxN_avg(16, 8) highbd_sadMxNx4D(16, 8) // 16x16 -highbd_sadMxN(16, 16) highbd_sadMxN_avg(16, 16) highbd_sadMxNx4D(16, 16) // 16x32 -highbd_sadMxN(16, 32) highbd_sadMxN_avg(16, 32) highbd_sadMxNx4D(16, 32) // 32x16 -highbd_sadMxN(32, 16) highbd_sadMxN_avg(32, 16) highbd_sadMxNx4D(32, 16) // 32x32 -highbd_sadMxN(32, 32) highbd_sadMxN_avg(32, 32) highbd_sadMxNx4D(32, 32) // 32x64 -highbd_sadMxN(32, 64) highbd_sadMxN_avg(32, 64) highbd_sadMxNx4D(32, 64) // 64x32 -highbd_sadMxN(64, 32) highbd_sadMxN_avg(64, 32) highbd_sadMxNx4D(64, 32) // 64x64 -highbd_sadMxN(64, 64) highbd_sadMxN_avg(64, 64) highbd_sadMxNx4D(64, 64) /* clang-format on */ From 9a5cbfbc087210eabfac5b0c2d72d12852ac56ae Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Fri, 3 Feb 2023 11:00:19 +0000 Subject: [PATCH 0591/1000] Optimize Neon implementation of high bitdepth avg SAD functions Optimizations take a similar form to those implemented for standard bitdepth averaging SAD: - Use ABD, UADALP instead of ABAL, ABAL2 (double the throughput on modern out-of-order Arm-designed cores.) - Use more accumulator registers to make better use of Neon pipeline resources on Arm CPUs that have four Neon pipes. Change-Id: I75c5f09948f6bf17200f82e00e7a827a80451108 --- vpx_dsp/arm/highbd_sad_neon.c | 244 +++++++++++++++++++++++++--------- 1 file changed, 181 insertions(+), 63 deletions(-) diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c index 8415481f00..c76eb12b94 100644 --- a/vpx_dsp/arm/highbd_sad_neon.c +++ b/vpx_dsp/arm/highbd_sad_neon.c @@ -180,73 +180,204 @@ HBD_SAD_WXH_NEON(32, 64) HBD_SAD_WXH_NEON(64, 32) HBD_SAD_WXH_NEON(64, 64) -static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon( - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, - int ref_stride, const uint8_t *second_pred, int width, int height) { - int i, j; - uint32x4_t sum_abs_diff = vdupq_n_u32(0); +static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); - const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 4) { - const uint16x4_t a_u16 = vld1_u16(src16_ptr + j); - const uint16x4_t b_u16 = vld1_u16(ref16_ptr + j); - const uint16x4_t c_u16 = vld1_u16(pred_ptr + j); - const uint16x4_t avg = vrhadd_u16(b_u16, c_u16); - sum_abs_diff = vabal_u16(sum_abs_diff, a_u16, avg); - } + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x4_t s = vld1_u16(src16_ptr); + uint16x4_t r = vld1_u16(ref16_ptr); + uint16x4_t p = vld1_u16(pred16_ptr); + + uint16x4_t avg = vrhadd_u16(r, p); + sum = vabal_u16(sum, s, avg); + src16_ptr += src_stride; ref16_ptr += ref_stride; - pred_ptr += width; - } + pred16_ptr += 4; + } while (--i != 0); - return horizontal_add_uint32x4(sum_abs_diff); + return horizontal_add_uint32x4(sum); } -static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon( - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, - int ref_stride, const uint8_t *second_pred, int width, int height) { - int i, j; - uint32x4_t sum_abs_diff = vdupq_n_u32(0); +static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); - const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 8) { - const uint16x8_t a_u16 = vld1q_u16(src16_ptr + j); - const uint16x8_t b_u16 = vld1q_u16(ref16_ptr + j); - const uint16x8_t c_u16 = vld1q_u16(pred_ptr + j); - const uint16x8_t avg = vrhaddq_u16(b_u16, c_u16); - sum_abs_diff = - vabal_u16(sum_abs_diff, vget_low_u16(a_u16), vget_low_u16(avg)); - sum_abs_diff = - vabal_u16(sum_abs_diff, vget_high_u16(a_u16), vget_high_u16(avg)); - } + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x8_t s = vld1q_u16(src16_ptr); + uint16x8_t r = vld1q_u16(ref16_ptr); + uint16x8_t p = vld1q_u16(pred16_ptr); + + uint16x8_t avg = vrhaddq_u16(r, p); + uint16x8_t diff = vabdq_u16(s, avg); + sum = vpadalq_u16(sum, diff); + src16_ptr += src_stride; ref16_ptr += ref_stride; - pred_ptr += width; - } + pred16_ptr += 8; + } while (--i != 0); - return horizontal_add_uint32x4(sum_abs_diff); + return horizontal_add_uint32x4(sum); } -#define highbd_sad4MxN_avg(m, n) \ - unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, const uint8_t *second_pred) { \ - return highbd_sad4_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \ - second_pred, m, n); \ - } +static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + uint16x8_t s0, s1, r0, r1, p0, p1; + uint16x8_t avg0, avg1, diff0, diff1; + + s0 = vld1q_u16(src16_ptr); + r0 = vld1q_u16(ref16_ptr); + p0 = vld1q_u16(pred16_ptr); + avg0 = vrhaddq_u16(r0, p0); + diff0 = vabdq_u16(s0, avg0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + 8); + r1 = vld1q_u16(ref16_ptr + 8); + p1 = vld1q_u16(pred16_ptr + 8); + avg1 = vrhaddq_u16(r1, p1); + diff1 = vabdq_u16(s1, avg1); + sum[1] = vpadalq_u16(sum[1], diff1); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 16; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + return horizontal_add_uint32x4(sum[0]); +} -#define highbd_sadMxN_avg(m, n) \ - unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, const uint8_t *second_pred) { \ - return highbd_sad8_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \ - second_pred, m, n); \ +static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; + uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3; + + s0 = vld1q_u16(src16_ptr + j); + r0 = vld1q_u16(ref16_ptr + j); + p0 = vld1q_u16(pred16_ptr + j); + avg0 = vrhaddq_u16(r0, p0); + diff0 = vabdq_u16(s0, avg0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + j + 8); + r1 = vld1q_u16(ref16_ptr + j + 8); + p1 = vld1q_u16(pred16_ptr + j + 8); + avg1 = vrhaddq_u16(r1, p1); + diff1 = vabdq_u16(s1, avg1); + sum[1] = vpadalq_u16(sum[1], diff1); + + s2 = vld1q_u16(src16_ptr + j + 16); + r2 = vld1q_u16(ref16_ptr + j + 16); + p2 = vld1q_u16(pred16_ptr + j + 16); + avg2 = vrhaddq_u16(r2, p2); + diff2 = vabdq_u16(s2, avg2); + sum[2] = vpadalq_u16(sum[2], diff2); + + s3 = vld1q_u16(src16_ptr + j + 24); + r3 = vld1q_u16(ref16_ptr + j + 24); + p3 = vld1q_u16(pred16_ptr + j + 24); + avg3 = vrhaddq_u16(r3, p3); + diff3 = vabdq_u16(s3, avg3); + sum[3] = vpadalq_u16(sum[3], diff3); + + j += 32; + } while (j < w); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += w; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + sum[2] = vaddq_u32(sum[2], sum[3]); + sum[0] = vaddq_u32(sum[0], sum[2]); + + return horizontal_add_uint32x4(sum[0]); +} + +static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h, + second_pred); +} + +static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h, + second_pred); +} + +#define HBD_SAD_WXH_AVG_NEON(w, h) \ + uint32_t vpx_highbd_sad##w##x##h##_avg_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ } +HBD_SAD_WXH_AVG_NEON(4, 4) +HBD_SAD_WXH_AVG_NEON(4, 8) + +HBD_SAD_WXH_AVG_NEON(8, 4) +HBD_SAD_WXH_AVG_NEON(8, 8) +HBD_SAD_WXH_AVG_NEON(8, 16) + +HBD_SAD_WXH_AVG_NEON(16, 8) +HBD_SAD_WXH_AVG_NEON(16, 16) +HBD_SAD_WXH_AVG_NEON(16, 32) + +HBD_SAD_WXH_AVG_NEON(32, 16) +HBD_SAD_WXH_AVG_NEON(32, 32) +HBD_SAD_WXH_AVG_NEON(32, 64) + +HBD_SAD_WXH_AVG_NEON(64, 32) +HBD_SAD_WXH_AVG_NEON(64, 64) + #define highbd_sadMxNx4D(m, n) \ void vpx_highbd_sad##m##x##n##x4d_neon( \ const uint8_t *src_ptr, int src_stride, \ @@ -261,54 +392,41 @@ static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon( /* clang-format off */ // 4x4 -highbd_sad4MxN_avg(4, 4) highbd_sadMxNx4D(4, 4) // 4x8 -highbd_sad4MxN_avg(4, 8) highbd_sadMxNx4D(4, 8) // 8x4 -highbd_sadMxN_avg(8, 4) highbd_sadMxNx4D(8, 4) // 8x8 -highbd_sadMxN_avg(8, 8) highbd_sadMxNx4D(8, 8) // 8x16 -highbd_sadMxN_avg(8, 16) highbd_sadMxNx4D(8, 16) // 16x8 -highbd_sadMxN_avg(16, 8) highbd_sadMxNx4D(16, 8) // 16x16 -highbd_sadMxN_avg(16, 16) highbd_sadMxNx4D(16, 16) // 16x32 -highbd_sadMxN_avg(16, 32) highbd_sadMxNx4D(16, 32) // 32x16 -highbd_sadMxN_avg(32, 16) highbd_sadMxNx4D(32, 16) // 32x32 -highbd_sadMxN_avg(32, 32) highbd_sadMxNx4D(32, 32) // 32x64 -highbd_sadMxN_avg(32, 64) highbd_sadMxNx4D(32, 64) // 64x32 -highbd_sadMxN_avg(64, 32) highbd_sadMxNx4D(64, 32) // 64x64 -highbd_sadMxN_avg(64, 64) highbd_sadMxNx4D(64, 64) /* clang-format on */ From 5eea5c76669fefb07e5dcc4677942c28a5de4257 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 6 Feb 2023 13:29:58 -0500 Subject: [PATCH 0592/1000] Remove duplicated VPX_SCALING declaration Use VPX_SCALING_MODE instead Change-Id: Iab9d29f20838703e00bd9f7641035d8ebd69af53 --- vp8/common/onyx.h | 19 ++++++------------- vp8/encoder/firstpass.c | 4 ++-- vp8/encoder/onyx_if.c | 20 +++++++++++--------- vp8/vp8_cx_iface.c | 4 ++-- vp9/encoder/vp9_encoder.c | 16 ++++++++-------- vp9/encoder/vp9_encoder.h | 11 ++--------- vp9/vp9_cx_iface.c | 5 ++--- 7 files changed, 33 insertions(+), 46 deletions(-) diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index 05c72df3fa..8c35e433e7 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -26,13 +26,6 @@ struct VP8_COMP; /* Create/destroy static data structures. */ -typedef enum { - NORMAL = 0, - FOURFIVE = 1, - THREEFIVE = 2, - ONETWO = 3 -} VPX_SCALING; - typedef enum { USAGE_LOCAL_FILE_PLAYBACK = 0x0, USAGE_STREAM_FROM_SERVER = 0x1, @@ -58,19 +51,19 @@ typedef enum { #include static INLINE void Scale2Ratio(int mode, int *hr, int *hs) { switch (mode) { - case NORMAL: + case VP8E_NORMAL: *hr = 1; *hs = 1; break; - case FOURFIVE: + case VP8E_FOURFIVE: *hr = 4; *hs = 5; break; - case THREEFIVE: + case VP8E_THREEFIVE: *hr = 3; *hs = 5; break; - case ONETWO: + case VP8E_ONETWO: *hr = 1; *hs = 2; break; @@ -273,8 +266,8 @@ int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int threshold[4]); int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols); -int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode); +int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode); int vp8_get_quantizer(struct VP8_COMP *cpi); #ifdef __cplusplus diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 65d2681c91..4149fb4bf8 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -2990,8 +2990,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { } /* Set back to unscaled by defaults */ - cpi->common.horiz_scale = NORMAL; - cpi->common.vert_scale = NORMAL; + cpi->common.horiz_scale = VP8E_NORMAL; + cpi->common.vert_scale = VP8E_NORMAL; /* Calculate Average bits per frame. */ av_bits_per_frame = cpi->oxcf.target_bandwidth / diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 4bbeadef01..bcf5227029 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1667,7 +1667,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { cm->sharpness_level = cpi->oxcf.Sharpness; - if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) { + if (cm->horiz_scale != VP8E_NORMAL || cm->vert_scale != VP8E_NORMAL) { int hr, hs, vr, vs; Scale2Ratio(cm->horiz_scale, &hr, &hs); @@ -2504,15 +2504,17 @@ static int resize_key_frame(VP8_COMP *cpi) { if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100)) { cm->horiz_scale = - (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO; - cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO; + (cm->horiz_scale < VP8E_ONETWO) ? cm->horiz_scale + 1 : VP8E_ONETWO; + cm->vert_scale = + (cm->vert_scale < VP8E_ONETWO) ? cm->vert_scale + 1 : VP8E_ONETWO; } /* Should we now start scaling back up */ else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100)) { cm->horiz_scale = - (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL; - cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL; + (cm->horiz_scale > VP8E_NORMAL) ? cm->horiz_scale - 1 : VP8E_NORMAL; + cm->vert_scale = + (cm->vert_scale > VP8E_NORMAL) ? cm->vert_scale - 1 : VP8E_NORMAL; } /* Get the new height and width */ @@ -5380,15 +5382,15 @@ int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, } } -int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode) { - if (horiz_mode <= ONETWO) { +int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode) { + if (horiz_mode <= VP8E_ONETWO) { cpi->common.horiz_scale = horiz_mode; } else { return -1; } - if (vert_mode <= ONETWO) { + if (vert_mode <= VP8E_ONETWO) { cpi->common.vert_scale = vert_mode; } else { return -1; diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 340f3e6638..a9d1f8005d 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -1224,8 +1224,8 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx, if (data) { int res; vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data; - res = vp8_set_internal_size(ctx->cpi, (VPX_SCALING)scalemode.h_scaling_mode, - (VPX_SCALING)scalemode.v_scaling_mode); + res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode, + scalemode.v_scaling_mode); if (!res) { /*force next frame a key frame to effect scaling mode */ diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index b9fc148d7b..c14b56e24b 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -502,22 +502,22 @@ static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = { "Too many reference buffers are used." }; -static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) { +static INLINE void Scale2Ratio(VPX_SCALING_MODE mode, int *hr, int *hs) { switch (mode) { - case NORMAL: + case VP8E_NORMAL: *hr = 1; *hs = 1; break; - case FOURFIVE: + case VP8E_FOURFIVE: *hr = 4; *hs = 5; break; - case THREEFIVE: + case VP8E_THREEFIVE: *hr = 3; *hs = 5; break; default: - assert(mode == ONETWO); + assert(mode == VP8E_ONETWO); *hr = 1; *hs = 2; break; @@ -8237,12 +8237,12 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest, } } -int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode) { +int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode) { VP9_COMMON *cm = &cpi->common; int hr = 0, hs = 0, vr = 0, vs = 0; - if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1; + if (horiz_mode > VP8E_ONETWO || vert_mode > VP8E_ONETWO) return -1; Scale2Ratio(horiz_mode, &hr, &hs); Scale2Ratio(vert_mode, &vr, &vs); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 33a2844d3e..bdca8e58bf 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -90,13 +90,6 @@ typedef enum { ENCODE_BREAKOUT_LIMITED = 2 } ENCODE_BREAKOUT_TYPE; -typedef enum { - NORMAL = 0, - FOURFIVE = 1, - THREEFIVE = 2, - ONETWO = 3 -} VPX_SCALING; - typedef enum { // Good Quality Fast Encoding. The encoder balances quality with the amount of // time it takes to encode the output. Speed setting controls how fast. @@ -1236,8 +1229,8 @@ int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, int cols); -int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode); +int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode); int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, unsigned int height); diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index dee175dc09..4c7eaed725 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1675,9 +1675,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *); if (mode) { - const int res = - vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode, - (VPX_SCALING)mode->v_scaling_mode); + const int res = vp9_set_internal_size(ctx->cpi, mode->h_scaling_mode, + mode->v_scaling_mode); return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM; } return VPX_CODEC_INVALID_PARAM; From 6b8e9e1f3eb5fe8420d49d0b4df146fb1e91e1cf Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Thu, 2 Feb 2023 16:06:38 +0000 Subject: [PATCH 0593/1000] Optimize Neon implementation of high bitdepth SAD4D functions Optimizations take a similar form to those implemented for Armv8.0 standard bitdepth SAD4D: - Use ABD, UADALP instead of ABAL, ABAL2 (double the throughput on modern out-of-order Arm-designed cores.) - Use more accumulator registers to make better use of Neon pipeline resources on Arm CPUs that have four Neon pipes. - Compute the four SAD sums in parallel so that we only load the source block once - instead of four times. Change-Id: Ica45c44fd167e5fcc83871d8c138fc72ed3a9723 --- vpx_dsp/arm/highbd_sad4d_neon.c | 248 ++++++++++++++++++++++++++++++++ vpx_dsp/arm/highbd_sad_neon.c | 53 ------- vpx_dsp/vpx_dsp.mk | 1 + 3 files changed, 249 insertions(+), 53 deletions(-) create mode 100644 vpx_dsp/arm/highbd_sad4d_neon.c diff --git a/vpx_dsp/arm/highbd_sad4d_neon.c b/vpx_dsp/arm/highbd_sad4d_neon.c new file mode 100644 index 0000000000..d5e9e8ad22 --- /dev/null +++ b/vpx_dsp/arm/highbd_sad4d_neon.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE uint32x4_t horizontal_add_4d_u32(uint32x4_t sum[4]) { +#if defined(__aarch64__) + uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]); + uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]); + return vpaddq_u32(res01, res23); +#else + uint32x4_t res = vdupq_n_u32(0); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3); + return res; +#endif +} + +static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = 0; + do { + uint16x4_t s = vld1_u16(src16_ptr + i * src_stride); + uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride); + uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride); + uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride); + uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride); + + sum[0] = vabal_u16(sum[0], s, r0); + sum[1] = vabal_u16(sum[1], s, r1); + sum[2] = vabal_u16(sum[2], s, r2); + sum[3] = vabal_u16(sum[3], s, r3); + + } while (++i < h); + + vst1q_u32(res, horizontal_add_4d_u32(sum)); +} + +static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref, + uint32x4_t *const sad_sum) { + uint16x8_t abs_diff = vabdq_u16(src, ref); + *sad_sum = vpadalq_u16(*sad_sum, abs_diff); +} + +static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = 0; + do { + uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); + + sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]); + sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]); + sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]); + sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]); + + } while (++i < h); + + vst1q_u32(res, horizontal_add_4d_u32(sum)); +} + +static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + uint16x8_t s0, s1; + + s0 = vld1q_u16(src16_ptr + i * src_stride); + sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]); + sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]); + sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]); + sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u16(src16_ptr + i * src_stride + 8); + sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]); + sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]); + sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]); + sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_u32(sum)); +} + +static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], int w, + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3; + + s0 = vld1q_u16(src16_ptr + i * src_stride + j); + sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]); + sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]); + sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]); + sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]); + + s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8); + sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]); + sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]); + sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]); + sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]); + + s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16); + sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16), + &sum_lo[0]); + sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16), + &sum_lo[1]); + sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16), + &sum_lo[2]); + sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16), + &sum_lo[3]); + + s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24); + sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24), + &sum_hi[0]); + sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24), + &sum_hi[1]); + sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24), + &sum_hi[2]); + sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24), + &sum_hi[3]); + + j += 32; + } while (j < w); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_u32(sum)); +} + +static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h); +} + +static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h); +} + +#define HBD_SAD_WXH_4D_NEON(w, h) \ + void vpx_highbd_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], \ + int ref_stride, uint32_t res[4]) { \ + highbd_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \ + } + +HBD_SAD_WXH_4D_NEON(4, 4) +HBD_SAD_WXH_4D_NEON(4, 8) + +HBD_SAD_WXH_4D_NEON(8, 4) +HBD_SAD_WXH_4D_NEON(8, 8) +HBD_SAD_WXH_4D_NEON(8, 16) + +HBD_SAD_WXH_4D_NEON(16, 8) +HBD_SAD_WXH_4D_NEON(16, 16) +HBD_SAD_WXH_4D_NEON(16, 32) + +HBD_SAD_WXH_4D_NEON(32, 16) +HBD_SAD_WXH_4D_NEON(32, 32) +HBD_SAD_WXH_4D_NEON(32, 64) + +HBD_SAD_WXH_4D_NEON(64, 32) +HBD_SAD_WXH_4D_NEON(64, 64) diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c index c76eb12b94..90971f6009 100644 --- a/vpx_dsp/arm/highbd_sad_neon.c +++ b/vpx_dsp/arm/highbd_sad_neon.c @@ -377,56 +377,3 @@ HBD_SAD_WXH_AVG_NEON(32, 64) HBD_SAD_WXH_AVG_NEON(64, 32) HBD_SAD_WXH_AVG_NEON(64, 64) - -#define highbd_sadMxNx4D(m, n) \ - void vpx_highbd_sad##m##x##n##x4d_neon( \ - const uint8_t *src_ptr, int src_stride, \ - const uint8_t *const ref_array[4], int ref_stride, \ - uint32_t sad_array[4]) { \ - int i; \ - for (i = 0; i < 4; ++i) { \ - sad_array[i] = vpx_highbd_sad##m##x##n##_neon(src_ptr, src_stride, \ - ref_array[i], ref_stride); \ - } \ - } - -/* clang-format off */ -// 4x4 -highbd_sadMxNx4D(4, 4) - -// 4x8 -highbd_sadMxNx4D(4, 8) - -// 8x4 -highbd_sadMxNx4D(8, 4) - -// 8x8 -highbd_sadMxNx4D(8, 8) - -// 8x16 -highbd_sadMxNx4D(8, 16) - -// 16x8 -highbd_sadMxNx4D(16, 8) - -// 16x16 -highbd_sadMxNx4D(16, 16) - -// 16x32 -highbd_sadMxNx4D(16, 32) - -// 32x16 -highbd_sadMxNx4D(32, 16) - -// 32x32 -highbd_sadMxNx4D(32, 32) - -// 32x64 -highbd_sadMxNx4D(32, 64) - -// 64x32 -highbd_sadMxNx4D(64, 32) - -// 64x64 -highbd_sadMxNx4D(64, 64) - /* clang-format on */ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 1fd9495cf9..3b04e97651 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -392,6 +392,7 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c From ec8e2fe1cfda967b78c7068404455348357c7e7d Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Mon, 6 Feb 2023 14:48:34 -0800 Subject: [PATCH 0594/1000] Move TPL to a new file This is a refactoring CL. Change-Id: Ic8c1575601d27f14ecd1b1bf0a038e447eaae458 --- vp9/encoder/vp9_encoder.c | 1429 +---------------------------------- vp9/encoder/vp9_encoder.h | 13 + vp9/encoder/vp9_tpl_model.c | 1400 ++++++++++++++++++++++++++++++++++ vp9/encoder/vp9_tpl_model.h | 44 ++ vp9/vp9cx.mk | 2 + 5 files changed, 1466 insertions(+), 1422 deletions(-) create mode 100644 vp9/encoder/vp9_tpl_model.c create mode 100644 vp9/encoder/vp9_tpl_model.h diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index b9fc148d7b..f8d1d64672 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -34,16 +34,12 @@ #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_idct.h" -#if CONFIG_NON_GREEDY_MV -#include "vp9/common/vp9_mvref_common.h" -#endif #if CONFIG_VP9_POSTPROC #include "vp9/common/vp9_postproc.h" #endif #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_tile_common.h" -#include "vp9/common/vp9_scan.h" #if !CONFIG_REALTIME_ONLY #include "vp9/encoder/vp9_alt_ref_aq.h" @@ -81,6 +77,7 @@ #include "vp9/encoder/vp9_speed_features.h" #include "vp9/encoder/vp9_svc_layercontext.h" #include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_tpl_model.h" #include "vp9/vp9_cx_iface.h" #define AM_SEGMENT_ID_INACTIVE 7 @@ -126,13 +123,6 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) { } #endif -#if CONFIG_VP9_HIGHBITDEPTH -void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, - TX_SIZE tx_size); -#endif -void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, - TX_SIZE tx_size); - #if !CONFIG_REALTIME_ONLY // compute adaptive threshold for skip recoding static int compute_context_model_thresh(const VP9_COMP *const cpi) { @@ -2109,11 +2099,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { vp9_set_row_mt(cpi); } -#ifndef M_LOG2_E -#define M_LOG2_E 0.693147180559945309417 -#endif -#define log2f(x) (log(x) / (float)M_LOG2_E) - /*********************************************************************** * Read before modifying 'cal_nmvjointsadcost' or 'cal_nmvsadcosts' * *********************************************************************** @@ -2666,8 +2651,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V)) #endif // CONFIG_INTERNAL_STATS -static void free_tpl_buffer(VP9_COMP *cpi); - void vp9_remove_compressor(VP9_COMP *cpi) { VP9_COMMON *cm; unsigned int i; @@ -2781,7 +2764,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vpx_free(cpi->kmeans_data_arr); } - free_tpl_buffer(cpi); + vp9_free_tpl_buffer(cpi); vp9_loop_filter_dealloc(&cpi->lf_row_sync); vp9_bitstream_encode_tiles_buffer_dealloc(cpi); @@ -3329,19 +3312,6 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { vpx_extend_frame_inner_borders(cm->frame_to_show); } -static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) { - RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx]; - if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows || - new_fb_ptr->mi_cols < cm->mi_cols) { - vpx_free(new_fb_ptr->mvs); - CHECK_MEM_ERROR(cm, new_fb_ptr->mvs, - (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, - sizeof(*new_fb_ptr->mvs))); - new_fb_ptr->mi_rows = cm->mi_rows; - new_fb_ptr->mi_cols = cm->mi_cols; - } -} - void vp9_scale_references(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; MV_REFERENCE_FRAME ref_frame; @@ -5302,16 +5272,16 @@ static void set_mb_wiener_variance(VP9_COMP *cpi) { vpx_highbd_subtract_block(block_size, block_size, src_diff, block_size, mb_buffer, buf_stride, zero_pred, block_size, xd->bd); - highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); + vp9_highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); } else { vpx_subtract_block(block_size, block_size, src_diff, block_size, mb_buffer, buf_stride, zero_pred, block_size); - wht_fwd_txfm(src_diff, block_size, coeff, tx_size); + vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); } #else vpx_subtract_block(block_size, block_size, src_diff, block_size, mb_buffer, buf_stride, zero_pred, block_size); - wht_fwd_txfm(src_diff, block_size, coeff, tx_size); + vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); #endif // CONFIG_VP9_HIGHBITDEPTH coeff[0] = 0; @@ -6229,1391 +6199,6 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { } } -typedef struct GF_PICTURE { - YV12_BUFFER_CONFIG *frame; - int ref_frame[3]; - FRAME_UPDATE_TYPE update_type; -} GF_PICTURE; - -static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, - const GF_GROUP *gf_group, int *tpl_group_frames) { - VP9_COMMON *cm = &cpi->common; - int frame_idx = 0; - int i; - int gld_index = -1; - int alt_index = -1; - int lst_index = -1; - int arf_index_stack[MAX_ARF_LAYERS]; - int arf_stack_size = 0; - int extend_frame_count = 0; - int pframe_qindex = cpi->tpl_stats[2].base_qindex; - int frame_gop_offset = 0; - - RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs; - int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS]; - - memset(recon_frame_index, -1, sizeof(recon_frame_index)); - stack_init(arf_index_stack, MAX_ARF_LAYERS); - - // TODO(jingning): To be used later for gf frame type parsing. - (void)gf_group; - - for (i = 0; i < FRAME_BUFFERS; ++i) { - if (frame_bufs[i].ref_count == 0) { - alloc_frame_mvs(cm, i); - if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, -#if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, - NULL, NULL, NULL)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate frame buffer"); - - recon_frame_index[frame_idx] = i; - ++frame_idx; - - if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break; - } - } - - for (i = 0; i < REFS_PER_FRAME + 1; ++i) { - assert(recon_frame_index[i] >= 0); - cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf; - } - - *tpl_group_frames = 0; - - // Initialize Golden reference frame. - gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME); - for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1; - gf_picture[0].update_type = gf_group->update_type[0]; - gld_index = 0; - ++*tpl_group_frames; - - // Initialize base layer ARF frame - gf_picture[1].frame = cpi->Source; - gf_picture[1].ref_frame[0] = gld_index; - gf_picture[1].ref_frame[1] = lst_index; - gf_picture[1].ref_frame[2] = alt_index; - gf_picture[1].update_type = gf_group->update_type[1]; - alt_index = 1; - ++*tpl_group_frames; - - // Initialize P frames - for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { - struct lookahead_entry *buf; - frame_gop_offset = gf_group->frame_gop_index[frame_idx]; - buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); - - if (buf == NULL) break; - - gf_picture[frame_idx].frame = &buf->img; - gf_picture[frame_idx].ref_frame[0] = gld_index; - gf_picture[frame_idx].ref_frame[1] = lst_index; - gf_picture[frame_idx].ref_frame[2] = alt_index; - gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx]; - - switch (gf_group->update_type[frame_idx]) { - case ARF_UPDATE: - stack_push(arf_index_stack, alt_index, arf_stack_size); - ++arf_stack_size; - alt_index = frame_idx; - break; - case LF_UPDATE: lst_index = frame_idx; break; - case OVERLAY_UPDATE: - gld_index = frame_idx; - alt_index = stack_pop(arf_index_stack, arf_stack_size); - --arf_stack_size; - break; - case USE_BUF_FRAME: - lst_index = alt_index; - alt_index = stack_pop(arf_index_stack, arf_stack_size); - --arf_stack_size; - break; - default: break; - } - - ++*tpl_group_frames; - - // The length of group of pictures is baseline_gf_interval, plus the - // beginning golden frame from last GOP, plus the last overlay frame in - // the same GOP. - if (frame_idx == gf_group->gf_group_size) break; - } - - alt_index = -1; - ++frame_idx; - ++frame_gop_offset; - - // Extend two frames outside the current gf group. - for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) { - struct lookahead_entry *buf = - vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); - - if (buf == NULL) break; - - cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex; - - gf_picture[frame_idx].frame = &buf->img; - gf_picture[frame_idx].ref_frame[0] = gld_index; - gf_picture[frame_idx].ref_frame[1] = lst_index; - gf_picture[frame_idx].ref_frame[2] = alt_index; - gf_picture[frame_idx].update_type = LF_UPDATE; - lst_index = frame_idx; - ++*tpl_group_frames; - ++extend_frame_count; - ++frame_gop_offset; - } -} - -static void init_tpl_stats(VP9_COMP *cpi) { - int frame_idx; - for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { - TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - memset(tpl_frame->tpl_stats_ptr, 0, - tpl_frame->height * tpl_frame->width * - sizeof(*tpl_frame->tpl_stats_ptr)); - tpl_frame->is_valid = 0; - } -} - -#if CONFIG_NON_GREEDY_MV -static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, - MotionField *motion_field, - int frame_idx, uint8_t *cur_frame_buf, - uint8_t *ref_frame_buf, int stride, - BLOCK_SIZE bsize, int mi_row, - int mi_col, MV *mv) { - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; - int step_param; - uint32_t bestsme = UINT_MAX; - const MvLimits tmp_mv_limits = x->mv_limits; - // lambda is used to adjust the importance of motion vector consistency. - // TODO(angiebird): Figure out lambda's proper value. - const int lambda = cpi->tpl_stats[frame_idx].lambda; - int_mv nb_full_mvs[NB_MVS_NUM]; - int nb_full_mv_num; - - MV best_ref_mv1 = { 0, 0 }; - MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ - - best_ref_mv1_full.col = best_ref_mv1.col >> 3; - best_ref_mv1_full.row = best_ref_mv1.row >> 3; - - // Setup frame pointers - x->plane[0].src.buf = cur_frame_buf; - x->plane[0].src.stride = stride; - xd->plane[0].pre[0].buf = ref_frame_buf; - xd->plane[0].pre[0].stride = stride; - - step_param = mv_sf->reduce_first_step_size; - step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); - - vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); - - nb_full_mv_num = - vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs); - vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param, - lambda, 1, nb_full_mvs, nb_full_mv_num, mv); - - /* restore UMV window */ - x->mv_limits = tmp_mv_limits; - - return bestsme; -} - -static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, - uint8_t *cur_frame_buf, - uint8_t *ref_frame_buf, int stride, - BLOCK_SIZE bsize, MV *mv) { - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; - uint32_t bestsme = UINT_MAX; - uint32_t distortion; - uint32_t sse; - int cost_list[5]; - - MV best_ref_mv1 = { 0, 0 }; - - // Setup frame pointers - x->plane[0].src.buf = cur_frame_buf; - x->plane[0].src.stride = stride; - xd->plane[0].pre[0].buf = ref_frame_buf; - xd->plane[0].pre[0].stride = stride; - - // TODO(yunqing): may use higher tap interp filter than 2 taps. - // Ignore mv costing by sending NULL pointer instead of cost array - bestsme = cpi->find_fractional_mv_step( - x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, - cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, - USE_2_TAPS); - - return bestsme; -} - -#else // CONFIG_NON_GREEDY_MV -static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td, - uint8_t *cur_frame_buf, - uint8_t *ref_frame_buf, - int stride, BLOCK_SIZE bsize, - MV *mv) { - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; - const SEARCH_METHODS search_method = NSTEP; - int step_param; - int sadpb = x->sadperbit16; - uint32_t bestsme = UINT_MAX; - uint32_t distortion; - uint32_t sse; - int cost_list[5]; - const MvLimits tmp_mv_limits = x->mv_limits; - - MV best_ref_mv1 = { 0, 0 }; - MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ - - best_ref_mv1_full.col = best_ref_mv1.col >> 3; - best_ref_mv1_full.row = best_ref_mv1.row >> 3; - - // Setup frame pointers - x->plane[0].src.buf = cur_frame_buf; - x->plane[0].src.stride = stride; - xd->plane[0].pre[0].buf = ref_frame_buf; - xd->plane[0].pre[0].stride = stride; - - step_param = mv_sf->reduce_first_step_size; - step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); - - vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); - - vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param, - search_method, sadpb, cond_cost_list(cpi, cost_list), - &best_ref_mv1, mv, 0, 0); - - /* restore UMV window */ - x->mv_limits = tmp_mv_limits; - - // TODO(yunqing): may use higher tap interp filter than 2 taps. - // Ignore mv costing by sending NULL pointer instead of cost array - bestsme = cpi->find_fractional_mv_step( - x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, - cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, - USE_2_TAPS); - - return bestsme; -} -#endif - -static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, - int ref_pos_col, int block, BLOCK_SIZE bsize) { - int width = 0, height = 0; - int bw = 4 << b_width_log2_lookup[bsize]; - int bh = 4 << b_height_log2_lookup[bsize]; - - switch (block) { - case 0: - width = grid_pos_col + bw - ref_pos_col; - height = grid_pos_row + bh - ref_pos_row; - break; - case 1: - width = ref_pos_col + bw - grid_pos_col; - height = grid_pos_row + bh - ref_pos_row; - break; - case 2: - width = grid_pos_col + bw - ref_pos_col; - height = ref_pos_row + bh - grid_pos_row; - break; - case 3: - width = ref_pos_col + bw - grid_pos_col; - height = ref_pos_row + bh - grid_pos_row; - break; - default: assert(0); - } - - return width * height; -} - -static int round_floor(int ref_pos, int bsize_pix) { - int round; - if (ref_pos < 0) - round = -(1 + (-ref_pos - 1) / bsize_pix); - else - round = ref_pos / bsize_pix; - - return round; -} - -static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, - BLOCK_SIZE bsize, int stride) { - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; - int idx, idy; - - for (idy = 0; idy < mi_height; ++idy) { - for (idx = 0; idx < mi_width; ++idx) { - TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx]; - const int64_t mc_flow = tpl_ptr->mc_flow; - const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost; - *tpl_ptr = *src_stats; - tpl_ptr->mc_flow = mc_flow; - tpl_ptr->mc_ref_cost = mc_ref_cost; - tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow; - } - } -} - -static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, - int mi_row, int mi_col, const BLOCK_SIZE bsize) { - TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index]; - TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr; - MV mv = tpl_stats->mv.as_mv; - int mv_row = mv.row >> 3; - int mv_col = mv.col >> 3; - - int ref_pos_row = mi_row * MI_SIZE + mv_row; - int ref_pos_col = mi_col * MI_SIZE + mv_col; - - const int bw = 4 << b_width_log2_lookup[bsize]; - const int bh = 4 << b_height_log2_lookup[bsize]; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - const int pix_num = bw * bh; - - // top-left on grid block location in pixel - int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; - int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; - int block; - - for (block = 0; block < 4; ++block) { - int grid_pos_row = grid_pos_row_base + bh * (block >> 1); - int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); - - if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && - grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { - int overlap_area = get_overlap_area( - grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize); - int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; - int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; - - int64_t mc_flow = tpl_stats->mc_dep_cost - - (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) / - tpl_stats->intra_cost; - - int idx, idy; - - for (idy = 0; idy < mi_height; ++idy) { - for (idx = 0; idx < mi_width; ++idx) { - TplDepStats *des_stats = - &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride + - (ref_mi_col + idx)]; - - des_stats->mc_flow += (mc_flow * overlap_area) / pix_num; - des_stats->mc_ref_cost += - ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) / - pix_num; - assert(overlap_area >= 0); - } - } - } - } -} - -static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, - int mi_row, int mi_col, const BLOCK_SIZE bsize) { - int idx, idy; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - - for (idy = 0; idy < mi_height; ++idy) { - for (idx = 0; idx < mi_width; ++idx) { - TplDepStats *tpl_ptr = - &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)]; - tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx, - BLOCK_8X8); - } - } -} - -static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, - tran_low_t *qcoeff, tran_low_t *dqcoeff, - TX_SIZE tx_size, int64_t *recon_error, - int64_t *sse) { - MACROBLOCKD *const xd = &x->e_mbd; - const struct macroblock_plane *const p = &x->plane[plane]; - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; - uint16_t eob; - int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; - const int shift = tx_size == TX_32X32 ? 0 : 2; - - // skip block condition should be handled before this is called. - assert(!x->skip_block); - -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, &eob, - scan_order->scan, scan_order->iscan); - } else { - vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, &eob, scan_order->scan, - scan_order->iscan); - } -#else - vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, &eob, scan_order->scan, - scan_order->iscan); -#endif // CONFIG_VP9_HIGHBITDEPTH - - *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift; - *recon_error = VPXMAX(*recon_error, 1); - - *sse = (*sse) >> shift; - *sse = VPXMAX(*sse, 1); -} - -#if CONFIG_VP9_HIGHBITDEPTH -void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, - TX_SIZE tx_size) { - // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms. - switch (tx_size) { - case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break; - case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break; - case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break; - default: assert(0); - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH - -void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, - TX_SIZE tx_size) { - switch (tx_size) { - case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break; - case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break; - case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break; - default: assert(0); - } -} - -static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row, - int mi_col) { - x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); - x->mv_limits.row_max = - (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND); - x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); - x->mv_limits.col_max = - ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND); -} - -static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, - struct scale_factors *sf, GF_PICTURE *gf_picture, - int frame_idx, TplDepFrame *tpl_frame, - int16_t *src_diff, tran_low_t *coeff, - tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row, - int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, - YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, - int64_t *recon_error, int64_t *sse) { - VP9_COMMON *cm = &cpi->common; - ThreadData *td = &cpi->td; - - const int bw = 4 << b_width_log2_lookup[bsize]; - const int bh = 4 << b_height_log2_lookup[bsize]; - const int pix_num = bw * bh; - int best_rf_idx = -1; - int_mv best_mv; - int64_t best_inter_cost = INT64_MAX; - int64_t inter_cost; - int rf_idx; - const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; - - int64_t best_intra_cost = INT64_MAX; - int64_t intra_cost; - PREDICTION_MODE mode; - int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; - MODE_INFO mi_above, mi_left; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - TplDepStats *tpl_stats = - &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; - - xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); - xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8; - xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); - xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8; - xd->above_mi = (mi_row > 0) ? &mi_above : NULL; - xd->left_mi = (mi_col > 0) ? &mi_left : NULL; - - // Intra prediction search - for (mode = DC_PRED; mode <= TM_PRED; ++mode) { - uint8_t *src, *dst; - int src_stride, dst_stride; - - src = xd->cur_buf->y_buffer + mb_y_offset; - src_stride = xd->cur_buf->y_stride; - - dst = &predictor[0]; - dst_stride = bw; - - xd->mi[0]->sb_type = bsize; - xd->mi[0]->ref_frame[0] = INTRA_FRAME; - - vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src, - src_stride, dst, dst_stride, 0, 0, 0); - -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, - dst_stride, xd->bd); - highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); - intra_cost = vpx_highbd_satd(coeff, pix_num); - } else { - vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, - dst_stride); - wht_fwd_txfm(src_diff, bw, coeff, tx_size); - intra_cost = vpx_satd(coeff, pix_num); - } -#else - vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride); - wht_fwd_txfm(src_diff, bw, coeff, tx_size); - intra_cost = vpx_satd(coeff, pix_num); -#endif // CONFIG_VP9_HIGHBITDEPTH - - if (intra_cost < best_intra_cost) best_intra_cost = intra_cost; - } - - // Motion compensated prediction - best_mv.as_int = 0; - - set_mv_limits(cm, x, mi_row, mi_col); - - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - int_mv mv; -#if CONFIG_NON_GREEDY_MV - MotionField *motion_field; -#endif - if (ref_frame[rf_idx] == NULL) continue; - -#if CONFIG_NON_GREEDY_MV - (void)td; - motion_field = vp9_motion_field_info_get_motion_field( - &cpi->motion_field_info, frame_idx, rf_idx, bsize); - mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); -#else - motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset, - ref_frame[rf_idx]->y_buffer + mb_y_offset, - xd->cur_buf->y_stride, bsize, &mv.as_mv); -#endif - -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_build_inter_predictor( - CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset), - ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw, - &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, - mi_row * MI_SIZE, xd->bd); - vpx_highbd_subtract_block( - bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset, - xd->cur_buf->y_stride, &predictor[0], bw, xd->bd); - highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); - inter_cost = vpx_highbd_satd(coeff, pix_num); - } else { - vp9_build_inter_predictor( - ref_frame[rf_idx]->y_buffer + mb_y_offset, - ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh, - 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE); - vpx_subtract_block(bh, bw, src_diff, bw, - xd->cur_buf->y_buffer + mb_y_offset, - xd->cur_buf->y_stride, &predictor[0], bw); - wht_fwd_txfm(src_diff, bw, coeff, tx_size); - inter_cost = vpx_satd(coeff, pix_num); - } -#else - vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset, - ref_frame[rf_idx]->y_stride, &predictor[0], bw, - &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, - mi_col * MI_SIZE, mi_row * MI_SIZE); - vpx_subtract_block(bh, bw, src_diff, bw, - xd->cur_buf->y_buffer + mb_y_offset, - xd->cur_buf->y_stride, &predictor[0], bw); - wht_fwd_txfm(src_diff, bw, coeff, tx_size); - inter_cost = vpx_satd(coeff, pix_num); -#endif - - if (inter_cost < best_inter_cost) { - best_rf_idx = rf_idx; - best_inter_cost = inter_cost; - best_mv.as_int = mv.as_int; - get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error, - sse); - } - } - best_intra_cost = VPXMAX(best_intra_cost, 1); - best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost); - tpl_stats->inter_cost = VPXMAX( - 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); - tpl_stats->intra_cost = VPXMAX( - 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); - tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; - tpl_stats->mv.as_int = best_mv.as_int; -} - -#if CONFIG_NON_GREEDY_MV -static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture, - int frame_idx, int rf_idx, int mi_row, - int mi_col, struct buf_2d *src, - struct buf_2d *pre) { - const int mb_y_offset = - mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; - YV12_BUFFER_CONFIG *ref_frame = NULL; - int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; - if (ref_frame_idx != -1) { - ref_frame = gf_picture[ref_frame_idx].frame; - src->buf = xd->cur_buf->y_buffer + mb_y_offset; - src->stride = xd->cur_buf->y_stride; - pre->buf = ref_frame->y_buffer + mb_y_offset; - pre->stride = ref_frame->y_stride; - assert(src->stride == pre->stride); - return 1; - } else { - printf("invalid ref_frame_idx"); - assert(ref_frame_idx != -1); - return 0; - } -} - -#define kMvPreCheckLines 5 -#define kMvPreCheckSize 15 - -#define MV_REF_POS_NUM 3 -POSITION mv_ref_pos[MV_REF_POS_NUM] = { - { -1, 0 }, - { 0, -1 }, - { -1, -1 }, -}; - -static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row, - int mi_col) { - return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col]; -} - -static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame, - BLOCK_SIZE bsize, int mi_row, int mi_col) { - int i; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - int_mv nearest_mv, near_mv, invalid_mv; - nearest_mv.as_int = INVALID_MV; - near_mv.as_int = INVALID_MV; - invalid_mv.as_int = INVALID_MV; - for (i = 0; i < MV_REF_POS_NUM; ++i) { - int nb_row = mi_row + mv_ref_pos[i].row * mi_height; - int nb_col = mi_col + mv_ref_pos[i].col * mi_width; - assert(mv_ref_pos[i].row <= 0); - assert(mv_ref_pos[i].col <= 0); - if (nb_row >= 0 && nb_col >= 0) { - if (nearest_mv.as_int == INVALID_MV) { - nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); - } else { - int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); - if (mv.as_int == nearest_mv.as_int) { - continue; - } else { - near_mv = mv; - break; - } - } - } - } - if (nearest_mv.as_int == INVALID_MV) { - nearest_mv.as_mv.row = 0; - nearest_mv.as_mv.col = 0; - } - if (near_mv.as_int == INVALID_MV) { - near_mv.as_mv.row = 0; - near_mv.as_mv.col = 0; - } - if (mv_mode == NEAREST_MV_MODE) { - return nearest_mv; - } - if (mv_mode == NEAR_MV_MODE) { - return near_mv; - } - assert(0); - return invalid_mv; -} - -static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi, - MotionField *motion_field, - TplDepFrame *tpl_frame, BLOCK_SIZE bsize, - int mi_row, int mi_col) { - int_mv mv; - switch (mv_mode) { - case ZERO_MV_MODE: - mv.as_mv.row = 0; - mv.as_mv.col = 0; - break; - case NEW_MV_MODE: - mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); - break; - case NEAREST_MV_MODE: - mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); - break; - case NEAR_MV_MODE: - mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); - break; - default: - mv.as_int = INVALID_MV; - assert(0); - break; - } - return mv; -} - -static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd, - GF_PICTURE *gf_picture, MotionField *motion_field, - int frame_idx, TplDepFrame *tpl_frame, int rf_idx, - BLOCK_SIZE bsize, int mi_row, int mi_col, - int_mv *mv) { - uint32_t sse; - struct buf_2d src; - struct buf_2d pre; - MV full_mv; - *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize, - mi_row, mi_col); - full_mv = get_full_mv(&mv->as_mv); - if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col, - &src, &pre)) { - // TODO(angiebird): Consider subpixel when computing the sse. - cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv), - pre.stride, &sse); - return (double)(sse << VP9_DIST_SCALE_LOG2); - } else { - assert(0); - return 0; - } -} - -static int get_mv_mode_cost(int mv_mode) { - // TODO(angiebird): The probabilities are roughly inferred from - // default_inter_mode_probs. Check if there is a better way to set the - // probabilities. - const int zero_mv_prob = 16; - const int new_mv_prob = 24 * 1; - const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob; - assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256); - switch (mv_mode) { - case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break; - case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break; - case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; - case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; - default: assert(0); return -1; - } -} - -static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) { - double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) + - log2(1 + abs(new_mv->col - ref_mv->col)); - mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT); - return mv_diff_cost; -} -static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field, - TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row, - int mi_col) { - double mv_cost = get_mv_mode_cost(mv_mode); - if (mv_mode == NEW_MV_MODE) { - MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, - bsize, mi_row, mi_col) - .as_mv; - MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field, - tpl_frame, bsize, mi_row, mi_col) - .as_mv; - MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame, - bsize, mi_row, mi_col) - .as_mv; - double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv); - double near_cost = get_mv_diff_cost(&new_mv, &near_mv); - mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost; - } - return mv_cost; -} - -static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x, - GF_PICTURE *gf_picture, MotionField *motion_field, - int frame_idx, TplDepFrame *tpl_frame, int rf_idx, - BLOCK_SIZE bsize, int mi_row, int mi_col, - int_mv *mv) { - MACROBLOCKD *xd = &x->e_mbd; - double mv_dist = - get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx, - tpl_frame, rf_idx, bsize, mi_row, mi_col, mv); - double mv_cost = - get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col); - double mult = 180; - - return mv_cost + mult * log2f(1 + mv_dist); -} - -static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, - GF_PICTURE *gf_picture, - MotionField *motion_field, int frame_idx, - TplDepFrame *tpl_frame, int rf_idx, - BLOCK_SIZE bsize, int mi_row, int mi_col, - double *rd, int_mv *mv) { - int best_mv_mode = ZERO_MV_MODE; - int update = 0; - int mv_mode; - *rd = 0; - for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) { - double this_rd; - int_mv this_mv; - if (mv_mode == NEW_MV_MODE) { - continue; - } - this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx, - tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv); - if (update == 0) { - *rd = this_rd; - *mv = this_mv; - best_mv_mode = mv_mode; - update = 1; - } else { - if (this_rd < *rd) { - *rd = this_rd; - *mv = this_mv; - best_mv_mode = mv_mode; - } - } - } - return best_mv_mode; -} - -static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, - GF_PICTURE *gf_picture, MotionField *motion_field, - int frame_idx, TplDepFrame *tpl_frame, int rf_idx, - BLOCK_SIZE bsize, int mi_row, int mi_col) { - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - int tmp_mv_mode_arr[kMvPreCheckSize]; - int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx]; - double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx]; - int_mv *select_mv_arr = cpi->select_mv_arr; - int_mv tmp_select_mv_arr[kMvPreCheckSize]; - int stride = tpl_frame->stride; - double new_mv_rd = 0; - double no_new_mv_rd = 0; - double this_new_mv_rd = 0; - double this_no_new_mv_rd = 0; - int idx; - int tmp_idx; - assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1); - - // no new mv - // diagonal scan order - tmp_idx = 0; - for (idx = 0; idx < kMvPreCheckLines; ++idx) { - int r; - for (r = 0; r <= idx; ++r) { - int c = idx - r; - int nb_row = mi_row + r * mi_height; - int nb_col = mi_col + c * mi_width; - if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { - double this_rd; - int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; - mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( - cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, - bsize, nb_row, nb_col, &this_rd, mv); - if (r == 0 && c == 0) { - this_no_new_mv_rd = this_rd; - } - no_new_mv_rd += this_rd; - tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col]; - tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col]; - ++tmp_idx; - } - } - } - - // new mv - mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE; - this_new_mv_rd = eval_mv_mode( - NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, - rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]); - new_mv_rd = this_new_mv_rd; - // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE - // beforehand. - for (idx = 1; idx < kMvPreCheckLines; ++idx) { - int r; - for (r = 0; r <= idx; ++r) { - int c = idx - r; - int nb_row = mi_row + r * mi_height; - int nb_col = mi_col + c * mi_width; - if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { - double this_rd; - int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; - mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( - cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, - bsize, nb_row, nb_col, &this_rd, mv); - new_mv_rd += this_rd; - } - } - } - - // update best_mv_mode - tmp_idx = 0; - if (no_new_mv_rd < new_mv_rd) { - for (idx = 0; idx < kMvPreCheckLines; ++idx) { - int r; - for (r = 0; r <= idx; ++r) { - int c = idx - r; - int nb_row = mi_row + r * mi_height; - int nb_col = mi_col + c * mi_width; - if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { - mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx]; - select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx]; - ++tmp_idx; - } - } - } - rd_diff_arr[mi_row * stride + mi_col] = 0; - } else { - rd_diff_arr[mi_row * stride + mi_col] = - (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd); - } -} - -static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x, - GF_PICTURE *gf_picture, - MotionField *motion_field, int frame_idx, - TplDepFrame *tpl_frame, int rf_idx, - BLOCK_SIZE bsize) { - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - const int unit_rows = tpl_frame->mi_rows / mi_height; - const int unit_cols = tpl_frame->mi_cols / mi_width; - const int max_diagonal_lines = unit_rows + unit_cols - 1; - int idx; - for (idx = 0; idx < max_diagonal_lines; ++idx) { - int r; - for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1); - ++r) { - int c = idx - r; - int mi_row = r * mi_height; - int mi_col = c * mi_width; - assert(c >= 0 && c < unit_cols); - assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows); - assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols); - predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, - rf_idx, bsize, mi_row, mi_col); - } - } -} - -static void do_motion_search(VP9_COMP *cpi, ThreadData *td, - MotionField *motion_field, int frame_idx, - YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize, - int mi_row, int mi_col) { - VP9_COMMON *cm = &cpi->common; - MACROBLOCK *x = &td->mb; - MACROBLOCKD *xd = &x->e_mbd; - const int mb_y_offset = - mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; - assert(ref_frame != NULL); - set_mv_limits(cm, x, mi_row, mi_col); - { - int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); - uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset; - uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset; - const int stride = xd->cur_buf->y_stride; - full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf, - ref_frame_buf, stride, bsize, mi_row, mi_col, - &mv.as_mv); - sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride, - bsize, &mv.as_mv); - vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv); - } -} - -static void build_motion_field( - VP9_COMP *cpi, int frame_idx, - YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) { - VP9_COMMON *cm = &cpi->common; - ThreadData *td = &cpi->td; - TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - const int pw = num_4x4_blocks_wide_lookup[bsize] << 2; - const int ph = num_4x4_blocks_high_lookup[bsize] << 2; - int mi_row, mi_col; - int rf_idx; - - tpl_frame->lambda = (pw * ph) >> 2; - assert(pw * ph == tpl_frame->lambda << 2); - - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - MotionField *motion_field = vp9_motion_field_info_get_motion_field( - &cpi->motion_field_info, frame_idx, rf_idx, bsize); - if (ref_frame[rf_idx] == NULL) { - continue; - } - vp9_motion_field_reset_mvs(motion_field); - for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { - do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx], - bsize, mi_row, mi_col); - } - } - } -} -#endif // CONFIG_NON_GREEDY_MV - -static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, - int frame_idx, BLOCK_SIZE bsize) { - TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame; - YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL }; - - VP9_COMMON *cm = &cpi->common; - struct scale_factors sf; - int rdmult, idx; - ThreadData *td = &cpi->td; - MACROBLOCK *x = &td->mb; - MACROBLOCKD *xd = &x->e_mbd; - int mi_row, mi_col; - -#if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]); - DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]); - uint8_t *predictor; -#else - DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]); -#endif - DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]); - DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); - - const TX_SIZE tx_size = max_txsize_lookup[bsize]; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - int64_t recon_error, sse; -#if CONFIG_NON_GREEDY_MV - int square_block_idx; - int rf_idx; -#endif - - // Setup scaling factor -#if CONFIG_VP9_HIGHBITDEPTH - vp9_setup_scale_factors_for_frame( - &sf, this_frame->y_crop_width, this_frame->y_crop_height, - this_frame->y_crop_width, this_frame->y_crop_height, - cpi->common.use_highbitdepth); - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - predictor = CONVERT_TO_BYTEPTR(predictor16); - else - predictor = predictor8; -#else - vp9_setup_scale_factors_for_frame( - &sf, this_frame->y_crop_width, this_frame->y_crop_height, - this_frame->y_crop_width, this_frame->y_crop_height); -#endif // CONFIG_VP9_HIGHBITDEPTH - - // Prepare reference frame pointers. If any reference frame slot is - // unavailable, the pointer will be set to Null. - for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) { - int rf_idx = gf_picture[frame_idx].ref_frame[idx]; - if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame; - } - - xd->mi = cm->mi_grid_visible; - xd->mi[0] = cm->mi; - xd->cur_buf = this_frame; - - // Get rd multiplier set up. - rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex); - set_error_per_bit(&cpi->td.mb, rdmult); - vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex); - - tpl_frame->is_valid = 1; - - cm->base_qindex = tpl_frame->base_qindex; - vp9_frame_init_quantizer(cpi); - -#if CONFIG_NON_GREEDY_MV - for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; - ++square_block_idx) { - BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx); - build_motion_field(cpi, frame_idx, ref_frame, square_bsize); - } - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; - if (ref_frame_idx != -1) { - MotionField *motion_field = vp9_motion_field_info_get_motion_field( - &cpi->motion_field_info, frame_idx, rf_idx, bsize); - predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx, - tpl_frame, rf_idx, bsize); - } - } -#endif - - for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { - mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame, - src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize, - tx_size, ref_frame, predictor, &recon_error, &sse); - // Motion flow dependency dispenser. - tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, - tpl_frame->stride); - - tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, - bsize); - } - } -} - -#if CONFIG_NON_GREEDY_MV -#define DUMP_TPL_STATS 0 -#if DUMP_TPL_STATS -static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) { - int i, j; - printf("%d %d\n", h, w); - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - printf("%d ", buf[(row + i) * stride + col + j]); - } - } - printf("\n"); -} - -static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) { - dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height, - frame_buf->y_width); - dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0, - frame_buf->uv_height, frame_buf->uv_width); - dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0, - frame_buf->uv_height, frame_buf->uv_width); -} - -static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames, - const GF_GROUP *gf_group, - const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) { - int frame_idx; - const VP9_COMMON *cm = &cpi->common; - int rf_idx; - for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) { - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - int mi_row, mi_col; - int ref_frame_idx; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; - if (ref_frame_idx != -1) { - YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame; - const int gf_frame_offset = gf_group->frame_gop_index[frame_idx]; - const int ref_gf_frame_offset = - gf_group->frame_gop_index[ref_frame_idx]; - printf("=\n"); - printf( - "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d " - "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n", - frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE, - ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset); - for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { - for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { - if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { - int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info, - frame_idx, rf_idx, bsize, - mi_row, mi_col); - printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row, - mv.as_mv.col); - } - } - } - for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { - for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { - if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { - const TplDepStats *tpl_ptr = - &tpl_frame - ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; - printf("%f ", tpl_ptr->feature_score); - } - } - } - printf("\n"); - - for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { - const int mv_mode = - tpl_frame - ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col]; - printf("%d ", mv_mode); - } - } - printf("\n"); - - dump_frame_buf(gf_picture[frame_idx].frame); - dump_frame_buf(ref_frame_buf); - } - } - } -} -#endif // DUMP_TPL_STATS -#endif // CONFIG_NON_GREEDY_MV - -static void init_tpl_buffer(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - int frame; - - const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); - const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); -#if CONFIG_NON_GREEDY_MV - int rf_idx; - - vpx_free(cpi->select_mv_arr); - CHECK_MEM_ERROR( - cm, cpi->select_mv_arr, - vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr))); -#endif - - // TODO(jingning): Reduce the actual memory use for tpl model build up. - for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { - if (cpi->tpl_stats[frame].width >= mi_cols && - cpi->tpl_stats[frame].height >= mi_rows && - cpi->tpl_stats[frame].tpl_stats_ptr) - continue; - -#if CONFIG_NON_GREEDY_MV - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); - CHECK_MEM_ERROR( - cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx], - vpx_calloc(mi_rows * mi_cols * 4, - sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx]))); - vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); - CHECK_MEM_ERROR( - cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx], - vpx_calloc(mi_rows * mi_cols * 4, - sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx]))); - } -#endif - vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); - CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr, - vpx_calloc(mi_rows * mi_cols, - sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr))); - cpi->tpl_stats[frame].is_valid = 0; - cpi->tpl_stats[frame].width = mi_cols; - cpi->tpl_stats[frame].height = mi_rows; - cpi->tpl_stats[frame].stride = mi_cols; - cpi->tpl_stats[frame].mi_rows = cm->mi_rows; - cpi->tpl_stats[frame].mi_cols = cm->mi_cols; - } - - for (frame = 0; frame < REF_FRAMES; ++frame) { - cpi->enc_frame_buf[frame].mem_valid = 0; - cpi->enc_frame_buf[frame].released = 1; - } -} - -static void free_tpl_buffer(VP9_COMP *cpi) { - int frame; -#if CONFIG_NON_GREEDY_MV - vp9_free_motion_field_info(&cpi->motion_field_info); - vpx_free(cpi->select_mv_arr); -#endif - for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { -#if CONFIG_NON_GREEDY_MV - int rf_idx; - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); - vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); - } -#endif - vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); - cpi->tpl_stats[frame].is_valid = 0; - } -} - -#if CONFIG_RATE_CTRL -static void accumulate_frame_tpl_stats(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - const GF_GROUP *gf_group = &cpi->twopass.gf_group; - int show_frame_count = 0; - int frame_idx; - // Accumulate tpl stats for each frame in the current group of picture. - for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) { - TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; - const int tpl_stride = tpl_frame->stride; - int64_t intra_cost_base = 0; - int64_t inter_cost_base = 0; - int64_t mc_dep_cost_base = 0; - int64_t mc_ref_cost_base = 0; - int64_t mc_flow_base = 0; - int row, col; - - if (!tpl_frame->is_valid) continue; - - for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) { - for (col = 0; col < cm->mi_cols; ++col) { - TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; - intra_cost_base += this_stats->intra_cost; - inter_cost_base += this_stats->inter_cost; - mc_dep_cost_base += this_stats->mc_dep_cost; - mc_ref_cost_base += this_stats->mc_ref_cost; - mc_flow_base += this_stats->mc_flow; - } - } - - cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base; - cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base; - cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base; - cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base; - cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base; - - ++show_frame_count; - } -} -#endif // CONFIG_RATE_CTRL - -static void setup_tpl_stats(VP9_COMP *cpi) { - GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE]; - const GF_GROUP *gf_group = &cpi->twopass.gf_group; - int tpl_group_frames = 0; - int frame_idx; - cpi->tpl_bsize = BLOCK_32X32; - - init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames); - - init_tpl_stats(cpi); - - // Backward propagation from tpl_group_frames to 1. - for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) { - if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue; - mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize); - } -#if CONFIG_NON_GREEDY_MV - cpi->tpl_ready = 1; -#if DUMP_TPL_STATS - dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize); -#endif // DUMP_TPL_STATS -#endif // CONFIG_NON_GREEDY_MV - -#if CONFIG_RATE_CTRL - if (cpi->oxcf.use_simple_encode_api) { - accumulate_frame_tpl_stats(cpi); - } -#endif // CONFIG_RATE_CTRL -} - void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int *ref_frame_coding_indexes, @@ -7906,9 +6491,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, if (gf_group_index == 1 && cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE && cpi->sf.enable_tpl_model) { - init_tpl_buffer(cpi); + vp9_init_tpl_buffer(cpi); vp9_estimate_qp_gop(cpi); - setup_tpl_stats(cpi); + vp9_setup_tpl_stats(cpi); } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, setup_tpl_stats_time); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 33a2844d3e..02c7400fbc 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1474,6 +1474,19 @@ int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr); #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl)) +static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) { + RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx]; + if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows || + new_fb_ptr->mi_cols < cm->mi_cols) { + vpx_free(new_fb_ptr->mvs); + CHECK_MEM_ERROR(cm, new_fb_ptr->mvs, + (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, + sizeof(*new_fb_ptr->mvs))); + new_fb_ptr->mi_rows = cm->mi_rows; + new_fb_ptr->mi_cols = cm->mi_cols; + } +} + #if CONFIG_COLLECT_COMPONENT_TIMING static INLINE void start_timing(VP9_COMP *cpi, int component) { vpx_usec_timer_start(&cpi->component_timer[component]); diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c new file mode 100644 index 0000000000..b0c735167e --- /dev/null +++ b/vp9/encoder/vp9_tpl_model.c @@ -0,0 +1,1400 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#if CONFIG_NON_GREEDY_MV +#include "vp9/common/vp9_mvref_common.h" +#endif +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_tpl_model.h" + +static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, + const GF_GROUP *gf_group, int *tpl_group_frames) { + VP9_COMMON *cm = &cpi->common; + int frame_idx = 0; + int i; + int gld_index = -1; + int alt_index = -1; + int lst_index = -1; + int arf_index_stack[MAX_ARF_LAYERS]; + int arf_stack_size = 0; + int extend_frame_count = 0; + int pframe_qindex = cpi->tpl_stats[2].base_qindex; + int frame_gop_offset = 0; + + RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs; + int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS]; + + memset(recon_frame_index, -1, sizeof(recon_frame_index)); + stack_init(arf_index_stack, MAX_ARF_LAYERS); + + // TODO(jingning): To be used later for gf frame type parsing. + (void)gf_group; + + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (frame_bufs[i].ref_count == 0) { + alloc_frame_mvs(cm, i); + if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + + recon_frame_index[frame_idx] = i; + ++frame_idx; + + if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break; + } + } + + for (i = 0; i < REFS_PER_FRAME + 1; ++i) { + assert(recon_frame_index[i] >= 0); + cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf; + } + + *tpl_group_frames = 0; + + // Initialize Golden reference frame. + gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1; + gf_picture[0].update_type = gf_group->update_type[0]; + gld_index = 0; + ++*tpl_group_frames; + + // Initialize base layer ARF frame + gf_picture[1].frame = cpi->Source; + gf_picture[1].ref_frame[0] = gld_index; + gf_picture[1].ref_frame[1] = lst_index; + gf_picture[1].ref_frame[2] = alt_index; + gf_picture[1].update_type = gf_group->update_type[1]; + alt_index = 1; + ++*tpl_group_frames; + + // Initialize P frames + for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { + struct lookahead_entry *buf; + frame_gop_offset = gf_group->frame_gop_index[frame_idx]; + buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx]; + + switch (gf_group->update_type[frame_idx]) { + case ARF_UPDATE: + stack_push(arf_index_stack, alt_index, arf_stack_size); + ++arf_stack_size; + alt_index = frame_idx; + break; + case LF_UPDATE: lst_index = frame_idx; break; + case OVERLAY_UPDATE: + gld_index = frame_idx; + alt_index = stack_pop(arf_index_stack, arf_stack_size); + --arf_stack_size; + break; + case USE_BUF_FRAME: + lst_index = alt_index; + alt_index = stack_pop(arf_index_stack, arf_stack_size); + --arf_stack_size; + break; + default: break; + } + + ++*tpl_group_frames; + + // The length of group of pictures is baseline_gf_interval, plus the + // beginning golden frame from last GOP, plus the last overlay frame in + // the same GOP. + if (frame_idx == gf_group->gf_group_size) break; + } + + alt_index = -1; + ++frame_idx; + ++frame_gop_offset; + + // Extend two frames outside the current gf group. + for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) { + struct lookahead_entry *buf = + vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + gf_picture[frame_idx].update_type = LF_UPDATE; + lst_index = frame_idx; + ++*tpl_group_frames; + ++extend_frame_count; + ++frame_gop_offset; + } +} + +static void init_tpl_stats(VP9_COMP *cpi) { + int frame_idx; + for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + memset(tpl_frame->tpl_stats_ptr, 0, + tpl_frame->height * tpl_frame->width * + sizeof(*tpl_frame->tpl_stats_ptr)); + tpl_frame->is_valid = 0; + } +} + +#if CONFIG_NON_GREEDY_MV +static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, + MotionField *motion_field, + int frame_idx, uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, int stride, + BLOCK_SIZE bsize, int mi_row, + int mi_col, MV *mv) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + int step_param; + uint32_t bestsme = UINT_MAX; + const MvLimits tmp_mv_limits = x->mv_limits; + // lambda is used to adjust the importance of motion vector consistency. + // TODO(angiebird): Figure out lambda's proper value. + const int lambda = cpi->tpl_stats[frame_idx].lambda; + int_mv nb_full_mvs[NB_MVS_NUM]; + int nb_full_mv_num; + + MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + step_param = mv_sf->reduce_first_step_size; + step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + + nb_full_mv_num = + vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs); + vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param, + lambda, 1, nb_full_mvs, nb_full_mv_num, mv); + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + return bestsme; +} + +static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, int stride, + BLOCK_SIZE bsize, MV *mv) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + uint32_t bestsme = UINT_MAX; + uint32_t distortion; + uint32_t sse; + int cost_list[5]; + + MV best_ref_mv1 = { 0, 0 }; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + // TODO(yunqing): may use higher tap interp filter than 2 taps. + // Ignore mv costing by sending NULL pointer instead of cost array + bestsme = cpi->find_fractional_mv_step( + x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, + USE_2_TAPS); + + return bestsme; +} + +#else // CONFIG_NON_GREEDY_MV +static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, + int stride, BLOCK_SIZE bsize, + MV *mv) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + const SEARCH_METHODS search_method = NSTEP; + int step_param; + int sadpb = x->sadperbit16; + uint32_t bestsme = UINT_MAX; + uint32_t distortion; + uint32_t sse; + int cost_list[5]; + const MvLimits tmp_mv_limits = x->mv_limits; + + MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + step_param = mv_sf->reduce_first_step_size; + step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + + vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param, + search_method, sadpb, cond_cost_list(cpi, cost_list), + &best_ref_mv1, mv, 0, 0); + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + // TODO(yunqing): may use higher tap interp filter than 2 taps. + // Ignore mv costing by sending NULL pointer instead of cost array + bestsme = cpi->find_fractional_mv_step( + x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, + USE_2_TAPS); + + return bestsme; +} +#endif + +static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, + int ref_pos_col, int block, BLOCK_SIZE bsize) { + int width = 0, height = 0; + int bw = 4 << b_width_log2_lookup[bsize]; + int bh = 4 << b_height_log2_lookup[bsize]; + + switch (block) { + case 0: + width = grid_pos_col + bw - ref_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 1: + width = ref_pos_col + bw - grid_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 2: + width = grid_pos_col + bw - ref_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + case 3: + width = ref_pos_col + bw - grid_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + default: assert(0); + } + + return width * height; +} + +static int round_floor(int ref_pos, int bsize_pix) { + int round; + if (ref_pos < 0) + round = -(1 + (-ref_pos - 1) / bsize_pix); + else + round = ref_pos / bsize_pix; + + return round; +} + +static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, + BLOCK_SIZE bsize, int stride) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx]; + const int64_t mc_flow = tpl_ptr->mc_flow; + const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost; + *tpl_ptr = *src_stats; + tpl_ptr->mc_flow = mc_flow; + tpl_ptr->mc_ref_cost = mc_ref_cost; + tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow; + } + } +} + +static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index]; + TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr; + MV mv = tpl_stats->mv.as_mv; + int mv_row = mv.row >> 3; + int mv_col = mv.col >> 3; + + int ref_pos_row = mi_row * MI_SIZE + mv_row; + int ref_pos_col = mi_col * MI_SIZE + mv_col; + + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pix_num = bw * bh; + + // top-left on grid block location in pixel + int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; + int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; + int block; + + for (block = 0; block < 4; ++block) { + int grid_pos_row = grid_pos_row_base + bh * (block >> 1); + int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); + + if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && + grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { + int overlap_area = get_overlap_area( + grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize); + int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; + int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; + + int64_t mc_flow = tpl_stats->mc_dep_cost - + (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) / + tpl_stats->intra_cost; + + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *des_stats = + &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride + + (ref_mi_col + idx)]; + + des_stats->mc_flow += (mc_flow * overlap_area) / pix_num; + des_stats->mc_ref_cost += + ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) / + pix_num; + assert(overlap_area >= 0); + } + } + } + } +} + +static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + int idx, idy; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *tpl_ptr = + &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)]; + tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx, + BLOCK_8X8); + } + } +} + +static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + TX_SIZE tx_size, int64_t *recon_error, + int64_t *sse) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + uint16_t eob; + int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; + const int shift = tx_size == TX_32X32 ? 0 : 2; + + // skip block condition should be handled before this is called. + assert(!x->skip_block); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, + qcoeff, dqcoeff, pd->dequant, &eob, + scan_order->scan, scan_order->iscan); + } else { + vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, + dqcoeff, pd->dequant, &eob, scan_order->scan, + scan_order->iscan); + } +#else + vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, + dqcoeff, pd->dequant, &eob, scan_order->scan, + scan_order->iscan); +#endif // CONFIG_VP9_HIGHBITDEPTH + + *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift; + *recon_error = VPXMAX(*recon_error, 1); + + *sse = (*sse) >> shift; + *sse = VPXMAX(*sse, 1); +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size) { + // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms. + switch (tx_size) { + case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break; + case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break; + case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break; + default: assert(0); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size) { + switch (tx_size) { + case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break; + case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break; + case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break; + default: assert(0); + } +} + +static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row, + int mi_col) { + x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); + x->mv_limits.row_max = + (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND); + x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); + x->mv_limits.col_max = + ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND); +} + +static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + struct scale_factors *sf, GF_PICTURE *gf_picture, + int frame_idx, TplDepFrame *tpl_frame, + int16_t *src_diff, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row, + int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, + YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, + int64_t *recon_error, int64_t *sse) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int pix_num = bw * bh; + int best_rf_idx = -1; + int_mv best_mv; + int64_t best_inter_cost = INT64_MAX; + int64_t inter_cost; + int rf_idx; + const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; + + int64_t best_intra_cost = INT64_MAX; + int64_t intra_cost; + PREDICTION_MODE mode; + int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + MODE_INFO mi_above, mi_left; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + TplDepStats *tpl_stats = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + + xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); + xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8; + xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); + xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8; + xd->above_mi = (mi_row > 0) ? &mi_above : NULL; + xd->left_mi = (mi_col > 0) ? &mi_left : NULL; + + // Intra prediction search + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + uint8_t *src, *dst; + int src_stride, dst_stride; + + src = xd->cur_buf->y_buffer + mb_y_offset; + src_stride = xd->cur_buf->y_stride; + + dst = &predictor[0]; + dst_stride = bw; + + xd->mi[0]->sb_type = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + + vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src, + src_stride, dst, dst_stride, 0, 0, 0); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride, xd->bd); + vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_highbd_satd(coeff, pix_num); + } else { + vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_satd(coeff, pix_num); + } +#else + vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_satd(coeff, pix_num); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (intra_cost < best_intra_cost) best_intra_cost = intra_cost; + } + + // Motion compensated prediction + best_mv.as_int = 0; + + set_mv_limits(cm, x, mi_row, mi_col); + + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + int_mv mv; +#if CONFIG_NON_GREEDY_MV + MotionField *motion_field; +#endif + if (ref_frame[rf_idx] == NULL) continue; + +#if CONFIG_NON_GREEDY_MV + (void)td; + motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); +#else + motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, bsize, &mv.as_mv); +#endif + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset), + ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw, + &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE, xd->bd); + vpx_highbd_subtract_block( + bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw, xd->bd); + vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_highbd_satd(coeff, pix_num); + } else { + vp9_build_inter_predictor( + ref_frame[rf_idx]->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh, + 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, + xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_satd(coeff, pix_num); + } +#else + vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_stride, &predictor[0], bw, + &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE, mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, + xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_satd(coeff, pix_num); +#endif + + if (inter_cost < best_inter_cost) { + best_rf_idx = rf_idx; + best_inter_cost = inter_cost; + best_mv.as_int = mv.as_int; + get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error, + sse); + } + } + best_intra_cost = VPXMAX(best_intra_cost, 1); + best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost); + tpl_stats->inter_cost = VPXMAX( + 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); + tpl_stats->intra_cost = VPXMAX( + 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); + tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; + tpl_stats->mv.as_int = best_mv.as_int; +} + +#if CONFIG_NON_GREEDY_MV +static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture, + int frame_idx, int rf_idx, int mi_row, + int mi_col, struct buf_2d *src, + struct buf_2d *pre) { + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + YV12_BUFFER_CONFIG *ref_frame = NULL; + int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + ref_frame = gf_picture[ref_frame_idx].frame; + src->buf = xd->cur_buf->y_buffer + mb_y_offset; + src->stride = xd->cur_buf->y_stride; + pre->buf = ref_frame->y_buffer + mb_y_offset; + pre->stride = ref_frame->y_stride; + assert(src->stride == pre->stride); + return 1; + } else { + printf("invalid ref_frame_idx"); + assert(ref_frame_idx != -1); + return 0; + } +} + +#define kMvPreCheckLines 5 +#define kMvPreCheckSize 15 + +#define MV_REF_POS_NUM 3 +POSITION mv_ref_pos[MV_REF_POS_NUM] = { + { -1, 0 }, + { 0, -1 }, + { -1, -1 }, +}; + +static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row, + int mi_col) { + return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col]; +} + +static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + int i; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int_mv nearest_mv, near_mv, invalid_mv; + nearest_mv.as_int = INVALID_MV; + near_mv.as_int = INVALID_MV; + invalid_mv.as_int = INVALID_MV; + for (i = 0; i < MV_REF_POS_NUM; ++i) { + int nb_row = mi_row + mv_ref_pos[i].row * mi_height; + int nb_col = mi_col + mv_ref_pos[i].col * mi_width; + assert(mv_ref_pos[i].row <= 0); + assert(mv_ref_pos[i].col <= 0); + if (nb_row >= 0 && nb_col >= 0) { + if (nearest_mv.as_int == INVALID_MV) { + nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); + } else { + int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); + if (mv.as_int == nearest_mv.as_int) { + continue; + } else { + near_mv = mv; + break; + } + } + } + } + if (nearest_mv.as_int == INVALID_MV) { + nearest_mv.as_mv.row = 0; + nearest_mv.as_mv.col = 0; + } + if (near_mv.as_int == INVALID_MV) { + near_mv.as_mv.row = 0; + near_mv.as_mv.col = 0; + } + if (mv_mode == NEAREST_MV_MODE) { + return nearest_mv; + } + if (mv_mode == NEAR_MV_MODE) { + return near_mv; + } + assert(0); + return invalid_mv; +} + +static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi, + MotionField *motion_field, + TplDepFrame *tpl_frame, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + int_mv mv; + switch (mv_mode) { + case ZERO_MV_MODE: + mv.as_mv.row = 0; + mv.as_mv.col = 0; + break; + case NEW_MV_MODE: + mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); + break; + case NEAREST_MV_MODE: + mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); + break; + case NEAR_MV_MODE: + mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); + break; + default: + mv.as_int = INVALID_MV; + assert(0); + break; + } + return mv; +} + +static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd, + GF_PICTURE *gf_picture, MotionField *motion_field, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *mv) { + uint32_t sse; + struct buf_2d src; + struct buf_2d pre; + MV full_mv; + *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize, + mi_row, mi_col); + full_mv = get_full_mv(&mv->as_mv); + if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col, + &src, &pre)) { + // TODO(angiebird): Consider subpixel when computing the sse. + cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv), + pre.stride, &sse); + return (double)(sse << VP9_DIST_SCALE_LOG2); + } else { + assert(0); + return 0; + } +} + +static int get_mv_mode_cost(int mv_mode) { + // TODO(angiebird): The probabilities are roughly inferred from + // default_inter_mode_probs. Check if there is a better way to set the + // probabilities. + const int zero_mv_prob = 16; + const int new_mv_prob = 24 * 1; + const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob; + assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256); + switch (mv_mode) { + case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break; + case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break; + case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; + case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; + default: assert(0); return -1; + } +} + +static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) { + double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) + + log2(1 + abs(new_mv->col - ref_mv->col)); + mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT); + return mv_diff_cost; +} +static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field, + TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + double mv_cost = get_mv_mode_cost(mv_mode); + if (mv_mode == NEW_MV_MODE) { + MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, + bsize, mi_row, mi_col) + .as_mv; + MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field, + tpl_frame, bsize, mi_row, mi_col) + .as_mv; + MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame, + bsize, mi_row, mi_col) + .as_mv; + double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv); + double near_cost = get_mv_diff_cost(&new_mv, &near_mv); + mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost; + } + return mv_cost; +} + +static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, MotionField *motion_field, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *mv) { + MACROBLOCKD *xd = &x->e_mbd; + double mv_dist = + get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize, mi_row, mi_col, mv); + double mv_cost = + get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col); + double mult = 180; + + return mv_cost + mult * log2f(1 + mv_dist); +} + +static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, + MotionField *motion_field, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + double *rd, int_mv *mv) { + int best_mv_mode = ZERO_MV_MODE; + int update = 0; + int mv_mode; + *rd = 0; + for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) { + double this_rd; + int_mv this_mv; + if (mv_mode == NEW_MV_MODE) { + continue; + } + this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv); + if (update == 0) { + *rd = this_rd; + *mv = this_mv; + best_mv_mode = mv_mode; + update = 1; + } else { + if (this_rd < *rd) { + *rd = this_rd; + *mv = this_mv; + best_mv_mode = mv_mode; + } + } + } + return best_mv_mode; +} + +static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, MotionField *motion_field, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int tmp_mv_mode_arr[kMvPreCheckSize]; + int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx]; + double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx]; + int_mv *select_mv_arr = cpi->select_mv_arr; + int_mv tmp_select_mv_arr[kMvPreCheckSize]; + int stride = tpl_frame->stride; + double new_mv_rd = 0; + double no_new_mv_rd = 0; + double this_new_mv_rd = 0; + double this_no_new_mv_rd = 0; + int idx; + int tmp_idx; + assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1); + + // no new mv + // diagonal scan order + tmp_idx = 0; + for (idx = 0; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + double this_rd; + int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; + mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( + cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, + bsize, nb_row, nb_col, &this_rd, mv); + if (r == 0 && c == 0) { + this_no_new_mv_rd = this_rd; + } + no_new_mv_rd += this_rd; + tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col]; + tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col]; + ++tmp_idx; + } + } + } + + // new mv + mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE; + this_new_mv_rd = eval_mv_mode( + NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, + rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]); + new_mv_rd = this_new_mv_rd; + // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE + // beforehand. + for (idx = 1; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + double this_rd; + int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; + mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( + cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, + bsize, nb_row, nb_col, &this_rd, mv); + new_mv_rd += this_rd; + } + } + } + + // update best_mv_mode + tmp_idx = 0; + if (no_new_mv_rd < new_mv_rd) { + for (idx = 0; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx]; + select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx]; + ++tmp_idx; + } + } + } + rd_diff_arr[mi_row * stride + mi_col] = 0; + } else { + rd_diff_arr[mi_row * stride + mi_col] = + (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd); + } +} + +static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, + MotionField *motion_field, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int unit_rows = tpl_frame->mi_rows / mi_height; + const int unit_cols = tpl_frame->mi_cols / mi_width; + const int max_diagonal_lines = unit_rows + unit_cols - 1; + int idx; + for (idx = 0; idx < max_diagonal_lines; ++idx) { + int r; + for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1); + ++r) { + int c = idx - r; + int mi_row = r * mi_height; + int mi_col = c * mi_width; + assert(c >= 0 && c < unit_cols); + assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows); + assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols); + predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, + rf_idx, bsize, mi_row, mi_col); + } + } +} + +static void do_motion_search(VP9_COMP *cpi, ThreadData *td, + MotionField *motion_field, int frame_idx, + YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + VP9_COMMON *cm = &cpi->common; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + assert(ref_frame != NULL); + set_mv_limits(cm, x, mi_row, mi_col); + { + int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); + uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset; + uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset; + const int stride = xd->cur_buf->y_stride; + full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf, + ref_frame_buf, stride, bsize, mi_row, mi_col, + &mv.as_mv); + sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride, + bsize, &mv.as_mv); + vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv); + } +} + +static void build_motion_field( + VP9_COMP *cpi, int frame_idx, + YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pw = num_4x4_blocks_wide_lookup[bsize] << 2; + const int ph = num_4x4_blocks_high_lookup[bsize] << 2; + int mi_row, mi_col; + int rf_idx; + + tpl_frame->lambda = (pw * ph) >> 2; + assert(pw * ph == tpl_frame->lambda << 2); + + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + MotionField *motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + if (ref_frame[rf_idx] == NULL) { + continue; + } + vp9_motion_field_reset_mvs(motion_field); + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx], + bsize, mi_row, mi_col); + } + } + } +} +#endif // CONFIG_NON_GREEDY_MV + +static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, + int frame_idx, BLOCK_SIZE bsize) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame; + YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL }; + + VP9_COMMON *cm = &cpi->common; + struct scale_factors sf; + int rdmult, idx; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + int mi_row, mi_col; + +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]); + uint8_t *predictor; +#else + DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]); +#endif + DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); + + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int64_t recon_error, sse; +#if CONFIG_NON_GREEDY_MV + int square_block_idx; + int rf_idx; +#endif + + // Setup scaling factor +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height, + cpi->common.use_highbitdepth); + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + predictor = CONVERT_TO_BYTEPTR(predictor16); + else + predictor = predictor8; +#else + vp9_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Prepare reference frame pointers. If any reference frame slot is + // unavailable, the pointer will be set to Null. + for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) { + int rf_idx = gf_picture[frame_idx].ref_frame[idx]; + if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame; + } + + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + xd->cur_buf = this_frame; + + // Get rd multiplier set up. + rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex); + set_error_per_bit(&cpi->td.mb, rdmult); + vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex); + + tpl_frame->is_valid = 1; + + cm->base_qindex = tpl_frame->base_qindex; + vp9_frame_init_quantizer(cpi); + +#if CONFIG_NON_GREEDY_MV + for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; + ++square_block_idx) { + BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx); + build_motion_field(cpi, frame_idx, ref_frame, square_bsize); + } + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + MotionField *motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize); + } + } +#endif + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame, + src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize, + tx_size, ref_frame, predictor, &recon_error, &sse); + // Motion flow dependency dispenser. + tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, + tpl_frame->stride); + + tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, + bsize); + } + } +} + +#if CONFIG_NON_GREEDY_MV +#define DUMP_TPL_STATS 0 +#if DUMP_TPL_STATS +static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) { + int i, j; + printf("%d %d\n", h, w); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + printf("%d ", buf[(row + i) * stride + col + j]); + } + } + printf("\n"); +} + +static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) { + dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height, + frame_buf->y_width); + dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0, + frame_buf->uv_height, frame_buf->uv_width); + dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0, + frame_buf->uv_height, frame_buf->uv_width); +} + +static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames, + const GF_GROUP *gf_group, + const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) { + int frame_idx; + const VP9_COMMON *cm = &cpi->common; + int rf_idx; + for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) { + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + int mi_row, mi_col; + int ref_frame_idx; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame; + const int gf_frame_offset = gf_group->frame_gop_index[frame_idx]; + const int ref_gf_frame_offset = + gf_group->frame_gop_index[ref_frame_idx]; + printf("=\n"); + printf( + "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d " + "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n", + frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE, + ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset); + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { + int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info, + frame_idx, rf_idx, bsize, + mi_row, mi_col); + printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row, + mv.as_mv.col); + } + } + } + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { + const TplDepStats *tpl_ptr = + &tpl_frame + ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + printf("%f ", tpl_ptr->feature_score); + } + } + } + printf("\n"); + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + const int mv_mode = + tpl_frame + ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col]; + printf("%d ", mv_mode); + } + } + printf("\n"); + + dump_frame_buf(gf_picture[frame_idx].frame); + dump_frame_buf(ref_frame_buf); + } + } + } +} +#endif // DUMP_TPL_STATS +#endif // CONFIG_NON_GREEDY_MV + +void vp9_init_tpl_buffer(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int frame; + + const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); +#if CONFIG_NON_GREEDY_MV + int rf_idx; + + vpx_free(cpi->select_mv_arr); + CHECK_MEM_ERROR( + cm, cpi->select_mv_arr, + vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr))); +#endif + + // TODO(jingning): Reduce the actual memory use for tpl model build up. + for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { + if (cpi->tpl_stats[frame].width >= mi_cols && + cpi->tpl_stats[frame].height >= mi_rows && + cpi->tpl_stats[frame].tpl_stats_ptr) + continue; + +#if CONFIG_NON_GREEDY_MV + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); + CHECK_MEM_ERROR( + cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx], + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx]))); + vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); + CHECK_MEM_ERROR( + cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx], + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx]))); + } +#endif + vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); + CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr, + vpx_calloc(mi_rows * mi_cols, + sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr))); + cpi->tpl_stats[frame].is_valid = 0; + cpi->tpl_stats[frame].width = mi_cols; + cpi->tpl_stats[frame].height = mi_rows; + cpi->tpl_stats[frame].stride = mi_cols; + cpi->tpl_stats[frame].mi_rows = cm->mi_rows; + cpi->tpl_stats[frame].mi_cols = cm->mi_cols; + } + + for (frame = 0; frame < REF_FRAMES; ++frame) { + cpi->enc_frame_buf[frame].mem_valid = 0; + cpi->enc_frame_buf[frame].released = 1; + } +} + +void vp9_free_tpl_buffer(VP9_COMP *cpi) { + int frame; +#if CONFIG_NON_GREEDY_MV + vp9_free_motion_field_info(&cpi->motion_field_info); + vpx_free(cpi->select_mv_arr); +#endif + for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { +#if CONFIG_NON_GREEDY_MV + int rf_idx; + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); + vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); + } +#endif + vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); + cpi->tpl_stats[frame].is_valid = 0; + } +} + +#if CONFIG_RATE_CTRL +static void accumulate_frame_tpl_stats(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + int show_frame_count = 0; + int frame_idx; + // Accumulate tpl stats for each frame in the current group of picture. + for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + const int tpl_stride = tpl_frame->stride; + int64_t intra_cost_base = 0; + int64_t inter_cost_base = 0; + int64_t mc_dep_cost_base = 0; + int64_t mc_ref_cost_base = 0; + int64_t mc_flow_base = 0; + int row, col; + + if (!tpl_frame->is_valid) continue; + + for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) { + for (col = 0; col < cm->mi_cols; ++col) { + TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; + intra_cost_base += this_stats->intra_cost; + inter_cost_base += this_stats->inter_cost; + mc_dep_cost_base += this_stats->mc_dep_cost; + mc_ref_cost_base += this_stats->mc_ref_cost; + mc_flow_base += this_stats->mc_flow; + } + } + + cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base; + cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base; + cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base; + cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base; + cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base; + + ++show_frame_count; + } +} +#endif // CONFIG_RATE_CTRL + +void vp9_setup_tpl_stats(VP9_COMP *cpi) { + GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE]; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + int tpl_group_frames = 0; + int frame_idx; + cpi->tpl_bsize = BLOCK_32X32; + + init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames); + + init_tpl_stats(cpi); + + // Backward propagation from tpl_group_frames to 1. + for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) { + if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue; + mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize); + } +#if CONFIG_NON_GREEDY_MV + cpi->tpl_ready = 1; +#if DUMP_TPL_STATS + dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize); +#endif // DUMP_TPL_STATS +#endif // CONFIG_NON_GREEDY_MV + +#if CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + accumulate_frame_tpl_stats(cpi); + } +#endif // CONFIG_RATE_CTRL +} diff --git a/vp9/encoder/vp9_tpl_model.h b/vp9/encoder/vp9_tpl_model.h new file mode 100644 index 0000000000..86a7734f82 --- /dev/null +++ b/vp9/encoder/vp9_tpl_model.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_TPL_MODEL_H_ +#define VPX_VP9_ENCODER_VP9_TPL_MODEL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef M_LOG2_E +#define M_LOG2_E 0.693147180559945309417 +#endif +#define log2f(x) (log(x) / (float)M_LOG2_E) + +typedef struct GF_PICTURE { + YV12_BUFFER_CONFIG *frame; + int ref_frame[3]; + FRAME_UPDATE_TYPE update_type; +} GF_PICTURE; + +void vp9_init_tpl_buffer(VP9_COMP *cpi); +void vp9_setup_tpl_stats(VP9_COMP *cpi); +void vp9_free_tpl_buffer(VP9_COMP *cpi); + +void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size); +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_TPL_MODEL_H_ diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 9072628f23..ae8fb85d87 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -104,6 +104,8 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c endif VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.c VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h +VP9_CX_SRCS-yes += encoder/vp9_tpl_model.c +VP9_CX_SRCS-yes += encoder/vp9_tpl_model.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h From 25a6b2b181db9896070b2fc1239dc28ba6f3115d Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Tue, 7 Feb 2023 11:28:15 +0000 Subject: [PATCH 0595/1000] Use 4D reduction Neon helper for standard bitdepth SAD4D Move the 4D reduction helper function to sum_neon.h and use this for both standard and high bitdepth SAD4D paths. This also removes the AArch64 requirement for using the UDOT Neon SAD4D paths. Change-Id: I207f76b3d42aa541809b0672c3b3d86e54d133ff --- vpx_dsp/arm/highbd_sad4d_neon.c | 23 ++++---------------- vpx_dsp/arm/sad4d_neon.c | 37 ++++++++++++++++----------------- vpx_dsp/arm/sum_neon.h | 16 ++++++++++++++ 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/vpx_dsp/arm/highbd_sad4d_neon.c b/vpx_dsp/arm/highbd_sad4d_neon.c index d5e9e8ad22..f731d38cc1 100644 --- a/vpx_dsp/arm/highbd_sad4d_neon.c +++ b/vpx_dsp/arm/highbd_sad4d_neon.c @@ -17,21 +17,6 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -static INLINE uint32x4_t horizontal_add_4d_u32(uint32x4_t sum[4]) { -#if defined(__aarch64__) - uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]); - uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]); - return vpaddq_u32(res01, res23); -#else - uint32x4_t res = vdupq_n_u32(0); - res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0); - res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1); - res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2); - res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3); - return res; -#endif -} - static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], @@ -60,7 +45,7 @@ static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride, } while (++i < h); - vst1q_u32(res, horizontal_add_4d_u32(sum)); + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); } static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref, @@ -93,7 +78,7 @@ static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride, } while (++i < h); - vst1q_u32(res, horizontal_add_4d_u32(sum)); + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); } static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr, @@ -136,7 +121,7 @@ static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr, sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); - vst1q_u32(res, horizontal_add_4d_u32(sum)); + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); } static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride, @@ -203,7 +188,7 @@ static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride, sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); - vst1q_u32(res, horizontal_add_4d_u32(sum)); + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); } static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr, diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 85f6c1e5b1..9509573939 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -17,7 +17,7 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#if defined(__ARM_FEATURE_DOTPROD) static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, uint32x4_t *const sad_sum) { @@ -28,11 +28,11 @@ static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { - uint32x4_t res0, res1; uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; + uint32x4_t sum[4]; int i = 0; do { @@ -65,21 +65,22 @@ static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride, i++; } while (i < h); - res0 = vpaddq_u32(vaddq_u32(sum_lo[0], sum_hi[0]), - vaddq_u32(sum_lo[1], sum_hi[1])); - res1 = vpaddq_u32(vaddq_u32(sum_lo[2], sum_hi[2]), - vaddq_u32(sum_lo[3], sum_hi[3])); - vst1q_u32(res, vpaddq_u32(res0, res1)); + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); } static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { - uint32x4_t res0, res1; uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; + uint32x4_t sum[4]; int i = 0; do { @@ -100,17 +101,17 @@ static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, i++; } while (i < h); - res0 = vpaddq_u32(vaddq_u32(sum_lo[0], sum_hi[0]), - vaddq_u32(sum_lo[1], sum_hi[1])); - res1 = vpaddq_u32(vaddq_u32(sum_lo[2], sum_hi[2]), - vaddq_u32(sum_lo[3], sum_hi[3])); - vst1q_u32(res, vpaddq_u32(res0, res1)); + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); } static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { - uint32x4_t res0, res1; uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; @@ -125,12 +126,10 @@ static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, i++; } while (i < h); - res0 = vpaddq_u32(sum[0], sum[1]); - res1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(res, vpaddq_u32(res0, res1)); + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); } -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) +#else // !defined(__ARM_FEATURE_DOTPROD)) static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, uint16x8_t *const sad_sum) { @@ -246,7 +245,7 @@ static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, res[3] = horizontal_add_uint16x8(sum[3]); } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#endif // defined(__ARM_FEATURE_DOTPROD) static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref, uint16x8_t *const sad_sum) { diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index 5f20f9d99a..21560837ae 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -94,4 +94,20 @@ static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) { return vget_lane_u32(c, 0); #endif } + +static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) { +#if defined(__aarch64__) + uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]); + uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]); + return vpaddq_u32(res01, res23); +#else + uint32x4_t res = vdupq_n_u32(0); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3); + return res; +#endif +} + #endif // VPX_VPX_DSP_ARM_SUM_NEON_H_ From 03ddac40dfc9533a3038219b0bc819a5e6375227 Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Tue, 7 Feb 2023 11:11:35 -0800 Subject: [PATCH 0596/1000] Enable some speed features on speed 0 Performance: | SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T | |---------|---------|----------|----------|---------|-------| | 0 | hdres2 | +0.069% | +0.067% | +0.100% | -8.6% | | 0 | midres2 | +0.116% | +0.103% | +0.062% | -9.6% | | 0 | lowres2 | +0.276% | +0.283% | +0.214% |-11.9% | STATS_CHANGED Change-Id: I8b26c0be2312fcd0f8c9e889367682e80ea8de4b --- test/vp9_ext_ratectrl_test.cc | 2 +- vp9/encoder/vp9_speed_features.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index 2bfa6281d7..739c0b7f8e 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -41,7 +41,7 @@ constexpr int kDefaultMaxGfInterval = 16; constexpr int kReadMinGfInterval = 5; constexpr int kReadMaxGfInterval = 13; const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv"; -const double kPsnrThreshold = 30.50; +const double kPsnrThreshold = 30.4; struct ToyRateCtrl { int magic_number; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 0431d8a452..6ce90c0887 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -209,15 +209,18 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, const int boosted = frame_is_boosted(cpi); int i; - sf->tx_size_search_breakout = 1; + sf->adaptive_pred_interp_filter = 1; sf->adaptive_rd_thresh = 1; sf->adaptive_rd_thresh_row_mt = 0; sf->allow_skip_recode = 1; sf->less_rectangular_check = 1; - sf->use_square_partition_only = !boosted; + sf->mv.auto_mv_step_size = 1; sf->prune_ref_frame_for_rect_partitions = 1; - sf->rd_ml_partition.var_pruning = 1; + sf->temporal_filter_search_method = NSTEP; + sf->tx_size_search_breakout = 1; + sf->use_square_partition_only = !boosted; + sf->rd_ml_partition.var_pruning = 1; sf->rd_ml_partition.prune_rect_thresh[0] = -1; sf->rd_ml_partition.prune_rect_thresh[1] = 350; sf->rd_ml_partition.prune_rect_thresh[2] = 325; @@ -238,7 +241,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, } if (speed >= 1) { - sf->temporal_filter_search_method = NSTEP; sf->rd_ml_partition.var_pruning = !boosted; sf->rd_ml_partition.prune_rect_thresh[1] = 225; sf->rd_ml_partition.prune_rect_thresh[2] = 225; @@ -263,11 +265,9 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->less_rectangular_check = 1; sf->use_rd_breakout = 1; sf->adaptive_motion_search = 1; - sf->mv.auto_mv_step_size = 1; sf->adaptive_rd_thresh = 2; sf->mv.subpel_search_level = 1; if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10; - sf->adaptive_pred_interp_filter = 1; sf->allow_acl = 0; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; From d6eb9696aa72473c1a11d34d928d35a3acc0c9a9 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Mon, 30 Jan 2023 11:51:58 -0800 Subject: [PATCH 0597/1000] Fix unsigned integer overflow in sse computation Basically port the fix from libaom: https://aomedia-review.googlesource.com/c/aom/+/169361 Change-Id: Id06a5db91372037832399200ded75d514e096726 (cherry picked from commit a94cdd57ffd95ee7beb48d2794dae538f25da46c) --- vpx_dsp/psnr.c | 67 ++++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 43 deletions(-) diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c index 48bac04508..f0d4e927ae 100644 --- a/vpx_dsp/psnr.c +++ b/vpx_dsp/psnr.c @@ -26,57 +26,44 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) { /* TODO(yaowu): The block_variance calls the unoptimized versions of variance() * and highbd_8_variance(). It should not. */ -static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, unsigned int *sse, - int *sum) { +static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h) { int i, j; - - *sum = 0; - *sse = 0; + int64_t sse = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; + sse += diff * diff; } a += a_stride; b += b_stride; } + + return sse; } #if CONFIG_VP9_HIGHBITDEPTH -static void encoder_highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h, uint64_t *sse, int64_t *sum) { +static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h) { int i, j; + int64_t sse = 0; uint16_t *a = CONVERT_TO_SHORTPTR(a8); uint16_t *b = CONVERT_TO_SHORTPTR(b8); - *sum = 0; - *sse = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; + sse += diff * diff; } a += a_stride; b += b_stride; } -} -static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h, unsigned int *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, - &sum_long); - *sse = (unsigned int)sse_long; - *sum = (int)sum_long; + return sse; } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -85,26 +72,23 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, const int dw = width % 16; const int dh = height % 16; int64_t total_sse = 0; - unsigned int sse = 0; - int sum = 0; int x, y; if (dw > 0) { - encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw, - height, &sse, &sum); - total_sse += sse; + total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride, + dw, height); } if (dh > 0) { - encoder_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, width - dw, dh, - &sse, &sum); - total_sse += sse; + total_sse += + encoder_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; + unsigned int sse; for (x = 0; x < width / 16; ++x) { vpx_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; @@ -146,22 +130,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, int x, y; const int dw = width % 16; const int dh = height % 16; - unsigned int sse = 0; - int sum = 0; if (dw > 0) { - encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], - b_stride, dw, height, &sse, &sum); - total_sse += sse; + total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw], + b_stride, dw, height); } if (dh > 0) { - encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, - width - dw, dh, &sse, &sum); - total_sse += sse; + total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; + unsigned int sse; for (x = 0; x < width / 16; ++x) { vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; From bb065c6c6dbdfc24678e926501e9db13afb2ec12 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Wed, 8 Feb 2023 17:05:25 +0000 Subject: [PATCH 0598/1000] Add missing high bitdepth Neon subpel variance tests Add missing 4x4 and 4x8 tests for both high bitdepth sub-pixel variance and high bitdepth averaging sub-pixel variance. Change-Id: I042752c5b7ccc14f58075694d0bb1d36f144ad06 --- test/variance_test.cc | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index a6c8ef0480..33f09209f4 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1572,6 +1572,10 @@ INSTANTIATE_TEST_SUITE_P( 12), SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon, 12), + SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon, + 12), + SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon, + 12), SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon, 10), SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon, @@ -1594,6 +1598,10 @@ INSTANTIATE_TEST_SUITE_P( 10), SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon, 10), + SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon, + 10), + SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon, + 10), SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon, 8), SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon, @@ -1613,7 +1621,9 @@ INSTANTIATE_TEST_SUITE_P( SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon, 8), SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8), - SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8), + SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8), + SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon, 8))); INSTANTIATE_TEST_SUITE_P( @@ -1652,6 +1662,12 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_neon, 12), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_12_sub_pixel_avg_variance4x8_neon, + 12), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_12_sub_pixel_avg_variance4x4_neon, + 12), SubpelAvgVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_neon, 10), @@ -1685,6 +1701,12 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_neon, 10), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_10_sub_pixel_avg_variance4x8_neon, + 10), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_10_sub_pixel_avg_variance4x4_neon, + 10), SubpelAvgVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_neon, 8), @@ -1717,6 +1739,12 @@ INSTANTIATE_TEST_SUITE_P( 8), SubpelAvgVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_neon, + 8), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_8_sub_pixel_avg_variance4x8_neon, + 8), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_8_sub_pixel_avg_variance4x4_neon, 8))); #endif // CONFIG_VP9_HIGHBITDEPTH From b6951d2b0f44c6e52e981929a2632022b101dc1c Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Wed, 8 Feb 2023 14:01:19 -0800 Subject: [PATCH 0599/1000] Copy BLOCK_8X8's mi to PICK_MODE_CONTEXT::mic STATS_CHANGED BUG=webm:1789 Change-Id: I74efe28bdf90a179c59fe3d1f5a15d497f57080d --- vp9/encoder/vp9_rdopt.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index d9b031cdc8..498bc0fbd5 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -4005,13 +4005,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } if (best_mode_index < 0 || best_rd >= best_rd_so_far) { -// If adaptive interp filter is enabled, then the current leaf node of 8x8 -// data is needed for sub8x8. Hence preserve the context. -#if CONFIG_CONSISTENT_RECODE + // If adaptive interp filter is enabled, then the current leaf node of 8x8 + // data is needed for sub8x8. Hence preserve the context. if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; -#else - if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; -#endif rd_cost->rate = INT_MAX; rd_cost->rdcost = INT64_MAX; return; From 459cfc8bae26afde6a16421b6f0e5ff5269ebb80 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 9 Feb 2023 11:57:10 +0000 Subject: [PATCH 0600/1000] Optimize Neon high bitdepth convolve copy Use standard loads and stores instead of the significantly slower interleaving/de-interleaving variants. Also move all loads in loop bodies above all stores as a mitigation against the compiler thinking that the src and dst pointers alias (since we can't use restrict in C89.) Change-Id: Idd59dca51387f553f8db27144a2b8f2377c937d3 --- vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c | 106 +++++++++++--------- 1 file changed, 59 insertions(+), 47 deletions(-) diff --git a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c index 9d2752e097..7751082083 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -26,76 +26,88 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, (void)bd; if (w < 8) { // copy4 + uint16x4_t s0, s1; do { - vst1_u16(dst, vld1_u16(src)); + s0 = vld1_u16(src); src += src_stride; - dst += dst_stride; - vst1_u16(dst, vld1_u16(src)); + s1 = vld1_u16(src); src += src_stride; + + vst1_u16(dst, s0); + dst += dst_stride; + vst1_u16(dst, s1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w == 8) { // copy8 + uint16x8_t s0, s1; do { - vst1q_u16(dst, vld1q_u16(src)); + s0 = vld1q_u16(src); src += src_stride; - dst += dst_stride; - vst1q_u16(dst, vld1q_u16(src)); + s1 = vld1q_u16(src); src += src_stride; + + vst1q_u16(dst, s0); + dst += dst_stride; + vst1q_u16(dst, s1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w < 32) { // copy16 + uint16x8_t s0, s1, s2, s3; do { - vst2q_u16(dst, vld2q_u16(src)); - src += src_stride; - dst += dst_stride; - vst2q_u16(dst, vld2q_u16(src)); + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); src += src_stride; - dst += dst_stride; - vst2q_u16(dst, vld2q_u16(src)); + s2 = vld1q_u16(src); + s3 = vld1q_u16(src + 8); src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); dst += dst_stride; - vst2q_u16(dst, vld2q_u16(src)); - src += src_stride; + vst1q_u16(dst, s2); + vst1q_u16(dst + 8, s3); dst += dst_stride; - h -= 4; - } while (h > 0); + h -= 2; + } while (h != 0); } else if (w == 32) { // copy32 + uint16x8_t s0, s1, s2, s3; do { - vst4q_u16(dst, vld4q_u16(src)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); + vst1q_u16(dst + 16, s2); + vst1q_u16(dst + 24, s3); dst += dst_stride; - h -= 4; - } while (h > 0); + } while (--h != 0); } else { // copy64 + uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7; do { - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + s4 = vld1q_u16(src + 32); + s5 = vld1q_u16(src + 40); + s6 = vld1q_u16(src + 48); + s7 = vld1q_u16(src + 56); src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); - src += src_stride; - dst += dst_stride; - h -= 4; - } while (h > 0); + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); + vst1q_u16(dst + 16, s2); + vst1q_u16(dst + 24, s3); + vst1q_u16(dst + 32, s4); + vst1q_u16(dst + 40, s5); + vst1q_u16(dst + 48, s6); + vst1q_u16(dst + 56, s7); + dst += dst_stride; + } while (--h != 0); } } From 5edaa583e14473464b86c6a5ab29fb793bd805e7 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 7 Feb 2023 17:22:12 -0500 Subject: [PATCH 0601/1000] Remove onyx_int.h from vp8 rc header Also move the FRAME_TYPE declaration to common.h Bug: webm:1766 Change-Id: Ic3016bd16548a5d2e0ae828a7fd7ad8adda8b8f6 --- test/vp8_ratectrl_rtc_test.cc | 7 ++++--- test/vp9_ratectrl_rtc_test.cc | 15 +++++++++------ vp8/vp8_ratectrl_rtc.cc | 11 ++++++++++- vp8/vp8_ratectrl_rtc.h | 20 ++++++++------------ vp9/ratectrl_rtc.cc | 25 ++++++++++++++++++++++++- vp9/ratectrl_rtc.h | 31 +++++-------------------------- vpx/internal/vpx_ratectrl_rtc.h | 3 +++ 7 files changed, 63 insertions(+), 49 deletions(-) diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc index 7410f3c01d..56c26a99f4 100644 --- a/test/vp8_ratectrl_rtc_test.cc +++ b/test/vp8_ratectrl_rtc_test.cc @@ -127,14 +127,15 @@ class Vp8RcInterfaceTest encoder->Control(VP8E_SET_CPUUSED, -6); encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1); encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); - } else if (frame_params_.frame_type == INTER_FRAME) { + } else if (frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) { // Disable golden frame update. frame_flags_ |= VP8_EFLAG_NO_UPD_GF; frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; } } - frame_params_.frame_type = - video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME; + frame_params_.frame_type = video->frame() % key_interval_ == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; encoder_exit_ = video->frame() == test_video_.frames; } diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index 1d1a78f43d..cce73fcce2 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -57,9 +57,11 @@ class RcInterfaceTest encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1); } - frame_params_.frame_type = - video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME; - if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) { + frame_params_.frame_type = video->frame() % key_interval_ == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; + if (rc_cfg_.rc_mode == VPX_CBR && + frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) { // Disable golden frame update. frame_flags_ |= VP8_EFLAG_NO_UPD_GF; frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; @@ -183,8 +185,9 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, encoder->Control(VP9E_SET_SVC, 1); encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); } - frame_params_.frame_type = - video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME; + frame_params_.frame_type = video->frame() % key_interval_ == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; encoder_exit_ = video->frame() == kNumFrames; current_superframe_ = video->frame(); if (dynamic_spatial_layers_ == 1) { @@ -247,7 +250,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, else frame_params_.temporal_layer_id = 0; rc_api_->ComputeQP(frame_params_); - frame_params_.frame_type = INTER_FRAME; + frame_params_.frame_type = libvpx::RcFrameType::kInterFrame; rc_api_->PostEncodeUpdate(sizes_[sl]); } } diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc index f3f42529db..c36cfea485 100644 --- a/vp8/vp8_ratectrl_rtc.cc +++ b/vp8/vp8_ratectrl_rtc.cc @@ -10,7 +10,9 @@ #include #include +#include "vp8/common/common.h" #include "vp8/vp8_ratectrl_rtc.h" +#include "vp8/encoder/onyx_int.h" #include "vp8/encoder/ratectrl.h" #include "vpx_ports/system_state.h" @@ -65,6 +67,13 @@ std::unique_ptr VP8RateControlRTC::Create( return rc_api; } +VP8RateControlRTC::~VP8RateControlRTC() { + if (cpi_) { + vpx_free(cpi_->gf_active_flags); + vpx_free(cpi_); + } +} + void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) { VP8_COMMON *cm = &cpi_->common; VP8_CONFIG *oxcf = &cpi_->oxcf; @@ -203,7 +212,7 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) { vp8_restore_layer_context(cpi_, layer); vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate); } - cm->frame_type = frame_params.frame_type; + cm->frame_type = static_cast(frame_params.frame_type); cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) { diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h index def7dd8f9e..0e81592eca 100644 --- a/vp8/vp8_ratectrl_rtc.h +++ b/vp8/vp8_ratectrl_rtc.h @@ -12,23 +12,24 @@ #define VPX_VP8_RATECTRL_RTC_H_ #include +#include #include -#include "vp8/encoder/onyx_int.h" -#include "vp8/common/common.h" #include "vpx/internal/vpx_ratectrl_rtc.h" +struct VP8_COMP; + namespace libvpx { struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig { public: VP8RateControlRtcConfig() { - vp8_zero(layer_target_bitrate); - vp8_zero(ts_rate_decimator); + memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate)); + memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator)); } }; struct VP8FrameParamsQpRTC { - FRAME_TYPE frame_type; + RcFrameType frame_type; int temporal_layer_id; }; @@ -36,12 +37,7 @@ class VP8RateControlRTC { public: static std::unique_ptr Create( const VP8RateControlRtcConfig &cfg); - ~VP8RateControlRTC() { - if (cpi_) { - vpx_free(cpi_->gf_active_flags); - vpx_free(cpi_); - } - } + ~VP8RateControlRTC(); void UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg); // GetQP() needs to be called after ComputeQP() to get the latest QP @@ -54,7 +50,7 @@ class VP8RateControlRTC { private: VP8RateControlRTC() {} void InitRateControl(const VP8RateControlRtcConfig &cfg); - VP8_COMP *cpi_; + struct VP8_COMP *cpi_; int q_; }; diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index 02e50a857c..944c526ac1 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -48,6 +48,29 @@ std::unique_ptr VP9RateControlRTC::Create( return rc_api; } +VP9RateControlRTC::~VP9RateControlRTC() { + if (cpi_) { + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) { + for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) { + int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers); + LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer]; + vpx_free(lc->map); + vpx_free(lc->last_coded_q_map); + vpx_free(lc->consec_zero_mv); + } + } + } + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + vpx_free(cpi_->segmentation_map); + cpi_->segmentation_map = NULL; + vp9_cyclic_refresh_free(cpi_->cyclic_refresh); + } + vpx_free(cpi_); + } +} + void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { VP9_COMMON *cm = &cpi_->common; VP9EncoderConfig *oxcf = &cpi_->oxcf; @@ -157,7 +180,7 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { cm->height = height; } vp9_set_mb_mi(cm, cm->width, cm->height); - cm->frame_type = frame_params.frame_type; + cm->frame_type = static_cast(frame_params.frame_type); // This is needed to ensure key frame does not get unset in rc_get_svc_params. cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0; cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index b209e4db66..162a04883e 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -19,14 +19,14 @@ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/vp9_iface_common.h" #include "vp9/encoder/vp9_aq_cyclicrefresh.h" -#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/vp9_cx_iface.h" #include "vpx/internal/vpx_ratectrl_rtc.h" #include "vpx_mem/vpx_mem.h" -namespace libvpx { +struct VP9_COMP; +namespace libvpx { struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig { public: VP9RateControlRtcConfig() { @@ -53,7 +53,7 @@ struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig { }; struct VP9FrameParamsQpRTC { - FRAME_TYPE frame_type; + RcFrameType frame_type; int spatial_layer_id; int temporal_layer_id; }; @@ -90,28 +90,7 @@ class VP9RateControlRTC { public: static std::unique_ptr Create( const VP9RateControlRtcConfig &cfg); - ~VP9RateControlRTC() { - if (cpi_) { - if (cpi_->svc.number_spatial_layers > 1 || - cpi_->svc.number_temporal_layers > 1) { - for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) { - for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) { - int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers); - LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer]; - vpx_free(lc->map); - vpx_free(lc->last_coded_q_map); - vpx_free(lc->consec_zero_mv); - } - } - } - if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { - vpx_free(cpi_->segmentation_map); - cpi_->segmentation_map = NULL; - vp9_cyclic_refresh_free(cpi_->cyclic_refresh); - } - vpx_free(cpi_); - } - } + ~VP9RateControlRTC(); void UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg); // GetQP() needs to be called after ComputeQP() to get the latest QP @@ -125,7 +104,7 @@ class VP9RateControlRTC { private: VP9RateControlRTC() {} void InitRateControl(const VP9RateControlRtcConfig &cfg); - VP9_COMP *cpi_; + struct VP9_COMP *cpi_; }; } // namespace libvpx diff --git a/vpx/internal/vpx_ratectrl_rtc.h b/vpx/internal/vpx_ratectrl_rtc.h index 65398c654d..33c57e219b 100644 --- a/vpx/internal/vpx_ratectrl_rtc.h +++ b/vpx/internal/vpx_ratectrl_rtc.h @@ -14,6 +14,9 @@ #include "vpx/vpx_encoder.h" namespace libvpx { + +enum class RcFrameType { kKeyFrame = 0, kInterFrame = 1 }; + struct VpxRateControlRtcConfig { public: VpxRateControlRtcConfig() { From 086f0e653893bf1fa15f5d78592ac96372c9ccd4 Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Wed, 8 Feb 2023 13:54:46 -0800 Subject: [PATCH 0602/1000] Remove CONFIG_CONSISTENT_RECODE flag Currently, libvpx does not properly clear and re-initialize the memories when it re-encodes a frame. As a result, out-of-date values are used in the encoding process, and re-encoding a frame with the same parameter will give different outputs. This commit enables the code under CONFIG_CONSISTENT_RECODE to correct this behavior. This change has minor effect on the coding performance, but it ensures valid values are used in the encoding process. Furthermore, the flag is removed as it is now always turned on. Performance: | SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T | |---------|---------|----------|----------|---------|-------| | 0 | hdres2 | -0.012% | -0.021% | -0.030% | +0.1% | | 0 | lowres2 | +0.029% | +0.019% | +0.047% | +0.1% | | 0 | midres2 | -0.004% | +0.009% | +0.026% | +0.1% | |---------|---------|----------|----------|---------|-------| | 1 | hdres2 | +0.032% | +0.032% | -0.000% | -0.0% | | 1 | lowres2 | -0.005% | -0.011% | -0.014% | +0.0% | | 1 | midres2 | +0.004% | +0.020% | +0.027% | +0.2% | |---------|---------|----------|----------|---------|-------| | 2 | hdres2 | +0.048% | +0.056% | +0.057% | +0.1% | | 2 | lowres2 | +0.007% | +0.002% | -0.016% | -0.0% | | 2 | midres2 | -0.015% | -0.008% | -0.002% | +0.1% | |---------|---------|----------|----------|---------|-------| | 3 | hdres2 | +0.010% | +0.014% | +0.004% | -0.0% | | 3 | lowres2 | +0.000% | -0.021% | -0.001% | +0.0% | | 3 | midres2 | +0.007% | -0.038% | +0.012% | -0.2% | |---------|---------|----------|----------|---------|-------| | 4 | hdres2 | +0.107% | +0.136% | +0.124% | -0.0% | | 4 | lowres2 | -0.012% | -0.024% | -0.020% | -0.0% | | 4 | midres2 | +0.055% | -0.004% | +0.048% | -0.1% | |---------|---------|----------|----------|---------|-------| | 5 | hdres2 | +0.026% | +0.027% | +0.020% | -0.0% | | 5 | lowres2 | +0.009% | -0.008% | +0.028% | +0.1% | | 5 | midres2 | -0.025% | +0.021% | -0.020% | -0.1% | STATS_CHANGED Change-Id: I3967aee8c8e4d0608a492e07f99ab8de9744ba57 --- configure | 2 -- vp9/encoder/vp9_encodeframe.c | 39 ++++++----------------------------- vp9/encoder/vp9_encoder.c | 30 ++++++--------------------- vp9/encoder/vp9_encoder.h | 2 -- vp9/encoder/vp9_rd.h | 2 -- 5 files changed, 12 insertions(+), 63 deletions(-) diff --git a/configure b/configure index 18f0ea798b..890ad3968a 100755 --- a/configure +++ b/configure @@ -343,7 +343,6 @@ CONFIG_LIST=" multi_res_encoding temporal_denoising vp9_temporal_denoising - consistent_recode coefficient_range_checking vp9_highbitdepth better_hw_compatibility @@ -407,7 +406,6 @@ CMDLINE_SELECT=" multi_res_encoding temporal_denoising vp9_temporal_denoising - consistent_recode coefficient_range_checking better_hw_compatibility vp9_highbitdepth diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index a22c00bd8f..a522097e61 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -5841,14 +5841,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) { for (i = 0; i < BLOCK_SIZES; ++i) { for (j = 0; j < MAX_MODES; ++j) { tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT; -#if CONFIG_RATE_CTRL - if (cpi->oxcf.use_simple_encode_api) { - tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT; - } -#endif // CONFIG_RATE_CTRL -#if CONFIG_CONSISTENT_RECODE tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT; -#endif // CONFIG_CONSISTENT_RECODE tile_data->mode_map[i][j] = j; } } @@ -6068,9 +6061,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; #endif // CONFIG_VP9_HIGHBITDEPTH x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; -#if CONFIG_CONSISTENT_RECODE x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1; -#endif if (xd->lossless) x->optimize = 0; x->sharpness = cpi->oxcf.sharpness; x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ); @@ -6215,13 +6206,11 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) { return sum_delta / (cm->mi_rows * cm->mi_cols); } -#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL static void restore_encode_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; - const int tile_cols = 1 << cm->log2_tile_cols; - const int tile_rows = 1 << cm->log2_tile_rows; - int tile_col, tile_row; + int tile_idx; int i, j; + TileDataEnc *tile_data; RD_OPT *rd_opt = &cpi->rd; for (i = 0; i < MAX_REF_FRAMES; i++) { for (j = 0; j < REFERENCE_MODES; j++) @@ -6232,35 +6221,19 @@ static void restore_encode_params(VP9_COMP *cpi) { rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j]; } - if (cpi->tile_data != NULL) { - for (tile_row = 0; tile_row < tile_rows; ++tile_row) - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileDataEnc *tile_data = - &cpi->tile_data[tile_row * tile_cols + tile_col]; - for (i = 0; i < BLOCK_SIZES; ++i) { - for (j = 0; j < MAX_MODES; ++j) { - tile_data->thresh_freq_fact[i][j] = - tile_data->thresh_freq_fact_prev[i][j]; - } - } - } + for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) { + assert(cpi->tile_data); + tile_data = &cpi->tile_data[tile_idx]; + vp9_copy(tile_data->thresh_freq_fact, tile_data->thresh_freq_fact_prev); } cm->interp_filter = cpi->sf.default_interp_filter; } -#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL void vp9_encode_frame(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; -#if CONFIG_RATE_CTRL - if (cpi->oxcf.use_simple_encode_api) { - restore_encode_params(cpi); - } -#endif // CONFIG_RATE_CTRL -#if CONFIG_CONSISTENT_RECODE restore_encode_params(cpi); -#endif #if CONFIG_MISMATCH_DEBUG mismatch_reset_frame(MAX_MB_PLANE); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 5b895c2814..22fbb899fd 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -3774,13 +3774,10 @@ static void set_frame_size(VP9_COMP *cpi) { set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); } -#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL static void save_encode_params(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - const int tile_cols = 1 << cm->log2_tile_cols; - const int tile_rows = 1 << cm->log2_tile_rows; - int tile_col, tile_row; + int tile_idx; int i, j; + TileDataEnc *tile_data; RD_OPT *rd_opt = &cpi->rd; for (i = 0; i < MAX_REF_FRAMES; i++) { for (j = 0; j < REFERENCE_MODES; j++) @@ -3791,21 +3788,12 @@ static void save_encode_params(VP9_COMP *cpi) { rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j]; } - if (cpi->tile_data != NULL) { - for (tile_row = 0; tile_row < tile_rows; ++tile_row) - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileDataEnc *tile_data = - &cpi->tile_data[tile_row * tile_cols + tile_col]; - for (i = 0; i < BLOCK_SIZES; ++i) { - for (j = 0; j < MAX_MODES; ++j) { - tile_data->thresh_freq_fact_prev[i][j] = - tile_data->thresh_freq_fact[i][j]; - } - } - } + for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) { + assert(cpi->tile_data); + tile_data = &cpi->tile_data[tile_idx]; + vp9_copy(tile_data->thresh_freq_fact_prev, tile_data->thresh_freq_fact); } } -#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL static INLINE void set_raw_source_frame(VP9_COMP *cpi) { #ifdef ENABLE_KF_DENOISE @@ -5484,14 +5472,8 @@ static void encode_frame_to_data_rate( memset(cpi->mode_chosen_counts, 0, MAX_MODES * sizeof(*cpi->mode_chosen_counts)); #endif -#if CONFIG_CONSISTENT_RECODE // Backup to ensure consistency between recodes save_encode_params(cpi); -#elif CONFIG_RATE_CTRL - if (cpi->oxcf.use_simple_encode_api) { - save_encode_params(cpi); - } -#endif if (cpi->ext_ratectrl.ready && (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0) { vpx_codec_err_t codec_status; diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index e17845d065..79c0b36a17 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -329,9 +329,7 @@ typedef struct TplDepFrame { typedef struct TileDataEnc { TileInfo tile_info; int thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; -#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES]; -#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL int8_t mode_map[BLOCK_SIZES][MAX_MODES]; FIRSTPASS_DATA fp_data; VP9RowMTSync row_mt_sync; diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h index d2bc5e60ed..efd854edf4 100644 --- a/vp9/encoder/vp9_rd.h +++ b/vp9/encoder/vp9_rd.h @@ -121,11 +121,9 @@ typedef struct RD_OPT { int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES]; int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; -#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES]; int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; -#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL int RDMULT; int RDDIV; double r0; From 184a886917529e8a9d23ab564b05b0cc13e29f2b Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 10 Feb 2023 19:04:41 -0800 Subject: [PATCH 0603/1000] README: update release version to 1.13.0 this was missed in the v1.13.0 tag Bug: webm:1780 Change-Id: I3044534123bf67861174970e6241f6586055358e --- README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README b/README index 477a145ba3..e360df05f6 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -v1.12.0 Torrent Duck +v1.13.0 Ugly Duckling Welcome to the WebM VP8/VP9 Codec SDK! From b5e1945af0e3f33ab4ab5fc5175da5505d6a67cd Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 10 Feb 2023 19:04:41 -0800 Subject: [PATCH 0604/1000] README: update release version to 1.13.0 this was missed in the v1.13.0 tag Bug: webm:1780 Change-Id: I3044534123bf67861174970e6241f6586055358e (cherry picked from commit 184a886917529e8a9d23ab564b05b0cc13e29f2b) --- README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README b/README index 477a145ba3..e360df05f6 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -v1.12.0 Torrent Duck +v1.13.0 Ugly Duckling Welcome to the WebM VP8/VP9 Codec SDK! From 42cb3dbf94706dd8477f9313ceba1fc3a9a14e92 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Tue, 7 Feb 2023 14:08:33 +0000 Subject: [PATCH 0605/1000] Optimize Neon high bitdepth subpel variance functions Use the same general code style as in the standard bitdepth Neon implementation. Additionally, do not unnecessarily widen to 32-bit data types when doing bilinear filtering - allowing us to process twice as many elements per instruction. Change-Id: I1e178991d2aa71f5f77a376e145d19257481e90f --- vpx_dsp/arm/highbd_subpel_variance_neon.c | 307 ++++++++++++++++++++++ vpx_dsp/arm/highbd_variance_neon.c | 288 +------------------- vpx_dsp/arm/mem_neon.h | 14 + vpx_dsp/vpx_dsp.mk | 1 + 4 files changed, 336 insertions(+), 274 deletions(-) create mode 100644 vpx_dsp/arm/highbd_subpel_variance_neon.c diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c new file mode 100644 index 0000000000..81943ee4a6 --- /dev/null +++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" + +// The bilinear filters look like this: +// +// {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, +// { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }} +// +// We can factor out the highest common multiple, such that the sum of both +// weights will be 8 instead of 128. The benefits of this are two-fold: +// +// 1) We can infer the filter values from the filter_offset parameter in the +// bilinear filter functions below - we don't have to actually load the values +// from memory: +// f0 = 8 - filter_offset +// f1 = filter_offset +// +// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on +// 16-bit data types at all times, rather than widening out to 32-bit and +// requiring double the number of data processing instructions. (12-bit * 8 = +// 15-bit.) + +// Process a block exactly 4 wide and a multiple of 2 high. +static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride); + uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr, blend); + + src_ptr += 2 * src_stride; + dst_ptr += 8; + i -= 2; + } while (i != 0); +} + +// Process a block which is a multiple of 8 and any height. +static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int src_stride, int pixel_step, + int dst_width, int dst_height, + int filter_offset) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr + j, blend); + + j += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 8, dst_height, filter_offset); +} +static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 16, dst_height, filter_offset); +} +static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 32, dst_height, filter_offset); +} +static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 64, dst_height, filter_offset); +} + +#define HBD_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_highbd_8_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint16_t tmp0[w * (h + padding)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + padding), xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + \ + return vpx_highbd_8_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), w, ref, \ + ref_stride, sse); \ + } \ + \ + unsigned int vpx_highbd_10_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint16_t tmp0[w * (h + padding)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + padding), xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + \ + return vpx_highbd_10_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), w, ref, \ + ref_stride, sse); \ + } \ + unsigned int vpx_highbd_12_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint16_t tmp0[w * (h + padding)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + padding), xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + \ + return vpx_highbd_12_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), w, ref, \ + ref_stride, sse); \ + } + +// 4x blocks are processed two rows at a time, so require an extra row of +// padding. +HBD_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) +HBD_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) + +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) + +HBD_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) + +HBD_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) + +HBD_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) + +void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i, j; + uint32x4_t one_u32 = vdupq_n_u32(1); + if (width >= 8) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 8) { + const uint16x8_t pred_u16 = vld1q_u16(&pred[j]); + const uint16x8_t ref_u16 = vld1q_u16(&ref[j]); + const uint32x4_t sum1_u32 = + vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16)); + const uint32x4_t sum2_u32 = + vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16)); + const uint16x4_t sum1_u16 = + vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1); + const uint16x4_t sum2_u16 = + vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1); + const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16); + vst1q_u16(&comp_pred[j], vcomp_pred); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else { + assert(width >= 4); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 4) { + const uint16x4_t pred_u16 = vld1_u16(&pred[j]); + const uint16x4_t ref_u16 = vld1_u16(&ref[j]); + const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16); + const uint16x4_t vcomp_pred = + vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1); + vst1_u16(&comp_pred[j], vcomp_pred); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } +} + +#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ + uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t tmp0[w * (h + padding)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + padding), xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + \ + vpx_highbd_comp_avg_pred_neon(tmp0, CONVERT_TO_SHORTPTR(second_pred), w, \ + h, tmp1, w); \ + \ + return vpx_highbd_8_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp0), w, \ + ref, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t tmp0[w * (h + padding)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + padding), xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + \ + vpx_highbd_comp_avg_pred_neon(tmp0, CONVERT_TO_SHORTPTR(second_pred), w, \ + h, tmp1, w); \ + \ + return vpx_highbd_10_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp0), w, \ + ref, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t tmp0[w * (h + padding)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + padding), xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + \ + vpx_highbd_comp_avg_pred_neon(tmp0, CONVERT_TO_SHORTPTR(second_pred), w, \ + h, tmp1, w); \ + \ + return vpx_highbd_12_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp0), w, \ + ref, ref_stride, sse); \ + } + +// 4x blocks are processed two rows at a time, so require an extra row of +// padding. +SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2) +SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2) + +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1) + +SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1) + +SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1) + +SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1) diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c index 96a35af01c..985cc35682 100644 --- a/vpx_dsp/arm/highbd_variance_neon.c +++ b/vpx_dsp/arm/highbd_variance_neon.c @@ -18,11 +18,6 @@ #include "vpx_dsp/arm/sum_neon.h" #include "vpx_ports/mem.h" -static const uint8_t bilinear_filters[8][2] = { - { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, -}; - static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, int w, int h, uint64_t *sse, @@ -136,7 +131,7 @@ static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride, *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); } -#define HIGHBD_VAR(W, H) \ +#define HBD_VARIANCE_WXH_NEON(W, H) \ uint32_t vpx_highbd_8_variance##W##x##H##_neon( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ @@ -218,274 +213,19 @@ static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride, return *sse; \ } -static INLINE void highbd_var_filter_block2d_bil_first_pass( - const uint8_t *src_ptr8, uint16_t *output_ptr, - unsigned int src_pixels_per_line, int pixel_step, - unsigned int output_height, unsigned int output_width, - const uint8_t *filter) { - uint32_t i, j; - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); - - uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1); - uint16x4_t filter1_u16 = vdup_n_u16(filter[0]); - uint16x4_t filter2_u16 = vdup_n_u16(filter[1]); - - if (output_width >= 8) { - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 8) { - const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]); - const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]); - uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); - uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); - uint16x4_t out1_u16; - uint16x4_t out2_u16; - sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16)); - sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16)); - out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS); - out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS); - vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16)); - } - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } - } else { - assert(output_width >= 4); - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 4) { - const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]); - const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]); - uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16); - uint16x4_t out_u16; - sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16); - out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS); - vst1_u16(&output_ptr[j], out_u16); - } - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } - } -} - -static INLINE void highbd_var_filter_block2d_bil_second_pass( - const uint16_t *src_ptr, uint16_t *output_ptr, - unsigned int src_pixels_per_line, unsigned int pixel_step, - unsigned int output_height, unsigned int output_width, - const uint8_t *filter) { - uint32_t i, j; - - uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1); - uint16x4_t filter1_u16 = vdup_n_u16(filter[0]); - uint16x4_t filter2_u16 = vdup_n_u16(filter[1]); - - if (output_width >= 8) { - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 8) { - const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]); - const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]); - uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); - uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); - uint16x4_t out1_u16; - uint16x4_t out2_u16; - sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16)); - sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16)); - out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS); - out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS); - vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16)); - } - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } - } else { - assert(output_width >= 4); - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 4) { - const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]); - const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]); - uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16); - uint16x4_t out_u16; - sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16); - out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS); - vst1_u16(&output_ptr[j], out_u16); - } - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } - } -} - -#define HIGHBD_SUBPIX_VAR(W, H) \ - uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp2), W, \ - ref_ptr, ref_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - return vpx_highbd_10_variance##W##x##H##_neon( \ - CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - return vpx_highbd_12_variance##W##x##H##_neon( \ - CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \ - } - -#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ - uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ - H, temp2, W); \ - \ - return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp3), W, \ - ref_ptr, ref_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ - H, temp2, W); \ - \ - return vpx_highbd_10_variance##W##x##H##_neon( \ - CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ - H, temp2, W); \ - \ - return vpx_highbd_12_variance##W##x##H##_neon( \ - CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \ - } - -void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, - int width, int height, const uint16_t *ref, - int ref_stride) { - int i, j; - uint32x4_t one_u32 = vdupq_n_u32(1); - if (width >= 8) { - for (i = 0; i < height; ++i) { - for (j = 0; j < width; j += 8) { - const uint16x8_t pred_u16 = vld1q_u16(&pred[j]); - const uint16x8_t ref_u16 = vld1q_u16(&ref[j]); - const uint32x4_t sum1_u32 = - vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16)); - const uint32x4_t sum2_u32 = - vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16)); - const uint16x4_t sum1_u16 = - vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1); - const uint16x4_t sum2_u16 = - vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1); - const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16); - vst1q_u16(&comp_pred[j], vcomp_pred); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } - } else { - assert(width >= 4); - for (i = 0; i < height; ++i) { - for (j = 0; j < width; j += 4) { - const uint16x4_t pred_u16 = vld1_u16(&pred[j]); - const uint16x4_t ref_u16 = vld1_u16(&ref[j]); - const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16); - const uint16x4_t vcomp_pred = - vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1); - vst1_u16(&comp_pred[j], vcomp_pred); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } - } -} - -/* All three forms of the variance are available in the same sizes. */ -#define HIGHBD_VARIANCES(W, H) \ - HIGHBD_VAR(W, H) \ - HIGHBD_SUBPIX_VAR(W, H) \ - HIGHBD_SUBPIX_AVG_VAR(W, H) - -HIGHBD_VARIANCES(64, 64) -HIGHBD_VARIANCES(64, 32) -HIGHBD_VARIANCES(32, 64) -HIGHBD_VARIANCES(32, 32) -HIGHBD_VARIANCES(32, 16) -HIGHBD_VARIANCES(16, 32) -HIGHBD_VARIANCES(16, 16) -HIGHBD_VARIANCES(16, 8) -HIGHBD_VARIANCES(8, 16) -HIGHBD_VARIANCES(8, 8) -HIGHBD_VARIANCES(8, 4) -HIGHBD_VARIANCES(4, 8) -HIGHBD_VARIANCES(4, 4) +HBD_VARIANCE_WXH_NEON(64, 64) +HBD_VARIANCE_WXH_NEON(64, 32) +HBD_VARIANCE_WXH_NEON(32, 64) +HBD_VARIANCE_WXH_NEON(32, 32) +HBD_VARIANCE_WXH_NEON(32, 16) +HBD_VARIANCE_WXH_NEON(16, 32) +HBD_VARIANCE_WXH_NEON(16, 16) +HBD_VARIANCE_WXH_NEON(16, 8) +HBD_VARIANCE_WXH_NEON(8, 16) +HBD_VARIANCE_WXH_NEON(8, 8) +HBD_VARIANCE_WXH_NEON(8, 4) +HBD_VARIANCE_WXH_NEON(4, 8) +HBD_VARIANCE_WXH_NEON(4, 4) HIGHBD_GET_VAR(8) HIGHBD_GET_VAR(16) diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index 19cfc7c7f2..866be7439e 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -126,6 +126,20 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, return vreinterpret_u8_u32(a_u32); } +// Load 2 sets of 8 bytes when alignment is not guaranteed. +static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf, + ptrdiff_t stride) { + uint64_t a; + uint64x2_t a_u64; + if (stride == 4) return vld1q_u16(buf); + memcpy(&a, buf, 8); + buf += stride; + a_u64 = vdupq_n_u64(a); + memcpy(&a, buf, 8); + a_u64 = vsetq_lane_u64(a, a_u64, 1); + return vreinterpretq_u16_u64(a_u64); +} + // Store 2 sets of 4 bytes when alignment is not guaranteed. static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) { diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 3b04e97651..f10b7cc208 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -434,6 +434,7 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC From 7343d56c1bf2f32b3fe2127cfcec1006f2fd95c6 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Wed, 8 Feb 2023 16:50:59 +0000 Subject: [PATCH 0606/1000] Refactor Neon high bitdepth avg subpel variance functions Use the same general code style as in the standard bitdepth Neon implementation - merging the computation of vpx_highbd_comp_avg_pred with the second pass of the bilinear filter to avoid storing and loading the block again. Also move vpx_highbd_comp_avg_pred_neon to its own file (like the standard bitdepth implementation) since we're no longer using it for averaging sub-pixel variance. Change-Id: I2f5916d5b397db44b3247b478ef57046797dae6c --- vpx_dsp/arm/highbd_avg_pred_neon.c | 57 ++++++++ vpx_dsp/arm/highbd_subpel_variance_neon.c | 150 ++++++++++++++-------- vpx_dsp/vpx_dsp.mk | 1 + 3 files changed, 152 insertions(+), 56 deletions(-) create mode 100644 vpx_dsp/arm/highbd_avg_pred_neon.c diff --git a/vpx_dsp/arm/highbd_avg_pred_neon.c b/vpx_dsp/arm/highbd_avg_pred_neon.c new file mode 100644 index 0000000000..04dbaad7a3 --- /dev/null +++ b/vpx_dsp/arm/highbd_avg_pred_neon.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i, j; + uint32x4_t one_u32 = vdupq_n_u32(1); + if (width >= 8) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 8) { + const uint16x8_t pred_u16 = vld1q_u16(&pred[j]); + const uint16x8_t ref_u16 = vld1q_u16(&ref[j]); + const uint32x4_t sum1_u32 = + vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16)); + const uint32x4_t sum2_u32 = + vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16)); + const uint16x4_t sum1_u16 = + vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1); + const uint16x4_t sum2_u16 = + vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1); + const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16); + vst1q_u16(&comp_pred[j], vcomp_pred); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else { + assert(width >= 4); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 4) { + const uint16x4_t pred_u16 = vld1_u16(&pred[j]); + const uint16x4_t ref_u16 = vld1_u16(&ref[j]); + const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16); + const uint16x4_t vcomp_pred = + vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1); + vst1_u16(&comp_pred[j], vcomp_pred); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } +} diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c index 81943ee4a6..07e8ea5107 100644 --- a/vpx_dsp/arm/highbd_subpel_variance_neon.c +++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c @@ -185,47 +185,91 @@ HBD_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) HBD_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) HBD_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) -void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, - int width, int height, const uint16_t *ref, - int ref_stride) { - int i, j; - uint32x4_t one_u32 = vdupq_n_u32(1); - if (width >= 8) { - for (i = 0; i < height; ++i) { - for (j = 0; j < width; j += 8) { - const uint16x8_t pred_u16 = vld1q_u16(&pred[j]); - const uint16x8_t ref_u16 = vld1q_u16(&ref[j]); - const uint32x4_t sum1_u32 = - vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16)); - const uint32x4_t sum2_u32 = - vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16)); - const uint16x4_t sum1_u16 = - vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1); - const uint16x4_t sum2_u16 = - vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1); - const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16); - vst1q_u16(&comp_pred[j], vcomp_pred); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } - } else { - assert(width >= 4); - for (i = 0; i < height; ++i) { - for (j = 0; j < width; j += 4) { - const uint16x4_t pred_u16 = vld1_u16(&pred[j]); - const uint16x4_t ref_u16 = vld1_u16(&ref[j]); - const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16); - const uint16x4_t vcomp_pred = - vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1); - vst1_u16(&comp_pred[j], vcomp_pred); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } - } +// Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having +// width 4. +static void highbd_avg_pred_var_filter_block2d_bil_w4( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride); + uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr, vrhaddq_u16(blend, p)); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + second_pred += 2 * 4; + i -= 2; + } while (i != 0); +} + +// Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks. +static void highbd_avg_pred_var_filter_block2d_bil_large( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, int filter_offset, + const uint16_t *second_pred) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p)); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void highbd_avg_pred_var_filter_block2d_bil_w8( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 8, dst_height, + filter_offset, second_pred); +} +static void highbd_avg_pred_var_filter_block2d_bil_w16( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 16, dst_height, + filter_offset, second_pred); +} +static void highbd_avg_pred_var_filter_block2d_bil_w32( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 32, dst_height, + filter_offset, second_pred); +} +static void highbd_avg_pred_var_filter_block2d_bil_w64( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 64, dst_height, + filter_offset, second_pred); } #define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ @@ -239,12 +283,10 @@ void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ (h + padding), xoffset); \ - highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ \ - vpx_highbd_comp_avg_pred_neon(tmp0, CONVERT_TO_SHORTPTR(second_pred), w, \ - h, tmp1, w); \ - \ - return vpx_highbd_8_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp0), w, \ + return vpx_highbd_8_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp1), w, \ ref, ref_stride, sse); \ } \ \ @@ -258,12 +300,10 @@ void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ (h + padding), xoffset); \ - highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - \ - vpx_highbd_comp_avg_pred_neon(tmp0, CONVERT_TO_SHORTPTR(second_pred), w, \ - h, tmp1, w); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ \ - return vpx_highbd_10_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp0), w, \ + return vpx_highbd_10_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp1), w, \ ref, ref_stride, sse); \ } \ \ @@ -277,12 +317,10 @@ void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ (h + padding), xoffset); \ - highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - \ - vpx_highbd_comp_avg_pred_neon(tmp0, CONVERT_TO_SHORTPTR(second_pred), w, \ - h, tmp1, w); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ \ - return vpx_highbd_12_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp0), w, \ + return vpx_highbd_12_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp1), w, \ ref, ref_stride, sse); \ } diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index f10b7cc208..5535f82c07 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -433,6 +433,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c endif # CONFIG_VP9_HIGHBITDEPTH From c113d6b027b0299abe31f05240e2895112517c91 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Thu, 9 Feb 2023 14:16:30 +0000 Subject: [PATCH 0607/1000] Specialize Neon high bitdepth subpel variance by filter value Use the same specialization as for standard bitdepth. The rationale for the specialization is as follows: The optimal implementation of the bilinear interpolation depends on the filter values being used. For both horizontal and vertical interpolation this can simplify to just taking the source values, or averaging the source and reference values - which can be computed more easily than a bilinear interpolation with arbitrary filter values. This patch introduces tests to find the most optimal bilinear interpolation implementation based on the filter values being used. This new specialization is only used for larger block sizes. Change-Id: I73182c979255f0332a274f2e5907df7f38c9eeb3 --- vpx_dsp/arm/highbd_subpel_variance_neon.c | 217 ++++++++++++++++------ 1 file changed, 161 insertions(+), 56 deletions(-) diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c index 07e8ea5107..0682eb3a29 100644 --- a/vpx_dsp/arm/highbd_subpel_variance_neon.c +++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c @@ -120,70 +120,175 @@ static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr, 64, dst_height, filter_offset); } -#define HBD_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ - unsigned int vpx_highbd_8_sub_pixel_variance##w##x##h##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, uint32_t *sse) { \ - uint16_t tmp0[w * (h + padding)]; \ - uint16_t tmp1[w * h]; \ - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ - \ - highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ - (h + padding), xoffset); \ - highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - \ - return vpx_highbd_8_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), w, ref, \ - ref_stride, sse); \ - } \ - \ - unsigned int vpx_highbd_10_sub_pixel_variance##w##x##h##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, uint32_t *sse) { \ - uint16_t tmp0[w * (h + padding)]; \ - uint16_t tmp1[w * h]; \ - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ - \ - highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ - (h + padding), xoffset); \ - highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - \ - return vpx_highbd_10_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), w, ref, \ - ref_stride, sse); \ - } \ - unsigned int vpx_highbd_12_sub_pixel_variance##w##x##h##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, uint32_t *sse) { \ - uint16_t tmp0[w * (h + padding)]; \ - uint16_t tmp1[w * h]; \ - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ - \ - highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ - (h + padding), xoffset); \ - highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - \ - return vpx_highbd_12_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), w, ref, \ - ref_stride, sse); \ +static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t avg = vrhaddq_u16(s0, s1); + vst1q_u16(dst_ptr + j, avg); + + j += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \ + unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint16_t tmp0[w * (h + padding)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + padding), xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + \ + return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ + w, ref, ref_stride, sse); \ + } + +#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \ + unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse) { \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp[w * h]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \ + h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \ + src_stride, h, yoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint16_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * (h + padding)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ + (h + padding)); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * (h + padding)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ + (h + padding)); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } else { \ + uint16_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ + xoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + padding), xoffset); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + padding), xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } \ } // 4x blocks are processed two rows at a time, so require an extra row of // padding. -HBD_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) -HBD_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) -HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) +// 8-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4, 2) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8, 2) + +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16, 1) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32, 1) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64, 1) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64, 1) + +// 10-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4, 2) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8, 2) + +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16, 1) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32, 1) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64, 1) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64, 1) + +// 12-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4, 2) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8, 2) + +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64, 1) // Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having // width 4. From e03217c9d543d2f1053b45e11f6c323794c80e9a Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Thu, 9 Feb 2023 16:45:01 +0000 Subject: [PATCH 0608/1000] Specialize Neon high bitdepth avg subpel variance by filter value Use the same specialization as for standard bitdepth. The rationale for the specialization is as follows: The optimal implementation of the bilinear interpolation depends on the filter values being used. For both horizontal and vertical interpolation this can simplify to just taking the source values, or averaging the source and reference values - which can be computed more easily than a bilinear interpolation with arbitrary filter values. This patch introduces tests to find the most optimal bilinear interpolation implementation based on the filter values being used. This new specialization is only used for larger block sizes. Change-Id: Id5a2b2d9fac6f878795a6ed9de2bc27d9e62d661 --- vpx_dsp/arm/highbd_subpel_variance_neon.c | 264 +++++++++++++++++----- 1 file changed, 204 insertions(+), 60 deletions(-) diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c index 0682eb3a29..aa64697458 100644 --- a/vpx_dsp/arm/highbd_subpel_variance_neon.c +++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c @@ -377,74 +377,218 @@ static void highbd_avg_pred_var_filter_block2d_bil_w64( filter_offset, second_pred); } -#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ - uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t tmp0[w * (h + padding)]; \ - uint16_t tmp1[w * h]; \ - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ - \ - highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ - (h + padding), xoffset); \ - highbd_avg_pred_var_filter_block2d_bil_w##w( \ - tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - \ - return vpx_highbd_8_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp1), w, \ - ref, ref_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t tmp0[w * (h + padding)]; \ - uint16_t tmp1[w * h]; \ - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ - \ - highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ - (h + padding), xoffset); \ - highbd_avg_pred_var_filter_block2d_bil_w##w( \ - tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - \ - return vpx_highbd_10_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp1), w, \ - ref, ref_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, uint32_t *sse, \ +// Combine averaging subpel filter with vpx_highbd_comp_avg_pred. +static void highbd_avg_pred_var_filter_block2d_avg( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, const uint16_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t avg = vrhaddq_u16(s0, s1); + + uint16x8_t p = vld1q_u16(second_pred); + avg = vrhaddq_u16(avg, p); + + vst1q_u16(dst_ptr + j, avg); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16. +static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, + int src_stride, int dst_width, int dst_height, + const uint16_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t avg = vrhaddq_u16(s, p); + + vst1q_u16(dst_ptr + j, avg); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \ + uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t tmp0[w * (h + padding)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + padding), xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } + +#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \ + unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse, \ const uint8_t *second_pred) { \ - uint16_t tmp0[w * (h + padding)]; \ - uint16_t tmp1[w * h]; \ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ \ - highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ - (h + padding), xoffset); \ - highbd_avg_pred_var_filter_block2d_bil_w##w( \ - tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - \ - return vpx_highbd_12_variance##w##x##h##_neon(CONVERT_TO_BYTEPTR(tmp1), w, \ - ref, ref_stride, sse); \ + if (xoffset == 0) { \ + uint16_t tmp[w * h]; \ + if (yoffset == 0) { \ + highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + highbd_avg_pred_var_filter_block2d_avg( \ + src_ptr, tmp, source_stride, source_stride, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else { \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + src_ptr, tmp, source_stride, source_stride, h, yoffset, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint16_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + highbd_avg_pred_var_filter_block2d_avg( \ + src_ptr, tmp0, source_stride, 1, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * (h + padding)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ + (h + padding)); \ + highbd_avg_pred_var_filter_block2d_avg( \ + tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * (h + padding)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ + (h + padding)); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } else { \ + uint16_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + src_ptr, tmp0, source_stride, 1, h, xoffset, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + highbd_avg_pred_var_filter_block2d_avg( \ + tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } \ } // 4x blocks are processed two rows at a time, so require an extra row of // padding. -SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2) -SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2) -SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1) +// 8-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4, 2) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8, 2) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4, 1) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8, 1) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16, 1) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32, 1) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64, 1) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64, 1) + +// 10-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4, 2) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8, 2) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4, 1) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8, 1) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16, 1) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32, 1) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64, 1) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64, 1) + +// 12-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4, 2) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8, 2) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4, 1) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8, 1) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1) -SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64, 1) From b17993ca673f284c99b789b207c80df50f8c4429 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Fri, 10 Feb 2023 10:29:24 +0000 Subject: [PATCH 0609/1000] Add Neon AvgPredTestHBD test suite Add test suite for vpx_highbd_comp_avg_pred_neon. Change-Id: I5c31e0e990661ee3b8030bb517829c088fceae4d --- test/comp_avg_pred_test.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc index 70aeab8d7e..f747c3524e 100644 --- a/test/comp_avg_pred_test.cc +++ b/test/comp_avg_pred_test.cc @@ -260,5 +260,11 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(&highbd_wrapper)); #endif // HAVE_SSE2 +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, AvgPredTestHBD, + ::testing::Values(&highbd_wrapper)); +#endif // HAVE_NEON + #endif // CONFIG_VP9_HIGHBITDEPTH } // namespace From ed68c267cfafbd5acc4fe9a9d61b27a9e70170de Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Fri, 10 Feb 2023 10:50:47 +0000 Subject: [PATCH 0610/1000] Optimize vpx_highbd_comp_avg_pred_neon Optimize the implementation of vpx_highbd_comp_avg_pred_neon by making use of the URHADD instruction to compute the average. Change-Id: Id74a6d9c33e89bc548c3c7ecace59af69051b4a7 --- vpx_dsp/arm/highbd_avg_pred_neon.c | 67 +++++++++++++++++------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/vpx_dsp/arm/highbd_avg_pred_neon.c b/vpx_dsp/arm/highbd_avg_pred_neon.c index 04dbaad7a3..3063acbb3e 100644 --- a/vpx_dsp/arm/highbd_avg_pred_neon.c +++ b/vpx_dsp/arm/highbd_avg_pred_neon.c @@ -9,6 +9,7 @@ */ #include +#include #include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" @@ -16,42 +17,48 @@ void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride) { - int i, j; - uint32x4_t one_u32 = vdupq_n_u32(1); - if (width >= 8) { - for (i = 0; i < height; ++i) { - for (j = 0; j < width; j += 8) { - const uint16x8_t pred_u16 = vld1q_u16(&pred[j]); - const uint16x8_t ref_u16 = vld1q_u16(&ref[j]); - const uint32x4_t sum1_u32 = - vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16)); - const uint32x4_t sum2_u32 = - vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16)); - const uint16x4_t sum1_u16 = - vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1); - const uint16x4_t sum2_u16 = - vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1); - const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16); - vst1q_u16(&comp_pred[j], vcomp_pred); - } + int i = height; + if (width > 8) { + do { + int j = 0; + do { + const uint16x8_t p = vld1q_u16(pred + j); + const uint16x8_t r = vld1q_u16(ref + j); + + uint16x8_t avg = vrhaddq_u16(p, r); + vst1q_u16(comp_pred + j, avg); + + j += 8; + } while (j < width); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } else if (width == 8) { + do { + const uint16x8_t p = vld1q_u16(pred); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vrhaddq_u16(p, r); + vst1q_u16(comp_pred, avg); + comp_pred += width; pred += width; ref += ref_stride; - } + } while (--i != 0); } else { - assert(width >= 4); - for (i = 0; i < height; ++i) { - for (j = 0; j < width; j += 4) { - const uint16x4_t pred_u16 = vld1_u16(&pred[j]); - const uint16x4_t ref_u16 = vld1_u16(&ref[j]); - const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16); - const uint16x4_t vcomp_pred = - vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1); - vst1_u16(&comp_pred[j], vcomp_pred); - } + assert(width == 4); + do { + const uint16x4_t p = vld1_u16(pred); + const uint16x4_t r = vld1_u16(ref); + + uint16x4_t avg = vrhadd_u16(p, r); + vst1_u16(comp_pred, avg); + comp_pred += width; pred += width; ref += ref_stride; - } + } while (--i != 0); } } From 660031ccf380e6a37beecc67e78154b7b6ea78d8 Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Mon, 13 Feb 2023 17:57:26 -0800 Subject: [PATCH 0611/1000] Enable some more speed features on speed 0 to 2 Performance: | SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T | |---------|---------|----------|----------|---------|-------| | 0 | hdres2 | +0.034% | +0.030% | +0.033% | -3.7% | | 0 | lowres2 | +0.012% | +0.017% | +0.044% | -2.1% | | 0 | midres2 | +0.030% | +0.035% | +0.060% | -1.9% | |---------|---------|----------|----------|---------|-------| | 1 | hdres2 | +0.027% | +0.036% | +0.030% | -2.7% | | 1 | lowres2 | -0.006% | -0.002% | +0.006% | -1.0% | | 1 | midres2 | -0.006% | -0.012% | -0.010% | -1.0% | |---------|---------|----------|----------|---------|-------| | 2 | hdres2 | -0.006% | -0.001% | -0.020% | -2.4% | | 2 | lowres2 | -0.010% | -0.015% | -0.001% | -0.9% | | 2 | midres2 | +0.006% | -0.005% | +0.009% | -1.0% | STATS_CHANGED Change-Id: I1431ac07215bb844739a410697387b9aead82792 --- vp9/encoder/vp9_speed_features.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 19e37537b2..58e9e739a2 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -84,6 +84,9 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, } else { sf->use_square_only_thresh_high = BLOCK_32X32; } + if (is_720p_or_larger) { + sf->alt_ref_search_fp = 1; + } if (!is_1080p_or_larger) { sf->rd_ml_partition.search_breakout = 1; @@ -212,6 +215,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, const int boosted = frame_is_boosted(cpi); int i; + sf->adaptive_interp_filter_search = 1; sf->adaptive_pred_interp_filter = 1; sf->adaptive_rd_thresh = 1; sf->adaptive_rd_thresh_row_mt = 0; @@ -223,6 +227,9 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->tx_size_search_breakout = 1; sf->use_square_partition_only = !boosted; + // Reference masking is not supported in dynamic scaling mode. + sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC; + sf->rd_ml_partition.var_pruning = 1; sf->rd_ml_partition.prune_rect_thresh[0] = -1; sf->rd_ml_partition.prune_rect_thresh[1] = 350; @@ -299,9 +306,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL; - // Reference masking is not supported in dynamic scaling mode. - sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC ? 1 : 0; - sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 @@ -347,7 +351,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->mode_skip_start = 6; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; - sf->adaptive_interp_filter_search = 1; if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { for (i = 0; i < MAX_MESH_STEP; ++i) { From b737865480d2f1355a972f2f9b3b3a0f34a9ef83 Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Tue, 14 Feb 2023 14:29:29 -0800 Subject: [PATCH 0612/1000] Relax frame recode tolerance on speed 0 to 1 above 480p Performance: | SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T | |---------|---------|----------|----------|---------|-------| | 0 | hdres2 | -0.028% | +0.030% | -0.408% | -2.0% | | 0 | lowres2 | +0.000% | +0.000% | +0.000% | +0.0% | | 0 | midres2 | -0.138% | +0.042% | -0.427% | -2.5% | |---------|---------|----------|----------|---------|-------| | 1 | hdres2 | -0.032% | +0.018% | -0.342% | -1.1% | | 1 | lowres2 | +0.000% | +0.000% | +0.000% | +0.0% | | 1 | midres2 | +0.050% | +0.060% | -0.257% | -1.6% | Rate Error: | | | AVG_RC_ERROR | MAX_RC_ERROR | | | |---------------------|---------------------| | SPD_SET | TESTSET | BASE | TEST | BASE | TEST | |---------|---------|----------|----------|----------|----------| | 0 | hdres2 | 33.044% | 33.065% | 149.903% | 149.903% | | 0 | midres2 | 59.632% | 59.566% | 79.091% | 79.249% | |---------|---------|----------|----------|----------|----------| | 1 | hdres2 | 33.050% | 33.057% | 151.278% | 151.278% | | 1 | midres2 | 59.640% | 59.614% | 78.707% | 78.842% | STATS_CHANGED Change-Id: I5d09601fede3912d5173717ce9dd070df3a97ec8 --- vp9/encoder/vp9_speed_features.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 58e9e739a2..72ac0cebb8 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -81,6 +81,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, // Currently, the machine-learning based partition search early termination // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0. sf->rd_ml_partition.search_early_termination = 1; + sf->recode_tolerance_high = 45; } else { sf->use_square_only_thresh_high = BLOCK_32X32; } @@ -314,7 +315,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->disable_filter_search_var_thresh = 100; sf->comp_inter_joint_search_thresh = BLOCK_SIZES; sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; - sf->recode_tolerance_low = 15; sf->recode_tolerance_high = 45; sf->enhanced_full_pixel_motion_search = 0; sf->prune_ref_frame_for_rect_partitions = 0; From be2fd0c7406bcf7297220d3fa43d4baa2f619c7b Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 16 Feb 2023 17:48:49 -0500 Subject: [PATCH 0613/1000] vp9 rc: Verify QP for all spatial layers Change-Id: Ic669c96d25d7c039d370e9acd00dc45e09054552 --- test/vp9_ratectrl_rtc_test.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index cce73fcce2..ad421d3acd 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -235,6 +235,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0; + std::vector rc_qp; while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { ParseSuperframeSizes(static_cast(pkt->data.frame.buf), pkt->data.frame.sz); @@ -252,14 +253,17 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, rc_api_->ComputeQP(frame_params_); frame_params_.frame_type = libvpx::RcFrameType::kInterFrame; rc_api_->PostEncodeUpdate(sizes_[sl]); + rc_qp.push_back(rc_api_->GetQP()); } } } if (!encoder_exit_) { - int loopfilter_level, qp; + int loopfilter_level; + std::vector encoder_qp(VPX_SS_MAX_LAYERS, 0); encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level); - encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp); - ASSERT_EQ(rc_api_->GetQP(), qp); + encoder->Control(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, encoder_qp.data()); + encoder_qp.resize(rc_qp.size()); + ASSERT_EQ(rc_qp, encoder_qp); ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level); } } From 0f888815c546f7b3bda0795d0a585203bde406bc Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 11 Nov 2022 14:21:27 -0500 Subject: [PATCH 0614/1000] vp9 rc: Make it work for SVC parallel encoding Added unit test. Keep track of spatial layer id and frame type in case where spatial layers are encoded parallel by the hardware encoder. ComputeQP() / PostEncodeUpdate() doesn't need to be called sequentially when there is no inter layer prediction. Bug: b/257368998 Change-Id: I50beaefcfc205d3f9a9d3dbe11fead5bfdc71489 --- test/vp9_ratectrl_rtc_test.cc | 96 ++++++++++++++++++++++-------- vp9/encoder/vp9_svc_layercontext.c | 5 ++ vp9/encoder/vp9_svc_layercontext.h | 3 + vp9/ratectrl_rtc.cc | 29 ++++++++- vp9/ratectrl_rtc.h | 3 +- 5 files changed, 110 insertions(+), 26 deletions(-) diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index ad421d3acd..578ad26fcf 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -82,7 +82,7 @@ class RcInterfaceTest } virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { - rc_api_->PostEncodeUpdate(pkt->data.frame.sz); + rc_api_->PostEncodeUpdate(pkt->data.frame.sz, frame_params_); } void RunOneLayer() { @@ -162,10 +162,14 @@ class RcInterfaceTest bool encoder_exit_; }; -class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWithParam { +class RcInterfaceSvcTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { public: - RcInterfaceSvcTest() : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)) {} + RcInterfaceSvcTest() + : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000), + dynamic_spatial_layers_(0), inter_layer_pred_off_(GET_PARAM(2)), + parallel_spatial_layers_(false) {} virtual ~RcInterfaceSvcTest() {} protected: @@ -184,6 +188,10 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1); encoder->Control(VP9E_SET_SVC, 1); encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); + if (inter_layer_pred_off_) { + encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, + INTER_LAYER_PRED_OFF_NONKEY); + } } frame_params_.frame_type = video->frame() % key_interval_ == 0 ? libvpx::RcFrameType::kKeyFrame @@ -232,6 +240,22 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, } } + virtual void SetFrameParamsSvc(int sl) { + frame_params_.spatial_layer_id = sl; + if (rc_cfg_.ts_number_layers == 3) + frame_params_.temporal_layer_id = + kTemporalId3Layer[current_superframe_ % 4]; + else if (rc_cfg_.ts_number_layers == 2) + frame_params_.temporal_layer_id = + kTemporalId2Layer[current_superframe_ % 2]; + else + frame_params_.temporal_layer_id = 0; + frame_params_.frame_type = + current_superframe_ % key_interval_ == 0 && sl == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; + } + virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0; @@ -239,21 +263,28 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { ParseSuperframeSizes(static_cast(pkt->data.frame.buf), pkt->data.frame.sz); - for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { - if (sizes_[sl] > 0) { - frame_params_.spatial_layer_id = sl; - if (rc_cfg_.ts_number_layers == 3) - frame_params_.temporal_layer_id = - kTemporalId3Layer[current_superframe_ % 4]; - else if (rc_cfg_.ts_number_layers == 2) - frame_params_.temporal_layer_id = - kTemporalId2Layer[current_superframe_ % 2]; - else - frame_params_.temporal_layer_id = 0; - rc_api_->ComputeQP(frame_params_); - frame_params_.frame_type = libvpx::RcFrameType::kInterFrame; - rc_api_->PostEncodeUpdate(sizes_[sl]); - rc_qp.push_back(rc_api_->GetQP()); + if (!parallel_spatial_layers_ || current_superframe_ == 0) { + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { + if (sizes_[sl] > 0) { + SetFrameParamsSvc(sl); + rc_api_->ComputeQP(frame_params_); + rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_); + rc_qp.push_back(rc_api_->GetQP()); + } + } + } else { + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { + if (sizes_[sl] > 0) { + SetFrameParamsSvc(sl); + rc_api_->ComputeQP(frame_params_); + } + } + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { + if (sizes_[sl] > 0) { + SetFrameParamsSvc(sl); + rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_); + rc_qp.push_back(rc_api_->GetQP()); + } } } } @@ -274,9 +305,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, const vpx_image_t * /*img2*/) {} void RunSvc() { - dynamic_spatial_layers_ = 0; SetRCConfigSvc(3, 3); - key_interval_ = 10000; rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); SetEncoderConfigSvc(3, 3); @@ -287,7 +316,6 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, } void RunSvcPeriodicKey() { - dynamic_spatial_layers_ = 0; SetRCConfigSvc(3, 3); key_interval_ = 100; rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); @@ -302,7 +330,19 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, void RunSvcDynamicSpatial() { dynamic_spatial_layers_ = 1; SetRCConfigSvc(3, 3); - key_interval_ = 10000; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunSvcParallelSpatialLayers() { + if (!inter_layer_pred_off_) return; + parallel_spatial_layers_ = true; + SetRCConfigSvc(3, 3); rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); SetEncoderConfigSvc(3, 3); @@ -505,6 +545,9 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, uint32_t sizes_[8]; int key_interval_; int dynamic_spatial_layers_; + bool inter_layer_pred_off_; + // ComputeQP() and PostEncodeUpdate() don't need to be sequential for KSVC. + bool parallel_spatial_layers_; }; TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); } @@ -513,11 +556,16 @@ TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); } TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); } +TEST_P(RcInterfaceSvcTest, SvcParallelSpatialLayers) { + RunSvcParallelSpatialLayers(); +} + TEST_P(RcInterfaceSvcTest, SvcPeriodicKey) { RunSvcPeriodicKey(); } TEST_P(RcInterfaceSvcTest, SvcDynamicSpatial) { RunSvcDynamicSpatial(); } VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3), ::testing::Values(VPX_CBR, VPX_VBR)); -VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3)); +VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3), + ::testing::Values(true, false)); } // namespace diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 7e9435fb5f..c60445cba5 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -389,6 +389,8 @@ void vp9_save_layer_context(VP9_COMP *const cpi) { lc->twopass = cpi->twopass; lc->target_bandwidth = (int)oxcf->target_bandwidth; lc->alt_ref_source = cpi->alt_ref_source; + lc->frame_qp = cpi->common.base_qindex; + lc->MBs = cpi->common.MBs; // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, // for the base temporal layer. @@ -408,6 +410,9 @@ void vp9_save_layer_context(VP9_COMP *const cpi) { lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks; lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks; lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change; + lc->qindex_delta[0] = cr->qindex_delta[0]; + lc->qindex_delta[1] = cr->qindex_delta[1]; + lc->qindex_delta[2] = cr->qindex_delta[2]; } } diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index c7328cf571..90dec5e20a 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -70,8 +70,11 @@ typedef struct { int actual_num_seg1_blocks; int actual_num_seg2_blocks; int counter_encode_maxq_scene_change; + int qindex_delta[3]; uint8_t speed; int loopfilter_ctrl; + int frame_qp; + int MBs; } LAYER_CONTEXT; typedef struct SVC { diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index 944c526ac1..8592173fb6 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -220,6 +220,9 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { vp9_rc_pick_q_and_bounds(cpi_, &bottom_index, &top_index); if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_setup(cpi_); + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) + vp9_save_layer_context(cpi_); } int VP9RateControlRTC::GetQP() const { return cpi_->common.base_qindex; } @@ -242,7 +245,31 @@ bool VP9RateControlRTC::GetSegmentationData( return true; } -void VP9RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { +void VP9RateControlRTC::PostEncodeUpdate( + uint64_t encoded_frame_size, const VP9FrameParamsQpRTC &frame_params) { + cpi_->common.frame_type = static_cast(frame_params.frame_type); + cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id; + cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id; + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + vp9_restore_layer_context(cpi_); + const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id, + cpi_->svc.temporal_layer_id, + cpi_->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer]; + cpi_->common.base_qindex = lc->frame_qp; + cpi_->common.MBs = lc->MBs; + // For spatial-svc, allow cyclic-refresh to be applied on the spatial + // layers, for the base temporal layer. + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi_->svc.number_spatial_layers > 1 && + cpi_->svc.temporal_layer_id == 0) { + CYCLIC_REFRESH *const cr = cpi_->cyclic_refresh; + cr->qindex_delta[0] = lc->qindex_delta[0]; + cr->qindex_delta[1] = lc->qindex_delta[1]; + cr->qindex_delta[2] = lc->qindex_delta[2]; + } + } vp9_rc_postencode_update(cpi_, encoded_frame_size); if (cpi_->svc.number_spatial_layers > 1 || cpi_->svc.number_temporal_layers > 1) diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index 162a04883e..3131c22231 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -99,7 +99,8 @@ class VP9RateControlRTC { bool GetSegmentationData(VP9SegmentationData *segmentation_data) const; void ComputeQP(const VP9FrameParamsQpRTC &frame_params); // Feedback to rate control with the size of current encoded frame - void PostEncodeUpdate(uint64_t encoded_frame_size); + void PostEncodeUpdate(uint64_t encoded_frame_size, + const VP9FrameParamsQpRTC &frame_params); private: VP9RateControlRTC() {} From c4ee2b2f033d377427017b2b8244e5f29fe80961 Mon Sep 17 00:00:00 2001 From: Deepa K G Date: Thu, 16 Feb 2023 21:47:24 +0530 Subject: [PATCH 0615/1000] =?UTF-8?q?Skip=20redundant=20iterations=20in=20?= =?UTF-8?q?joint=20motion=20search=C2=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In joint_motion_search, there are four iterations. Even iterations search in the first reference frame and odd iterations search in the second. The last two iterations use the search result of the first two iterations as the start point. If the search result does not change,last two iterations are not necessary and can be skipped. Instruction Count cpu-used Reduction(%) 0 1.411 Change-Id: Ie583c9f75dd0a22bbdfb432ccdd62eea6ec4fce8 --- vp9/encoder/vp9_rdopt.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 498bc0fbd5..201bf416db 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1862,6 +1862,19 @@ static int check_best_zero_mv(const VP9_COMP *cpi, return 1; } +static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) { + if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) { + int_mv cur_fullpel_mv, prev_fullpel_mv; + cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3; + cur_fullpel_mv.as_mv.col = iter_mvs[ite][id].as_mv.col >> 3; + prev_fullpel_mv.as_mv.row = iter_mvs[ite - 2][id].as_mv.row >> 3; + prev_fullpel_mv.as_mv.col = iter_mvs[ite - 2][id].as_mv.col >> 3; + if (cur_fullpel_mv.as_int == prev_fullpel_mv.as_int) return 1; + } + return 0; +} + +#define NUM_ITERS 4 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], @@ -1874,6 +1887,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, const int refs[2] = { mi->ref_frame[0], mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1] }; int_mv ref_mv[2]; + int_mv iter_mvs[NUM_ITERS][2]; int ite, ref; const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter]; struct scale_factors sf; @@ -1909,6 +1923,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, } frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int; + iter_mvs[0][ref].as_int = single_newmv[refs[ref]].as_int; } // Since we have scaled the reference frames to match the size of the current @@ -1923,7 +1938,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // Allow joint search multiple times iteratively for each reference frame // and break out of the search loop if it couldn't find a better mv. - for (ite = 0; ite < 4; ite++) { + for (ite = 0; ite < NUM_ITERS; ite++) { struct buf_2d ref_yv12[2]; uint32_t bestsme = UINT_MAX; int sadpb = x->sadperbit16; @@ -1935,6 +1950,11 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // odd iterations search in the second. The predictor // found for the 'other' reference frame is factored in. + // Skip further iterations of search if in the previous iteration, the + // motion vector of the searched ref frame is unchanged, and the other ref + // frame's full-pixel mv is unchanged. + if (skip_iters(iter_mvs, ite, id)) break; + // Initialized here because of compiler problem in Visual Studio. ref_yv12[0] = xd->plane[0].pre[0]; ref_yv12[1] = xd->plane[0].pre[1]; @@ -2000,6 +2020,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, } else { break; } + if (ite < NUM_ITERS - 1) { + iter_mvs[ite + 1][0].as_int = frame_mv[refs[0]].as_int; + iter_mvs[ite + 1][1].as_int = frame_mv[refs[1]].as_int; + } } *rate_mv = 0; From 6ed9639e43983a95272d3a2787aa9aa0d23ffcbb Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Mon, 13 Feb 2023 16:11:31 +0000 Subject: [PATCH 0616/1000] Optimize Neon implementation of high bitpdeth variance functions Specialize implementation of high bitdepth variance functions such that we only widen data processing element types when absolutely necessary. Change-Id: If4cc3fea7b5ab0821e3129ebd79ff63706a512bf --- vpx_dsp/arm/highbd_variance_neon.c | 548 +++++++++++++++++++---------- vpx_dsp/arm/sum_neon.h | 17 + 2 files changed, 373 insertions(+), 192 deletions(-) diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c index 985cc35682..89bd5c579d 100644 --- a/vpx_dsp/arm/highbd_variance_neon.c +++ b/vpx_dsp/arm/highbd_variance_neon.c @@ -18,214 +18,378 @@ #include "vpx_dsp/arm/sum_neon.h" #include "vpx_ports/mem.h" -static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride, - const uint16_t *ref_ptr, int ref_stride, - int w, int h, uint64_t *sse, - int64_t *sum) { - int i, j; - - if (w >= 8) { - int32x4_t sum_s32 = vdupq_n_s32(0); - uint32x4_t sse_u32 = vdupq_n_u32(0); - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - const int16x8_t src_s16 = vreinterpretq_s16_u16(vld1q_u16(&src_ptr[j])); - const int16x8_t ref_s16 = vreinterpretq_s16_u16(vld1q_u16(&ref_ptr[j])); - const int32x4_t diff1_s32 = - vsubl_s16(vget_low_s16(src_s16), vget_low_s16(ref_s16)); - const int32x4_t diff2_s32 = - vsubl_s16(vget_high_s16(src_s16), vget_high_s16(ref_s16)); - const uint32x4_t diff1_u32 = vreinterpretq_u32_s32(diff1_s32); - const uint32x4_t diff2_u32 = vreinterpretq_u32_s32(diff2_s32); - sum_s32 = vaddq_s32(sum_s32, diff1_s32); - sum_s32 = vaddq_s32(sum_s32, diff2_s32); - sse_u32 = vmlaq_u32(sse_u32, diff1_u32, diff1_u32); - sse_u32 = vmlaq_u32(sse_u32, diff2_u32, diff2_u32); - } - src_ptr += src_stride; - ref_ptr += ref_stride; - } - *sum = horizontal_add_int32x4(sum_s32); - *sse = horizontal_add_uint32x4(sse_u32); - } else { - int32x4_t sum_s32 = vdupq_n_s32(0); - uint32x4_t sse_u32 = vdupq_n_u32(0); - assert(w >= 4); - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 4) { - const int16x4_t src_s16 = vreinterpret_s16_u16(vld1_u16(&src_ptr[j])); - const int16x4_t ref_s16 = vreinterpret_s16_u16(vld1_u16(&ref_ptr[j])); - const int32x4_t diff_s32 = vsubl_s16(src_s16, ref_s16); - const uint32x4_t diff_u32 = vreinterpretq_u32_s32(diff_s32); - sum_s32 = vaddq_s32(sum_s32, diff_s32); - sse_u32 = vmlaq_u32(sse_u32, diff_u32, diff_u32); - } - src_ptr += src_stride; - ref_ptr += ref_stride; - } - *sum = horizontal_add_int32x4(sum_s32); - *sse = horizontal_add_uint32x4(sse_u32); - } +// Process a block of width 4 two rows at a time. +static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_s32 = vdupq_n_s32(0); + + int i = h; + do { + const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride); + const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride); + + int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff)); + sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff)); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_int16x8(sum_s16); + *sse = horizontal_add_int32x4(sse_s32); } -static INLINE void highbd_variance64(const uint8_t *src8_ptr, int src_stride, - const uint8_t *ref8_ptr, int ref_stride, - int w, int h, uint64_t *sse, - int64_t *sum) { - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr); - uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr); - - if (w < 32 && h < 32) { - highbd_variance16(src_ptr, src_stride, ref_ptr, ref_stride, w, h, sse, sum); - } else { - uint64_t sse_long = 0; - int64_t sum_long = 0; - int k, l; - for (k = 0; k + 16 <= h; k += 16) { - for (l = 0; l + 16 <= w; l += 16) { - uint64_t sse_tmp = 0; - int64_t sum_tmp = 0; - highbd_variance16(src_ptr + l, src_stride, ref_ptr + l, ref_stride, 16, - 16, &sse_tmp, &sum_tmp); - sum_long += sum_tmp; - sse_long += sse_tmp; - } - src_ptr += 16 * src_stride; - ref_ptr += 16 * ref_stride; - } - *sum = sum_long; - *sse = sse_long; - } +// For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all +// block sizes can be processed in 32-bit elements (1023*1023*64*16 = 1071645696 +// for a 64x64 block). +static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + int i = h; + do { + int j = 0; + do { + const uint16x8_t s = vld1q_u16(src_ptr + j); + const uint16x8_t r = vld1q_u16(ref_ptr + j); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s32 = vpadalq_s16(sum_s32, diff); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4(sum_s32); + *sse = horizontal_long_add_uint32x4(vaddq_u32( + vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1]))); +} + +static INLINE void highbd_variance_8xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum); +} + +static INLINE void highbd_variance_16xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum); } -static INLINE void highbd_8_variance(const uint8_t *src8_ptr, int src_stride, - const uint8_t *ref8_ptr, int ref_stride, - int w, int h, uint32_t *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, - &sum_long); - *sse = (uint32_t)sse_long; - *sum = (int)sum_long; +static INLINE void highbd_variance_32xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum); +} + +static INLINE void highbd_variance_64xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum); +} + +// For 12-bit data, we can only accumulate up to 128 elements in the sum of +// squares (4095*4095*128 = 2146435200), and because we're using two int32x4 +// accumulators, we can only process up to 32 32-element rows (32*32/8 = 128) +// or 16 64-element rows before we have to accumulate into 64-bit elements. +// Therefore blocks of size 32x64, 64x32 and 64x64 are processed in a different +// helper function. + +// Process a block of any size where the width is divisible by 8, with +// accumulation into 64-bit elements. +static INLINE void highbd_variance_xlarge_neon( + const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, + int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit + // accumulator overflows. After hitting this limit we accumulate into 64-bit + // elements. + int h_tmp = h > h_limit ? h_limit : h; + + int i = 0; + do { + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + do { + int j = 0; + do { + const uint16x8_t s0 = vld1q_u16(src_ptr + j); + const uint16x8_t r0 = vld1q_u16(ref_ptr + j); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s0, r0)); + sum_s32 = vpadalq_s16(sum_s32, diff); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + i++; + } while (i < h_tmp); + + sse_s64 = vpadalq_s32(sse_s64, sse_s32[0]); + sse_s64 = vpadalq_s32(sse_s64, sse_s32[1]); + h_tmp += h_limit; + } while (i < h); + + *sum = horizontal_add_int32x4(sum_s32); + *sse = horizontal_add_int64x2(sse_s64); } -static INLINE void highbd_10_variance(const uint8_t *src8_ptr, int src_stride, - const uint8_t *ref8_ptr, int ref_stride, - int w, int h, uint32_t *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, - &sum_long); - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); - *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); +static INLINE void highbd_variance_32xh_xlarge_neon( + const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, int64_t *sum) { + highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse, + sum); } -static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride, - const uint8_t *ref8_ptr, int ref_stride, - int w, int h, uint32_t *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, - &sum_long); - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); - *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); +static INLINE void highbd_variance_64xh_xlarge_neon( + const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, int64_t *sum) { + highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse, + sum); } -#define HBD_VARIANCE_WXH_NEON(W, H) \ - uint32_t vpx_highbd_8_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ - &sum); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ - } \ - \ - uint32_t vpx_highbd_10_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - int64_t var; \ - highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ - &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } \ - \ - uint32_t vpx_highbd_12_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - int64_t var; \ - highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ - &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ +#define HBD_VARIANCE_WXH_8_NEON(w, h) \ + uint32_t vpx_highbd_8_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + sum = (int)sum_long; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \ + } + +#define HBD_VARIANCE_WXH_10_NEON(w, h) \ + uint32_t vpx_highbd_10_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } -#define HIGHBD_GET_VAR(S) \ - void vpx_highbd_8_get##S##x##S##var_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse, int *sum) { \ - highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ - sum); \ - } \ - \ - void vpx_highbd_10_get##S##x##S##var_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse, int *sum) { \ - highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ - sum); \ - } \ - \ - void vpx_highbd_12_get##S##x##S##var_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse, int *sum) { \ - highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ - sum); \ +#define HBD_VARIANCE_WXH_12_NEON(w, h) \ + uint32_t vpx_highbd_12_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } -#define HIGHBD_MSE(W, H) \ - uint32_t vpx_highbd_8_mse##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ - &sum); \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_10_mse##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ - &sum); \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_12_mse##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ - &sum); \ - return *sse; \ +#define HBD_VARIANCE_WXH_12_XLARGE_NEON(w, h) \ + uint32_t vpx_highbd_12_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_xlarge_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +// 8-bit +HBD_VARIANCE_WXH_8_NEON(4, 4) +HBD_VARIANCE_WXH_8_NEON(4, 8) + +HBD_VARIANCE_WXH_8_NEON(8, 4) +HBD_VARIANCE_WXH_8_NEON(8, 8) +HBD_VARIANCE_WXH_8_NEON(8, 16) + +HBD_VARIANCE_WXH_8_NEON(16, 8) +HBD_VARIANCE_WXH_8_NEON(16, 16) +HBD_VARIANCE_WXH_8_NEON(16, 32) + +HBD_VARIANCE_WXH_8_NEON(32, 16) +HBD_VARIANCE_WXH_8_NEON(32, 32) +HBD_VARIANCE_WXH_8_NEON(32, 64) + +HBD_VARIANCE_WXH_8_NEON(64, 32) +HBD_VARIANCE_WXH_8_NEON(64, 64) + +// 10-bit +HBD_VARIANCE_WXH_10_NEON(4, 4) +HBD_VARIANCE_WXH_10_NEON(4, 8) + +HBD_VARIANCE_WXH_10_NEON(8, 4) +HBD_VARIANCE_WXH_10_NEON(8, 8) +HBD_VARIANCE_WXH_10_NEON(8, 16) + +HBD_VARIANCE_WXH_10_NEON(16, 8) +HBD_VARIANCE_WXH_10_NEON(16, 16) +HBD_VARIANCE_WXH_10_NEON(16, 32) + +HBD_VARIANCE_WXH_10_NEON(32, 16) +HBD_VARIANCE_WXH_10_NEON(32, 32) +HBD_VARIANCE_WXH_10_NEON(32, 64) + +HBD_VARIANCE_WXH_10_NEON(64, 32) +HBD_VARIANCE_WXH_10_NEON(64, 64) + +// 12-bit +HBD_VARIANCE_WXH_12_NEON(4, 4) +HBD_VARIANCE_WXH_12_NEON(4, 8) + +HBD_VARIANCE_WXH_12_NEON(8, 4) +HBD_VARIANCE_WXH_12_NEON(8, 8) +HBD_VARIANCE_WXH_12_NEON(8, 16) + +HBD_VARIANCE_WXH_12_NEON(16, 8) +HBD_VARIANCE_WXH_12_NEON(16, 16) +HBD_VARIANCE_WXH_12_NEON(16, 32) + +HBD_VARIANCE_WXH_12_NEON(32, 16) +HBD_VARIANCE_WXH_12_NEON(32, 32) +HBD_VARIANCE_WXH_12_XLARGE_NEON(32, 64) + +HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 32) +HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64) + +#define HIGHBD_GET_VAR(S) \ + void vpx_highbd_8_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + *sum = (int)sum_long; \ + } \ + \ + void vpx_highbd_10_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + } \ + \ + void vpx_highbd_12_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ } -HBD_VARIANCE_WXH_NEON(64, 64) -HBD_VARIANCE_WXH_NEON(64, 32) -HBD_VARIANCE_WXH_NEON(32, 64) -HBD_VARIANCE_WXH_NEON(32, 32) -HBD_VARIANCE_WXH_NEON(32, 16) -HBD_VARIANCE_WXH_NEON(16, 32) -HBD_VARIANCE_WXH_NEON(16, 16) -HBD_VARIANCE_WXH_NEON(16, 8) -HBD_VARIANCE_WXH_NEON(8, 16) -HBD_VARIANCE_WXH_NEON(8, 8) -HBD_VARIANCE_WXH_NEON(8, 4) -HBD_VARIANCE_WXH_NEON(4, 8) -HBD_VARIANCE_WXH_NEON(4, 4) +#define HIGHBD_MSE(w, h) \ + uint32_t vpx_highbd_8_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_10_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_12_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + return *sse; \ + } HIGHBD_GET_VAR(8) HIGHBD_GET_VAR(16) diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index 21560837ae..47748a8061 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -110,4 +110,21 @@ static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) { #endif } +static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) { +#if defined(__aarch64__) + return vaddlvq_u32(a); +#else + const uint64x2_t b = vpaddlq_u32(a); + return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1); +#endif +} + +static INLINE uint64_t horizontal_add_int64x2(const int64x2_t a) { +#if defined(__aarch64__) + return vaddvq_s64(a); +#else + return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); +#endif +} + #endif // VPX_VPX_DSP_ARM_SUM_NEON_H_ From 46add73f7e60799fab383c5dcbad0b953eee0c7a Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 22 Feb 2023 11:34:30 -0800 Subject: [PATCH 0617/1000] vp9_block.h: rename diff struct to Diff This matches the style guide and fixes some -Wshadow warnings related to variables with the same name. Something similar was done in libaom in: 863b04994b Fix warnings reported by -Wshadow: Part2: av1 directory Bug: webm:1793 Change-Id: I4df1bbc8d079a3174d75f0d35d54c200ffdbb677 --- vp9/encoder/vp9_block.h | 2 +- vp9/encoder/vp9_encodeframe.c | 14 +++++++------- vp9/encoder/vp9_encoder.c | 3 ++- vp9/encoder/vp9_encoder.h | 2 +- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 20294b4b94..1786952911 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -24,7 +24,7 @@ typedef struct { unsigned int sse; int sum; unsigned int var; -} diff; +} Diff; struct macroblock_plane { DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]); diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index a522097e61..5b811016de 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -2432,16 +2432,16 @@ static void set_source_var_based_partition(VP9_COMP *cpi, (row8x8_remaining >= MI_BLOCK_SIZE)) { int i, j; int index; - diff d32[4]; + Diff d32[4]; const int offset = (mi_row >> 1) * cm->mb_cols + (mi_col >> 1); int is_larger_better = 0; int use32x32 = 0; unsigned int thr = cpi->source_var_thresh; - memset(d32, 0, 4 * sizeof(diff)); + memset(d32, 0, sizeof(d32)); for (i = 0; i < 4; i++) { - diff *d16[4]; + Diff *d16[4]; for (j = 0; j < 4; j++) { int b_mi_row = coord_lookup[i * 4 + j].row; @@ -5681,12 +5681,12 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, } // end RTC play code -static INLINE uint32_t variance(const diff *const d) { +static INLINE uint32_t variance(const Diff *const d) { return d->sse - (uint32_t)(((int64_t)d->sum * d->sum) >> 8); } #if CONFIG_VP9_HIGHBITDEPTH -static INLINE uint32_t variance_highbd(diff *const d) { +static INLINE uint32_t variance_highbd(Diff *const d) { const int64_t var = (int64_t)d->sse - (((int64_t)d->sum * d->sum) >> 8); return (var >= 0) ? (uint32_t)var : 0; } @@ -5706,7 +5706,7 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { ? (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100) : (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100); DECLARE_ALIGNED(16, int, hist[VAR_HIST_BINS]); - diff *var16 = cpi->source_diff_var; + Diff *var16 = cpi->source_diff_var; int sum = 0; int i, j; @@ -5790,7 +5790,7 @@ static void source_var_based_partition_search_method(VP9_COMP *cpi) { if (cpi->source_diff_var) vpx_free(cpi->source_diff_var); CHECK_MEM_ERROR(cm, cpi->source_diff_var, - vpx_calloc(cm->MBs, sizeof(diff))); + vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var))); } if (!cpi->frames_till_next_var_check) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 22fbb899fd..4cec02eb93 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2546,7 +2546,8 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) cpi->tpl_stats[i].tpl_stats_ptr = NULL; // Allocate memory to store variances for a frame. - CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff))); + CHECK_MEM_ERROR(cm, cpi->source_diff_var, + vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var))); cpi->source_var_thresh = 0; cpi->frames_till_next_var_check = 0; #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \ diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 79c0b36a17..77de5c8754 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -920,7 +920,7 @@ typedef struct VP9_COMP { SVC svc; // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type. - diff *source_diff_var; + Diff *source_diff_var; // The threshold used in SOURCE_VAR_BASED_PARTITION search type. unsigned int source_var_thresh; int frames_till_next_var_check; From 3712a5869caa05f2ff8e9087cf7583f349d4e23e Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 22 Feb 2023 12:54:21 -0800 Subject: [PATCH 0618/1000] vpx_subpixel_8t_intrin_avx2: clear -Wshadow warnings no changes to assembly Bug: webm:1793 Change-Id: I6a82290cafee7f4a7909d497ccfdefd5a78fb8ed --- vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 31 ++++++++++------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index c7d880860e..841db7cd71 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -534,9 +534,6 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, const ptrdiff_t unrolled_dst_stride = dst_stride << 1; int h; - __m256i src_reg, src_reg_shift_0, src_reg_shift_2; - __m256i dst_reg; - __m256i tmp_0, tmp_1; __m256i idx_shift_0 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); @@ -557,9 +554,11 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, for (h = height; h >= 2; h -= 2) { // Load the source - src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); - src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); - src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + const __m256i src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + __m256i dst_reg; + __m256i tmp_0, tmp_1; + const __m256i src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + const __m256i src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); // Get the output tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); @@ -580,9 +579,9 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, // Repeat for the last row if needed if (h > 0) { - __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + const __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr); __m128i dst_reg; - const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding __m128i tmp_0, tmp_1; __m128i src_reg_shift_0 = @@ -596,7 +595,7 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, _mm256_castsi256_si128(kernel_reg_45)); dst_reg = _mm_adds_epi16(tmp_0, tmp_1); - dst_reg = mm_round_epi16_sse2(&dst_reg, ®_32, 6); + dst_reg = mm_round_epi16_sse2(&dst_reg, ®_32_128, 6); dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128()); @@ -715,8 +714,6 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, const ptrdiff_t unrolled_src_stride = src_stride << 1; const ptrdiff_t unrolled_dst_stride = dst_stride << 1; - __m256i src_reg, src_reg_shuf; - __m256i dst; __m256i shuf_idx = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); @@ -733,12 +730,12 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, for (h = height; h > 1; h -= 2) { // Load the source - src_reg = mm256_loadu2_epi64((const __m128i *)src_ptr, - (const __m128i *)(src_ptr + src_stride)); - src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx); + const __m256i src_reg = mm256_loadu2_epi64( + (const __m128i *)src_ptr, (const __m128i *)(src_ptr + src_stride)); + const __m256i src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx); // Get the result - dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg); + __m256i dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg); dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256()); // Round result @@ -757,7 +754,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, if (h > 0) { // Load the source - const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr); __m128i src_reg_shuf = _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx)); @@ -768,7 +765,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, dst = _mm_hadds_epi16(dst, _mm_setzero_si128()); // Round result - dst = mm_round_epi16_sse2(&dst, ®_32, 6); + dst = mm_round_epi16_sse2(&dst, ®_32_128, 6); // Pack to 8-bits dst = _mm_packus_epi16(dst, _mm_setzero_si128()); From 4ba3be9324497d33d51d3b8a8ea900505e6f1450 Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Wed, 22 Feb 2023 12:44:47 -0800 Subject: [PATCH 0619/1000] Disable some intra modes for TX_32X32 Performance: | SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T | |---------|---------|----------|----------|---------|-------| | 0 | hdres2 | +0.036% | +0.032% | +0.014% | -3.9% | | 0 | lowres2 | -0.002% | -0.011% | +0.020% | -3.6% | | 0 | midres2 | +0.045% | +0.025% | -0.007% | -4.0% | STATS_CHANGED Change-Id: I75a927333d26f2a37f0dda57a641b455b845f5b9 --- vp9/encoder/vp9_speed_features.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 72ac0cebb8..ce83a97626 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -228,6 +228,8 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->tx_size_search_breakout = 1; sf->use_square_partition_only = !boosted; + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + // Reference masking is not supported in dynamic scaling mode. sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC; @@ -281,7 +283,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10; sf->allow_acl = 0; - sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; if (cpi->oxcf.content != VP9E_CONTENT_FILM) { sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; From f569a4d68c703c0ded5ec71ef20e12aeeb58de1f Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 22 Feb 2023 13:21:27 -0800 Subject: [PATCH 0620/1000] vp9_adapt_mode_probs: clear -Wshadow warning Bug: webm:1793 Change-Id: Ie4ea8f0a3295e6f58dc6f7d5c61d46700c539d40 --- vp9/common/vp9_entropymode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c index bda824de3c..9289fc9e1f 100644 --- a/vp9/common/vp9_entropymode.c +++ b/vp9/common/vp9_entropymode.c @@ -381,7 +381,6 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { } if (cm->tx_mode == TX_MODE_SELECT) { - int j; unsigned int branch_ct_8x8p[TX_SIZES - 3][2]; unsigned int branch_ct_16x16p[TX_SIZES - 2][2]; unsigned int branch_ct_32x32p[TX_SIZES - 1][2]; From 76389886ee6bf90e8fe464547eb26bb2311ae698 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 22 Feb 2023 13:25:29 -0800 Subject: [PATCH 0621/1000] vp9_loop_filter_alloc: clear -Wshadow warnings Bug: webm:1793 Change-Id: Ia64d175aa69dc2ecde2babf64bde04f02b32795b --- vp9/common/vp9_thread_common.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c index b3d50162b2..ad4478179e 100644 --- a/vp9/common/vp9_thread_common.c +++ b/vp9/common/vp9_thread_common.c @@ -306,7 +306,6 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex, vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows)); if (lf_sync->recon_done_mutex) { - int i; for (i = 0; i < rows; ++i) { pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL); } @@ -315,7 +314,6 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond, vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows)); if (lf_sync->recon_done_cond) { - int i; for (i = 0; i < rows; ++i) { pthread_cond_init(&lf_sync->recon_done_cond[i], NULL); } From aab93ee6b62fd3ab489784062ea6f825f2b871da Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Tue, 21 Feb 2023 17:40:20 +0000 Subject: [PATCH 0622/1000] Add Neon implementation of high bitdepth 8x8 hadamard transform Add Neon implementation of vpx_highbd_hadamard_8x8 as well as the corresponding tests. Change-Id: I3ef1ff199d76b6b010591ef15a81b0f36c9ded03 --- test/hadamard_test.cc | 6 ++ vpx_dsp/arm/highbd_hadamard_neon.c | 137 +++++++++++++++++++++++++++++ vpx_dsp/arm/mem_neon.h | 6 ++ vpx_dsp/vpx_dsp.mk | 3 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 5 files changed, 153 insertions(+), 1 deletion(-) create mode 100644 vpx_dsp/arm/highbd_hadamard_neon.c diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index f904e814ad..2062cbe340 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -324,5 +324,11 @@ INSTANTIATE_TEST_SUITE_P( 32))); #endif // HAVE_AVX2 +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, HadamardHighbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_neon, 8))); +#endif + #endif // CONFIG_VP9_HIGHBITDEPTH } // namespace diff --git a/vpx_dsp/arm/highbd_hadamard_neon.c b/vpx_dsp/arm/highbd_hadamard_neon.c new file mode 100644 index 0000000000..615de4b0ce --- /dev/null +++ b/vpx_dsp/arm/highbd_hadamard_neon.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1, + int16x8_t *a2, int16x8_t *a3, + int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, + int16x8_t *a7) { + int16x8_t b0 = vaddq_s16(*a0, *a1); + int16x8_t b1 = vsubq_s16(*a0, *a1); + int16x8_t b2 = vaddq_s16(*a2, *a3); + int16x8_t b3 = vsubq_s16(*a2, *a3); + int16x8_t b4 = vaddq_s16(*a4, *a5); + int16x8_t b5 = vsubq_s16(*a4, *a5); + int16x8_t b6 = vaddq_s16(*a6, *a7); + int16x8_t b7 = vsubq_s16(*a6, *a7); + + int16x8_t c0 = vaddq_s16(b0, b2); + int16x8_t c2 = vsubq_s16(b0, b2); + int16x8_t c1 = vaddq_s16(b1, b3); + int16x8_t c3 = vsubq_s16(b1, b3); + int16x8_t c4 = vaddq_s16(b4, b6); + int16x8_t c6 = vsubq_s16(b4, b6); + int16x8_t c5 = vaddq_s16(b5, b7); + int16x8_t c7 = vsubq_s16(b5, b7); + + *a0 = vaddq_s16(c0, c4); + *a2 = vsubq_s16(c0, c4); + *a7 = vaddq_s16(c1, c5); + *a6 = vsubq_s16(c1, c5); + *a3 = vaddq_s16(c2, c6); + *a1 = vsubq_s16(c2, c6); + *a4 = vaddq_s16(c3, c7); + *a5 = vsubq_s16(c3, c7); +} + +static INLINE void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1, + int16x4_t a2, int16x4_t a3, + int16x4_t a4, int16x4_t a5, + int16x4_t a6, int16x4_t a7, + tran_low_t *coeff) { + int32x4_t b0 = vaddl_s16(a0, a1); + int32x4_t b1 = vsubl_s16(a0, a1); + int32x4_t b2 = vaddl_s16(a2, a3); + int32x4_t b3 = vsubl_s16(a2, a3); + int32x4_t b4 = vaddl_s16(a4, a5); + int32x4_t b5 = vsubl_s16(a4, a5); + int32x4_t b6 = vaddl_s16(a6, a7); + int32x4_t b7 = vsubl_s16(a6, a7); + + int32x4_t c0 = vaddq_s32(b0, b2); + int32x4_t c2 = vsubq_s32(b0, b2); + int32x4_t c1 = vaddq_s32(b1, b3); + int32x4_t c3 = vsubq_s32(b1, b3); + int32x4_t c4 = vaddq_s32(b4, b6); + int32x4_t c6 = vsubq_s32(b4, b6); + int32x4_t c5 = vaddq_s32(b5, b7); + int32x4_t c7 = vsubq_s32(b5, b7); + + int32x4_t d0 = vaddq_s32(c0, c4); + int32x4_t d2 = vsubq_s32(c0, c4); + int32x4_t d7 = vaddq_s32(c1, c5); + int32x4_t d6 = vsubq_s32(c1, c5); + int32x4_t d3 = vaddq_s32(c2, c6); + int32x4_t d1 = vsubq_s32(c2, c6); + int32x4_t d4 = vaddq_s32(c3, c7); + int32x4_t d5 = vsubq_s32(c3, c7); + + store_s32q_to_tran_low(coeff + 0, d0); + store_s32q_to_tran_low(coeff + 4, d1); + store_s32q_to_tran_low(coeff + 8, d2); + store_s32q_to_tran_low(coeff + 12, d3); + store_s32q_to_tran_low(coeff + 16, d4); + store_s32q_to_tran_low(coeff + 20, d5); + store_s32q_to_tran_low(coeff + 24, d6); + store_s32q_to_tran_low(coeff + 28, d7); +} + +void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int16x4_t b0, b1, b2, b3, b4, b5, b6, b7; + + int16x8_t s0 = vld1q_s16(src_diff + 0 * src_stride); + int16x8_t s1 = vld1q_s16(src_diff + 1 * src_stride); + int16x8_t s2 = vld1q_s16(src_diff + 2 * src_stride); + int16x8_t s3 = vld1q_s16(src_diff + 3 * src_stride); + int16x8_t s4 = vld1q_s16(src_diff + 4 * src_stride); + int16x8_t s5 = vld1q_s16(src_diff + 5 * src_stride); + int16x8_t s6 = vld1q_s16(src_diff + 6 * src_stride); + int16x8_t s7 = vld1q_s16(src_diff + 7 * src_stride); + + // For the first pass we can stay in 16-bit elements (4095*8 = 32760). + hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + // For the second pass we need to widen to 32-bit elements, so we're + // processing 4 columns at a time. + // Skip the second transpose because it is not required. + + b0 = vget_low_s16(s0); + b1 = vget_low_s16(s1); + b2 = vget_low_s16(s2); + b3 = vget_low_s16(s3); + b4 = vget_low_s16(s4); + b5 = vget_low_s16(s5); + b6 = vget_low_s16(s6); + b7 = vget_low_s16(s7); + + hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff); + + b0 = vget_high_s16(s0); + b1 = vget_high_s16(s1); + b2 = vget_high_s16(s2); + b3 = vget_high_s16(s3); + b4 = vget_high_s16(s4); + b5 = vget_high_s16(s5); + b6 = vget_high_s16(s6); + b7 = vget_high_s16(s7); + + hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32); +} diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index 866be7439e..b7a363891e 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -102,6 +102,12 @@ static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { #endif } +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void store_s32q_to_tran_low(tran_low_t *buf, const int32x4_t a) { + vst1q_s32(buf, a); +} +#endif + // Propagate type information to the compiler. Without this the compiler may // assume the required alignment of uint32_t (4 bytes) and add alignment hints // to the memory access. diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 5535f82c07..ab8e5bd817 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -342,6 +342,9 @@ DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c DSP_SRCS-$(HAVE_AVX2) += x86/avg_intrin_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_NEON) += arm/highbd_hadamard_neon.c +endif DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c DSP_SRCS-$(HAVE_LSX) += loongarch/avg_lsx.c ifeq ($(VPX_ARCH_X86_64),yes) diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 8725821b67..dc3cdc4145 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -802,7 +802,7 @@ () specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/; add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_highbd_hadamard_8x8 avx2/; + specialize qw/vpx_highbd_hadamard_8x8 avx2 neon/; add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/vpx_highbd_hadamard_16x16 avx2/; From 221d76ab9ca3a8e139a92386c814f71fd172d197 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 23 Feb 2023 14:28:30 -0500 Subject: [PATCH 0623/1000] vp9 rc test: change param type to bool Change-Id: Ib45522e32d9137678da9062830044e9dd87537e5 --- test/vp9_ratectrl_rtc_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index 578ad26fcf..c6ab5b034f 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -164,7 +164,7 @@ class RcInterfaceTest class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWith2Params { + public ::libvpx_test::CodecTestWith2Params { public: RcInterfaceSvcTest() : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000), From 6ec45f933c6c4de3fcd9344852bde25d30613321 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Wed, 22 Feb 2023 17:27:56 +0000 Subject: [PATCH 0624/1000] Add Neon implementation of high bitdepth 16x16 hadamard transform Add Neon implementation of vpx_highbd_hadamard_16x16 as well as the corresponding tests. Change-Id: If3299fe556351dfe3db994ac171d83a95ea1504b --- test/hadamard_test.cc | 4 ++- vpx_dsp/arm/highbd_hadamard_neon.c | 39 ++++++++++++++++++++++++++++++ vpx_dsp/arm/mem_neon.h | 4 +++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 4 files changed, 47 insertions(+), 2 deletions(-) diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index 2062cbe340..2482e87cb6 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -327,7 +327,9 @@ INSTANTIATE_TEST_SUITE_P( #if HAVE_NEON INSTANTIATE_TEST_SUITE_P( NEON, HadamardHighbdTest, - ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_neon, 8))); + ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_neon, 8), + HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_neon, + 16))); #endif #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/highbd_hadamard_neon.c b/vpx_dsp/arm/highbd_hadamard_neon.c index 615de4b0ce..013f7148f4 100644 --- a/vpx_dsp/arm/highbd_hadamard_neon.c +++ b/vpx_dsp/arm/highbd_hadamard_neon.c @@ -135,3 +135,42 @@ void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32); } + +void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int i = 0; + + // Rearrange 16x16 to 8x32 and remove stride. + // Top left first. + vpx_highbd_hadamard_8x8_neon(src_diff, src_stride, coeff); + // Top right. + vpx_highbd_hadamard_8x8_neon(src_diff + 8, src_stride, coeff + 64); + // Bottom left. + vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride, src_stride, + coeff + 128); + // Bottom right. + vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride + 8, src_stride, + coeff + 192); + + do { + int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i); + int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 64); + int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 128); + int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 192); + + int32x4_t b0 = vhaddq_s32(a0, a1); + int32x4_t b1 = vhsubq_s32(a0, a1); + int32x4_t b2 = vhaddq_s32(a2, a3); + int32x4_t b3 = vhsubq_s32(a2, a3); + + int32x4_t c0 = vaddq_s32(b0, b2); + int32x4_t c1 = vaddq_s32(b1, b3); + int32x4_t c2 = vsubq_s32(b0, b2); + int32x4_t c3 = vsubq_s32(b1, b3); + + store_s32q_to_tran_low(coeff + 4 * i, c0); + store_s32q_to_tran_low(coeff + 4 * i + 64, c1); + store_s32q_to_tran_low(coeff + 4 * i + 128, c2); + store_s32q_to_tran_low(coeff + 4 * i + 192, c3); + } while (++i < 16); +} diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index b7a363891e..2122956dc6 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -106,6 +106,10 @@ static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { static INLINE void store_s32q_to_tran_low(tran_low_t *buf, const int32x4_t a) { vst1q_s32(buf, a); } + +static INLINE int32x4_t load_tran_low_to_s32q(const tran_low_t *buf) { + return vld1q_s32(buf); +} #endif // Propagate type information to the compiler. Without this the compiler may diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index dc3cdc4145..276d55baff 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -805,7 +805,7 @@ () specialize qw/vpx_highbd_hadamard_8x8 avx2 neon/; add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_highbd_hadamard_16x16 avx2/; + specialize qw/vpx_highbd_hadamard_16x16 avx2 neon/; add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/vpx_highbd_hadamard_32x32 avx2/; From 111068923b4ca778a680330d00161d7ee93f61e1 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Thu, 23 Feb 2023 12:05:30 +0000 Subject: [PATCH 0625/1000] Add Neon implementation of high bitdepth 32x32 hadamard transform Add Neon implementation of vpx_highbd_hadamard_32x32 as well as the corresponding tests. Change-Id: I65d8603896649de1996b353aa79eee54824b4708 --- test/hadamard_test.cc | 5 ++-- vpx_dsp/arm/highbd_hadamard_neon.c | 39 ++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index 2482e87cb6..9f6c99f3c4 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -328,8 +328,9 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( NEON, HadamardHighbdTest, ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_neon, 8), - HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_neon, - 16))); + HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_neon, 16), + HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_neon, + 32))); #endif #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/highbd_hadamard_neon.c b/vpx_dsp/arm/highbd_hadamard_neon.c index 013f7148f4..499eb65462 100644 --- a/vpx_dsp/arm/highbd_hadamard_neon.c +++ b/vpx_dsp/arm/highbd_hadamard_neon.c @@ -174,3 +174,42 @@ void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff, store_s32q_to_tran_low(coeff + 4 * i + 192, c3); } while (++i < 16); } + +void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int i = 0; + + // Rearrange 32x32 to 16x64 and remove stride. + // Top left first. + vpx_highbd_hadamard_16x16_neon(src_diff, src_stride, coeff); + // Top right. + vpx_highbd_hadamard_16x16_neon(src_diff + 16, src_stride, coeff + 256); + // Bottom left. + vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride, src_stride, + coeff + 512); + // Bottom right. + vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride + 16, src_stride, + coeff + 768); + + do { + int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i); + int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 256); + int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 512); + int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 768); + + int32x4_t b0 = vhaddq_s32(a0, a1); + int32x4_t b1 = vhsubq_s32(a0, a1); + int32x4_t b2 = vhaddq_s32(a2, a3); + int32x4_t b3 = vhsubq_s32(a2, a3); + + int32x4_t c0 = vhaddq_s32(b0, b2); + int32x4_t c1 = vhaddq_s32(b1, b3); + int32x4_t c2 = vhsubq_s32(b0, b2); + int32x4_t c3 = vhsubq_s32(b1, b3); + + store_s32q_to_tran_low(coeff + 4 * i, c0); + store_s32q_to_tran_low(coeff + 4 * i + 256, c1); + store_s32q_to_tran_low(coeff + 4 * i + 512, c2); + store_s32q_to_tran_low(coeff + 4 * i + 768, c3); + } while (++i < 64); +} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 276d55baff..eef72249e0 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -808,7 +808,7 @@ () specialize qw/vpx_highbd_hadamard_16x16 avx2 neon/; add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_highbd_hadamard_32x32 avx2/; + specialize qw/vpx_highbd_hadamard_32x32 avx2 neon/; add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; specialize qw/vpx_satd avx2 sse2 neon/; From 5b2d3d5e4242f63e0f3cb673dac245b739c4423d Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 24 Feb 2023 19:25:39 -0800 Subject: [PATCH 0626/1000] tools_common,VpxInterface: fix interface fn ptr proto Use (void) to indicate an empty parameter list and match the declaration of vpx_codec_vp[89]_[cd]x. This fixes a cfi sanitizer error. Change-Id: I190f432eea4d1765afffd84c7458ec44d863f90c --- tools_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools_common.h b/tools_common.h index b9cfb9cc85..3a266416e3 100644 --- a/tools_common.h +++ b/tools_common.h @@ -147,7 +147,7 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame); typedef struct VpxInterface { const char *const name; const uint32_t fourcc; - vpx_codec_iface_t *(*const codec_interface)(); + vpx_codec_iface_t *(*const codec_interface)(void); } VpxInterface; int get_vpx_encoder_count(void); From b25cca8c2edba5fbc18448007da2624a25113f4d Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Sat, 25 Feb 2023 00:43:46 +0000 Subject: [PATCH 0627/1000] Optimize transpose_neon.h helper functions 1) Use vtrn[12]q_[su]64 in vpx_vtrnq_[su]64* helpers on AArch64 targets. This produces half as many TRN1/2 instructions compared to the number of MOVs that result from vcombine. 2) Use vpx_vtrnq_[su]64* helpers wherever applicable. 3) Refactor transpose_4x8_s16 to operate on 128-bit vectors. Change-Id: I9a8b1c1fe2a98a429e0c5f39def5eb2f65759127 --- vpx_dsp/arm/transpose_neon.h | 108 +++++++++++++++++------------------ 1 file changed, 53 insertions(+), 55 deletions(-) diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index 48292c6936..518278f303 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -39,26 +39,45 @@ static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { int32x4x2_t b0; +#if defined(__aarch64__) + b0.val[0] = vreinterpretq_s32_s64( + vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); + b0.val[1] = vreinterpretq_s32_s64( + vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); +#else b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1)); b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1)); +#endif return b0; } static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { int64x2x2_t b0; +#if defined(__aarch64__) + b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)); + b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)); +#else b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)), vreinterpret_s64_s32(vget_low_s32(a1))); b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)), vreinterpret_s64_s32(vget_high_s32(a1))); +#endif return b0; } static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) { uint8x16x2_t b0; +#if defined(__aarch64__) + b0.val[0] = vreinterpretq_u8_u64( + vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); + b0.val[1] = vreinterpretq_u8_u64( + vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); +#else b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)), vreinterpret_u8_u32(vget_low_u32(a1))); b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)), vreinterpret_u8_u32(vget_high_u32(a1))); +#endif return b0; } @@ -155,17 +174,13 @@ static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) { // c0: 00 01 20 21 02 03 22 23 // c1: 10 11 30 31 12 13 32 33 - const int32x4_t c0 = - vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1])); - const int32x4_t c1 = - vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1])); + const int16x8x2_t c0 = vpx_vtrnq_s64_to_s16(b0.val[0], b0.val[1]); // Swap 16 bit elements resulting in: // d0.val[0]: 00 10 20 30 02 12 22 32 // d0.val[1]: 01 11 21 31 03 13 23 33 - const int16x8x2_t d0 = - vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1)); + const int16x8x2_t d0 = vtrnq_s16(c0.val[0], c0.val[1]); *a0 = d0.val[0]; *a1 = d0.val[1]; @@ -186,17 +201,13 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) { // c0: 00 01 20 21 02 03 22 23 // c1: 10 11 30 31 12 13 32 33 - const uint32x4_t c0 = - vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1])); - const uint32x4_t c1 = - vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1])); + const uint16x8x2_t c0 = vpx_vtrnq_u64_to_u16(b0.val[0], b0.val[1]); // Swap 16 bit elements resulting in: // d0.val[0]: 00 10 20 30 02 12 22 32 // d0.val[1]: 01 11 21 31 03 13 23 33 - const uint16x8x2_t d0 = - vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1)); + const uint16x8x2_t d0 = vtrnq_u16(c0.val[0], c0.val[1]); *a0 = d0.val[0]; *a1 = d0.val[1]; @@ -295,7 +306,7 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1, const int16x4_t a6, const int16x4_t a7, int16x8_t *const o0, int16x8_t *const o1, int16x8_t *const o2, int16x8_t *const o3) { - // Swap 16 bit elements. Goes from: + // Combine rows. Goes from: // a0: 00 01 02 03 // a1: 10 11 12 13 // a2: 20 21 22 23 @@ -305,53 +316,40 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1, // a6: 60 61 62 63 // a7: 70 71 72 73 // to: - // b0.val[0]: 00 10 02 12 - // b0.val[1]: 01 11 03 13 - // b1.val[0]: 20 30 22 32 - // b1.val[1]: 21 31 23 33 - // b2.val[0]: 40 50 42 52 - // b2.val[1]: 41 51 43 53 - // b3.val[0]: 60 70 62 72 - // b3.val[1]: 61 71 63 73 + // b0: 00 01 02 03 40 41 42 43 + // b1: 10 11 12 13 50 51 52 53 + // b2: 20 21 22 23 60 61 62 63 + // b3: 30 31 32 33 70 71 72 73 + + const int16x8_t b0 = vcombine_s16(a0, a4); + const int16x8_t b1 = vcombine_s16(a1, a5); + const int16x8_t b2 = vcombine_s16(a2, a6); + const int16x8_t b3 = vcombine_s16(a3, a7); - const int16x4x2_t b0 = vtrn_s16(a0, a1); - const int16x4x2_t b1 = vtrn_s16(a2, a3); - const int16x4x2_t b2 = vtrn_s16(a4, a5); - const int16x4x2_t b3 = vtrn_s16(a6, a7); + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 02 12 40 50 42 52 + // c0.val[1]: 01 11 03 13 41 51 43 53 + // c1.val[0]: 20 30 22 32 60 70 62 72 + // c1.val[1]: 21 31 23 33 61 71 63 73 + + const int16x8x2_t c0 = vtrnq_s16(b0, b1); + const int16x8x2_t c1 = vtrnq_s16(b2, b3); // Swap 32 bit elements resulting in: - // c0.val[0]: 00 10 20 30 - // c0.val[1]: 02 12 22 32 - // c1.val[0]: 01 11 21 31 - // c1.val[1]: 03 13 23 33 - // c2.val[0]: 40 50 60 70 - // c2.val[1]: 42 52 62 72 - // c3.val[0]: 41 51 61 71 - // c3.val[1]: 43 53 63 73 + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 02 12 22 32 42 52 62 72 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 03 13 23 33 43 53 63 73 - const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), - vreinterpret_s32_s16(b1.val[0])); - const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), - vreinterpret_s32_s16(b1.val[1])); - const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]), - vreinterpret_s32_s16(b3.val[0])); - const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]), - vreinterpret_s32_s16(b3.val[1])); + const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), + vreinterpretq_s32_s16(c1.val[0])); + const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), + vreinterpretq_s32_s16(c1.val[1])); - // Swap 64 bit elements resulting in: - // o0: 00 10 20 30 40 50 60 70 - // o1: 01 11 21 31 41 51 61 71 - // o2: 02 12 22 32 42 52 62 72 - // o3: 03 13 23 33 43 53 63 73 - - *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]), - vreinterpret_s16_s32(c2.val[0])); - *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]), - vreinterpret_s16_s32(c3.val[0])); - *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]), - vreinterpret_s16_s32(c2.val[1])); - *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]), - vreinterpret_s16_s32(c3.val[1])); + *o0 = vreinterpretq_s16_s32(d0.val[0]); + *o1 = vreinterpretq_s16_s32(d1.val[0]); + *o2 = vreinterpretq_s16_s32(d0.val[1]); + *o3 = vreinterpretq_s16_s32(d1.val[1]); } static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1, From ccc101e6bb63c2af340b993c57fad0f3810aee27 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Fri, 24 Feb 2023 18:05:43 +0000 Subject: [PATCH 0628/1000] Add Neon implementations of standard bitdepth MSE functions Currently only vpx_mse16x16 has a Neon implementation. This patch adds optimized Armv8.0 and Armv8.4 dot-product paths for all block sizes: 8x8, 8x16, 16x8 and 16x16. Add the corresponding tests as well. Change-Id: Ib0357fdcdeb05860385fec89633386e34395e260 --- test/variance_test.cc | 7 +- vpx_dsp/arm/variance_neon.c | 182 +++++++++++++++++++++++------------ vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +- 3 files changed, 127 insertions(+), 68 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index 33f09209f4..a68cfad516 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -773,6 +773,7 @@ TEST_P(VpxSseTest, RefSse) { RefTestSse(); } TEST_P(VpxSseTest, MaxSse) { MaxTestSse(); } TEST_P(VpxMseTest, RefMse) { RefTestMse(); } TEST_P(VpxMseTest, MaxMse) { MaxTestMse(); } +TEST_P(VpxMseTest, DISABLED_Speed) { SpeedTest(); } TEST_P(VpxVarianceTest, Zero) { ZeroTest(); } TEST_P(VpxVarianceTest, Ref) { RefTest(); } TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); } @@ -1450,8 +1451,10 @@ INSTANTIATE_TEST_SUITE_P(NEON, VpxSseTest, &vpx_get4x4sse_cs_neon))); INSTANTIATE_TEST_SUITE_P(NEON, VpxMseTest, - ::testing::Values(MseParams(4, 4, - &vpx_mse16x16_neon))); + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon), + MseParams(4, 3, &vpx_mse16x8_neon), + MseParams(3, 4, &vpx_mse8x16_neon), + MseParams(3, 3, &vpx_mse8x8_neon))); INSTANTIATE_TEST_SUITE_P( NEON, VpxVarianceTest, diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index 3ccc4e807b..feff980c93 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -371,32 +371,66 @@ VARIANCE_WXH_NEON(64, 64, 12) #if defined(__ARM_FEATURE_DOTPROD) -unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int *sse) { - int i; - uint8x16_t a[2], b[2], abs_diff[2]; - uint32x4_t sse_vec[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; - - for (i = 0; i < 8; i++) { - a[0] = vld1q_u8(src_ptr); +static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + int i = h / 2; + do { + uint8x8_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1_u8(src_ptr); src_ptr += src_stride; - a[1] = vld1q_u8(src_ptr); + s1 = vld1_u8(src_ptr); src_ptr += src_stride; - b[0] = vld1q_u8(ref_ptr); + r0 = vld1_u8(ref_ptr); ref_ptr += ref_stride; - b[1] = vld1q_u8(ref_ptr); + r1 = vld1_u8(ref_ptr); ref_ptr += ref_stride; - abs_diff[0] = vabdq_u8(a[0], b[0]); - abs_diff[1] = vabdq_u8(a[1], b[1]); + diff0 = vabd_u8(s0, r0); + diff1 = vabd_u8(s1, r1); - sse_vec[0] = vdotq_u32(sse_vec[0], abs_diff[0], abs_diff[0]); - sse_vec[1] = vdotq_u32(sse_vec[1], abs_diff[1], abs_diff[1]); - } + sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0); + sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1); + } while (--i != 0); - *sse = horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1])); - return horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1])); + *sse = horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1])); + return *sse; +} + +static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + s1 = vld1q_u8(src_ptr); + src_ptr += src_stride; + r0 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + + diff0 = vabdq_u8(s0, r0); + diff1 = vabdq_u8(s1, r1); + + sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0); + sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1); + } while (--i != 0); + + *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); + return *sse; } unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, @@ -435,58 +469,67 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, #else // !defined(__ARM_FEATURE_DOTPROD) -unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int *sse) { - int i; - uint8x16_t a[2], b[2]; - int16x4_t diff_lo[4], diff_hi[4]; - uint16x8_t diff[4]; - int32x4_t sse_vec[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), - vdupq_n_s32(0) }; +static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; - for (i = 0; i < 8; i++) { - a[0] = vld1q_u8(src_ptr); + int i = h / 2; + do { + uint8x8_t s0, s1, r0, r1, diff0, diff1; + uint16x8_t sse0, sse1; + + s0 = vld1_u8(src_ptr); src_ptr += src_stride; - a[1] = vld1q_u8(src_ptr); + s1 = vld1_u8(src_ptr); src_ptr += src_stride; - b[0] = vld1q_u8(ref_ptr); + r0 = vld1_u8(ref_ptr); ref_ptr += ref_stride; - b[1] = vld1q_u8(ref_ptr); + r1 = vld1_u8(ref_ptr); ref_ptr += ref_stride; - diff[0] = vsubl_u8(vget_low_u8(a[0]), vget_low_u8(b[0])); - diff[1] = vsubl_u8(vget_high_u8(a[0]), vget_high_u8(b[0])); - diff[2] = vsubl_u8(vget_low_u8(a[1]), vget_low_u8(b[1])); - diff[3] = vsubl_u8(vget_high_u8(a[1]), vget_high_u8(b[1])); - - diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0])); - diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1])); - sse_vec[0] = vmlal_s16(sse_vec[0], diff_lo[0], diff_lo[0]); - sse_vec[1] = vmlal_s16(sse_vec[1], diff_lo[1], diff_lo[1]); - - diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2])); - diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3])); - sse_vec[2] = vmlal_s16(sse_vec[2], diff_lo[2], diff_lo[2]); - sse_vec[3] = vmlal_s16(sse_vec[3], diff_lo[3], diff_lo[3]); - - diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0])); - diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1])); - sse_vec[0] = vmlal_s16(sse_vec[0], diff_hi[0], diff_hi[0]); - sse_vec[1] = vmlal_s16(sse_vec[1], diff_hi[1], diff_hi[1]); - - diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2])); - diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3])); - sse_vec[2] = vmlal_s16(sse_vec[2], diff_hi[2], diff_hi[2]); - sse_vec[3] = vmlal_s16(sse_vec[3], diff_hi[3], diff_hi[3]); - } + diff0 = vabd_u8(s0, r0); + diff1 = vabd_u8(s1, r1); + + sse0 = vmull_u8(diff0, diff0); + sse_u32[0] = vpadalq_u16(sse_u32[0], sse0); + sse1 = vmull_u8(diff1, diff1); + sse_u32[1] = vpadalq_u16(sse_u32[1], sse1); + } while (--i != 0); + + *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); + return *sse; +} + +static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + uint8x16_t s, r, diff; + uint16x8_t sse0, sse1; - sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[1]); - sse_vec[2] = vaddq_s32(sse_vec[2], sse_vec[3]); - sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[2]); + s = vld1q_u8(src_ptr); + src_ptr += src_stride; + r = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; - *sse = horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0])); - return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0])); + diff = vabdq_u8(s, r); + + sse0 = vmull_u8(vget_low_u8(diff), vget_low_u8(diff)); + sse_u32[0] = vpadalq_u16(sse_u32[0], sse0); + sse1 = vmull_u8(vget_high_u8(diff), vget_high_u8(diff)); + sse_u32[1] = vpadalq_u16(sse_u32[1], sse1); + } while (--i != 0); + + *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); + return *sse; } unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, @@ -531,3 +574,16 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, } #endif // defined(__ARM_FEATURE_DOTPROD) + +#define VPX_MSE_WXH_NEON(w, h) \ + unsigned int vpx_mse##w##x##h##_neon( \ + const unsigned char *src_ptr, int src_stride, \ + const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \ + return vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h, \ + sse); \ + } + +VPX_MSE_WXH_NEON(8, 8) +VPX_MSE_WXH_NEON(8, 16) +VPX_MSE_WXH_NEON(16, 8) +VPX_MSE_WXH_NEON(16, 16) diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index eef72249e0..0ad3cbe6b2 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1141,13 +1141,13 @@ () specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx lsx/; add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_mse16x8 sse2 avx2 msa mmi vsx/; + specialize qw/vpx_mse16x8 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_mse8x16 sse2 msa mmi vsx/; + specialize qw/vpx_mse8x16 sse2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_mse8x8 sse2 msa mmi vsx/; + specialize qw/vpx_mse8x8 sse2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *"; specialize qw/vpx_get_mb_ss sse2 msa vsx/; From 112945ac7b5784c05912d1955afd2c245ce5c51d Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 27 Feb 2023 13:48:47 -0800 Subject: [PATCH 0629/1000] tools_common,VpxInterface: remove unneeded const Change-Id: Ic309aab2ff1750bdbcc36e8aafe05d52930ba694 --- tools_common.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools_common.h b/tools_common.h index 3a266416e3..9850907c15 100644 --- a/tools_common.h +++ b/tools_common.h @@ -145,9 +145,9 @@ VPX_NO_RETURN void usage_exit(void); int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame); typedef struct VpxInterface { - const char *const name; - const uint32_t fourcc; - vpx_codec_iface_t *(*const codec_interface)(void); + const char *name; + uint32_t fourcc; + vpx_codec_iface_t *(*codec_interface)(void); } VpxInterface; int get_vpx_encoder_count(void); From 848f6e733789c627b6606baf1c85e32be997e36f Mon Sep 17 00:00:00 2001 From: Johann Date: Sat, 5 Nov 2022 09:53:07 +0900 Subject: [PATCH 0630/1000] quantize: simplify 32x32_b args Now that all the implementations of the 32x32 quantize are in intrinsics we can reference struct members directly. Saves pushing them to the stack. n_coeffs is not used at all for this function. Change-Id: I2104fea3fa20c455087e21b347d6abd7ea1f3e1e --- test/vp9_quantize_test.cc | 285 +++++++++++++++++++++-------------- vp9/encoder/vp9_block.h | 1 + vp9/encoder/vp9_encodemb.c | 6 +- vpx_dsp/arm/quantize_neon.c | 17 +-- vpx_dsp/quantize.c | 17 ++- vpx_dsp/vpx_dsp_rtcd_defs.pl | 5 +- vpx_dsp/x86/quantize_avx.c | 30 +--- vpx_dsp/x86/quantize_avx2.c | 15 +- vpx_dsp/x86/quantize_sse2.h | 28 ++++ vpx_dsp/x86/quantize_ssse3.c | 35 +---- 10 files changed, 238 insertions(+), 201 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 587cec6923..ecb6116f0c 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -26,6 +26,7 @@ #include "test/util.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" #include "vpx_ports/msvc.h" @@ -38,8 +39,7 @@ namespace { const int number_of_iterations = 100; typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, - const int16_t *zbin, const int16_t *round, - const int16_t *quant, const int16_t *quant_shift, + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, const int16_t *scan, const int16_t *iscan); @@ -47,6 +47,41 @@ typedef std::tuple QuantizeParam; +// Wrapper which takes a macroblock_plane. +typedef void (*QuantizeBaseFunc)(const tran_low_t *coeff, intptr_t count, + const int16_t *zbin, const int16_t *round, + const int16_t *quant, + const int16_t *quant_shift, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, + uint16_t *eob, const int16_t *scan, + const int16_t *iscan); + +template +void QuantWrapper(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan) { + fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant, + mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan); +} + +// Wrapper for 32x32 version which does not use count +typedef void (*Quantize32x32Func)(const tran_low_t *coeff, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan); + +template +void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan) { + (void)count; + fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan); +} + // Wrapper for FP version which does not use zbin or quant_shift. typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, const int16_t *round, const int16_t *quant, @@ -56,15 +91,11 @@ typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, template void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, - const int16_t *zbin, const int16_t *round, - const int16_t *quant, const int16_t *quant_shift, - tran_low_t *qcoeff, tran_low_t *dqcoeff, - const int16_t *dequant, uint16_t *eob, const int16_t *scan, - const int16_t *iscan) { - (void)zbin; - (void)quant_shift; - - fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan); + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan) { + fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff, + dequant, eob, scan, iscan); } void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, @@ -119,17 +150,16 @@ class VP9QuantizeBase : public AbstractBench { #else max_value_ = (1 << bit_depth_) - 1; #endif - zbin_ptr_ = + zbin_ptr_ = mb_plane_.zbin = reinterpret_cast(vpx_memalign(16, 8 * sizeof(*zbin_ptr_))); - round_fp_ptr_ = reinterpret_cast( - vpx_memalign(16, 8 * sizeof(*round_fp_ptr_))); - quant_fp_ptr_ = reinterpret_cast( + round_fp_ptr_ = mb_plane_.round_fp; + quant_fp_ptr_ = mb_plane_.quant_fp = reinterpret_cast( vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_))); - round_ptr_ = + round_ptr_ = mb_plane_.round = reinterpret_cast(vpx_memalign(16, 8 * sizeof(*round_ptr_))); - quant_ptr_ = + quant_ptr_ = mb_plane_.quant = reinterpret_cast(vpx_memalign(16, 8 * sizeof(*quant_ptr_))); - quant_shift_ptr_ = reinterpret_cast( + quant_shift_ptr_ = mb_plane_.quant_shift = reinterpret_cast( vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_))); dequant_ptr_ = reinterpret_cast( vpx_memalign(16, 8 * sizeof(*dequant_ptr_))); @@ -140,7 +170,6 @@ class VP9QuantizeBase : public AbstractBench { ~VP9QuantizeBase() { vpx_free(zbin_ptr_); - vpx_free(round_fp_ptr_); vpx_free(quant_fp_ptr_); vpx_free(round_ptr_); vpx_free(quant_ptr_); @@ -157,6 +186,7 @@ class VP9QuantizeBase : public AbstractBench { } protected: + macroblock_plane mb_plane_; int16_t *zbin_ptr_; int16_t *round_fp_ptr_; int16_t *quant_fp_ptr_; @@ -193,10 +223,9 @@ class VP9QuantizeTest : public VP9QuantizeBase, }; void VP9QuantizeTest::Run() { - quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan, - scan_->iscan); + quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, + qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), dequant_ptr_, + &eob_, scan_->scan, scan_->iscan); } void VP9QuantizeTest::Speed(bool is_median) { @@ -266,8 +295,8 @@ void VP9QuantizeTest::Speed(bool is_median) { vpx_usec_timer_start(&timer); for (int n = 0; n < kNumTests; ++n) { - ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, - q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); } @@ -275,10 +304,9 @@ void VP9QuantizeTest::Speed(bool is_median) { vpx_usec_timer_start(&simd_timer); for (int n = 0; n < kNumTests; ++n) { - quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, - scan_->scan, scan_->iscan); + quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, + qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), + dequant_ptr_, &eob_, scan_->scan, scan_->iscan); } vpx_usec_timer_mark(&simd_timer); @@ -417,15 +445,14 @@ TEST_P(VP9QuantizeTest, OperationCheck) { GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), - ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_->scan, scan_->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), - dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); + ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, + &mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, + &eob_, scan_->scan, scan_->iscan)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); @@ -475,15 +502,14 @@ TEST_P(VP9QuantizeTest, EOBCheck) { GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), - ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_->scan, scan_->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), - dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); + ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, + &mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, + &eob_, scan_->scan, scan_->iscan)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); @@ -510,28 +536,35 @@ using std::make_tuple; INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, ::testing::Values( - make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, - false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, + 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, + 32, false))); #else INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), + ::testing::Values(make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, + 16, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true))); @@ -541,11 +574,12 @@ INSTANTIATE_TEST_SUITE_P( #if HAVE_SSSE3 INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_ssse3, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false), + ::testing::Values(make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), @@ -555,13 +589,14 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_SSSE3 #if HAVE_AVX -INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_avx, - &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx, - &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false))); +INSTANTIATE_TEST_SUITE_P( + AVX, VP9QuantizeTest, + ::testing::Values(make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false))); #endif // HAVE_AVX #if VPX_ARCH_X86_64 && HAVE_AVX2 @@ -577,22 +612,29 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_12, 32, true), - make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16, + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, - VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_avx2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_avx2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_avx2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, 16, + false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, + 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, + 32, false))); #else INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, @@ -602,11 +644,12 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 32, true), - make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx2, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false))); + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false))); #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_AVX2 @@ -615,22 +658,29 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, ::testing::Values( - make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, 16, false), - make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, - VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false), - make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_neon, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_neon, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_neon, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, + 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, + 32, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper, @@ -639,11 +689,12 @@ INSTANTIATE_TEST_SUITE_P( #else INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_neon, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false), + ::testing::Values(make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), @@ -683,9 +734,11 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest, INSTANTIATE_TEST_SUITE_P( DISABLED_C, VP9QuantizeTest, ::testing::Values( - make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8, - 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, VPX_BITS_8, 32, + false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper, diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 1786952911..fc27a0fbda 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -13,6 +13,7 @@ #include "vpx_util/vpx_thread.h" +#include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index fa222f9dcf..4910dc20f5 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -542,8 +542,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_16X16: @@ -948,8 +947,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride); fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index 9c227d560f..e81738a7bb 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -14,6 +14,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vp9/encoder/vp9_block.h" static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, const int16x8_t dequant, @@ -213,11 +214,8 @@ quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. -void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -226,10 +224,10 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int i; // Only the first element of each vector is DC. - int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); - int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); - int16x8_t quant = vld1q_s16(quant_ptr); - int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1); + int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1); + int16x8_t quant = vld1q_s16(mb_plane->quant); + int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift); int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. @@ -289,6 +287,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #endif // __aarch64__ // Need these here, else the compiler complains about mixing declarations and // code in C90 - (void)n_coeffs; (void)scan; } diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index 5d6ba64a8a..212db45c88 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -14,6 +14,7 @@ #include "vpx_dsp/quantize.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" +#include "vp9/encoder/vp9_block.h" void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, const int16_t *round_ptr, const int16_t quant, @@ -208,19 +209,21 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } #endif -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const int n_coeffs = 32 * 32; + const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), + ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; int idx = 0; - int idx_arr[1024]; + int idx_arr[32 * 32 /* n_coeffs */]; int i, eob = -1; (void)iscan; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index eef72249e0..639c18bc98 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -17,6 +17,9 @@ () #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; +#endif EOF } @@ -717,7 +720,7 @@ () add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; - add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index 7d83527216..d52f6c6644 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -140,15 +140,12 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); const __m256i big_zero = _mm256_setzero_si256(); int index; @@ -160,26 +157,9 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i eob = zero, eob0; (void)scan; - (void)n_coeffs; - - // Setup global values. - // The 32x32 halves zbin and round. - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - // Shift with rounding. - zbin = _mm_add_epi16(zbin, one); - zbin = _mm_srli_epi16(zbin, 1); - // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so - // it is a strict "greater" comparison. - zbin = _mm_sub_epi16(zbin, one); - - round = _mm_load_si128((const __m128i *)round_ptr); - round = _mm_add_epi16(round, one); - round = _mm_srli_epi16(round, 1); - - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - shift = _mm_slli_epi16(shift, 1); + + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, + &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index 28f7c9c7da..a8412c5b8e 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -13,6 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void load_b_values_avx2( const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, @@ -250,23 +251,19 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16( } } -void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; - (void)n_coeffs; (void)scan; - load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, - &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, - &v_quant_shift, 1); + load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round, + mb_plane->quant, &v_quant, dequant_ptr, &v_dequant, + mb_plane->quant_shift, &v_quant_shift, 1); // Do DC and first 15 AC. v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h index 27bfb4e41b..fe42fee018 100644 --- a/vpx_dsp/x86/quantize_sse2.h +++ b/vpx_dsp/x86/quantize_sse2.h @@ -15,6 +15,7 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_block.h" static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, const int16_t *round_ptr, __m128i *round, @@ -29,6 +30,33 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, *shift = _mm_load_si128((const __m128i *)shift_ptr); } +static INLINE void load_b_values32x32( + const struct macroblock_plane *const mb_plane, __m128i *zbin, + __m128i *round, __m128i *quant, const int16_t *dequant_ptr, + __m128i *dequant, __m128i *shift) { + const __m128i one = _mm_set1_epi16(1); + // The 32x32 halves zbin and round. + *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin); + // Shift with rounding. + *zbin = _mm_add_epi16(*zbin, one); + *zbin = _mm_srli_epi16(*zbin, 1); + // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + *zbin = _mm_sub_epi16(*zbin, one); + + *round = _mm_load_si128((const __m128i *)mb_plane->round); + *round = _mm_add_epi16(*round, one); + *round = _mm_srli_epi16(*round, 1); + + *quant = _mm_load_si128((const __m128i *)mb_plane->quant); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift); + // I suspect this is not technically OK because quant_shift can be up + // to 1 << 16 and shifting up again will outrange that, but the test is not + // comprehensive enough to catch that and "it's been that way forever" + *shift = _mm_slli_epi16(*shift, 1); +} + static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round, const int16_t *quant_ptr, __m128i *quant, const int16_t *dequant_ptr, diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 476230286d..6fe54d7d98 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -16,6 +16,7 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/encoder/vp9_block.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -107,16 +108,12 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); int index; __m128i zbin, round, quant, dequant, shift; @@ -127,29 +124,9 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i eob = zero, eob0; (void)scan; - (void)n_coeffs; - - // Setup global values. - // The 32x32 halves zbin and round. - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - // Shift with rounding. - zbin = _mm_add_epi16(zbin, one); - zbin = _mm_srli_epi16(zbin, 1); - // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so - // it is a strict "greater" comparison. - zbin = _mm_sub_epi16(zbin, one); - - round = _mm_load_si128((const __m128i *)round_ptr); - round = _mm_add_epi16(round, one); - round = _mm_srli_epi16(round, 1); - - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - // I suspect this is not technically OK because quant_shift can be up - // to 1 << 16 and shifting up again will outrange that, but the test is not - // comprehensive enough to catch that and "it's been that way forever" - shift = _mm_slli_epi16(shift, 1); + + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, + &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); From a7ab16aed1d75869c5fd096374a91698b419c1a7 Mon Sep 17 00:00:00 2001 From: George Steed Date: Fri, 3 Feb 2023 17:12:46 +0000 Subject: [PATCH 0631/1000] Implement d63_predictor using Neon Add Neon implementations of the d63 predictor for 4x4, 8x8, 16x16 and 32x32 block sizes. Also update tests to add new corresponding cases. Speedups over the C code (higher is better): Microarch. | Compiler | Block | Speedup Neoverse N1 | LLVM 15 | 4x4 | 2.10 Neoverse N1 | LLVM 15 | 8x8 | 4.45 Neoverse N1 | LLVM 15 | 16x16 | 4.74 Neoverse N1 | LLVM 15 | 32x32 | 2.27 Neoverse N1 | GCC 12 | 4x4 | 2.46 Neoverse N1 | GCC 12 | 8x8 | 10.37 Neoverse N1 | GCC 12 | 16x16 | 11.46 Neoverse N1 | GCC 12 | 32x32 | 6.57 Neoverse V1 | LLVM 15 | 4x4 | 2.24 Neoverse V1 | LLVM 15 | 8x8 | 3.53 Neoverse V1 | LLVM 15 | 16x16 | 4.44 Neoverse V1 | LLVM 15 | 32x32 | 2.17 Neoverse V1 | GCC 12 | 4x4 | 2.25 Neoverse V1 | GCC 12 | 8x8 | 7.67 Neoverse V1 | GCC 12 | 16x16 | 8.97 Neoverse V1 | GCC 12 | 32x32 | 4.77 Change-Id: Ib4a1a2cb5a5c4495ae329529f8847664cbd0dfe0 --- test/test_intra_pred_speed.cc | 12 +-- test/vp9_intrapred_test.cc | 8 ++ vpx_dsp/arm/intrapred_neon.c | 162 ++++++++++++++++++++++++++++++++++ vpx_dsp/arm/mem_neon.h | 15 ++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- 5 files changed, 195 insertions(+), 10 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 28b3484a03..df01ccac22 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -269,28 +269,28 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon, vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon, vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon, vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon, - vpx_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, nullptr, - vpx_tm_predictor_4x4_neon) + vpx_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, + vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon) INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon, vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon, vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon, vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, - vpx_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, nullptr, - vpx_tm_predictor_8x8_neon) + vpx_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, + vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon) INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon, vpx_dc_left_predictor_16x16_neon, vpx_dc_top_predictor_16x16_neon, vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon, vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, vpx_d135_predictor_16x16_neon, nullptr, nullptr, nullptr, - nullptr, vpx_tm_predictor_16x16_neon) + vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon) INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon, vpx_dc_left_predictor_32x32_neon, vpx_dc_top_predictor_32x32_neon, vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon, vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon, vpx_d135_predictor_32x32_neon, nullptr, nullptr, nullptr, - nullptr, vpx_tm_predictor_32x32_neon) + vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon) #endif // HAVE_NEON #if HAVE_MSA diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index ccace719ea..12a227b12c 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -243,6 +243,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_d45_predictor_16x16_c, 16, 8), IntraPredParam(&vpx_d45_predictor_32x32_neon, &vpx_d45_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d63_predictor_4x4_neon, &vpx_d63_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_d63_predictor_8x8_neon, &vpx_d63_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_d63_predictor_16x16_neon, + &vpx_d63_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d63_predictor_32x32_neon, + &vpx_d63_predictor_32x32_c, 32, 8), IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c, 4, 8), IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c, diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index 38e275834b..02a05aae53 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -12,6 +12,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "mem_neon.h" #include "vpx/vpx_integer.h" //------------------------------------------------------------------------------ @@ -383,6 +384,167 @@ void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, // ----------------------------------------------------------------------------- +void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t a0, a1, a2, a3, d0, d1, d2, d3; + (void)left; + + a0 = load_unaligned_u8_4x1(above + 0); + a1 = load_unaligned_u8_4x1(above + 1); + a2 = load_unaligned_u8_4x1(above + 2); + a3 = load_unaligned_u8_4x1(above + 3); + + d0 = vrhadd_u8(a0, a1); + d1 = vrhadd_u8(vhadd_u8(a0, a2), a1); + d2 = vrhadd_u8(a1, a2); + d3 = vrhadd_u8(vhadd_u8(a1, a3), a2); + + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, d1); + store_u8_4x1(dst + 2 * stride, d2); + store_u8_4x1(dst + 3 * stride, d3); +} + +void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t a0, a1, a2, a7, d0, d1; + (void)left; + + a0 = vld1_u8(above + 0); + a1 = vld1_u8(above + 1); + a2 = vld1_u8(above + 2); + a7 = vld1_dup_u8(above + 7); + + d0 = vrhadd_u8(a0, a1); + d1 = vrhadd_u8(vhadd_u8(a0, a2), a1); + + vst1_u8(dst + 0 * stride, d0); + vst1_u8(dst + 1 * stride, d1); + vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 1)); + vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 1)); + vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 2)); + vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 2)); + vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 3)); + vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 3)); +} + +void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t a0, a1, a2, a15, d0, d1; + (void)left; + + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a2 = vld1q_u8(above + 2); + a15 = vld1q_dup_u8(above + 15); + + d0 = vrhaddq_u8(a0, a1); + d1 = vrhaddq_u8(vhaddq_u8(a0, a2), a1); + + vst1q_u8(dst + 0 * stride, d0); + vst1q_u8(dst + 1 * stride, d1); + vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 1)); + vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 1)); + vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 2)); + vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 2)); + vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 3)); + vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 3)); + vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 4)); + vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 4)); + vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 5)); + vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 5)); + vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 6)); + vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 6)); + vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 7)); + vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 7)); +} + +void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t a0, a1, a2, a16, a17, a18, a31, d0_lo, d0_hi, d1_lo, d1_hi; + (void)left; + + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a2 = vld1q_u8(above + 2); + a16 = vld1q_u8(above + 16); + a17 = vld1q_u8(above + 17); + a18 = vld1q_u8(above + 18); + a31 = vld1q_dup_u8(above + 31); + + d0_lo = vrhaddq_u8(a0, a1); + d0_hi = vrhaddq_u8(a16, a17); + d1_lo = vrhaddq_u8(vhaddq_u8(a0, a2), a1); + d1_hi = vrhaddq_u8(vhaddq_u8(a16, a18), a17); + + vst1q_u8(dst + 0 * stride + 0, d0_lo); + vst1q_u8(dst + 0 * stride + 16, d0_hi); + vst1q_u8(dst + 1 * stride + 0, d1_lo); + vst1q_u8(dst + 1 * stride + 16, d1_hi); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 1)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 1)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 1)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 1)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 2)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 2)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 2)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 2)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 3)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 3)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 3)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 3)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 4)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 4)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 4)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 4)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 5)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 5)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 5)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 5)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 6)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 6)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 6)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 6)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 7)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 7)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 7)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 7)); + vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 8)); + vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 8)); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 8)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 8)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 9)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 9)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 9)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 9)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 10)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 10)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 10)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 10)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 11)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 11)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 11)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 11)); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 12)); + vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 12)); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 12)); + vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 12)); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 13)); + vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 13)); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 13)); + vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 13)); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 14)); + vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 14)); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 14)); + vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 14)); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0_lo, d0_hi, 15)); + vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_hi, a31, 15)); + vst1q_u8(dst + 31 * stride + 0, vextq_u8(d1_lo, d1_hi, 15)); + vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_hi, a31, 15)); +} + +// ----------------------------------------------------------------------------- + void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t XA0123 = vld1_u8(above - 1); diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index 866be7439e..d1ea361896 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -112,6 +112,21 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) { memcpy(buf, &a, 4); } +// Load 4 contiguous bytes when alignment is not guaranteed. +static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) { + uint32_t a; + uint32x2_t a_u32; + memcpy(&a, buf, 4); + a_u32 = vdup_n_u32(0); + a_u32 = vset_lane_u32(a, a_u32, 0); + return vreinterpret_u8_u32(a_u32); +} + +// Store 4 contiguous bytes from the low half of an 8x8 vector. +static INLINE void store_u8_4x1(uint8_t *buf, uint8x8_t a) { + vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 0); +} + // Load 2 sets of 4 bytes when alignment is not guaranteed. static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, ptrdiff_t stride) { diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 8725821b67..9ee9fc1c4f 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -46,7 +46,7 @@ () add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d63_predictor_4x4 ssse3/; +specialize qw/vpx_d63_predictor_4x4 neon ssse3/; add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; @@ -94,7 +94,7 @@ () add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; # TODO(crbug.com/webm/1522): Re-enable vsx implementation. -specialize qw/vpx_d63_predictor_8x8 ssse3/; +specialize qw/vpx_d63_predictor_8x8 neon ssse3/; add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; # TODO(crbug.com/webm/1522): Re-enable vsx implementation. @@ -135,7 +135,7 @@ () specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/; add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d63_predictor_16x16 ssse3 vsx/; +specialize qw/vpx_d63_predictor_16x16 neon ssse3 vsx/; add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/; @@ -173,7 +173,7 @@ () specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/; add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d63_predictor_32x32 ssse3 vsx/; +specialize qw/vpx_d63_predictor_32x32 neon ssse3 vsx/; add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/; From 360e9069b6cc1dd3a004728b876fb923413f4b11 Mon Sep 17 00:00:00 2001 From: George Steed Date: Tue, 7 Feb 2023 12:16:00 +0000 Subject: [PATCH 0632/1000] Implement d117_predictor using Neon Add Neon implementations of the d117 predictor for 4x4, 8x8, 16x16 and 32x32 block sizes. Also update tests to add new corresponding cases. An explanation of the general implementation strategy is given in the 8x8 implementation body. Speedups over the C code (higher is better): Microarch. | Compiler | Block | Speedup Neoverse N1 | LLVM 15 | 4x4 | 1.73 Neoverse N1 | LLVM 15 | 8x8 | 5.24 Neoverse N1 | LLVM 15 | 16x16 | 9.77 Neoverse N1 | LLVM 15 | 32x32 | 14.13 Neoverse N1 | GCC 12 | 4x4 | 2.04 Neoverse N1 | GCC 12 | 8x8 | 4.70 Neoverse N1 | GCC 12 | 16x16 | 8.64 Neoverse N1 | GCC 12 | 32x32 | 4.57 Neoverse V1 | LLVM 15 | 4x4 | 1.75 Neoverse V1 | LLVM 15 | 8x8 | 6.79 Neoverse V1 | LLVM 15 | 16x16 | 9.16 Neoverse V1 | LLVM 15 | 32x32 | 14.47 Neoverse V1 | GCC 12 | 4x4 | 1.75 Neoverse V1 | GCC 12 | 8x8 | 6.00 Neoverse V1 | GCC 12 | 16x16 | 7.63 Neoverse V1 | GCC 12 | 32x32 | 4.32 Change-Id: I7228327b5be27ee7a68deecafa05be0bd2a40ff4 --- test/test_intra_pred_speed.cc | 20 +-- test/vp9_intrapred_test.cc | 8 ++ vpx_dsp/arm/intrapred_neon.c | 232 ++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 + 4 files changed, 256 insertions(+), 8 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index df01ccac22..5861a17770 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -269,28 +269,32 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon, vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon, vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon, vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon, - vpx_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, - vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon) + vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon, + nullptr, nullptr, vpx_d63_predictor_4x4_neon, + vpx_tm_predictor_4x4_neon) INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon, vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon, vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon, vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, - vpx_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, - vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon) + vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon, + nullptr, nullptr, vpx_d63_predictor_8x8_neon, + vpx_tm_predictor_8x8_neon) INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon, vpx_dc_left_predictor_16x16_neon, vpx_dc_top_predictor_16x16_neon, vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon, vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, - vpx_d135_predictor_16x16_neon, nullptr, nullptr, nullptr, - vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon) + vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon, + nullptr, nullptr, vpx_d63_predictor_16x16_neon, + vpx_tm_predictor_16x16_neon) INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon, vpx_dc_left_predictor_32x32_neon, vpx_dc_top_predictor_32x32_neon, vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon, vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon, - vpx_d135_predictor_32x32_neon, nullptr, nullptr, nullptr, - vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon) + vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon, + nullptr, nullptr, vpx_d63_predictor_32x32_neon, + vpx_tm_predictor_32x32_neon) #endif // HAVE_NEON #if HAVE_MSA diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 12a227b12c..d04be429d1 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -251,6 +251,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_d63_predictor_16x16_c, 16, 8), IntraPredParam(&vpx_d63_predictor_32x32_neon, &vpx_d63_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d117_predictor_4x4_neon, &vpx_d117_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_d117_predictor_8x8_neon, &vpx_d117_predictor_8x8_c, + 8, 8), + IntraPredParam(&vpx_d117_predictor_16x16_neon, + &vpx_d117_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d117_predictor_32x32_neon, + &vpx_d117_predictor_32x32_c, 32, 8), IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c, 4, 8), IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c, diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index 02a05aae53..4760a295b9 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -545,6 +545,238 @@ void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, // ----------------------------------------------------------------------------- +void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d117_predictor_8x8_neon for more details on the implementation. + uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1; + + az = load_unaligned_u8_4x1(above - 1); + a0 = load_unaligned_u8_4x1(above + 0); + // [ left[0], above[-1], above[0], above[1], x, x, x, x ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2); + col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2); + + d0 = vrhadd_u8(az, a0); + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + d2 = vext_u8(col0, d0, 7); + d3 = vext_u8(col1, d1, 7); + + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, d1); + store_u8_4x1(dst + 2 * stride, d2); + store_u8_4x1(dst + 3 * stride, d3); +} + +void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd; + + az = vld1_u8(above - 1); + a0 = vld1_u8(above + 0); + // [ left[0], above[-1], ... , above[5] ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + l0 = vld1_u8(left + 0); + l1 = vld1_u8(left + 1); + // [ above[-1], left[0], ... , left[6] ] + azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], above[0]) + // d0[1] = AVG2(above[0], above[1]) + // ... + // d0[7] = AVG2(above[6], above[7]) + d0 = vrhadd_u8(az, a0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vector to put the elements to be shifted in + // at the end: + // col0[7] = AVG3(above[-1], left[0], left[1]) + // col0[6] = AVG3(left[0], left[1], left[2]) + // ... + // col0[0] = AVG3(left[6], left[7], left[8]) + col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0)); + + // We don't care about the first parameter to this uzp since we only ever use + // the high three elements, we just use col0 again since it is already + // available: + // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ] + // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ] + col0_even = vuzp_u8(col0, col0).val[1]; + col0_odd = vuzp_u8(col0, col0).val[0]; + + // Incrementally shift more elements from col0 into d0/1: + // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ] + // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ] + // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ] + // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ] + // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ] + // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ] + // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ] + vst1_u8(dst + 0 * stride, d0); + vst1_u8(dst + 1 * stride, d1); + vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7)); + vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7)); + vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6)); + vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6)); + vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5)); + vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5)); +} + +void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d117_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left + 0); + l1 = vld1q_u8(left + 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0 = vrhaddq_u8(az, a0); + d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + + col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + col0 = vrev64q_u8(vextq_u8(col0, col0, 8)); + + col0_even = vuzpq_u8(col0, col0).val[1]; + col0_odd = vuzpq_u8(col0, col0).val[0]; + + vst1q_u8(dst + 0 * stride, d0); + vst1q_u8(dst + 1 * stride, d1); + vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15)); + vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15)); + vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14)); + vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14)); + vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13)); + vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13)); + vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12)); + vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12)); + vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11)); + vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11)); + vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10)); + vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10)); + vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9)); + vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9)); +} + +void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d117_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1, + l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + a14 = vld1q_u8(above + 14); + a15 = vld1q_u8(above + 15); + a16 = vld1q_u8(above + 16); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left + 0); + l1 = vld1q_u8(left + 1); + l15 = vld1q_u8(left + 15); + l16 = vld1q_u8(left + 16); + l17 = vld1q_u8(left + 17); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0_lo = vrhaddq_u8(az, a0); + d0_hi = vrhaddq_u8(a15, a16); + d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15); + + col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16); + + col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8)); + col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8)); + + col0_even = vuzpq_u8(col0_hi, col0_lo).val[1]; + col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0]; + + vst1q_u8(dst + 0 * stride + 0, d0_lo); + vst1q_u8(dst + 0 * stride + 16, d0_hi); + vst1q_u8(dst + 1 * stride + 0, d1_lo); + vst1q_u8(dst + 1 * stride + 16, d1_hi); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9)); + vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8)); + vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8)); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5)); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4)); + vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4)); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4)); + vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4)); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3)); + vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3)); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3)); + vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3)); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2)); + vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2)); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2)); + vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2)); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1)); + vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1)); + vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1)); + vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1)); +} + +// ----------------------------------------------------------------------------- + void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t XA0123 = vld1_u8(above - 1); diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 9ee9fc1c4f..980380325a 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -57,6 +57,7 @@ () add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vpx_d117_predictor_4x4 neon/; add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_4x4 neon/; @@ -101,6 +102,7 @@ () specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/; add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vpx_d117_predictor_8x8 neon/; add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_8x8 neon/; @@ -141,6 +143,7 @@ () specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vpx_d117_predictor_16x16 neon/; add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_16x16 neon/; @@ -179,6 +182,7 @@ () specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/; add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vpx_d117_predictor_32x32 neon/; add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_32x32 neon/; From 7cdf139e3d6237386e0f93bdb0bdc1b459c663bf Mon Sep 17 00:00:00 2001 From: George Steed Date: Mon, 20 Feb 2023 11:41:40 +0000 Subject: [PATCH 0633/1000] Implement highbd_d63_predictor using Neon Add Neon implementations of the highbd d63 predictor for 4x4, 8x8, 16x16 and 32x32 block sizes. Also update tests to add new corresponding cases. Speedups over the C code (higher is better): Microarch. | Compiler | Block | Speedup Neoverse N1 | LLVM 15 | 4x4 | 2.43 Neoverse N1 | LLVM 15 | 8x8 | 4.03 Neoverse N1 | LLVM 15 | 16x16 | 3.07 Neoverse N1 | LLVM 15 | 32x32 | 4.11 Neoverse N1 | GCC 12 | 4x4 | 2.92 Neoverse N1 | GCC 12 | 8x8 | 7.20 Neoverse N1 | GCC 12 | 16x16 | 4.43 Neoverse N1 | GCC 12 | 32x32 | 3.18 Neoverse V1 | LLVM 15 | 4x4 | 1.99 Neoverse V1 | LLVM 15 | 8x8 | 3.66 Neoverse V1 | LLVM 15 | 16x16 | 3.60 Neoverse V1 | LLVM 15 | 32x32 | 3.29 Neoverse V1 | GCC 12 | 4x4 | 2.39 Neoverse V1 | GCC 12 | 8x8 | 4.76 Neoverse V1 | GCC 12 | 16x16 | 3.29 Neoverse V1 | GCC 12 | 32x32 | 2.43 Change-Id: Ic59df16ceeb468003754b4374be2f4d9af6589e4 --- test/test_intra_pred_speed.cc | 44 ++--- test/vp9_intrapred_test.cc | 24 +++ vpx_dsp/arm/highbd_intrapred_neon.c | 278 ++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- 4 files changed, 326 insertions(+), 28 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 5861a17770..19dabf88a7 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -565,35 +565,31 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon, vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon, vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon, - vpx_highbd_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, nullptr, - vpx_highbd_tm_predictor_4x4_neon) + vpx_highbd_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, + vpx_highbd_d63_predictor_4x4_neon, vpx_highbd_tm_predictor_4x4_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon, vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon, vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon, vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon, - vpx_highbd_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, nullptr, - vpx_highbd_tm_predictor_8x8_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16, - vpx_highbd_dc_predictor_16x16_neon, - vpx_highbd_dc_left_predictor_16x16_neon, - vpx_highbd_dc_top_predictor_16x16_neon, - vpx_highbd_dc_128_predictor_16x16_neon, - vpx_highbd_v_predictor_16x16_neon, - vpx_highbd_h_predictor_16x16_neon, - vpx_highbd_d45_predictor_16x16_neon, - vpx_highbd_d135_predictor_16x16_neon, nullptr, nullptr, - nullptr, nullptr, vpx_highbd_tm_predictor_16x16_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32, - vpx_highbd_dc_predictor_32x32_neon, - vpx_highbd_dc_left_predictor_32x32_neon, - vpx_highbd_dc_top_predictor_32x32_neon, - vpx_highbd_dc_128_predictor_32x32_neon, - vpx_highbd_v_predictor_32x32_neon, - vpx_highbd_h_predictor_32x32_neon, - vpx_highbd_d45_predictor_32x32_neon, - vpx_highbd_d135_predictor_32x32_neon, nullptr, nullptr, - nullptr, nullptr, vpx_highbd_tm_predictor_32x32_neon) + vpx_highbd_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, + vpx_highbd_d63_predictor_8x8_neon, vpx_highbd_tm_predictor_8x8_neon) +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon, + vpx_highbd_dc_left_predictor_16x16_neon, + vpx_highbd_dc_top_predictor_16x16_neon, + vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon, + vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon, + vpx_highbd_d135_predictor_16x16_neon, nullptr, nullptr, nullptr, + vpx_highbd_d63_predictor_16x16_neon, vpx_highbd_tm_predictor_16x16_neon) +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon, + vpx_highbd_dc_left_predictor_32x32_neon, + vpx_highbd_dc_top_predictor_32x32_neon, + vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon, + vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon, + vpx_highbd_d135_predictor_32x32_neon, nullptr, nullptr, nullptr, + vpx_highbd_d63_predictor_32x32_neon, vpx_highbd_tm_predictor_32x32_neon) #endif // HAVE_NEON #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index d04be429d1..139358c307 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -848,6 +848,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d45_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, &vpx_highbd_d45_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, + &vpx_highbd_d63_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, + &vpx_highbd_d63_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, + &vpx_highbd_d63_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, + &vpx_highbd_d63_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, &vpx_highbd_d135_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, @@ -924,6 +932,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d45_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, &vpx_highbd_d45_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, + &vpx_highbd_d63_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, + &vpx_highbd_d63_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, + &vpx_highbd_d63_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, + &vpx_highbd_d63_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, &vpx_highbd_d135_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, @@ -1000,6 +1016,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d45_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, &vpx_highbd_d45_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, + &vpx_highbd_d63_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, + &vpx_highbd_d63_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, + &vpx_highbd_d63_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, + &vpx_highbd_d63_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, &vpx_highbd_d135_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index 6f7e5da762..18dca81100 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -453,6 +453,284 @@ void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, // ----------------------------------------------------------------------------- +void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x4_t a0, a1, a2, a3, d0, d1; + (void)left; + (void)bd; + + a0 = vld1_u16(above + 0); + a1 = vld1_u16(above + 1); + a2 = vld1_u16(above + 2); + a3 = vld1_dup_u16(above + 3); + + d0 = vrhadd_u16(a0, a1); + d1 = vrhadd_u16(vhadd_u16(a0, a2), a1); + + vst1_u16(dst + 0 * stride, d0); + vst1_u16(dst + 1 * stride, d1); + vst1_u16(dst + 2 * stride, vext_u16(d0, a3, 1)); + vst1_u16(dst + 3 * stride, vext_u16(d1, a3, 1)); +} + +void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t a0, a1, a2, a7, d0, d1; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a2 = vld1q_u16(above + 2); + a7 = vld1q_dup_u16(above + 7); + + d0 = vrhaddq_u16(a0, a1); + d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + + vst1q_u16(dst + 0 * stride, d0); + vst1q_u16(dst + 1 * stride, d1); + vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 1)); + vst1q_u16(dst + 3 * stride, vextq_u16(d1, a7, 1)); + vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 2)); + vst1q_u16(dst + 5 * stride, vextq_u16(d1, a7, 2)); + vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 3)); + vst1q_u16(dst + 7 * stride, vextq_u16(d1, a7, 3)); +} + +void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0_lo, d0_hi, d1_lo, d1_hi; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a2 = vld1q_u16(above + 2); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a10 = vld1q_u16(above + 10); + a15 = vld1q_dup_u16(above + 15); + + d0_lo = vrhaddq_u16(a0, a1); + d0_hi = vrhaddq_u16(a8, a9); + d1_lo = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + d1_hi = vrhaddq_u16(vhaddq_u16(a8, a10), a9); + + vst1q_u16(dst + 0 * stride + 0, d0_lo); + vst1q_u16(dst + 0 * stride + 8, d0_hi); + vst1q_u16(dst + 1 * stride + 0, d1_lo); + vst1q_u16(dst + 1 * stride + 8, d1_hi); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0_lo, d0_hi, 1)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_hi, a15, 1)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1_lo, d1_hi, 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_hi, a15, 1)); + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0_lo, d0_hi, 2)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_hi, a15, 2)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1_lo, d1_hi, 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_hi, a15, 2)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0_lo, d0_hi, 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_hi, a15, 3)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1_lo, d1_hi, 3)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_hi, a15, 3)); + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0_lo, d0_hi, 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_hi, a15, 4)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1_lo, d1_hi, 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_hi, a15, 4)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0_lo, d0_hi, 5)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_hi, a15, 5)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1_lo, d1_hi, 5)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_hi, a15, 5)); + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0_lo, d0_hi, 6)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_hi, a15, 6)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1_lo, d1_hi, 6)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_hi, a15, 6)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0_lo, d0_hi, 7)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_hi, a15, 7)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1_lo, d1_hi, 7)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_hi, a15, 7)); +} + +void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4], + d1[4]; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a2 = vld1q_u16(above + 2); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a10 = vld1q_u16(above + 10); + a16 = vld1q_u16(above + 16); + a17 = vld1q_u16(above + 17); + a18 = vld1q_u16(above + 18); + a24 = vld1q_u16(above + 24); + a25 = vld1q_u16(above + 25); + a26 = vld1q_u16(above + 26); + a31 = vld1q_dup_u16(above + 31); + + d0[0] = vrhaddq_u16(a0, a1); + d0[1] = vrhaddq_u16(a8, a9); + d0[2] = vrhaddq_u16(a16, a17); + d0[3] = vrhaddq_u16(a24, a25); + d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9); + d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17); + d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25); + + vst1q_u16(dst + 0 * stride + 0, d0[0]); + vst1q_u16(dst + 0 * stride + 8, d0[1]); + vst1q_u16(dst + 0 * stride + 16, d0[2]); + vst1q_u16(dst + 0 * stride + 24, d0[3]); + vst1q_u16(dst + 1 * stride + 0, d1[0]); + vst1q_u16(dst + 1 * stride + 8, d1[1]); + vst1q_u16(dst + 1 * stride + 16, d1[2]); + vst1q_u16(dst + 1 * stride + 24, d1[3]); + + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[3], a31, 1)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[3], a31, 1)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2)); + vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[3], a31, 2)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[3], a31, 2)); + + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[3], a31, 3)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[3], a31, 3)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4)); + vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[3], a31, 4)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[3], a31, 4)); + + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[3], a31, 5)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[3], a31, 5)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6)); + vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[3], a31, 6)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[3], a31, 6)); + + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7)); + vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[3], a31, 7)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7)); + vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[3], a31, 7)); + + vst1q_u16(dst + 16 * stride + 0, d0[1]); + vst1q_u16(dst + 16 * stride + 8, d0[2]); + vst1q_u16(dst + 16 * stride + 16, d0[3]); + vst1q_u16(dst + 16 * stride + 24, a31); + vst1q_u16(dst + 17 * stride + 0, d1[1]); + vst1q_u16(dst + 17 * stride + 8, d1[2]); + vst1q_u16(dst + 17 * stride + 16, d1[3]); + vst1q_u16(dst + 17 * stride + 24, a31); + + vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[3], a31, 1)); + vst1q_u16(dst + 18 * stride + 24, a31); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[3], a31, 1)); + vst1q_u16(dst + 19 * stride + 24, a31); + + vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2)); + vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[3], a31, 2)); + vst1q_u16(dst + 20 * stride + 24, a31); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[3], a31, 2)); + vst1q_u16(dst + 21 * stride + 24, a31); + + vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[3], a31, 3)); + vst1q_u16(dst + 22 * stride + 24, a31); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[3], a31, 3)); + vst1q_u16(dst + 23 * stride + 24, a31); + + vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4)); + vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[3], a31, 4)); + vst1q_u16(dst + 24 * stride + 24, a31); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[3], a31, 4)); + vst1q_u16(dst + 25 * stride + 24, a31); + + vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[3], a31, 5)); + vst1q_u16(dst + 26 * stride + 24, a31); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[3], a31, 5)); + vst1q_u16(dst + 27 * stride + 24, a31); + + vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6)); + vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[3], a31, 6)); + vst1q_u16(dst + 28 * stride + 24, a31); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[3], a31, 6)); + vst1q_u16(dst + 29 * stride + 24, a31); + + vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7)); + vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[3], a31, 7)); + vst1q_u16(dst + 30 * stride + 24, a31); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7)); + vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[3], a31, 7)); + vst1q_u16(dst + 31 * stride + 24, a31); +} + +// ----------------------------------------------------------------------------- + void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 980380325a..71c3a84638 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -217,7 +217,7 @@ () specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_4x4 sse2/; + specialize qw/vpx_highbd_d63_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/; @@ -256,7 +256,7 @@ () specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/; + specialize qw/vpx_highbd_d63_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/; @@ -295,7 +295,7 @@ () specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/; + specialize qw/vpx_highbd_d63_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/; @@ -334,7 +334,7 @@ () specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/; + specialize qw/vpx_highbd_d63_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/; From 74e4587c89d38ce6922171e9f65b8cff5f2ffcc8 Mon Sep 17 00:00:00 2001 From: George Steed Date: Tue, 21 Feb 2023 11:17:10 +0000 Subject: [PATCH 0634/1000] Implement highbd_d117_predictor using Neon Add Neon implementations of the highbd d117 predictor for 4x4, 8x8, 16x16 and 32x32 block sizes. Also update tests to add new corresponding cases. An explanation of the general implementation strategy is given in the 8x8 implementation body, and is mostly identical to the non-highbd version. Speedups over the C code (higher is better): Microarch. | Compiler | Block | Speedup Neoverse N1 | LLVM 15 | 4x4 | 1.99 Neoverse N1 | LLVM 15 | 8x8 | 4.37 Neoverse N1 | LLVM 15 | 16x16 | 6.81 Neoverse N1 | LLVM 15 | 32x32 | 6.49 Neoverse N1 | GCC 12 | 4x4 | 2.49 Neoverse N1 | GCC 12 | 8x8 | 4.10 Neoverse N1 | GCC 12 | 16x16 | 5.58 Neoverse N1 | GCC 12 | 32x32 | 2.16 Neoverse V1 | LLVM 15 | 4x4 | 1.99 Neoverse V1 | LLVM 15 | 8x8 | 5.03 Neoverse V1 | LLVM 15 | 16x16 | 6.61 Neoverse V1 | LLVM 15 | 32x32 | 6.01 Neoverse V1 | GCC 12 | 4x4 | 2.09 Neoverse V1 | GCC 12 | 8x8 | 4.52 Neoverse V1 | GCC 12 | 16x16 | 4.23 Neoverse V1 | GCC 12 | 32x32 | 2.70 Change-Id: I892fbd2c17ac527ddc22b91acca907ffc84c5cd2 --- test/test_intra_pred_speed.cc | 20 +- test/vp9_intrapred_test.cc | 24 ++ vpx_dsp/arm/highbd_intrapred_neon.c | 382 ++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- 4 files changed, 422 insertions(+), 12 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 19dabf88a7..e721a459ad 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -565,31 +565,35 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon, vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon, vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon, - vpx_highbd_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, - vpx_highbd_d63_predictor_4x4_neon, vpx_highbd_tm_predictor_4x4_neon) + vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon, + nullptr, nullptr, vpx_highbd_d63_predictor_4x4_neon, + vpx_highbd_tm_predictor_4x4_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon, vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon, vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon, vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon, - vpx_highbd_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, - vpx_highbd_d63_predictor_8x8_neon, vpx_highbd_tm_predictor_8x8_neon) + vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon, + nullptr, nullptr, vpx_highbd_d63_predictor_8x8_neon, + vpx_highbd_tm_predictor_8x8_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon, vpx_highbd_dc_left_predictor_16x16_neon, vpx_highbd_dc_top_predictor_16x16_neon, vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon, vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon, - vpx_highbd_d135_predictor_16x16_neon, nullptr, nullptr, nullptr, - vpx_highbd_d63_predictor_16x16_neon, vpx_highbd_tm_predictor_16x16_neon) + vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon, + nullptr, nullptr, vpx_highbd_d63_predictor_16x16_neon, + vpx_highbd_tm_predictor_16x16_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon, vpx_highbd_dc_left_predictor_32x32_neon, vpx_highbd_dc_top_predictor_32x32_neon, vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon, vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon, - vpx_highbd_d135_predictor_32x32_neon, nullptr, nullptr, nullptr, - vpx_highbd_d63_predictor_32x32_neon, vpx_highbd_tm_predictor_32x32_neon) + vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon, + nullptr, nullptr, vpx_highbd_d63_predictor_32x32_neon, + vpx_highbd_tm_predictor_32x32_neon) #endif // HAVE_NEON #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 139358c307..c4e0e78ac5 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -856,6 +856,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d63_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, &vpx_highbd_d63_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, + &vpx_highbd_d117_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, + &vpx_highbd_d117_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon, + &vpx_highbd_d117_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon, + &vpx_highbd_d117_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, &vpx_highbd_d135_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, @@ -940,6 +948,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d63_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, &vpx_highbd_d63_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, + &vpx_highbd_d117_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, + &vpx_highbd_d117_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon, + &vpx_highbd_d117_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon, + &vpx_highbd_d117_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, &vpx_highbd_d135_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, @@ -1024,6 +1040,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d63_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, &vpx_highbd_d63_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, + &vpx_highbd_d117_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, + &vpx_highbd_d117_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon, + &vpx_highbd_d117_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon, + &vpx_highbd_d117_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, &vpx_highbd_d135_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index 18dca81100..424bf5f4bc 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -731,6 +731,388 @@ void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, // ----------------------------------------------------------------------------- +void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x4_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1; + (void)bd; + + az = vld1_u16(above - 1); + a0 = vld1_u16(above + 0); + // [ left[0], above[-1], above[0], above[1] ] + l0az = vext_u16(vld1_dup_u16(left), az, 3); + + l0 = vld1_u16(left + 0); + l1 = vld1_u16(left + 1); + // [ above[-1], left[0], left[1], left[2] ] + azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3); + + d0 = vrhadd_u16(az, a0); + d1 = vrhadd_u16(vhadd_u16(l0az, a0), az); + + col0 = vrhadd_u16(vhadd_u16(azl0, l1), l0); + col0_even = vdup_lane_u16(col0, 0); + col0_odd = vdup_lane_u16(col0, 1); + + vst1_u16(dst + 0 * stride, d0); + vst1_u16(dst + 1 * stride, d1); + vst1_u16(dst + 2 * stride, vext_u16(col0_even, d0, 3)); + vst1_u16(dst + 3 * stride, vext_u16(col0_odd, d1, 3)); +} + +void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + // [ left[0], above[-1], ..., left[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + // [ above[-1], left[0], ..., left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], above[0]) + // ... + // d0[7] = AVG2(above[6], above[7]) + d0 = vrhaddq_u16(az, a0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vector to put the elements to be shifted in + // at the end: + // col0[7] = AVG3(above[-1], left[0], left[1]) + // col0[6] = AVG3(left[0], left[1], left[2]) + // ... + // col0[0] = AVG3(left[6], left[7], left[8]) + col0 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + col0 = vrev64q_u16(vextq_u16(col0, col0, 4)); + + // We don't care about the first parameter to this uzp since we only ever use + // the high three elements, we just use col0 again since it is already + // available: + // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ] + // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ] + col0_even = vuzpq_u16(col0, col0).val[1]; + col0_odd = vuzpq_u16(col0, col0).val[0]; + + // Incrementally shift more elements from col0 into d0/1: + // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ] + // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ] + // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ] + // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ] + // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ] + // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ] + // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ] + vst1q_u16(dst + 0 * stride, d0); + vst1q_u16(dst + 1 * stride, d1); + vst1q_u16(dst + 2 * stride, vextq_u16(col0_even, d0, 7)); + vst1q_u16(dst + 3 * stride, vextq_u16(col0_odd, d1, 7)); + vst1q_u16(dst + 4 * stride, vextq_u16(col0_even, d0, 6)); + vst1q_u16(dst + 5 * stride, vextq_u16(col0_odd, d1, 6)); + vst1q_u16(dst + 6 * stride, vextq_u16(col0_even, d0, 5)); + vst1q_u16(dst + 7 * stride, vextq_u16(col0_odd, d1, 5)); +} + +void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, col0_lo, + col0_hi, col0_even, col0_odd, d0_lo, d0_hi, d1_lo, d1_hi; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + // [ left[0], above[-1], ..., left[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + l9 = vld1q_u16(left + 9); + // [ above[-1], left[0], ..., left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0_lo = vrhaddq_u16(az, a0); + d0_hi = vrhaddq_u16(a7, a8); + d1_lo = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1_hi = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + + col0_lo = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + col0_hi = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + + // Reverse within each vector, then swap the array indices in the uzp to + // complete the reversal across all 16 elements. + col0_lo = vrev64q_u16(vextq_u16(col0_lo, col0_lo, 4)); + col0_hi = vrev64q_u16(vextq_u16(col0_hi, col0_hi, 4)); + col0_even = vuzpq_u16(col0_hi, col0_lo).val[1]; + col0_odd = vuzpq_u16(col0_hi, col0_lo).val[0]; + + vst1q_u16(dst + 0 * stride + 0, d0_lo); + vst1q_u16(dst + 0 * stride + 8, d0_hi); + vst1q_u16(dst + 1 * stride + 0, d1_lo); + vst1q_u16(dst + 1 * stride + 8, d1_hi); + + vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even, d0_lo, 7)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_lo, d0_hi, 7)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd, d1_lo, 7)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_lo, d1_hi, 7)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even, d0_lo, 6)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_lo, d0_hi, 6)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd, d1_lo, 6)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_lo, d1_hi, 6)); + + vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even, d0_lo, 5)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_lo, d0_hi, 5)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd, d1_lo, 5)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_lo, d1_hi, 5)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even, d0_lo, 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_lo, d0_hi, 4)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd, d1_lo, 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_lo, d1_hi, 4)); + + vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even, d0_lo, 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_lo, d0_hi, 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd, d1_lo, 3)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_lo, d1_hi, 3)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even, d0_lo, 2)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_lo, d0_hi, 2)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd, d1_lo, 2)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_lo, d1_hi, 2)); + + vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even, d0_lo, 1)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_lo, d0_hi, 1)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd, d1_lo, 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_lo, d1_hi, 1)); +} + +void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7, + l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], col0[4], + col0_even[2], col0_odd[2]; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a14 = vld1q_u16(above + 14); + a15 = vld1q_u16(above + 15); + a16 = vld1q_u16(above + 16); + a22 = vld1q_u16(above + 22); + a23 = vld1q_u16(above + 23); + a24 = vld1q_u16(above + 24); + // [ left[0], above[-1], ..., left[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + l9 = vld1q_u16(left + 9); + l15 = vld1q_u16(left + 15); + l16 = vld1q_u16(left + 16); + l17 = vld1q_u16(left + 17); + l23 = vld1q_u16(left + 23); + l24 = vld1q_u16(left + 24); + l25 = vld1q_u16(left + 25); + // [ above[-1], left[0], ..., left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0[0] = vrhaddq_u16(az, a0); + d0[1] = vrhaddq_u16(a7, a8); + d0[2] = vrhaddq_u16(a15, a16); + d0[3] = vrhaddq_u16(a23, a24); + d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15); + d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23); + + col0[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + col0[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + col0[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16); + col0[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24); + + // Reverse within each vector, then swap the array indices in both the uzp + // and the col0_{even,odd} assignment to complete the reversal across all + // 32-elements. + col0[0] = vrev64q_u16(vextq_u16(col0[0], col0[0], 4)); + col0[1] = vrev64q_u16(vextq_u16(col0[1], col0[1], 4)); + col0[2] = vrev64q_u16(vextq_u16(col0[2], col0[2], 4)); + col0[3] = vrev64q_u16(vextq_u16(col0[3], col0[3], 4)); + + col0_even[1] = vuzpq_u16(col0[1], col0[0]).val[1]; + col0_even[0] = vuzpq_u16(col0[3], col0[2]).val[1]; + col0_odd[1] = vuzpq_u16(col0[1], col0[0]).val[0]; + col0_odd[0] = vuzpq_u16(col0[3], col0[2]).val[0]; + + vst1q_u16(dst + 0 * stride + 0, d0[0]); + vst1q_u16(dst + 0 * stride + 8, d0[1]); + vst1q_u16(dst + 0 * stride + 16, d0[2]); + vst1q_u16(dst + 0 * stride + 24, d0[3]); + vst1q_u16(dst + 1 * stride + 0, d1[0]); + vst1q_u16(dst + 1 * stride + 8, d1[1]); + vst1q_u16(dst + 1 * stride + 16, d1[2]); + vst1q_u16(dst + 1 * stride + 24, d1[3]); + + vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even[1], d0[0], 7)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[2], d0[3], 7)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd[1], d1[0], 7)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 7)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even[1], d0[0], 6)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[2], d0[3], 6)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd[1], d1[0], 6)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[1], d1[2], 6)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[2], d1[3], 6)); + + vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even[1], d0[0], 5)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[2], d0[3], 5)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd[1], d1[0], 5)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[2], d1[3], 5)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even[1], d0[0], 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[2], d0[3], 4)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd[1], d1[0], 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[1], d1[2], 4)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[2], d1[3], 4)); + + vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even[1], d0[0], 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[2], d0[3], 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd[1], d1[0], 3)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[2], d1[3], 3)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even[1], d0[0], 2)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[2], d0[3], 2)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd[1], d1[0], 2)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[1], d1[2], 2)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[2], d1[3], 2)); + + vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even[1], d0[0], 1)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[2], d0[3], 1)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd[1], d1[0], 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[2], d1[3], 1)); + + vst1q_u16(dst + 16 * stride + 0, col0_even[1]); + vst1q_u16(dst + 16 * stride + 8, d0[0]); + vst1q_u16(dst + 16 * stride + 16, d0[1]); + vst1q_u16(dst + 16 * stride + 24, d0[2]); + vst1q_u16(dst + 17 * stride + 0, col0_odd[1]); + vst1q_u16(dst + 17 * stride + 8, d1[0]); + vst1q_u16(dst + 17 * stride + 16, d1[1]); + vst1q_u16(dst + 17 * stride + 24, d1[2]); + + vst1q_u16(dst + 18 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 7)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(col0_even[1], d0[0], 7)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 18 * stride + 24, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 7)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(col0_odd[1], d1[0], 7)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 19 * stride + 24, vextq_u16(d1[1], d1[2], 7)); + + vst1q_u16(dst + 20 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 6)); + vst1q_u16(dst + 20 * stride + 8, vextq_u16(col0_even[1], d0[0], 6)); + vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 20 * stride + 24, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 6)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(col0_odd[1], d1[0], 6)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 21 * stride + 24, vextq_u16(d1[1], d1[2], 6)); + + vst1q_u16(dst + 22 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 5)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(col0_even[1], d0[0], 5)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 22 * stride + 24, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 5)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(col0_odd[1], d1[0], 5)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 23 * stride + 24, vextq_u16(d1[1], d1[2], 5)); + + vst1q_u16(dst + 24 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 4)); + vst1q_u16(dst + 24 * stride + 8, vextq_u16(col0_even[1], d0[0], 4)); + vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 24 * stride + 24, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 4)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(col0_odd[1], d1[0], 4)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 25 * stride + 24, vextq_u16(d1[1], d1[2], 4)); + + vst1q_u16(dst + 26 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 3)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(col0_even[1], d0[0], 3)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 26 * stride + 24, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 3)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(col0_odd[1], d1[0], 3)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 27 * stride + 24, vextq_u16(d1[1], d1[2], 3)); + + vst1q_u16(dst + 28 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 2)); + vst1q_u16(dst + 28 * stride + 8, vextq_u16(col0_even[1], d0[0], 2)); + vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 28 * stride + 24, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 2)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(col0_odd[1], d1[0], 2)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 29 * stride + 24, vextq_u16(d1[1], d1[2], 2)); + + vst1q_u16(dst + 30 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 1)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(col0_even[1], d0[0], 1)); + vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 30 * stride + 24, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 1)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(col0_odd[1], d1[0], 1)); + vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 31 * stride + 24, vextq_u16(d1[1], d1[2], 1)); +} + +// ----------------------------------------------------------------------------- + void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 71c3a84638..26b723f055 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -223,7 +223,7 @@ () specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d117_predictor_4x4 sse2/; + specialize qw/vpx_highbd_d117_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/; @@ -262,7 +262,7 @@ () specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/; + specialize qw/vpx_highbd_d117_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/; @@ -301,7 +301,7 @@ () specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/; + specialize qw/vpx_highbd_d117_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/; @@ -340,7 +340,7 @@ () specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/; + specialize qw/vpx_highbd_d117_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/; From 573f5e662b544dbc553d73fa2b61055c30dfe8cc Mon Sep 17 00:00:00 2001 From: Johann Date: Sat, 12 Nov 2022 08:23:17 +0900 Subject: [PATCH 0635/1000] quantize: simplifly highbd 32x32_b args Change-Id: I431a41279c4c4193bc70cfe819da6ea7e1d2fba1 --- test/vp9_quantize_test.cc | 54 +++++++++++------------ vp9/encoder/vp9_encodemb.c | 10 ++--- vpx_dsp/arm/highbd_quantize_neon.c | 21 +++++---- vpx_dsp/quantize.c | 16 ++++--- vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 13 +++--- vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 21 ++++----- 7 files changed, 69 insertions(+), 68 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index ecb6116f0c..e533b2509c 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -549,15 +549,15 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantWrapper, &QuantWrapper, VPX_BITS_12, 16, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 32, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, - 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, - 32, false))); + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_12, 32, false))); #else INSTANTIATE_TEST_SUITE_P( @@ -626,15 +626,15 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 32, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, - 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, - 32, false))); + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_12, 32, false))); #else INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, @@ -672,15 +672,15 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 32, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, - 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, - 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_12, 32, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper, diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 4910dc20f5..6a5f628808 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -511,9 +511,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32( - coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); @@ -856,9 +855,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32( - coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, + eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index b9f72a94c5..3b1fec3321 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store( const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1, @@ -224,11 +225,9 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon( } void vpx_highbd_quantize_b_32x32_neon( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; @@ -237,12 +236,13 @@ void vpx_highbd_quantize_b_32x32_neon( // High half has identical elements, but we can reconstruct it from the low // half by duplicating the 2nd element. So we only need to pass a 4x32-bit // vector - int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1); - int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1); + int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1); + int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1); // Extend the quant, quant_shift vectors to ones of 32-bit elements // scale to high-half, so we can use vqdmulhq_s32 - int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15); - int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16); + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15); + int32x4_t quant_shift = + vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16); int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); // Process first 8 values which include a dc component. @@ -300,8 +300,7 @@ void vpx_highbd_quantize_b_32x32_neon( vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ - // Need these here, else the compiler complains about mixing declarations and + // Need this here, else the compiler complains about mixing declarations and // code in C90 - (void)n_coeffs; (void)scan; } diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index 212db45c88..c4642812ad 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -272,14 +272,16 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { + const intptr_t n_coeffs = 32 * 32; + const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), + ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; int idx = 0; int idx_arr[1024]; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 6ac8c982ad..a326e2b34c 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -731,7 +731,7 @@ () add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b neon sse2 avx2/; - add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c index 8edddd637f..6041d7289a 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -11,6 +11,7 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) { const __m128i sign = _mm_srai_epi16(*p, 15); @@ -222,17 +223,17 @@ static VPX_FORCE_INLINE void quantize_b_32x32( } void vpx_highbd_quantize_b_32x32_avx2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const unsigned int step = 8; + intptr_t n_coeffs = 32 * 32; __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; (void)scan; - init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1); + init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr, + mb_plane->quant_shift, qp, 1); quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index ae1981a834..6a8f42b8a4 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -15,6 +15,7 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vp9/encoder/vp9_block.h" #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, @@ -93,18 +94,17 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, } void vpx_highbd_quantize_b_32x32_sse2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; int i, eob = 0; - const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); - const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); + const intptr_t n_coeffs = 32 * 32; + const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1); (void)scan; zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); @@ -140,10 +140,11 @@ void vpx_highbd_quantize_b_32x32_sse2( const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1; const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15); qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; From 14fc40040ff30486c45111056db44ee18590a24a Mon Sep 17 00:00:00 2001 From: Johann Date: Mon, 14 Nov 2022 16:47:33 +0900 Subject: [PATCH 0636/1000] quantize: use scan_order instead of passing scan/iscan further reduces the arguments for the 32x32. This will be applied to the base version as well. Change-Id: I25a162b5248b14af53d9e20c6a7fa2a77028a6d1 --- test/vp9_quantize_test.cc | 41 +++++++++++------------ vp9/common/vp9_scan.h | 2 +- vp9/encoder/vp9_encodemb.c | 8 ++--- vpx_dsp/arm/highbd_quantize_neon.c | 7 ++-- vpx_dsp/arm/quantize_neon.c | 7 ++-- vpx_dsp/quantize.c | 9 ++--- vpx_dsp/vpx_dsp_rtcd_defs.pl | 5 +-- vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 5 +-- vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 5 +-- vpx_dsp/x86/quantize_avx.c | 7 ++-- vpx_dsp/x86/quantize_avx2.c | 5 +-- vpx_dsp/x86/quantize_ssse3.c | 6 ++-- 12 files changed, 55 insertions(+), 52 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index e533b2509c..630a0053ab 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -42,7 +42,7 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan); + const struct scan_order *const scan_order); typedef std::tuple QuantizeParam; @@ -60,9 +60,10 @@ template void QuantWrapper(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant, - mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan); + mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan_order->scan, + scan_order->iscan); } // Wrapper for 32x32 version which does not use count @@ -70,16 +71,16 @@ typedef void (*Quantize32x32Func)(const tran_low_t *coeff, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan); + const struct scan_order *const scan_order); template void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { (void)count; - fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan); + fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order); } // Wrapper for FP version which does not use zbin or quant_shift. @@ -93,9 +94,9 @@ template void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff, - dequant, eob, scan, iscan); + dequant, eob, scan_order->scan, scan_order->iscan); } void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, @@ -225,7 +226,7 @@ class VP9QuantizeTest : public VP9QuantizeBase, void VP9QuantizeTest::Run() { quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), dequant_ptr_, - &eob_, scan_->scan, scan_->iscan); + &eob_, scan_); } void VP9QuantizeTest::Speed(bool is_median) { @@ -298,7 +299,7 @@ void VP9QuantizeTest::Speed(bool is_median) { ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_->scan, scan_->iscan); + scan_); } vpx_usec_timer_mark(&timer); @@ -306,7 +307,7 @@ void VP9QuantizeTest::Speed(bool is_median) { for (int n = 0; n < kNumTests; ++n) { quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), - dequant_ptr_, &eob_, scan_->scan, scan_->iscan); + dequant_ptr_, &eob_, scan_); } vpx_usec_timer_mark(&simd_timer); @@ -447,12 +448,11 @@ TEST_P(VP9QuantizeTest, OperationCheck) { quant_fp_ptr_); ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), - dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); + dequant_ptr_, &ref_eob, scan_); - ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, - &mb_plane_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, - &eob_, scan_->scan, scan_->iscan)); + ASM_REGISTER_STATE_CHECK(quantize_op_( + coeff_.TopLeftPixel(), count_, &mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); @@ -504,12 +504,11 @@ TEST_P(VP9QuantizeTest, EOBCheck) { quant_fp_ptr_); ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), - dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); + dequant_ptr_, &ref_eob, scan_); - ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, - &mb_plane_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, - &eob_, scan_->scan, scan_->iscan)); + ASM_REGISTER_STATE_CHECK(quantize_op_( + coeff_.TopLeftPixel(), count_, &mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h index 72a9a5ec47..efa0e23365 100644 --- a/vp9/common/vp9_scan.h +++ b/vp9/common/vp9_scan.h @@ -23,7 +23,7 @@ extern "C" { #define MAX_NEIGHBORS 2 -typedef struct { +typedef struct scan_order { const int16_t *scan; const int16_t *iscan; const int16_t *neighbors; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 6a5f628808..515c7a9031 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -512,7 +512,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + scan_order); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); @@ -542,7 +542,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + scan_order); break; case TX_16X16: vpx_fdct16x16(src_diff, coeff, diff_stride); @@ -856,7 +856,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, src_stride, dst, dst_stride, xd->bd); highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + eob, scan_order); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -946,7 +946,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, dst_stride); fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + scan_order); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index 3b1fec3321..5a40f1284e 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store( @@ -227,10 +228,11 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon( void vpx_highbd_quantize_b_32x32_neon( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { + uint16_t *eob_ptr, const struct scan_order *const scan_order) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; + const int16_t *iscan = scan_order->iscan; // Only the first element of each vector is DC. // High half has identical elements, but we can reconstruct it from the low @@ -300,7 +302,4 @@ void vpx_highbd_quantize_b_32x32_neon( vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ - // Need this here, else the compiler complains about mixing declarations and - // code in C90 - (void)scan; } diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index e81738a7bb..84b6d8c79f 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -14,6 +14,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, @@ -218,10 +219,11 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; + const int16_t *iscan = scan_order->iscan; // Only the first element of each vector is DC. int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1); @@ -285,7 +287,4 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ - // Need these here, else the compiler complains about mixing declarations and - // code in C90 - (void)scan; } diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index c4642812ad..f51bf253e7 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -14,6 +14,7 @@ #include "vpx_dsp/quantize.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, @@ -213,7 +214,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { const int n_coeffs = 32 * 32; const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; @@ -221,11 +222,11 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const int16_t *round_ptr = mb_plane->round; const int16_t *quant_ptr = mb_plane->quant; const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; int idx = 0; int idx_arr[32 * 32 /* n_coeffs */]; int i, eob = -1; - (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -274,7 +275,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, void vpx_highbd_quantize_b_32x32_c( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { + uint16_t *eob_ptr, const struct scan_order *const scan_order) { const intptr_t n_coeffs = 32 * 32; const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; @@ -282,11 +283,11 @@ void vpx_highbd_quantize_b_32x32_c( const int16_t *round_ptr = mb_plane->round; const int16_t *quant_ptr = mb_plane->quant; const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; int idx = 0; int idx_arr[1024]; int i, eob = -1; - (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index a326e2b34c..b6f5d4a099 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -19,6 +19,7 @@ () #include "vpx_dsp/vpx_filter.h" #if CONFIG_VP9_ENCODER struct macroblock_plane; + struct scan_order; #endif EOF @@ -724,14 +725,14 @@ () add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; - add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b neon sse2 avx2/; - add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order"; specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c index 6041d7289a..bfd7b2e23e 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -11,6 +11,7 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) { @@ -225,12 +226,12 @@ static VPX_FORCE_INLINE void quantize_b_32x32( void vpx_highbd_quantize_b_32x32_avx2( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { + uint16_t *eob_ptr, const struct scan_order *const scan_order) { const unsigned int step = 8; intptr_t n_coeffs = 32 * 32; + const int16_t *iscan = scan_order->iscan; __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; - (void)scan; init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr, mb_plane->quant_shift, qp, 1); diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 6a8f42b8a4..58d5a3a5ff 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -15,6 +15,7 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" #if CONFIG_VP9_HIGHBITDEPTH @@ -96,16 +97,16 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, void vpx_highbd_quantize_b_32x32_sse2( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { + uint16_t *eob_ptr, const struct scan_order *const scan_order) { __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; int i, eob = 0; const intptr_t n_coeffs = 32 * 32; + const int16_t *iscan = scan_order->iscan; const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1); const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1); - (void)scan; zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); zbins[1] = _mm_set1_epi32(zbin1_tmp); diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index d52f6c6644..d05a937be1 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -19,6 +19,8 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -144,10 +146,11 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { const __m128i zero = _mm_setzero_si128(); const __m256i big_zero = _mm256_setzero_si256(); int index; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -156,8 +159,6 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, __m128i all_zero; __m128i eob = zero, eob0; - (void)scan; - load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index a8412c5b8e..1c82542ae6 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -13,6 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void load_b_values_avx2( @@ -255,11 +256,11 @@ void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; - (void)scan; + const int16_t *iscan = scan_order->iscan; load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round, mb_plane->quant, &v_quant, dequant_ptr, &v_dequant, diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 6fe54d7d98..6401b2865d 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -16,6 +16,7 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, @@ -112,9 +113,10 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { const __m128i zero = _mm_setzero_si128(); int index; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -123,8 +125,6 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, __m128i all_zero; __m128i eob = zero, eob0; - (void)scan; - load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); From 096cd0ba8ab2126682a9f6f01d6c8c0084d2f8ab Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Mon, 27 Feb 2023 17:58:18 +0000 Subject: [PATCH 0637/1000] Optimize Neon implementation of high bitdepth MSE functions Currently MSE functions just call the variance helpers but don't actually use the computed sum. This patch adds dedicated helpers to perform the computation of sse. Add the corresponding tests as well. Change-Id: I96a8590e3410e84d77f7187344688e02efe03902 --- test/variance_test.cc | 16 +++ vpx_dsp/arm/highbd_variance_neon.c | 197 ++++++++++++++++++++++------- 2 files changed, 169 insertions(+), 44 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index a68cfad516..1359bc4baf 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1507,6 +1507,22 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0))); #if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDMseTest, + ::testing::Values( + MseParams(4, 4, &vpx_highbd_12_mse16x16_neon, VPX_BITS_12), + MseParams(4, 3, &vpx_highbd_12_mse16x8_neon, VPX_BITS_12), + MseParams(3, 4, &vpx_highbd_12_mse8x16_neon, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_neon, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_neon, VPX_BITS_10), + MseParams(4, 3, &vpx_highbd_10_mse16x8_neon, VPX_BITS_10), + MseParams(3, 4, &vpx_highbd_10_mse8x16_neon, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_neon, VPX_BITS_10), + MseParams(4, 4, &vpx_highbd_8_mse16x16_neon, VPX_BITS_8), + MseParams(4, 3, &vpx_highbd_8_mse16x8_neon, VPX_BITS_8), + MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8))); + INSTANTIATE_TEST_SUITE_P( NEON, VpxHBDVarianceTest, ::testing::Values( diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c index 89bd5c579d..d0b366c95b 100644 --- a/vpx_dsp/arm/highbd_variance_neon.c +++ b/vpx_dsp/arm/highbd_variance_neon.c @@ -351,50 +351,159 @@ HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64) *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ } -#define HIGHBD_MSE(w, h) \ - uint32_t vpx_highbd_8_mse##w##x##h##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint64_t sse_long = 0; \ - int64_t sum_long = 0; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ - &sse_long, &sum_long); \ - *sse = (uint32_t)sse_long; \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_10_mse##w##x##h##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint64_t sse_long = 0; \ - int64_t sum_long = 0; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ - &sse_long, &sum_long); \ - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_12_mse##w##x##h##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint64_t sse_long = 0; \ - int64_t sum_long = 0; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ - &sse_long, &sum_long); \ - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ - return *sse; \ - } - HIGHBD_GET_VAR(8) HIGHBD_GET_VAR(16) -HIGHBD_MSE(16, 16) -HIGHBD_MSE(16, 8) -HIGHBD_MSE(8, 16) -HIGHBD_MSE(8, 8) +static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + unsigned int *sse) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t r = vld1q_u16(ref_ptr + j); + + uint16x8_t diff = vabdq_u16(s, r); + + sse_u32[0] = + vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff)); + sse_u32[1] = + vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); + return *sse; +} + +#if defined(__ARM_FEATURE_DOTPROD) + +static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h / 2; + do { + uint16x8_t s0, s1, r0, r1; + uint8x16_t s, r, diff; + + s0 = vld1q_u16(src_ptr); + src_ptr += src_stride; + s1 = vld1q_u16(src_ptr); + src_ptr += src_stride; + r0 = vld1q_u16(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1q_u16(ref_ptr); + ref_ptr += ref_stride; + + s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); + r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); + + diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, diff, diff); + } while (--i != 0); + + *sse = horizontal_add_uint32x4(sse_u32); + return *sse; +} + +static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + uint16x8_t s0, s1, r0, r1; + uint8x16_t s, r, diff; + + s0 = vld1q_u16(src_ptr); + s1 = vld1q_u16(src_ptr + 8); + r0 = vld1q_u16(ref_ptr); + r1 = vld1q_u16(ref_ptr + 8); + + s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); + r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); + + diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, diff, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sse = horizontal_add_uint32x4(sse_u32); + return *sse; +} + +#else // !defined(__ARM_FEATURE_DOTPROD) + +static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h, + sse); +} + +static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h, + sse); +} + +#endif // defined(__ARM_FEATURE_DOTPROD) + +#define HIGHBD_MSE_WXH_NEON(w, h) \ + uint32_t vpx_highbd_8_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_10_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_12_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + return *sse; \ + } + +HIGHBD_MSE_WXH_NEON(16, 16) +HIGHBD_MSE_WXH_NEON(16, 8) +HIGHBD_MSE_WXH_NEON(8, 16) +HIGHBD_MSE_WXH_NEON(8, 8) From 7478b7e4e481562a4a13f233acb66a60462e1934 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 1 Mar 2023 12:14:51 -0800 Subject: [PATCH 0638/1000] Revert "Implement highbd_d63_predictor using Neon" This reverts commit 7cdf139e3d6237386e0f93bdb0bdc1b459c663bf. This causes failures in the VP9/ExternalFrameBufferMD5Test and VP9/TestVectorTest.MD5Match tests in both armv7 and aarch64 builds. Change-Id: I7ac4ba0ddc70e7e7860df9f962e6658defe1cdd5 --- test/test_intra_pred_speed.cc | 12 +- test/vp9_intrapred_test.cc | 24 --- vpx_dsp/arm/highbd_intrapred_neon.c | 278 ---------------------------- vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- 4 files changed, 8 insertions(+), 314 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index e721a459ad..24af471eaa 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -566,16 +566,14 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon, vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon, vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon, - nullptr, nullptr, vpx_highbd_d63_predictor_4x4_neon, - vpx_highbd_tm_predictor_4x4_neon) + nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_4x4_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon, vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon, vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon, vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon, vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon, - nullptr, nullptr, vpx_highbd_d63_predictor_8x8_neon, - vpx_highbd_tm_predictor_8x8_neon) + nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_8x8_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon, vpx_highbd_dc_left_predictor_16x16_neon, @@ -583,8 +581,7 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon, vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon, vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon, - nullptr, nullptr, vpx_highbd_d63_predictor_16x16_neon, - vpx_highbd_tm_predictor_16x16_neon) + nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_16x16_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon, vpx_highbd_dc_left_predictor_32x32_neon, @@ -592,8 +589,7 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon, vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon, vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon, - nullptr, nullptr, vpx_highbd_d63_predictor_32x32_neon, - vpx_highbd_tm_predictor_32x32_neon) + nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_32x32_neon) #endif // HAVE_NEON #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index c4e0e78ac5..83e371df6e 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -848,14 +848,6 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d45_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, &vpx_highbd_d45_predictor_32x32_c, 32, 8), - HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, - &vpx_highbd_d63_predictor_4x4_c, 4, 8), - HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, - &vpx_highbd_d63_predictor_8x8_c, 8, 8), - HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, - &vpx_highbd_d63_predictor_16x16_c, 16, 8), - HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, - &vpx_highbd_d63_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, &vpx_highbd_d117_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, @@ -940,14 +932,6 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d45_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, &vpx_highbd_d45_predictor_32x32_c, 32, 10), - HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, - &vpx_highbd_d63_predictor_4x4_c, 4, 10), - HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, - &vpx_highbd_d63_predictor_8x8_c, 8, 10), - HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, - &vpx_highbd_d63_predictor_16x16_c, 16, 10), - HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, - &vpx_highbd_d63_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, &vpx_highbd_d117_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, @@ -1032,14 +1016,6 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d45_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, &vpx_highbd_d45_predictor_32x32_c, 32, 12), - HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, - &vpx_highbd_d63_predictor_4x4_c, 4, 12), - HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, - &vpx_highbd_d63_predictor_8x8_c, 8, 12), - HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, - &vpx_highbd_d63_predictor_16x16_c, 16, 12), - HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, - &vpx_highbd_d63_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, &vpx_highbd_d117_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index 424bf5f4bc..d1e335c263 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -453,284 +453,6 @@ void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, // ----------------------------------------------------------------------------- -void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - uint16x4_t a0, a1, a2, a3, d0, d1; - (void)left; - (void)bd; - - a0 = vld1_u16(above + 0); - a1 = vld1_u16(above + 1); - a2 = vld1_u16(above + 2); - a3 = vld1_dup_u16(above + 3); - - d0 = vrhadd_u16(a0, a1); - d1 = vrhadd_u16(vhadd_u16(a0, a2), a1); - - vst1_u16(dst + 0 * stride, d0); - vst1_u16(dst + 1 * stride, d1); - vst1_u16(dst + 2 * stride, vext_u16(d0, a3, 1)); - vst1_u16(dst + 3 * stride, vext_u16(d1, a3, 1)); -} - -void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - uint16x8_t a0, a1, a2, a7, d0, d1; - (void)left; - (void)bd; - - a0 = vld1q_u16(above + 0); - a1 = vld1q_u16(above + 1); - a2 = vld1q_u16(above + 2); - a7 = vld1q_dup_u16(above + 7); - - d0 = vrhaddq_u16(a0, a1); - d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1); - - vst1q_u16(dst + 0 * stride, d0); - vst1q_u16(dst + 1 * stride, d1); - vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 1)); - vst1q_u16(dst + 3 * stride, vextq_u16(d1, a7, 1)); - vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 2)); - vst1q_u16(dst + 5 * stride, vextq_u16(d1, a7, 2)); - vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 3)); - vst1q_u16(dst + 7 * stride, vextq_u16(d1, a7, 3)); -} - -void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0_lo, d0_hi, d1_lo, d1_hi; - (void)left; - (void)bd; - - a0 = vld1q_u16(above + 0); - a1 = vld1q_u16(above + 1); - a2 = vld1q_u16(above + 2); - a8 = vld1q_u16(above + 8); - a9 = vld1q_u16(above + 9); - a10 = vld1q_u16(above + 10); - a15 = vld1q_dup_u16(above + 15); - - d0_lo = vrhaddq_u16(a0, a1); - d0_hi = vrhaddq_u16(a8, a9); - d1_lo = vrhaddq_u16(vhaddq_u16(a0, a2), a1); - d1_hi = vrhaddq_u16(vhaddq_u16(a8, a10), a9); - - vst1q_u16(dst + 0 * stride + 0, d0_lo); - vst1q_u16(dst + 0 * stride + 8, d0_hi); - vst1q_u16(dst + 1 * stride + 0, d1_lo); - vst1q_u16(dst + 1 * stride + 8, d1_hi); - vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0_lo, d0_hi, 1)); - vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_hi, a15, 1)); - vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1_lo, d1_hi, 1)); - vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_hi, a15, 1)); - vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0_lo, d0_hi, 2)); - vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_hi, a15, 2)); - vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1_lo, d1_hi, 2)); - vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_hi, a15, 2)); - vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0_lo, d0_hi, 3)); - vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_hi, a15, 3)); - vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1_lo, d1_hi, 3)); - vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_hi, a15, 3)); - vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0_lo, d0_hi, 4)); - vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_hi, a15, 4)); - vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1_lo, d1_hi, 4)); - vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_hi, a15, 4)); - vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0_lo, d0_hi, 5)); - vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_hi, a15, 5)); - vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1_lo, d1_hi, 5)); - vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_hi, a15, 5)); - vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0_lo, d0_hi, 6)); - vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_hi, a15, 6)); - vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1_lo, d1_hi, 6)); - vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_hi, a15, 6)); - vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0_lo, d0_hi, 7)); - vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_hi, a15, 7)); - vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1_lo, d1_hi, 7)); - vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_hi, a15, 7)); -} - -void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4], - d1[4]; - (void)left; - (void)bd; - - a0 = vld1q_u16(above + 0); - a1 = vld1q_u16(above + 1); - a2 = vld1q_u16(above + 2); - a8 = vld1q_u16(above + 8); - a9 = vld1q_u16(above + 9); - a10 = vld1q_u16(above + 10); - a16 = vld1q_u16(above + 16); - a17 = vld1q_u16(above + 17); - a18 = vld1q_u16(above + 18); - a24 = vld1q_u16(above + 24); - a25 = vld1q_u16(above + 25); - a26 = vld1q_u16(above + 26); - a31 = vld1q_dup_u16(above + 31); - - d0[0] = vrhaddq_u16(a0, a1); - d0[1] = vrhaddq_u16(a8, a9); - d0[2] = vrhaddq_u16(a16, a17); - d0[3] = vrhaddq_u16(a24, a25); - d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1); - d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9); - d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17); - d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25); - - vst1q_u16(dst + 0 * stride + 0, d0[0]); - vst1q_u16(dst + 0 * stride + 8, d0[1]); - vst1q_u16(dst + 0 * stride + 16, d0[2]); - vst1q_u16(dst + 0 * stride + 24, d0[3]); - vst1q_u16(dst + 1 * stride + 0, d1[0]); - vst1q_u16(dst + 1 * stride + 8, d1[1]); - vst1q_u16(dst + 1 * stride + 16, d1[2]); - vst1q_u16(dst + 1 * stride + 24, d1[3]); - - vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1)); - vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1)); - vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1)); - vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[3], a31, 1)); - vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1)); - vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1)); - vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1)); - vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[3], a31, 1)); - - vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2)); - vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2)); - vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2)); - vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[3], a31, 2)); - vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2)); - vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2)); - vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2)); - vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[3], a31, 2)); - - vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3)); - vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3)); - vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3)); - vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[3], a31, 3)); - vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3)); - vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3)); - vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3)); - vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[3], a31, 3)); - - vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4)); - vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4)); - vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4)); - vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[3], a31, 4)); - vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4)); - vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4)); - vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4)); - vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[3], a31, 4)); - - vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5)); - vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5)); - vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5)); - vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[3], a31, 5)); - vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5)); - vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5)); - vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5)); - vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[3], a31, 5)); - - vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6)); - vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6)); - vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6)); - vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[3], a31, 6)); - vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6)); - vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6)); - vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6)); - vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[3], a31, 6)); - - vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7)); - vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7)); - vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7)); - vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[3], a31, 7)); - vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7)); - vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7)); - vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7)); - vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[3], a31, 7)); - - vst1q_u16(dst + 16 * stride + 0, d0[1]); - vst1q_u16(dst + 16 * stride + 8, d0[2]); - vst1q_u16(dst + 16 * stride + 16, d0[3]); - vst1q_u16(dst + 16 * stride + 24, a31); - vst1q_u16(dst + 17 * stride + 0, d1[1]); - vst1q_u16(dst + 17 * stride + 8, d1[2]); - vst1q_u16(dst + 17 * stride + 16, d1[3]); - vst1q_u16(dst + 17 * stride + 24, a31); - - vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1)); - vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1)); - vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[3], a31, 1)); - vst1q_u16(dst + 18 * stride + 24, a31); - vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1)); - vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1)); - vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[3], a31, 1)); - vst1q_u16(dst + 19 * stride + 24, a31); - - vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2)); - vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2)); - vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[3], a31, 2)); - vst1q_u16(dst + 20 * stride + 24, a31); - vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2)); - vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2)); - vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[3], a31, 2)); - vst1q_u16(dst + 21 * stride + 24, a31); - - vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3)); - vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3)); - vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[3], a31, 3)); - vst1q_u16(dst + 22 * stride + 24, a31); - vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3)); - vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3)); - vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[3], a31, 3)); - vst1q_u16(dst + 23 * stride + 24, a31); - - vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4)); - vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4)); - vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[3], a31, 4)); - vst1q_u16(dst + 24 * stride + 24, a31); - vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4)); - vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4)); - vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[3], a31, 4)); - vst1q_u16(dst + 25 * stride + 24, a31); - - vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5)); - vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5)); - vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[3], a31, 5)); - vst1q_u16(dst + 26 * stride + 24, a31); - vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5)); - vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5)); - vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[3], a31, 5)); - vst1q_u16(dst + 27 * stride + 24, a31); - - vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6)); - vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6)); - vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[3], a31, 6)); - vst1q_u16(dst + 28 * stride + 24, a31); - vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6)); - vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6)); - vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[3], a31, 6)); - vst1q_u16(dst + 29 * stride + 24, a31); - - vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7)); - vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7)); - vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[3], a31, 7)); - vst1q_u16(dst + 30 * stride + 24, a31); - vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7)); - vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7)); - vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[3], a31, 7)); - vst1q_u16(dst + 31 * stride + 24, a31); -} - -// ----------------------------------------------------------------------------- - void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index b6f5d4a099..072b10d3d1 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -221,7 +221,7 @@ () specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_4x4 neon sse2/; + specialize qw/vpx_highbd_d63_predictor_4x4 sse2/; add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/; @@ -260,7 +260,7 @@ () specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_8x8 neon ssse3/; + specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/; add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/; @@ -299,7 +299,7 @@ () specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_16x16 neon ssse3/; + specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/; add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/; @@ -338,7 +338,7 @@ () specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_32x32 neon ssse3/; + specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/; add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/; From d98a7b8bd937f4b846beb3df76271a5f91d86d5f Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 1 Mar 2023 15:52:20 -0800 Subject: [PATCH 0639/1000] Revert "quantize: use scan_order instead of passing scan/iscan" This reverts commit 14fc40040ff30486c45111056db44ee18590a24a. This has alignment issues, causing crashes in the tests: SSSE3/VP9QuantizeTest.EOBCheck/* Change-Id: I934f9a4c3ce3db33058a65180fa645c8649c3670 --- test/vp9_quantize_test.cc | 41 ++++++++++++----------- vp9/common/vp9_scan.h | 2 +- vp9/encoder/vp9_encodemb.c | 8 ++--- vpx_dsp/arm/highbd_quantize_neon.c | 7 ++-- vpx_dsp/arm/quantize_neon.c | 7 ++-- vpx_dsp/quantize.c | 9 +++-- vpx_dsp/vpx_dsp_rtcd_defs.pl | 5 ++- vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 5 ++- vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 5 ++- vpx_dsp/x86/quantize_avx.c | 7 ++-- vpx_dsp/x86/quantize_avx2.c | 5 ++- vpx_dsp/x86/quantize_ssse3.c | 6 ++-- 12 files changed, 52 insertions(+), 55 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 630a0053ab..e533b2509c 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -42,7 +42,7 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const struct scan_order *const scan_order); + const int16_t *scan, const int16_t *iscan); typedef std::tuple QuantizeParam; @@ -60,10 +60,9 @@ template void QuantWrapper(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const struct scan_order *const scan_order) { + const int16_t *scan, const int16_t *iscan) { fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant, - mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan_order->scan, - scan_order->iscan); + mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan); } // Wrapper for 32x32 version which does not use count @@ -71,16 +70,16 @@ typedef void (*Quantize32x32Func)(const tran_low_t *coeff, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const struct scan_order *const scan_order); + const int16_t *scan, const int16_t *iscan); template void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const struct scan_order *const scan_order) { + const int16_t *scan, const int16_t *iscan) { (void)count; - fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order); + fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan); } // Wrapper for FP version which does not use zbin or quant_shift. @@ -94,9 +93,9 @@ template void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const struct scan_order *const scan_order) { + const int16_t *scan, const int16_t *iscan) { fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff, - dequant, eob, scan_order->scan, scan_order->iscan); + dequant, eob, scan, iscan); } void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, @@ -226,7 +225,7 @@ class VP9QuantizeTest : public VP9QuantizeBase, void VP9QuantizeTest::Run() { quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), dequant_ptr_, - &eob_, scan_); + &eob_, scan_->scan, scan_->iscan); } void VP9QuantizeTest::Speed(bool is_median) { @@ -299,7 +298,7 @@ void VP9QuantizeTest::Speed(bool is_median) { ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_); + scan_->scan, scan_->iscan); } vpx_usec_timer_mark(&timer); @@ -307,7 +306,7 @@ void VP9QuantizeTest::Speed(bool is_median) { for (int n = 0; n < kNumTests; ++n) { quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), - dequant_ptr_, &eob_, scan_); + dequant_ptr_, &eob_, scan_->scan, scan_->iscan); } vpx_usec_timer_mark(&simd_timer); @@ -448,11 +447,12 @@ TEST_P(VP9QuantizeTest, OperationCheck) { quant_fp_ptr_); ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), - dequant_ptr_, &ref_eob, scan_); + dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_.TopLeftPixel(), count_, &mb_plane_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_)); + ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, + &mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, + &eob_, scan_->scan, scan_->iscan)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); @@ -504,11 +504,12 @@ TEST_P(VP9QuantizeTest, EOBCheck) { quant_fp_ptr_); ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), - dequant_ptr_, &ref_eob, scan_); + dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_.TopLeftPixel(), count_, &mb_plane_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_)); + ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, + &mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, + &eob_, scan_->scan, scan_->iscan)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h index efa0e23365..72a9a5ec47 100644 --- a/vp9/common/vp9_scan.h +++ b/vp9/common/vp9_scan.h @@ -23,7 +23,7 @@ extern "C" { #define MAX_NEIGHBORS 2 -typedef struct scan_order { +typedef struct { const int16_t *scan; const int16_t *iscan; const int16_t *neighbors; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 515c7a9031..6a5f628808 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -512,7 +512,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, - scan_order); + scan_order->scan, scan_order->iscan); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); @@ -542,7 +542,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, - scan_order); + scan_order->scan, scan_order->iscan); break; case TX_16X16: vpx_fdct16x16(src_diff, coeff, diff_stride); @@ -856,7 +856,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, src_stride, dst, dst_stride, xd->bd); highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, - eob, scan_order); + eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -946,7 +946,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, dst_stride); fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, - scan_order); + scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index 5a40f1284e..3b1fec3321 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -13,7 +13,6 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" -#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store( @@ -228,11 +227,10 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon( void vpx_highbd_quantize_b_32x32_neon( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const struct scan_order *const scan_order) { + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; - const int16_t *iscan = scan_order->iscan; // Only the first element of each vector is DC. // High half has identical elements, but we can reconstruct it from the low @@ -302,4 +300,7 @@ void vpx_highbd_quantize_b_32x32_neon( vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ + // Need this here, else the compiler complains about mixing declarations and + // code in C90 + (void)scan; } diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index 84b6d8c79f..e81738a7bb 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -14,7 +14,6 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" -#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, @@ -219,11 +218,10 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct scan_order *const scan_order) { + const int16_t *scan, const int16_t *iscan) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; - const int16_t *iscan = scan_order->iscan; // Only the first element of each vector is DC. int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1); @@ -287,4 +285,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ + // Need these here, else the compiler complains about mixing declarations and + // code in C90 + (void)scan; } diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index f51bf253e7..c4642812ad 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -14,7 +14,6 @@ #include "vpx_dsp/quantize.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" -#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, @@ -214,7 +213,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct scan_order *const scan_order) { + const int16_t *scan, const int16_t *iscan) { const int n_coeffs = 32 * 32; const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; @@ -222,11 +221,11 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const int16_t *round_ptr = mb_plane->round; const int16_t *quant_ptr = mb_plane->quant; const int16_t *quant_shift_ptr = mb_plane->quant_shift; - const int16_t *scan = scan_order->scan; int idx = 0; int idx_arr[32 * 32 /* n_coeffs */]; int i, eob = -1; + (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -275,7 +274,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, void vpx_highbd_quantize_b_32x32_c( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const struct scan_order *const scan_order) { + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const intptr_t n_coeffs = 32 * 32; const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; @@ -283,11 +282,11 @@ void vpx_highbd_quantize_b_32x32_c( const int16_t *round_ptr = mb_plane->round; const int16_t *quant_ptr = mb_plane->quant; const int16_t *quant_shift_ptr = mb_plane->quant_shift; - const int16_t *scan = scan_order->scan; int idx = 0; int idx_arr[1024]; int i, eob = -1; + (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 072b10d3d1..c899c467bb 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -19,7 +19,6 @@ () #include "vpx_dsp/vpx_filter.h" #if CONFIG_VP9_ENCODER struct macroblock_plane; - struct scan_order; #endif EOF @@ -725,14 +724,14 @@ () add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; - add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order"; + add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b neon sse2 avx2/; - add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order"; + add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c index bfd7b2e23e..6041d7289a 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -11,7 +11,6 @@ #include #include "./vpx_dsp_rtcd.h" -#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) { @@ -226,12 +225,12 @@ static VPX_FORCE_INLINE void quantize_b_32x32( void vpx_highbd_quantize_b_32x32_avx2( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const struct scan_order *const scan_order) { + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const unsigned int step = 8; intptr_t n_coeffs = 32 * 32; - const int16_t *iscan = scan_order->iscan; __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; + (void)scan; init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr, mb_plane->quant_shift, qp, 1); diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 58d5a3a5ff..6a8f42b8a4 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -15,7 +15,6 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" -#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" #if CONFIG_VP9_HIGHBITDEPTH @@ -97,16 +96,16 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, void vpx_highbd_quantize_b_32x32_sse2( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const struct scan_order *const scan_order) { + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; int i, eob = 0; const intptr_t n_coeffs = 32 * 32; - const int16_t *iscan = scan_order->iscan; const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1); const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1); + (void)scan; zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); zbins[1] = _mm_set1_epi32(zbin1_tmp); diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index d05a937be1..d52f6c6644 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -19,8 +19,6 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" -#include "vp9/common/vp9_scan.h" -#include "vp9/encoder/vp9_block.h" void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -146,11 +144,10 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct scan_order *const scan_order) { + const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); const __m256i big_zero = _mm256_setzero_si256(); int index; - const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -159,6 +156,8 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, __m128i all_zero; __m128i eob = zero, eob0; + (void)scan; + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index 1c82542ae6..a8412c5b8e 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -13,7 +13,6 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void load_b_values_avx2( @@ -256,11 +255,11 @@ void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct scan_order *const scan_order) { + const int16_t *scan, const int16_t *iscan) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; - const int16_t *iscan = scan_order->iscan; + (void)scan; load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round, mb_plane->quant, &v_quant, dequant_ptr, &v_dequant, diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 6401b2865d..6fe54d7d98 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -16,7 +16,6 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" -#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, @@ -113,10 +112,9 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct scan_order *const scan_order) { + const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); int index; - const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -125,6 +123,8 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, __m128i all_zero; __m128i eob = zero, eob0; + (void)scan; + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); From e4b423e1400436f07b9e3945718a8696e5eca9ec Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 1 Mar 2023 15:53:14 -0800 Subject: [PATCH 0640/1000] Revert "quantize: simplifly highbd 32x32_b args" This reverts commit 573f5e662b544dbc553d73fa2b61055c30dfe8cc. This has alignment issues, causing crashes in the tests: SSSE3/VP9QuantizeTest.EOBCheck/* Change-Id: Ibf05e6b116c46f6e2c11187b3e3578bbd2d2c227 --- test/vp9_quantize_test.cc | 54 +++++++++++------------ vp9/encoder/vp9_encodemb.c | 10 +++-- vpx_dsp/arm/highbd_quantize_neon.c | 21 ++++----- vpx_dsp/quantize.c | 16 +++---- vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 13 +++--- vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 21 +++++---- 7 files changed, 68 insertions(+), 69 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index e533b2509c..ecb6116f0c 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -549,15 +549,15 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantWrapper, &QuantWrapper, VPX_BITS_12, 16, false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, - VPX_BITS_8, 32, false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, - VPX_BITS_10, 32, false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, - VPX_BITS_12, 32, false))); + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, + 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, + 32, false))); #else INSTANTIATE_TEST_SUITE_P( @@ -626,15 +626,15 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, - VPX_BITS_8, 32, false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, - VPX_BITS_10, 32, false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, - VPX_BITS_12, 32, false))); + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, + 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, + 32, false))); #else INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, @@ -672,15 +672,15 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, - VPX_BITS_8, 32, false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, - VPX_BITS_10, 32, false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, - VPX_BITS_12, 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, + 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, + 32, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper, diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 6a5f628808..4910dc20f5 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -511,8 +511,9 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b_32x32( + coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, + dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); @@ -855,8 +856,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b_32x32( + coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, + dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index 3b1fec3321..b9f72a94c5 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -13,7 +13,6 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" -#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store( const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1, @@ -225,9 +224,11 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon( } void vpx_highbd_quantize_b_32x32_neon( - const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; @@ -236,13 +237,12 @@ void vpx_highbd_quantize_b_32x32_neon( // High half has identical elements, but we can reconstruct it from the low // half by duplicating the 2nd element. So we only need to pass a 4x32-bit // vector - int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1); - int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1); + int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1); + int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1); // Extend the quant, quant_shift vectors to ones of 32-bit elements // scale to high-half, so we can use vqdmulhq_s32 - int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15); - int32x4_t quant_shift = - vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16); + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15); + int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16); int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); // Process first 8 values which include a dc component. @@ -300,7 +300,8 @@ void vpx_highbd_quantize_b_32x32_neon( vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ - // Need this here, else the compiler complains about mixing declarations and + // Need these here, else the compiler complains about mixing declarations and // code in C90 + (void)n_coeffs; (void)scan; } diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index c4642812ad..212db45c88 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -272,16 +272,14 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_32x32_c( - const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const intptr_t n_coeffs = 32 * 32; - const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), - ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), + ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - const int16_t *round_ptr = mb_plane->round; - const int16_t *quant_ptr = mb_plane->quant; - const int16_t *quant_shift_ptr = mb_plane->quant_shift; int idx = 0; int idx_arr[1024]; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index c899c467bb..2752eea5d9 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -731,7 +731,7 @@ () add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b neon sse2 avx2/; - add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c index 6041d7289a..8edddd637f 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -11,7 +11,6 @@ #include #include "./vpx_dsp_rtcd.h" -#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) { const __m128i sign = _mm_srai_epi16(*p, 15); @@ -223,17 +222,17 @@ static VPX_FORCE_INLINE void quantize_b_32x32( } void vpx_highbd_quantize_b_32x32_avx2( - const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { const unsigned int step = 8; - intptr_t n_coeffs = 32 * 32; __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; (void)scan; - init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr, - mb_plane->quant_shift, qp, 1); + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1); quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 6a8f42b8a4..ae1981a834 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -15,7 +15,6 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" -#include "vp9/encoder/vp9_block.h" #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, @@ -94,17 +93,18 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, } void vpx_highbd_quantize_b_32x32_sse2( - const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; int i, eob = 0; - const intptr_t n_coeffs = 32 * 32; - const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1); - const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1); + const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); (void)scan; zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); @@ -140,11 +140,10 @@ void vpx_highbd_quantize_b_32x32_sse2( const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15); + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; From 508bfc1ff4c2d56353dcb845b59158256c434200 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 1 Mar 2023 15:53:18 -0800 Subject: [PATCH 0641/1000] Revert "quantize: simplify 32x32_b args" This reverts commit 848f6e733789c627b6606baf1c85e32be997e36f. This has alignment issues, causing crashes in the tests: SSSE3/VP9QuantizeTest.EOBCheck/* Change-Id: Ic12014ab0a78ed3cde02d642509061552cdc8fc9 --- test/vp9_quantize_test.cc | 285 ++++++++++++++--------------------- vp9/encoder/vp9_block.h | 1 - vp9/encoder/vp9_encodemb.c | 6 +- vpx_dsp/arm/quantize_neon.c | 17 ++- vpx_dsp/quantize.c | 17 +-- vpx_dsp/vpx_dsp_rtcd_defs.pl | 5 +- vpx_dsp/x86/quantize_avx.c | 30 +++- vpx_dsp/x86/quantize_avx2.c | 15 +- vpx_dsp/x86/quantize_sse2.h | 28 ---- vpx_dsp/x86/quantize_ssse3.c | 35 ++++- 10 files changed, 201 insertions(+), 238 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index ecb6116f0c..587cec6923 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -26,7 +26,6 @@ #include "test/util.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_scan.h" -#include "vp9/encoder/vp9_block.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" #include "vpx_ports/msvc.h" @@ -39,7 +38,8 @@ namespace { const int number_of_iterations = 100; typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, - const macroblock_plane *const mb_plane, + const int16_t *zbin, const int16_t *round, + const int16_t *quant, const int16_t *quant_shift, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, const int16_t *scan, const int16_t *iscan); @@ -47,41 +47,6 @@ typedef std::tuple QuantizeParam; -// Wrapper which takes a macroblock_plane. -typedef void (*QuantizeBaseFunc)(const tran_low_t *coeff, intptr_t count, - const int16_t *zbin, const int16_t *round, - const int16_t *quant, - const int16_t *quant_shift, tran_low_t *qcoeff, - tran_low_t *dqcoeff, const int16_t *dequant, - uint16_t *eob, const int16_t *scan, - const int16_t *iscan); - -template -void QuantWrapper(const tran_low_t *coeff, intptr_t count, - const macroblock_plane *const mb_plane, tran_low_t *qcoeff, - tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan) { - fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant, - mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan); -} - -// Wrapper for 32x32 version which does not use count -typedef void (*Quantize32x32Func)(const tran_low_t *coeff, - const macroblock_plane *const mb_plane, - tran_low_t *qcoeff, tran_low_t *dqcoeff, - const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan); - -template -void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count, - const macroblock_plane *const mb_plane, - tran_low_t *qcoeff, tran_low_t *dqcoeff, - const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan) { - (void)count; - fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan); -} - // Wrapper for FP version which does not use zbin or quant_shift. typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, const int16_t *round, const int16_t *quant, @@ -91,11 +56,15 @@ typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, template void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, - const macroblock_plane *const mb_plane, tran_low_t *qcoeff, - tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan) { - fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff, - dequant, eob, scan, iscan); + const int16_t *zbin, const int16_t *round, + const int16_t *quant, const int16_t *quant_shift, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, const int16_t *scan, + const int16_t *iscan) { + (void)zbin; + (void)quant_shift; + + fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan); } void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, @@ -150,16 +119,17 @@ class VP9QuantizeBase : public AbstractBench { #else max_value_ = (1 << bit_depth_) - 1; #endif - zbin_ptr_ = mb_plane_.zbin = + zbin_ptr_ = reinterpret_cast(vpx_memalign(16, 8 * sizeof(*zbin_ptr_))); - round_fp_ptr_ = mb_plane_.round_fp; - quant_fp_ptr_ = mb_plane_.quant_fp = reinterpret_cast( + round_fp_ptr_ = reinterpret_cast( + vpx_memalign(16, 8 * sizeof(*round_fp_ptr_))); + quant_fp_ptr_ = reinterpret_cast( vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_))); - round_ptr_ = mb_plane_.round = + round_ptr_ = reinterpret_cast(vpx_memalign(16, 8 * sizeof(*round_ptr_))); - quant_ptr_ = mb_plane_.quant = + quant_ptr_ = reinterpret_cast(vpx_memalign(16, 8 * sizeof(*quant_ptr_))); - quant_shift_ptr_ = mb_plane_.quant_shift = reinterpret_cast( + quant_shift_ptr_ = reinterpret_cast( vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_))); dequant_ptr_ = reinterpret_cast( vpx_memalign(16, 8 * sizeof(*dequant_ptr_))); @@ -170,6 +140,7 @@ class VP9QuantizeBase : public AbstractBench { ~VP9QuantizeBase() { vpx_free(zbin_ptr_); + vpx_free(round_fp_ptr_); vpx_free(quant_fp_ptr_); vpx_free(round_ptr_); vpx_free(quant_ptr_); @@ -186,7 +157,6 @@ class VP9QuantizeBase : public AbstractBench { } protected: - macroblock_plane mb_plane_; int16_t *zbin_ptr_; int16_t *round_fp_ptr_; int16_t *quant_fp_ptr_; @@ -223,9 +193,10 @@ class VP9QuantizeTest : public VP9QuantizeBase, }; void VP9QuantizeTest::Run() { - quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, - qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), dequant_ptr_, - &eob_, scan_->scan, scan_->iscan); + quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, + quant_shift_ptr_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan, + scan_->iscan); } void VP9QuantizeTest::Speed(bool is_median) { @@ -295,8 +266,8 @@ void VP9QuantizeTest::Speed(bool is_median) { vpx_usec_timer_start(&timer); for (int n = 0; n < kNumTests; ++n) { - ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, - ref_qcoeff.TopLeftPixel(), + ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, + q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); } @@ -304,9 +275,10 @@ void VP9QuantizeTest::Speed(bool is_median) { vpx_usec_timer_start(&simd_timer); for (int n = 0; n < kNumTests; ++n) { - quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, - qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), - dequant_ptr_, &eob_, scan_->scan, scan_->iscan); + quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, + quant_shift_ptr_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, + scan_->scan, scan_->iscan); } vpx_usec_timer_mark(&simd_timer); @@ -445,14 +417,15 @@ TEST_P(VP9QuantizeTest, OperationCheck) { GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, - ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), - dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, + quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, + scan_->scan, scan_->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, - &mb_plane_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, - &eob_, scan_->scan, scan_->iscan)); + ASM_REGISTER_STATE_CHECK(quantize_op_( + coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, + quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), + dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); @@ -502,14 +475,15 @@ TEST_P(VP9QuantizeTest, EOBCheck) { GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_, - ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), - dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, + quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, + scan_->scan, scan_->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, - &mb_plane_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, - &eob_, scan_->scan, scan_->iscan)); + ASM_REGISTER_STATE_CHECK(quantize_op_( + coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, + quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), + dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); @@ -536,35 +510,28 @@ using std::make_tuple; INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, ::testing::Values( - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 16, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, 16, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, 16, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 32, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, - 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, - 32, false))); + make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&vpx_highbd_quantize_b_32x32_sse2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_sse2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_sse2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); #else INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, - ::testing::Values(make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, - 16, false), + ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true))); @@ -574,12 +541,11 @@ INSTANTIATE_TEST_SUITE_P( #if HAVE_SSSE3 INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, - ::testing::Values(make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, - 16, false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, - VPX_BITS_8, 32, false), + ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_ssse3, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, + false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), @@ -589,14 +555,13 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_SSSE3 #if HAVE_AVX -INSTANTIATE_TEST_SUITE_P( - AVX, VP9QuantizeTest, - ::testing::Values(make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, - 16, false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, - VPX_BITS_8, 32, false))); +INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_avx, + &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_avx, + &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false))); #endif // HAVE_AVX #if VPX_ARCH_X86_64 && HAVE_AVX2 @@ -612,29 +577,22 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_12, 32, true), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 16, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 16, + make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, 16, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, 16, - false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, VPX_BITS_8, 32, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 32, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, - 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, - 32, false))); + make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_avx2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_avx2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_avx2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); #else INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, @@ -644,12 +602,11 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 32, true), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, - 16, false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, - VPX_BITS_8, 32, false))); + make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_avx2, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, + false))); #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_AVX2 @@ -658,29 +615,22 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, ::testing::Values( - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 16, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 16, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, 16, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, 16, + make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, VPX_BITS_8, 32, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 32, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, - 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, - 32, false), + make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_neon, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_neon, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_neon, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper, @@ -689,12 +639,11 @@ INSTANTIATE_TEST_SUITE_P( #else INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, - ::testing::Values(make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, - 16, false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, - VPX_BITS_8, 32, false), + ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_neon, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, + false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), @@ -734,11 +683,9 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest, INSTANTIATE_TEST_SUITE_P( DISABLED_C, VP9QuantizeTest, ::testing::Values( - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 16, false), - make_tuple(&Quant32x32Wrapper, - &Quant32x32Wrapper, VPX_BITS_8, 32, - false), + make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8, + 32, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper, diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index fc27a0fbda..1786952911 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -13,7 +13,6 @@ #include "vpx_util/vpx_thread.h" -#include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 4910dc20f5..fa222f9dcf 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -542,7 +542,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, + vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_16X16: @@ -947,7 +948,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride); fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, + vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index e81738a7bb..9c227d560f 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -14,7 +14,6 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" -#include "vp9/encoder/vp9_block.h" static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, const int16x8_t dequant, @@ -214,8 +213,11 @@ quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. -void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, - const struct macroblock_plane *const mb_plane, +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -224,10 +226,10 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, int i; // Only the first element of each vector is DC. - int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1); - int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1); - int16x8_t quant = vld1q_s16(mb_plane->quant); - int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift); + int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); + int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); + int16x8_t quant = vld1q_s16(quant_ptr); + int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. @@ -287,5 +289,6 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, #endif // __aarch64__ // Need these here, else the compiler complains about mixing declarations and // code in C90 + (void)n_coeffs; (void)scan; } diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index 212db45c88..5d6ba64a8a 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -14,7 +14,6 @@ #include "vpx_dsp/quantize.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" -#include "vp9/encoder/vp9_block.h" void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, const int16_t *round_ptr, const int16_t quant, @@ -209,21 +208,19 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } #endif -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, - const struct macroblock_plane *const mb_plane, +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int n_coeffs = 32 * 32; - const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), - ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), + ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - const int16_t *round_ptr = mb_plane->round; - const int16_t *quant_ptr = mb_plane->quant; - const int16_t *quant_shift_ptr = mb_plane->quant_shift; int idx = 0; - int idx_arr[32 * 32 /* n_coeffs */]; + int idx_arr[1024]; int i, eob = -1; (void)iscan; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 2752eea5d9..3bcfa7a541 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -17,9 +17,6 @@ () #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/vpx_filter.h" -#if CONFIG_VP9_ENCODER - struct macroblock_plane; -#endif EOF } @@ -724,7 +721,7 @@ () add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; - add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index d52f6c6644..7d83527216 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -140,12 +140,15 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, - const struct macroblock_plane *const mb_plane, +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); const __m256i big_zero = _mm256_setzero_si256(); int index; @@ -157,9 +160,26 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, __m128i eob = zero, eob0; (void)scan; - - load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, - &shift); + (void)n_coeffs; + + // Setup global values. + // The 32x32 halves zbin and round. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + // Shift with rounding. + zbin = _mm_add_epi16(zbin, one); + zbin = _mm_srli_epi16(zbin, 1); + // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + zbin = _mm_sub_epi16(zbin, one); + + round = _mm_load_si128((const __m128i *)round_ptr); + round = _mm_add_epi16(round, one); + round = _mm_srli_epi16(round, 1); + + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + shift = _mm_slli_epi16(shift, 1); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index a8412c5b8e..28f7c9c7da 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -13,7 +13,6 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void load_b_values_avx2( const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, @@ -251,19 +250,23 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16( } } -void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, - const struct macroblock_plane *const mb_plane, +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; + (void)n_coeffs; (void)scan; - load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round, - mb_plane->quant, &v_quant, dequant_ptr, &v_dequant, - mb_plane->quant_shift, &v_quant_shift, 1); + load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, + &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, + &v_quant_shift, 1); // Do DC and first 15 AC. v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h index fe42fee018..27bfb4e41b 100644 --- a/vpx_dsp/x86/quantize_sse2.h +++ b/vpx_dsp/x86/quantize_sse2.h @@ -15,7 +15,6 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" -#include "vp9/encoder/vp9_block.h" static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, const int16_t *round_ptr, __m128i *round, @@ -30,33 +29,6 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, *shift = _mm_load_si128((const __m128i *)shift_ptr); } -static INLINE void load_b_values32x32( - const struct macroblock_plane *const mb_plane, __m128i *zbin, - __m128i *round, __m128i *quant, const int16_t *dequant_ptr, - __m128i *dequant, __m128i *shift) { - const __m128i one = _mm_set1_epi16(1); - // The 32x32 halves zbin and round. - *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin); - // Shift with rounding. - *zbin = _mm_add_epi16(*zbin, one); - *zbin = _mm_srli_epi16(*zbin, 1); - // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so - // it is a strict "greater" comparison. - *zbin = _mm_sub_epi16(*zbin, one); - - *round = _mm_load_si128((const __m128i *)mb_plane->round); - *round = _mm_add_epi16(*round, one); - *round = _mm_srli_epi16(*round, 1); - - *quant = _mm_load_si128((const __m128i *)mb_plane->quant); - *dequant = _mm_load_si128((const __m128i *)dequant_ptr); - *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift); - // I suspect this is not technically OK because quant_shift can be up - // to 1 << 16 and shifting up again will outrange that, but the test is not - // comprehensive enough to catch that and "it's been that way forever" - *shift = _mm_slli_epi16(*shift, 1); -} - static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round, const int16_t *quant_ptr, __m128i *quant, const int16_t *dequant_ptr, diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 6fe54d7d98..476230286d 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -16,7 +16,6 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" -#include "vp9/encoder/vp9_block.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -108,12 +107,16 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, - const struct macroblock_plane *const mb_plane, +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); int index; __m128i zbin, round, quant, dequant, shift; @@ -124,9 +127,29 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, __m128i eob = zero, eob0; (void)scan; - - load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, - &shift); + (void)n_coeffs; + + // Setup global values. + // The 32x32 halves zbin and round. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + // Shift with rounding. + zbin = _mm_add_epi16(zbin, one); + zbin = _mm_srli_epi16(zbin, 1); + // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + zbin = _mm_sub_epi16(zbin, one); + + round = _mm_load_si128((const __m128i *)round_ptr); + round = _mm_add_epi16(round, one); + round = _mm_srli_epi16(round, 1); + + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + // I suspect this is not technically OK because quant_shift can be up + // to 1 << 16 and shifting up again will outrange that, but the test is not + // comprehensive enough to catch that and "it's been that way forever" + shift = _mm_slli_epi16(shift, 1); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); From 817248e1be1548af10f3d4f0922e01e372d10cea Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 1 Mar 2023 23:54:51 +0000 Subject: [PATCH 0642/1000] [SSE4_1] Fix overflow in highbd temporal_filter While porting this function to NEON, using SSE4_1 implementation as base I noticed that both were producing files with different checksums to the C reference implementation. After investigating further I found that this saturating pack was the culprit. Doing the multiplication on the 32-bit values, leads to producing the correct results with the C implementation. Change-Id: I40c2a36551b2db363a58ea9aa19ef327f2676de3 --- vp9/encoder/x86/highbd_temporal_filter_sse4.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/vp9/encoder/x86/highbd_temporal_filter_sse4.c index a7f5117cff..bcbf6d77e6 100644 --- a/vp9/encoder/x86/highbd_temporal_filter_sse4.c +++ b/vp9/encoder/x86/highbd_temporal_filter_sse4.c @@ -141,11 +141,12 @@ static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32, count_u16 = _mm_adds_epu16(count_u16, sum_u16); _mm_storeu_si128((__m128i *)count, count_u16); - pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); - pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + pred_0_u32 = _mm_mullo_epi32(sum_first_u32, pred_0_u32); + pred_1_u32 = _mm_mullo_epi32(sum_second_u32, pred_1_u32); + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); From ca0c51f05f6238a68fbc33efedef4e6ec7f0b56d Mon Sep 17 00:00:00 2001 From: Johann Date: Fri, 3 Mar 2023 12:46:01 +0900 Subject: [PATCH 0643/1000] Revert "Allow macroblock_plane to have its own rounding buffer" This reverts commit 5359ae810cdbb974060297ecf935183baf7b009b. Reason for revert: Blocks quantize cleanups Original change's description: > Allow macroblock_plane to have its own rounding buffer > > Add 8 bytes buffer to macroblock_plane to support rounding factor. > > Change-Id: I3751689e4449c0caea28d3acf6cd17d7f39508ed Change-Id: Ia2424d2114207370f0b45350313a5ff8521d25a8 --- vp9/encoder/vp9_block.h | 2 +- vp9/encoder/vp9_quantize.c | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 1786952911..3e2c9a3c35 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -34,7 +34,7 @@ struct macroblock_plane { struct buf_2d src; // Quantizer setings - DECLARE_ALIGNED(16, int16_t, round_fp[8]); + int16_t *round_fp; int16_t *quant_fp; int16_t *quant; int16_t *quant_shift; diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index dcc44449fd..115c66723d 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -249,8 +249,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { // Y x->plane[0].quant = quants->y_quant[qindex]; x->plane[0].quant_fp = quants->y_quant_fp[qindex]; - memcpy(x->plane[0].round_fp, quants->y_round_fp[qindex], - 8 * sizeof(*(x->plane[0].round_fp))); + x->plane[0].round_fp = quants->y_round_fp[qindex]; x->plane[0].quant_shift = quants->y_quant_shift[qindex]; x->plane[0].zbin = quants->y_zbin[qindex]; x->plane[0].round = quants->y_round[qindex]; @@ -262,8 +261,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { for (i = 1; i < 3; i++) { x->plane[i].quant = quants->uv_quant[qindex]; x->plane[i].quant_fp = quants->uv_quant_fp[qindex]; - memcpy(x->plane[i].round_fp, quants->uv_round_fp[qindex], - 8 * sizeof(*(x->plane[i].round_fp))); + x->plane[i].round_fp = quants->uv_round_fp[qindex]; x->plane[i].quant_shift = quants->uv_quant_shift[qindex]; x->plane[i].zbin = quants->uv_zbin[qindex]; x->plane[i].round = quants->uv_round[qindex]; From 394de691a0ef570fc49943f565ad53ee0d22a7f3 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 3 Mar 2023 12:34:36 -0800 Subject: [PATCH 0644/1000] Revert "Implement d117_predictor using Neon" This reverts commit 360e9069b6cc1dd3a004728b876fb923413f4b11. This causes ASan errors: [ RUN ] VP9/TestVectorTest.MD5Match/1 ================================================================= ==837858==ERROR: AddressSanitizer: stack-buffer-overflow on address 0xffff82ecad40 at pc 0x000000c494d4 bp 0xffffe1695800 sp 0xffffe16957f8 READ of size 16 at 0xffff82ecad40 thread T0 #0 0xc494d0 in vpx_d117_predictor_32x32_neon (test_libvpx+0xc494d0) #1 0x1040b34 in vp9_predict_intra_block (test_libvpx+0x1040b34) #2 0xf8feec in decode_block (test_libvpx+0xf8feec) #3 0xf8f588 in decode_partition (test_libvpx+0xf8f588) #4 0xf7be5c in vp9_decode_frame (test_libvpx+0xf7be5c) ... Address 0xffff82ecad40 is located in stack of thread T0 at offset 64 in frame #0 0x103fd3c in vp9_predict_intra_block (test_libvpx+0x103fd3c) This frame has 2 object(s): [32, 64) 'left_col.i' <== Memory access at offset 64 overflows this variable [96, 176) 'above_data.i' Change-Id: I058213364617dfe1036126c33a3307f8288d9ae0 --- test/test_intra_pred_speed.cc | 20 ++- test/vp9_intrapred_test.cc | 8 -- vpx_dsp/arm/intrapred_neon.c | 232 ---------------------------------- vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 - 4 files changed, 8 insertions(+), 256 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 24af471eaa..30817553ff 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -269,32 +269,28 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon, vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon, vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon, vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon, - vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon, - nullptr, nullptr, vpx_d63_predictor_4x4_neon, - vpx_tm_predictor_4x4_neon) + vpx_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, + vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon) INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon, vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon, vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon, vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, - vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon, - nullptr, nullptr, vpx_d63_predictor_8x8_neon, - vpx_tm_predictor_8x8_neon) + vpx_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, + vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon) INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon, vpx_dc_left_predictor_16x16_neon, vpx_dc_top_predictor_16x16_neon, vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon, vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, - vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon, - nullptr, nullptr, vpx_d63_predictor_16x16_neon, - vpx_tm_predictor_16x16_neon) + vpx_d135_predictor_16x16_neon, nullptr, nullptr, nullptr, + vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon) INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon, vpx_dc_left_predictor_32x32_neon, vpx_dc_top_predictor_32x32_neon, vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon, vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon, - vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon, - nullptr, nullptr, vpx_d63_predictor_32x32_neon, - vpx_tm_predictor_32x32_neon) + vpx_d135_predictor_32x32_neon, nullptr, nullptr, nullptr, + vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon) #endif // HAVE_NEON #if HAVE_MSA diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 83e371df6e..7f8e1c5b51 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -251,14 +251,6 @@ INSTANTIATE_TEST_SUITE_P( &vpx_d63_predictor_16x16_c, 16, 8), IntraPredParam(&vpx_d63_predictor_32x32_neon, &vpx_d63_predictor_32x32_c, 32, 8), - IntraPredParam(&vpx_d117_predictor_4x4_neon, &vpx_d117_predictor_4x4_c, - 4, 8), - IntraPredParam(&vpx_d117_predictor_8x8_neon, &vpx_d117_predictor_8x8_c, - 8, 8), - IntraPredParam(&vpx_d117_predictor_16x16_neon, - &vpx_d117_predictor_16x16_c, 16, 8), - IntraPredParam(&vpx_d117_predictor_32x32_neon, - &vpx_d117_predictor_32x32_c, 32, 8), IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c, 4, 8), IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c, diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index 4760a295b9..02a05aae53 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -545,238 +545,6 @@ void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, // ----------------------------------------------------------------------------- -void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - // See vpx_d117_predictor_8x8_neon for more details on the implementation. - uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1; - - az = load_unaligned_u8_4x1(above - 1); - a0 = load_unaligned_u8_4x1(above + 0); - // [ left[0], above[-1], above[0], above[1], x, x, x, x ] - l0az = vext_u8(vld1_dup_u8(left), az, 7); - - col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2); - col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2); - - d0 = vrhadd_u8(az, a0); - d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); - d2 = vext_u8(col0, d0, 7); - d3 = vext_u8(col1, d1, 7); - - store_u8_4x1(dst + 0 * stride, d0); - store_u8_4x1(dst + 1 * stride, d1); - store_u8_4x1(dst + 2 * stride, d2); - store_u8_4x1(dst + 3 * stride, d3); -} - -void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd; - - az = vld1_u8(above - 1); - a0 = vld1_u8(above + 0); - // [ left[0], above[-1], ... , above[5] ] - l0az = vext_u8(vld1_dup_u8(left), az, 7); - - l0 = vld1_u8(left + 0); - l1 = vld1_u8(left + 1); - // [ above[-1], left[0], ... , left[6] ] - azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7); - - // d0[0] = AVG2(above[-1], above[0]) - // d0[1] = AVG2(above[0], above[1]) - // ... - // d0[7] = AVG2(above[6], above[7]) - d0 = vrhadd_u8(az, a0); - - // d1[0] = AVG3(left[0], above[-1], above[0]) - // d1[1] = AVG3(above[-1], above[0], above[1]) - // ... - // d1[7] = AVG3(above[5], above[6], above[7]) - d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); - - // The ext instruction shifts elements in from the end of the vector rather - // than the start, so reverse the vector to put the elements to be shifted in - // at the end: - // col0[7] = AVG3(above[-1], left[0], left[1]) - // col0[6] = AVG3(left[0], left[1], left[2]) - // ... - // col0[0] = AVG3(left[6], left[7], left[8]) - col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0)); - - // We don't care about the first parameter to this uzp since we only ever use - // the high three elements, we just use col0 again since it is already - // available: - // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ] - // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ] - col0_even = vuzp_u8(col0, col0).val[1]; - col0_odd = vuzp_u8(col0, col0).val[0]; - - // Incrementally shift more elements from col0 into d0/1: - // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ] - // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ] - // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ] - // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] - // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ] - // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ] - // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ] - // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ] - vst1_u8(dst + 0 * stride, d0); - vst1_u8(dst + 1 * stride, d1); - vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7)); - vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7)); - vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6)); - vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6)); - vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5)); - vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5)); -} - -void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - // See vpx_d117_predictor_8x8_neon for more details on the implementation. - uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd; - - az = vld1q_u8(above - 1); - a0 = vld1q_u8(above + 0); - // [ left[0], above[-1], ... , above[13] ] - l0az = vextq_u8(vld1q_dup_u8(left), az, 15); - - l0 = vld1q_u8(left + 0); - l1 = vld1q_u8(left + 1); - // [ above[-1], left[0], ... , left[14] ] - azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); - - d0 = vrhaddq_u8(az, a0); - d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az); - - col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); - col0 = vrev64q_u8(vextq_u8(col0, col0, 8)); - - col0_even = vuzpq_u8(col0, col0).val[1]; - col0_odd = vuzpq_u8(col0, col0).val[0]; - - vst1q_u8(dst + 0 * stride, d0); - vst1q_u8(dst + 1 * stride, d1); - vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15)); - vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15)); - vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14)); - vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14)); - vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13)); - vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13)); - vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12)); - vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12)); - vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11)); - vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11)); - vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10)); - vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10)); - vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9)); - vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9)); -} - -void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - // See vpx_d117_predictor_8x8_neon for more details on the implementation. - uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1, - l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd; - - az = vld1q_u8(above - 1); - a0 = vld1q_u8(above + 0); - a14 = vld1q_u8(above + 14); - a15 = vld1q_u8(above + 15); - a16 = vld1q_u8(above + 16); - // [ left[0], above[-1], ... , above[13] ] - l0az = vextq_u8(vld1q_dup_u8(left), az, 15); - - l0 = vld1q_u8(left + 0); - l1 = vld1q_u8(left + 1); - l15 = vld1q_u8(left + 15); - l16 = vld1q_u8(left + 16); - l17 = vld1q_u8(left + 17); - // [ above[-1], left[0], ... , left[14] ] - azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); - - d0_lo = vrhaddq_u8(az, a0); - d0_hi = vrhaddq_u8(a15, a16); - d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az); - d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15); - - col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); - col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16); - - col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8)); - col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8)); - - col0_even = vuzpq_u8(col0_hi, col0_lo).val[1]; - col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0]; - - vst1q_u8(dst + 0 * stride + 0, d0_lo); - vst1q_u8(dst + 0 * stride + 16, d0_hi); - vst1q_u8(dst + 1 * stride + 0, d1_lo); - vst1q_u8(dst + 1 * stride + 16, d1_hi); - vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15)); - vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15)); - vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15)); - vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15)); - vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14)); - vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14)); - vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14)); - vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14)); - vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13)); - vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13)); - vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13)); - vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13)); - vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12)); - vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12)); - vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12)); - vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12)); - vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11)); - vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11)); - vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11)); - vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11)); - vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10)); - vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10)); - vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10)); - vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10)); - vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9)); - vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9)); - vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9)); - vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9)); - vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8)); - vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8)); - vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8)); - vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8)); - vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7)); - vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7)); - vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7)); - vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7)); - vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6)); - vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6)); - vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6)); - vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6)); - vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5)); - vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5)); - vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5)); - vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5)); - vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4)); - vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4)); - vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4)); - vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4)); - vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3)); - vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3)); - vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3)); - vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3)); - vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2)); - vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2)); - vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2)); - vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2)); - vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1)); - vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1)); - vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1)); - vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1)); -} - -// ----------------------------------------------------------------------------- - void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t XA0123 = vld1_u8(above - 1); diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 3bcfa7a541..c50ab93c5a 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -57,7 +57,6 @@ () add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d117_predictor_4x4 neon/; add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_4x4 neon/; @@ -102,7 +101,6 @@ () specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/; add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d117_predictor_8x8 neon/; add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_8x8 neon/; @@ -143,7 +141,6 @@ () specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d117_predictor_16x16 neon/; add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_16x16 neon/; @@ -182,7 +179,6 @@ () specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/; add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d117_predictor_32x32 neon/; add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_32x32 neon/; From f5dfa780ce087af40b39a05b45c4798ad70b48c8 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 3 Mar 2023 20:56:29 +0000 Subject: [PATCH 0645/1000] disable vpx_get4x4sse_cs_neon This function causes a heap overflow in the tests: [ RUN ] NEON/VpxSseTest.RefSse/0 ================================================================= ==876922==ERROR: AddressSanitizer: heap-buffer-overflow on address 0xffff8949d903 at pc 0x000000dd95d4 bp 0xfffffdd7f260 sp 0xfffffdd7f258 READ of size 8 at 0xffff8949d903 thread T0 #0 0xdd95d0 in vpx_get4x4sse_cs_neon vpx_dsp/arm/variance_neon.c:556:10 #1 0x9d4894 in (anonymous namespace)::MainTestClass::RefTestSse() test/variance_test.cc:531:5 #2 0x9d4894 in (anonymous namespace)::VpxSseTest_RefSse_Test::TestBody() test/variance_test.cc:772:30 ... 0xffff8949d903 is located 3 bytes to the right of 16-byte region [0xffff8949d8f0,0xffff8949d900) allocated by thread T0 here: #0 0x5fd050 in operator new[](unsigned long) (test_libvpx+0x5fd050) #1 0x9d3e04 in (anonymous namespace)::MainTestClass::SetUp() test/variance_test.cc:299:12 Bug: webm:1794 Change-Id: I4bc681eb9a436743ef8bfe2a2abae59ce754309c --- test/variance_test.cc | 3 +++ vpx_dsp/arm/variance_neon.c | 6 ++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index 1359bc4baf..237d595bb7 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1446,9 +1446,12 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_AVX2 #if HAVE_NEON +// TODO(https://crbug.com/webm/1794): enable this after heap overflow is fixed. +#if 0 INSTANTIATE_TEST_SUITE_P(NEON, VpxSseTest, ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_neon))); +#endif INSTANTIATE_TEST_SUITE_P(NEON, VpxMseTest, ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon), diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index feff980c93..76c2a15863 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -433,6 +433,8 @@ static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, return *sse; } +// TODO(https://crbug.com/webm/1794): enable this after heap overflow is fixed. +#if 0 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride) { @@ -466,6 +468,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, return vget_lane_u32(sse, 0); } +#endif // 0 #else // !defined(__ARM_FEATURE_DOTPROD) @@ -532,6 +535,8 @@ static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, return *sse; } +// TODO(https://crbug.com/webm/1794): enable this after heap overflow is fixed. +#if 0 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride) { @@ -572,6 +577,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse)); } +#endif // 0 #endif // defined(__ARM_FEATURE_DOTPROD) diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index c50ab93c5a..2301fbe328 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1152,8 +1152,10 @@ () add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *"; specialize qw/vpx_get_mb_ss sse2 msa vsx/; + # TODO(https://crbug.com/webm/1794): enable neon after heap overflow is + # fixed. add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride"; - specialize qw/vpx_get4x4sse_cs neon msa vsx/; + specialize qw/vpx_get4x4sse_cs msa vsx/; add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/; From 5fae248f2a8af49bc82590ec1d397c8535859b0e Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 3 Mar 2023 15:33:16 -0800 Subject: [PATCH 0646/1000] disable vp8_sixtap_predict16x16_neon This causes various buffer overflows in the tests: [ RUN ] NEON/SixtapPredictTest.TestWithPresetData/0 ================================================================= ==22346==ERROR: AddressSanitizer: global-buffer-overflow on address 0x0000012b4a5b at pc 0x000000df0f60 bp 0xffffcf6e64b0 sp 0xffffcf6e64a8 READ of size 8 at 0x0000012b4a5b thread T0 #0 0xdf0f5c in vp8_sixtap_predict16x16_neon vp8/common/arm/neon/sixtappredict_neon.c:1507:13 #1 0x8819e4 in (anonymous namespace)::SixtapPredictTest_TestWithPresetData_Test::TestBody() test/predict_test.cc:293:3 ... 0x0000012b4a5b is located 2 bytes to the right of global variable 'kTestData' defined in '../test/predict_test.cc:237:24' (0x12b48a0) of size 441 [ RUN ] NEON/SixtapPredictTest.TestWithRandomData/0 ================================================================= ==22338==ERROR: AddressSanitizer: heap-buffer-overflow on address 0xffff8b5321fb at pc 0x000000df0f60 bp 0xfffff7e0cf30 sp 0xfffff7e0cf28 READ of size 8 at 0xffff8b5321fb thread T0 #0 0xdf0f5c in vp8_sixtap_predict16x16_neon vp8/common/arm/neon/sixtappredict_neon.c:1507:13 #1 0x87d4c0 in (anonymous namespace)::PredictTestBase::TestWithRandomData(void (*)(unsigned char*, int, int, int, unsigned char*, int)) test/predict_test.cc:170:9 ... 0xffff8b5321fb is located 2 bytes to the right of 441-byte region [0xffff8b532040,0xffff8b5321f9) allocated by thread T0 here: #0 0x5fd4f0 in operator new[](unsigned long) (test_libvpx+0x5fd4f0) #1 0x87c2e0 in (anonymous namespace)::PredictTestBase::SetUp() test/predict_test.cc:47:12 #2 0x87d074 in non-virtual thunk to (anonymous namespace)::PredictTestBase::SetUp() test/predict_test.cc ... Bug: webm:1795 Change-Id: I32213a381eef91547d00f88acf90f1cf2ec2ea75 --- test/predict_test.cc | 4 +++- vp8/common/arm/neon/sixtappredict_neon.c | 4 ++++ vp8/common/rtcd_defs.pl | 4 +++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/test/predict_test.cc b/test/predict_test.cc index 7472970576..e49d98272e 100644 --- a/test/predict_test.cc +++ b/test/predict_test.cc @@ -307,7 +307,9 @@ INSTANTIATE_TEST_SUITE_P( #if HAVE_NEON INSTANTIATE_TEST_SUITE_P( NEON, SixtapPredictTest, - ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon), + ::testing::Values(/*TODO(https://crbug.com/webm/1795): enable this after + buffer overflows are fixed. + make_tuple(16, 16, &vp8_sixtap_predict16x16_neon),*/ make_tuple(8, 8, &vp8_sixtap_predict8x8_neon), make_tuple(8, 4, &vp8_sixtap_predict8x4_neon), make_tuple(4, 4, &vp8_sixtap_predict4x4_neon))); diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c index 48e86d3278..4960d16516 100644 --- a/vp8/common/arm/neon/sixtappredict_neon.c +++ b/vp8/common/arm/neon/sixtappredict_neon.c @@ -1253,6 +1253,9 @@ void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, return; } +// TODO(https://crbug.com/webm/1795): enable this after buffer overflows are +// fixed. +#if 0 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, @@ -1728,3 +1731,4 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, } return; } +#endif // 0 diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 739a612847..05e67ce11b 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -146,7 +146,9 @@ () # Subpixel # add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; -specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi lsx/; +# TODO(https://crbug.com/webm/1795): enable neon after buffer overflows are +# fixed. +specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 dspr2 msa mmi lsx/; add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi lsx/; From 0384a2aab79a8b004257ba32d7824e8defb12c55 Mon Sep 17 00:00:00 2001 From: Johann Date: Sat, 5 Nov 2022 09:53:07 +0900 Subject: [PATCH 0647/1000] reland: quantize: simplify 32x32_b args Allocate mb_plane_ on the heap to ensure src is aligned. Now that all the implementations of the 32x32 quantize are in intrinsics we can reference struct members directly. Saves pushing them to the stack. n_coeffs is not used at all for this function. Change-Id: Ib551f7f583977602504d962b72063bc6eda9dda9 --- test/vp9_quantize_test.cc | 287 +++++++++++++++++++++-------------- vp9/encoder/vp9_block.h | 1 + vp9/encoder/vp9_encodemb.c | 6 +- vpx_dsp/arm/quantize_neon.c | 17 +-- vpx_dsp/quantize.c | 17 ++- vpx_dsp/vpx_dsp_rtcd_defs.pl | 5 +- vpx_dsp/x86/quantize_avx.c | 30 +--- vpx_dsp/x86/quantize_avx2.c | 15 +- vpx_dsp/x86/quantize_sse2.h | 28 ++++ vpx_dsp/x86/quantize_ssse3.c | 35 +---- 10 files changed, 243 insertions(+), 198 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 587cec6923..6a8f1dafb1 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -26,6 +26,7 @@ #include "test/util.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" #include "vpx_ports/msvc.h" @@ -38,8 +39,7 @@ namespace { const int number_of_iterations = 100; typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, - const int16_t *zbin, const int16_t *round, - const int16_t *quant, const int16_t *quant_shift, + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, const int16_t *scan, const int16_t *iscan); @@ -47,6 +47,41 @@ typedef std::tuple QuantizeParam; +// Wrapper which takes a macroblock_plane. +typedef void (*QuantizeBaseFunc)(const tran_low_t *coeff, intptr_t count, + const int16_t *zbin, const int16_t *round, + const int16_t *quant, + const int16_t *quant_shift, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, + uint16_t *eob, const int16_t *scan, + const int16_t *iscan); + +template +void QuantWrapper(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan) { + fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant, + mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan); +} + +// Wrapper for 32x32 version which does not use count +typedef void (*Quantize32x32Func)(const tran_low_t *coeff, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan); + +template +void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan) { + (void)count; + fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan); +} + // Wrapper for FP version which does not use zbin or quant_shift. typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, const int16_t *round, const int16_t *quant, @@ -56,15 +91,11 @@ typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, template void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, - const int16_t *zbin, const int16_t *round, - const int16_t *quant, const int16_t *quant_shift, - tran_low_t *qcoeff, tran_low_t *dqcoeff, - const int16_t *dequant, uint16_t *eob, const int16_t *scan, - const int16_t *iscan) { - (void)zbin; - (void)quant_shift; - - fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan); + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, + const int16_t *scan, const int16_t *iscan) { + fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff, + dequant, eob, scan, iscan); } void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, @@ -119,17 +150,21 @@ class VP9QuantizeBase : public AbstractBench { #else max_value_ = (1 << bit_depth_) - 1; #endif - zbin_ptr_ = + + mb_plane_ = reinterpret_cast( + vpx_memalign(16, sizeof(macroblock_plane))); + + zbin_ptr_ = mb_plane_->zbin = reinterpret_cast(vpx_memalign(16, 8 * sizeof(*zbin_ptr_))); - round_fp_ptr_ = reinterpret_cast( + round_fp_ptr_ = mb_plane_->round_fp = reinterpret_cast( vpx_memalign(16, 8 * sizeof(*round_fp_ptr_))); - quant_fp_ptr_ = reinterpret_cast( + quant_fp_ptr_ = mb_plane_->quant_fp = reinterpret_cast( vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_))); - round_ptr_ = + round_ptr_ = mb_plane_->round = reinterpret_cast(vpx_memalign(16, 8 * sizeof(*round_ptr_))); - quant_ptr_ = + quant_ptr_ = mb_plane_->quant = reinterpret_cast(vpx_memalign(16, 8 * sizeof(*quant_ptr_))); - quant_shift_ptr_ = reinterpret_cast( + quant_shift_ptr_ = mb_plane_->quant_shift = reinterpret_cast( vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_))); dequant_ptr_ = reinterpret_cast( vpx_memalign(16, 8 * sizeof(*dequant_ptr_))); @@ -139,6 +174,7 @@ class VP9QuantizeBase : public AbstractBench { } ~VP9QuantizeBase() { + vpx_free(mb_plane_); vpx_free(zbin_ptr_); vpx_free(round_fp_ptr_); vpx_free(quant_fp_ptr_); @@ -146,6 +182,7 @@ class VP9QuantizeBase : public AbstractBench { vpx_free(quant_ptr_); vpx_free(quant_shift_ptr_); vpx_free(dequant_ptr_); + mb_plane_ = nullptr; zbin_ptr_ = nullptr; round_fp_ptr_ = nullptr; quant_fp_ptr_ = nullptr; @@ -157,9 +194,10 @@ class VP9QuantizeBase : public AbstractBench { } protected: + macroblock_plane *mb_plane_; int16_t *zbin_ptr_; - int16_t *round_fp_ptr_; int16_t *quant_fp_ptr_; + int16_t *round_fp_ptr_; int16_t *round_ptr_; int16_t *quant_ptr_; int16_t *quant_shift_ptr_; @@ -193,8 +231,7 @@ class VP9QuantizeTest : public VP9QuantizeBase, }; void VP9QuantizeTest::Run() { - quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), + quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan, scan_->iscan); } @@ -266,8 +303,8 @@ void VP9QuantizeTest::Speed(bool is_median) { vpx_usec_timer_start(&timer); for (int n = 0; n < kNumTests; ++n) { - ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, - q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); } @@ -275,10 +312,9 @@ void VP9QuantizeTest::Speed(bool is_median) { vpx_usec_timer_start(&simd_timer); for (int n = 0; n < kNumTests; ++n) { - quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, - scan_->scan, scan_->iscan); + quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), + dequant_ptr_, &eob_, scan_->scan, scan_->iscan); } vpx_usec_timer_mark(&simd_timer); @@ -417,15 +453,14 @@ TEST_P(VP9QuantizeTest, OperationCheck) { GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), - ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_->scan, scan_->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), - dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); + ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, + mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, + &eob_, scan_->scan, scan_->iscan)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); @@ -475,15 +510,14 @@ TEST_P(VP9QuantizeTest, EOBCheck) { GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), - ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_->scan, scan_->iscan); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, - quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), - dequant_ptr_, &eob_, scan_->scan, scan_->iscan)); + ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, + mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, + &eob_, scan_->scan, scan_->iscan)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); @@ -510,28 +544,35 @@ using std::make_tuple; INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, ::testing::Values( - make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, - false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, + 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, + 32, false))); #else INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), + ::testing::Values(make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, + 16, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true))); @@ -541,11 +582,12 @@ INSTANTIATE_TEST_SUITE_P( #if HAVE_SSSE3 INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_ssse3, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false), + ::testing::Values(make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), @@ -555,13 +597,14 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_SSSE3 #if HAVE_AVX -INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_avx, - &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx, - &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false))); +INSTANTIATE_TEST_SUITE_P( + AVX, VP9QuantizeTest, + ::testing::Values(make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false))); #endif // HAVE_AVX #if VPX_ARCH_X86_64 && HAVE_AVX2 @@ -577,22 +620,29 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_12, 32, true), - make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16, + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, 16, + false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, - VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_avx2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_avx2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_avx2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, + 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, + 32, false))); #else INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, @@ -602,11 +652,12 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 32, true), - make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx2, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false))); + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false))); #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_AVX2 @@ -615,22 +666,29 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, ::testing::Values( - make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, - VPX_BITS_10, 16, false), - make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, - VPX_BITS_12, 16, false), - make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_neon, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_neon, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), - make_tuple(&vpx_highbd_quantize_b_32x32_neon, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, 16, + false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_10, + 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_12, + 32, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper, @@ -639,11 +697,12 @@ INSTANTIATE_TEST_SUITE_P( #else INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_neon, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false), + ::testing::Values(make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, + 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), @@ -683,9 +742,11 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest, INSTANTIATE_TEST_SUITE_P( DISABLED_C, VP9QuantizeTest, ::testing::Values( - make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8, - 32, false), + make_tuple(&QuantWrapper, + &QuantWrapper, VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, VPX_BITS_8, 32, + false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper, diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 3e2c9a3c35..da01c346d9 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -13,6 +13,7 @@ #include "vpx_util/vpx_thread.h" +#include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index fa222f9dcf..4910dc20f5 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -542,8 +542,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_16X16: @@ -948,8 +947,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride); fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index 9c227d560f..e81738a7bb 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -14,6 +14,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vp9/encoder/vp9_block.h" static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, const int16x8_t dequant, @@ -213,11 +214,8 @@ quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. -void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -226,10 +224,10 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int i; // Only the first element of each vector is DC. - int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); - int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); - int16x8_t quant = vld1q_s16(quant_ptr); - int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1); + int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1); + int16x8_t quant = vld1q_s16(mb_plane->quant); + int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift); int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. @@ -289,6 +287,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #endif // __aarch64__ // Need these here, else the compiler complains about mixing declarations and // code in C90 - (void)n_coeffs; (void)scan; } diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index 5d6ba64a8a..212db45c88 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -14,6 +14,7 @@ #include "vpx_dsp/quantize.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" +#include "vp9/encoder/vp9_block.h" void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, const int16_t *round_ptr, const int16_t quant, @@ -208,19 +209,21 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } #endif -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const int n_coeffs = 32 * 32; + const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), + ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; int idx = 0; - int idx_arr[1024]; + int idx_arr[32 * 32 /* n_coeffs */]; int i, eob = -1; (void)iscan; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 2301fbe328..deaf4afe8c 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -17,6 +17,9 @@ () #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; +#endif EOF } @@ -717,7 +720,7 @@ () add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; - add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index 7d83527216..d52f6c6644 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -140,15 +140,12 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); const __m256i big_zero = _mm256_setzero_si256(); int index; @@ -160,26 +157,9 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i eob = zero, eob0; (void)scan; - (void)n_coeffs; - - // Setup global values. - // The 32x32 halves zbin and round. - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - // Shift with rounding. - zbin = _mm_add_epi16(zbin, one); - zbin = _mm_srli_epi16(zbin, 1); - // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so - // it is a strict "greater" comparison. - zbin = _mm_sub_epi16(zbin, one); - - round = _mm_load_si128((const __m128i *)round_ptr); - round = _mm_add_epi16(round, one); - round = _mm_srli_epi16(round, 1); - - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - shift = _mm_slli_epi16(shift, 1); + + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, + &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index 28f7c9c7da..a8412c5b8e 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -13,6 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void load_b_values_avx2( const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, @@ -250,23 +251,19 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16( } } -void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; - (void)n_coeffs; (void)scan; - load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, - &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, - &v_quant_shift, 1); + load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round, + mb_plane->quant, &v_quant, dequant_ptr, &v_dequant, + mb_plane->quant_shift, &v_quant_shift, 1); // Do DC and first 15 AC. v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h index 27bfb4e41b..fe42fee018 100644 --- a/vpx_dsp/x86/quantize_sse2.h +++ b/vpx_dsp/x86/quantize_sse2.h @@ -15,6 +15,7 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_block.h" static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, const int16_t *round_ptr, __m128i *round, @@ -29,6 +30,33 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, *shift = _mm_load_si128((const __m128i *)shift_ptr); } +static INLINE void load_b_values32x32( + const struct macroblock_plane *const mb_plane, __m128i *zbin, + __m128i *round, __m128i *quant, const int16_t *dequant_ptr, + __m128i *dequant, __m128i *shift) { + const __m128i one = _mm_set1_epi16(1); + // The 32x32 halves zbin and round. + *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin); + // Shift with rounding. + *zbin = _mm_add_epi16(*zbin, one); + *zbin = _mm_srli_epi16(*zbin, 1); + // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + *zbin = _mm_sub_epi16(*zbin, one); + + *round = _mm_load_si128((const __m128i *)mb_plane->round); + *round = _mm_add_epi16(*round, one); + *round = _mm_srli_epi16(*round, 1); + + *quant = _mm_load_si128((const __m128i *)mb_plane->quant); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift); + // I suspect this is not technically OK because quant_shift can be up + // to 1 << 16 and shifting up again will outrange that, but the test is not + // comprehensive enough to catch that and "it's been that way forever" + *shift = _mm_slli_epi16(*shift, 1); +} + static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round, const int16_t *quant_ptr, __m128i *quant, const int16_t *dequant_ptr, diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 476230286d..6fe54d7d98 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -16,6 +16,7 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/encoder/vp9_block.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -107,16 +108,12 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); int index; __m128i zbin, round, quant, dequant, shift; @@ -127,29 +124,9 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i eob = zero, eob0; (void)scan; - (void)n_coeffs; - - // Setup global values. - // The 32x32 halves zbin and round. - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - // Shift with rounding. - zbin = _mm_add_epi16(zbin, one); - zbin = _mm_srli_epi16(zbin, 1); - // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so - // it is a strict "greater" comparison. - zbin = _mm_sub_epi16(zbin, one); - - round = _mm_load_si128((const __m128i *)round_ptr); - round = _mm_add_epi16(round, one); - round = _mm_srli_epi16(round, 1); - - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - // I suspect this is not technically OK because quant_shift can be up - // to 1 << 16 and shifting up again will outrange that, but the test is not - // comprehensive enough to catch that and "it's been that way forever" - shift = _mm_slli_epi16(shift, 1); + + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, + &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); From 62827575462ecdb7790b60f5da302b6395cef798 Mon Sep 17 00:00:00 2001 From: George Steed Date: Wed, 1 Mar 2023 22:44:38 +0000 Subject: [PATCH 0648/1000] Implement highbd_d63_predictor using Neon Add Neon implementations of the highbd d63 predictor for 4x4, 8x8, 16x16 and 32x32 block sizes. Also update tests to add new corresponding cases. This re-lands commit 7cdf139e3d6237386e0f93bdb0bdc1b459c663bf, previously reverted in 7478b7e4e481562a4a13f233acb66a60462e1934. Compared to the previous implementation attempt we now correctly match the behaviour of the C code when handling the final element loaded from the 'above' input array. In particular: - The C code for a 4x4 block performs a full average of the last element rather than duplicating the final element from the input 'above' array. - The C code for other block sizes performs a full average for the stride=0 and stride=1, and otherwise shifts in duplicates of the final element from the input 'above' array. Notably this shifting for later strides _replaces_ the final element which we previously performed an average on (see {d0,d1}_ext in the code). It is worth noting that this difference is not caught by the existing VP9HighbdIntraPredTest test cases since the test vector initialisation contains this loop: for (int x = block_size; x < 2 * block_size; x++) { above_row_[x] = above_row_[block_size - 1]; } Since AVG2(a, a) and AVG3(a, a, a) are simply 'a', such differences in behaviour for the final element are not observed. Tested on AArch64 with: - ./test_libvpx --gtest_filter="*VP9HighbdIntraPredTest*" - ./test_libvpx --gtest_filter="*VP9/TestVectorTest.MD5Match*" - ./test_libvpx --gtest_filter="*VP9/ExternalFrameBufferMD5Test*" Speedups over the C code (higher is better): Microarch. | Compiler | Block | Speedup Neoverse N1 | LLVM 15 | 4x4 | 2.43 Neoverse N1 | LLVM 15 | 8x8 | 3.92 Neoverse N1 | LLVM 15 | 16x16 | 3.19 Neoverse N1 | LLVM 15 | 32x32 | 4.13 Neoverse N1 | GCC 12 | 4x4 | 2.92 Neoverse N1 | GCC 12 | 8x8 | 6.51 Neoverse N1 | GCC 12 | 16x16 | 4.55 Neoverse N1 | GCC 12 | 32x32 | 3.18 Neoverse V1 | LLVM 15 | 4x4 | 1.99 Neoverse V1 | LLVM 15 | 8x8 | 3.65 Neoverse V1 | LLVM 15 | 16x16 | 3.72 Neoverse V1 | LLVM 15 | 32x32 | 3.26 Neoverse V1 | GCC 12 | 4x4 | 2.39 Neoverse V1 | GCC 12 | 8x8 | 4.76 Neoverse V1 | GCC 12 | 16x16 | 3.24 Neoverse V1 | GCC 12 | 32x32 | 2.44 Change-Id: Iefaa774d6a20388b523eaa7f5df6bc5f5cf249e4 --- test/test_intra_pred_speed.cc | 12 +- test/vp9_intrapred_test.cc | 24 ++ vpx_dsp/arm/highbd_intrapred_neon.c | 326 ++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- 4 files changed, 362 insertions(+), 8 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 24af471eaa..e721a459ad 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -566,14 +566,16 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon, vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon, vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon, - nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_4x4_neon) + nullptr, nullptr, vpx_highbd_d63_predictor_4x4_neon, + vpx_highbd_tm_predictor_4x4_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon, vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon, vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon, vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon, vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon, - nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_8x8_neon) + nullptr, nullptr, vpx_highbd_d63_predictor_8x8_neon, + vpx_highbd_tm_predictor_8x8_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon, vpx_highbd_dc_left_predictor_16x16_neon, @@ -581,7 +583,8 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon, vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon, vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon, - nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_16x16_neon) + nullptr, nullptr, vpx_highbd_d63_predictor_16x16_neon, + vpx_highbd_tm_predictor_16x16_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon, vpx_highbd_dc_left_predictor_32x32_neon, @@ -589,7 +592,8 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon, vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon, vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon, - nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_32x32_neon) + nullptr, nullptr, vpx_highbd_d63_predictor_32x32_neon, + vpx_highbd_tm_predictor_32x32_neon) #endif // HAVE_NEON #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 83e371df6e..c4e0e78ac5 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -848,6 +848,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d45_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, &vpx_highbd_d45_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, + &vpx_highbd_d63_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, + &vpx_highbd_d63_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, + &vpx_highbd_d63_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, + &vpx_highbd_d63_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, &vpx_highbd_d117_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, @@ -932,6 +940,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d45_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, &vpx_highbd_d45_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, + &vpx_highbd_d63_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, + &vpx_highbd_d63_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, + &vpx_highbd_d63_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, + &vpx_highbd_d63_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, &vpx_highbd_d117_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, @@ -1016,6 +1032,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d45_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, &vpx_highbd_d45_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, + &vpx_highbd_d63_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, + &vpx_highbd_d63_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, + &vpx_highbd_d63_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, + &vpx_highbd_d63_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, &vpx_highbd_d117_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index dc1b27dc10..6b6ad95c12 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -453,6 +453,332 @@ void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, // ----------------------------------------------------------------------------- +void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x4_t a0, a1, a2, a3, d0, d1, d2, d3; + (void)left; + (void)bd; + + a0 = vld1_u16(above + 0); + a1 = vld1_u16(above + 1); + a2 = vld1_u16(above + 2); + a3 = vld1_u16(above + 3); + + d0 = vrhadd_u16(a0, a1); + d1 = vrhadd_u16(vhadd_u16(a0, a2), a1); + d2 = vrhadd_u16(a1, a2); + d3 = vrhadd_u16(vhadd_u16(a1, a3), a2); + + // Note that here we are performing a full avg calculation for the final + // elements rather than storing a duplicate of above[3], which differs + // (correctly) from the general scheme employed by the bs={8,16,32} + // implementations in order to match the original C implementation. + vst1_u16(dst + 0 * stride, d0); + vst1_u16(dst + 1 * stride, d1); + vst1_u16(dst + 2 * stride, d2); + vst1_u16(dst + 3 * stride, d3); +} + +void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t a0, a1, a2, a7, d0, d1, d0_ext, d1_ext; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a2 = vld1q_u16(above + 2); + a7 = vld1q_dup_u16(above + 7); + + d0 = vrhaddq_u16(a0, a1); + d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + + // We want to store: + // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ] + // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ] + // stride=2 [ d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], a[7], a[7] ] + // stride=3 [ d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], a[7], a[7] ] + // stride=4 [ d0[2], d0[3], d0[4], d0[5], d0[6], a[7], a[7], a[7] ] + // stride=5 [ d1[2], d1[3], d1[4], d1[5], d1[6], a[7], a[7], a[7] ] + // stride=6 [ d0[3], d0[4], d0[5], d0[6], a[7], a[7], a[7], a[7] ] + // stride=7 [ d1[3], d1[4], d1[5], d1[6], a[7], a[7], a[7], a[7] ] + // Note in particular that d0[7] and d1[7] are only ever referenced in the + // stride=0 and stride=1 cases respectively, and in later strides are + // replaced by a copy of above[7]. These are equivalent if for i>7, + // above[i]==above[7], however that is not always the case. + + // Strip out d0[7] and d1[7] so that we can replace it with an additional + // copy of above[7], the first vector here doesn't matter so just reuse + // d0/d1. + d0_ext = vextq_u16(d0, d0, 7); + d1_ext = vextq_u16(d1, d1, 7); + + // Shuffle in duplicates of above[7] and store. + vst1q_u16(dst + 0 * stride, d0); + vst1q_u16(dst + 1 * stride, d1); + vst1q_u16(dst + 2 * stride, vextq_u16(d0_ext, a7, 2)); + vst1q_u16(dst + 3 * stride, vextq_u16(d1_ext, a7, 2)); + vst1q_u16(dst + 4 * stride, vextq_u16(d0_ext, a7, 3)); + vst1q_u16(dst + 5 * stride, vextq_u16(d1_ext, a7, 3)); + vst1q_u16(dst + 6 * stride, vextq_u16(d0_ext, a7, 4)); + vst1q_u16(dst + 7 * stride, vextq_u16(d1_ext, a7, 4)); +} + +void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation. + uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0[2], d1[2], d0_ext, d1_ext; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a2 = vld1q_u16(above + 2); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a10 = vld1q_u16(above + 10); + a15 = vld1q_dup_u16(above + 15); + + d0[0] = vrhaddq_u16(a0, a1); + d0[1] = vrhaddq_u16(a8, a9); + d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9); + + // Strip out the final element of d0/d1 so that we can replace it with an + // additional copy of above[7], the first vector here doesn't matter so just + // reuse the same vector. + d0_ext = vextq_u16(d0[1], d0[1], 7); + d1_ext = vextq_u16(d1[1], d1[1], 7); + + // Shuffle in duplicates of above[7] and store. Note that cases involving + // {d0,d1}_ext require an extra shift to undo the shifting out of the final + // element from above. + vst1q_u16(dst + 0 * stride + 0, d0[0]); + vst1q_u16(dst + 0 * stride + 8, d0[1]); + vst1q_u16(dst + 1 * stride + 0, d1[0]); + vst1q_u16(dst + 1 * stride + 8, d1[1]); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_ext, a15, 2)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_ext, a15, 2)); + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_ext, a15, 3)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_ext, a15, 3)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_ext, a15, 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_ext, a15, 4)); + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_ext, a15, 5)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_ext, a15, 5)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_ext, a15, 6)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_ext, a15, 6)); + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_ext, a15, 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_ext, a15, 7)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 14 * stride + 8, a15); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 15 * stride + 8, a15); +} + +void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation. + uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4], + d1[4], d0_ext, d1_ext; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a2 = vld1q_u16(above + 2); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a10 = vld1q_u16(above + 10); + a16 = vld1q_u16(above + 16); + a17 = vld1q_u16(above + 17); + a18 = vld1q_u16(above + 18); + a24 = vld1q_u16(above + 24); + a25 = vld1q_u16(above + 25); + a26 = vld1q_u16(above + 26); + a31 = vld1q_dup_u16(above + 31); + + d0[0] = vrhaddq_u16(a0, a1); + d0[1] = vrhaddq_u16(a8, a9); + d0[2] = vrhaddq_u16(a16, a17); + d0[3] = vrhaddq_u16(a24, a25); + d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9); + d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17); + d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25); + + // Strip out the final element of d0/d1 so that we can replace it with an + // additional copy of above[7], the first vector here doesn't matter so just + // reuse the same vector. + d0_ext = vextq_u16(d0[3], d0[3], 7); + d1_ext = vextq_u16(d1[3], d1[3], 7); + + // Shuffle in duplicates of above[7] and store. Note that cases involving + // {d0,d1}_ext require an extra shift to undo the shifting out of the final + // element from above. + + vst1q_u16(dst + 0 * stride + 0, d0[0]); + vst1q_u16(dst + 0 * stride + 8, d0[1]); + vst1q_u16(dst + 0 * stride + 16, d0[2]); + vst1q_u16(dst + 0 * stride + 24, d0[3]); + vst1q_u16(dst + 1 * stride + 0, d1[0]); + vst1q_u16(dst + 1 * stride + 8, d1[1]); + vst1q_u16(dst + 1 * stride + 16, d1[2]); + vst1q_u16(dst + 1 * stride + 24, d1[3]); + + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0_ext, a31, 2)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1_ext, a31, 2)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2)); + vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0_ext, a31, 3)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1_ext, a31, 3)); + + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0_ext, a31, 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1_ext, a31, 4)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4)); + vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0_ext, a31, 5)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1_ext, a31, 5)); + + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0_ext, a31, 6)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1_ext, a31, 6)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6)); + vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0_ext, a31, 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1_ext, a31, 7)); + + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7)); + vst1q_u16(dst + 14 * stride + 24, a31); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7)); + vst1q_u16(dst + 15 * stride + 24, a31); + + vst1q_u16(dst + 16 * stride + 0, d0[1]); + vst1q_u16(dst + 16 * stride + 8, d0[2]); + vst1q_u16(dst + 16 * stride + 16, vextq_u16(d0_ext, a31, 1)); + vst1q_u16(dst + 16 * stride + 24, a31); + vst1q_u16(dst + 17 * stride + 0, d1[1]); + vst1q_u16(dst + 17 * stride + 8, d1[2]); + vst1q_u16(dst + 17 * stride + 16, vextq_u16(d1_ext, a31, 1)); + vst1q_u16(dst + 17 * stride + 24, a31); + + vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0_ext, a31, 2)); + vst1q_u16(dst + 18 * stride + 24, a31); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1_ext, a31, 2)); + vst1q_u16(dst + 19 * stride + 24, a31); + + vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2)); + vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0_ext, a31, 3)); + vst1q_u16(dst + 20 * stride + 24, a31); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1_ext, a31, 3)); + vst1q_u16(dst + 21 * stride + 24, a31); + + vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0_ext, a31, 4)); + vst1q_u16(dst + 22 * stride + 24, a31); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1_ext, a31, 4)); + vst1q_u16(dst + 23 * stride + 24, a31); + + vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4)); + vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0_ext, a31, 5)); + vst1q_u16(dst + 24 * stride + 24, a31); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1_ext, a31, 5)); + vst1q_u16(dst + 25 * stride + 24, a31); + + vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0_ext, a31, 6)); + vst1q_u16(dst + 26 * stride + 24, a31); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1_ext, a31, 6)); + vst1q_u16(dst + 27 * stride + 24, a31); + + vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6)); + vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0_ext, a31, 7)); + vst1q_u16(dst + 28 * stride + 24, a31); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1_ext, a31, 7)); + vst1q_u16(dst + 29 * stride + 24, a31); + + vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7)); + vst1q_u16(dst + 30 * stride + 16, a31); + vst1q_u16(dst + 30 * stride + 24, a31); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7)); + vst1q_u16(dst + 31 * stride + 16, a31); + vst1q_u16(dst + 31 * stride + 24, a31); +} + +// ----------------------------------------------------------------------------- + void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 652c553f97..48552a6f8d 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -217,7 +217,7 @@ () specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_4x4 sse2/; + specialize qw/vpx_highbd_d63_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/; @@ -256,7 +256,7 @@ () specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/; + specialize qw/vpx_highbd_d63_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/; @@ -295,7 +295,7 @@ () specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/; + specialize qw/vpx_highbd_d63_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/; @@ -334,7 +334,7 @@ () specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/; + specialize qw/vpx_highbd_d63_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/; From 8b0a60f91c2002d2ff319f755622b60dd70e213e Mon Sep 17 00:00:00 2001 From: George Steed Date: Thu, 9 Feb 2023 16:12:59 +0000 Subject: [PATCH 0649/1000] Implement d153_predictor using Neon Add Neon implementations of the d153 predictor for 4x4, 8x8, 16x16 and 32x32 block sizes. Also update tests to add new corresponding cases. Speedups over the C code (higher is better): Microarch. | Compiler | Block | Speedup Neoverse N1 | LLVM 15 | 4x4 | 1.59 Neoverse N1 | LLVM 15 | 8x8 | 4.46 Neoverse N1 | LLVM 15 | 16x16 | 8.77 Neoverse N1 | LLVM 15 | 32x32 | 15.21 Neoverse N1 | GCC 12 | 4x4 | 1.90 Neoverse N1 | GCC 12 | 8x8 | 4.70 Neoverse N1 | GCC 12 | 16x16 | 9.55 Neoverse N1 | GCC 12 | 32x32 | 5.95 Neoverse V1 | LLVM 15 | 4x4 | 2.89 Neoverse V1 | LLVM 15 | 8x8 | 6.94 Neoverse V1 | LLVM 15 | 16x16 | 10.20 Neoverse V1 | LLVM 15 | 32x32 | 15.63 Neoverse V1 | GCC 12 | 4x4 | 4.45 Neoverse V1 | GCC 12 | 8x8 | 7.71 Neoverse V1 | GCC 12 | 16x16 | 9.08 Neoverse V1 | GCC 12 | 32x32 | 7.93 Change-Id: I910692b14917cde8a8952fab5b9c78bed7f7c6ad --- test/test_intra_pred_speed.cc | 16 +-- test/vp9_intrapred_test.cc | 8 ++ vpx_dsp/arm/intrapred_neon.c | 254 ++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- 4 files changed, 274 insertions(+), 12 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index e721a459ad..871f778116 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -270,31 +270,31 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon, vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon, vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon, vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon, - nullptr, nullptr, vpx_d63_predictor_4x4_neon, - vpx_tm_predictor_4x4_neon) + vpx_d153_predictor_4x4_neon, nullptr, + vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon) INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon, vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon, vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon, vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon, - nullptr, nullptr, vpx_d63_predictor_8x8_neon, - vpx_tm_predictor_8x8_neon) + vpx_d153_predictor_8x8_neon, nullptr, + vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon) INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon, vpx_dc_left_predictor_16x16_neon, vpx_dc_top_predictor_16x16_neon, vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon, vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon, - nullptr, nullptr, vpx_d63_predictor_16x16_neon, - vpx_tm_predictor_16x16_neon) + vpx_d153_predictor_16x16_neon, nullptr, + vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon) INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon, vpx_dc_left_predictor_32x32_neon, vpx_dc_top_predictor_32x32_neon, vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon, vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon, vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon, - nullptr, nullptr, vpx_d63_predictor_32x32_neon, - vpx_tm_predictor_32x32_neon) + vpx_d153_predictor_32x32_neon, nullptr, + vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon) #endif // HAVE_NEON #if HAVE_MSA diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index c4e0e78ac5..a2ea1334d8 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -267,6 +267,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_d135_predictor_16x16_c, 16, 8), IntraPredParam(&vpx_d135_predictor_32x32_neon, &vpx_d135_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d153_predictor_4x4_neon, &vpx_d153_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_d153_predictor_8x8_neon, &vpx_d153_predictor_8x8_c, + 8, 8), + IntraPredParam(&vpx_d153_predictor_16x16_neon, + &vpx_d153_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d153_predictor_32x32_neon, + &vpx_d153_predictor_32x32_c, 32, 8), IntraPredParam(&vpx_dc_128_predictor_4x4_neon, &vpx_dc_128_predictor_4x4_c, 4, 8), IntraPredParam(&vpx_dc_128_predictor_8x8_neon, diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index 629c7170c6..1ff4bf2955 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -1081,6 +1081,260 @@ void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, d135_store_32x2(&dst, stride, row_0, row_1, row_2); } +// ----------------------------------------------------------------------------- + +void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d153_predictor_8x8_neon for more details on the implementation. + uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02; + + az = load_unaligned_u8_4x1(above - 1); + a0 = load_unaligned_u8_4x1(above + 0); + // [ left[0], above[-1], above[0], above[1], x, x, x, x ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + l0 = load_unaligned_u8_4x1(left + 0); + l1 = load_unaligned_u8_4x1(left + 1); + // [ above[-1], left[0], left[1], left[2], x, x, x, x ] + azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7); + + d0 = vrhadd_u8(azl0, l0); + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0); + + d02 = vrev64_u8(vzip_u8(d0, d2).val[0]); + + store_u8_4x1(dst + 0 * stride, vext_u8(d02, d1, 7)); + store_u8_4x1(dst + 1 * stride, vext_u8(d02, d1, 5)); + store_u8_4x1(dst + 2 * stride, vext_u8(d02, d1, 3)); + store_u8_4x1(dst + 3 * stride, vext_u8(d02, d1, 1)); +} + +void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi; + + az = vld1_u8(above - 1); + a0 = vld1_u8(above + 0); + // [ left[0], above[-1], ... , above[5] ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + l0 = vld1_u8(left); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vext_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[6] ] + azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], left[0]) + // d0[1] = AVG2(left[0], left[1]) + // ... + // d0[7] = AVG2(left[6], left[7]) + d0 = vrhadd_u8(azl0, l0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + + // d2[0] = AVG3(above[-1], left[0], left[1]) + // d2[1] = AVG3(left[0], left[1], left[2]) + // ... + // d2[6] = AVG3(left[5], left[6], left[7]) + // d2[7] = x (don't care) + d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vectors to put the elements to be shifted + // in at the end. The lowest lane of d02_lo is unused. + d02_lo = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[0]; + d02_hi = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[1]; + + // Incrementally shift more elements from d0/d2 reversed into d1: + // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ] + // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ] + // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ] + // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ] + // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ] + // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ] + // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ] + vst1_u8(dst + 0 * stride, vext_u8(d02_hi, d1, 7)); + vst1_u8(dst + 1 * stride, vext_u8(d02_hi, d1, 5)); + vst1_u8(dst + 2 * stride, vext_u8(d02_hi, d1, 3)); + vst1_u8(dst + 3 * stride, vext_u8(d02_hi, d1, 1)); + vst1_u8(dst + 4 * stride, vext_u8(d02_lo, d02_hi, 7)); + vst1_u8(dst + 5 * stride, vext_u8(d02_lo, d02_hi, 5)); + vst1_u8(dst + 6 * stride, vext_u8(d02_lo, d02_hi, 3)); + vst1_u8(dst + 7 * stride, vext_u8(d02_lo, d02_hi, 1)); +} + +void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d153_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left + 0); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[15], x ] + l1 = vextq_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0 = vrhaddq_u8(azl0, l0); + d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + d2 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + + d0 = vrev64q_u8(vextq_u8(d0, d0, 8)); + d2 = vrev64q_u8(vextq_u8(d2, d2, 8)); + + // The lowest lane of d02_lo is unused. + d02_lo = vzipq_u8(d2, d0).val[0]; + d02_hi = vzipq_u8(d2, d0).val[1]; + + vst1q_u8(dst + 0 * stride, vextq_u8(d02_hi, d1, 15)); + vst1q_u8(dst + 1 * stride, vextq_u8(d02_hi, d1, 13)); + vst1q_u8(dst + 2 * stride, vextq_u8(d02_hi, d1, 11)); + vst1q_u8(dst + 3 * stride, vextq_u8(d02_hi, d1, 9)); + vst1q_u8(dst + 4 * stride, vextq_u8(d02_hi, d1, 7)); + vst1q_u8(dst + 5 * stride, vextq_u8(d02_hi, d1, 5)); + vst1q_u8(dst + 6 * stride, vextq_u8(d02_hi, d1, 3)); + vst1q_u8(dst + 7 * stride, vextq_u8(d02_hi, d1, 1)); + vst1q_u8(dst + 8 * stride, vextq_u8(d02_lo, d02_hi, 15)); + vst1q_u8(dst + 9 * stride, vextq_u8(d02_lo, d02_hi, 13)); + vst1q_u8(dst + 10 * stride, vextq_u8(d02_lo, d02_hi, 11)); + vst1q_u8(dst + 11 * stride, vextq_u8(d02_lo, d02_hi, 9)); + vst1q_u8(dst + 12 * stride, vextq_u8(d02_lo, d02_hi, 7)); + vst1q_u8(dst + 13 * stride, vextq_u8(d02_lo, d02_hi, 5)); + vst1q_u8(dst + 14 * stride, vextq_u8(d02_lo, d02_hi, 3)); + vst1q_u8(dst + 15 * stride, vextq_u8(d02_lo, d02_hi, 1)); +} + +void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d153_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, a14, a15, a16, l0az, l0, l1, l15, l16, l17, azl0, d0_lo, + d0_hi, d1_lo, d1_hi, d2_lo, d2_hi; + uint8x16x2_t d02_hi, d02_lo; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + a14 = vld1q_u8(above + 14); + a15 = vld1q_u8(above + 15); + a16 = vld1q_u8(above + 16); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left); + l1 = vld1q_u8(left + 1); + l15 = vld1q_u8(left + 15); + l16 = vld1q_u8(left + 16); + // The last lane here is unused, reading left[32] would cause a buffer + // over-read (observed as an address-sanitizer failure), so just fill with a + // duplicate of left[16] to avoid needing to materialize a zero: + // [ left[17], ... , left[31], x ] + l17 = vextq_u8(l16, l16, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0_lo = vrhaddq_u8(azl0, l0); + d0_hi = vrhaddq_u8(l15, l16); + + d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15); + + // The highest lane of d2_hi is unused. + d2_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + d2_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16); + + d0_lo = vrev64q_u8(vextq_u8(d0_lo, d0_lo, 8)); + d0_hi = vrev64q_u8(vextq_u8(d0_hi, d0_hi, 8)); + + d2_lo = vrev64q_u8(vextq_u8(d2_lo, d2_lo, 8)); + d2_hi = vrev64q_u8(vextq_u8(d2_hi, d2_hi, 8)); + + // d02_hi.val[0][0] is unused here. + d02_hi = vzipq_u8(d2_hi, d0_hi); + d02_lo = vzipq_u8(d2_lo, d0_lo); + + vst1q_u8(dst + 0 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 15)); + vst1q_u8(dst + 0 * stride + 16, vextq_u8(d1_lo, d1_hi, 15)); + vst1q_u8(dst + 1 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 13)); + vst1q_u8(dst + 1 * stride + 16, vextq_u8(d1_lo, d1_hi, 13)); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 11)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d1_lo, d1_hi, 11)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 9)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 9)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 7)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d1_lo, d1_hi, 7)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 5)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 5)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 3)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d1_lo, d1_hi, 3)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 1)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 1)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 15)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 13)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 11)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 9)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 7)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 5)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 3)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 1)); + vst1q_u8(dst + 16 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15)); + vst1q_u8(dst + 16 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15)); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1)); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 15)); + vst1q_u8(dst + 24 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15)); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 13)); + vst1q_u8(dst + 25 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13)); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 11)); + vst1q_u8(dst + 26 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11)); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 9)); + vst1q_u8(dst + 27 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9)); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 7)); + vst1q_u8(dst + 28 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7)); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 5)); + vst1q_u8(dst + 29 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5)); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 3)); + vst1q_u8(dst + 30 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3)); + vst1q_u8(dst + 31 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 1)); + vst1q_u8(dst + 31 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1)); +} + +// ----------------------------------------------------------------------------- + #if !HAVE_NEON_ASM void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 48552a6f8d..1423de2689 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -63,7 +63,7 @@ () specialize qw/vpx_d135_predictor_4x4 neon/; add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d153_predictor_4x4 ssse3/; +specialize qw/vpx_d153_predictor_4x4 neon ssse3/; add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_4x4 neon msa sse2/; @@ -108,7 +108,7 @@ () specialize qw/vpx_d135_predictor_8x8 neon/; add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d153_predictor_8x8 ssse3/; +specialize qw/vpx_d153_predictor_8x8 neon ssse3/; add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_8x8 neon msa sse2/; @@ -149,7 +149,7 @@ () specialize qw/vpx_d135_predictor_16x16 neon/; add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d153_predictor_16x16 ssse3/; +specialize qw/vpx_d153_predictor_16x16 neon ssse3/; add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/; @@ -188,7 +188,7 @@ () specialize qw/vpx_d135_predictor_32x32 neon/; add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d153_predictor_32x32 ssse3/; +specialize qw/vpx_d153_predictor_32x32 neon ssse3/; add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/; From 7e88600bf90787765c3e98d5f0bd7cf72b74d6ba Mon Sep 17 00:00:00 2001 From: George Steed Date: Mon, 6 Mar 2023 09:27:41 +0000 Subject: [PATCH 0650/1000] Implement d117_predictor using Neon Add Neon implementations of the d117 predictor for 4x4, 8x8, 16x16 and 32x32 block sizes. Also update tests to add new corresponding cases. This re-lands commit 360e9069b6cc1dd3a004728b876fb923413f4b11, previously reverted in commit 394de691a0ef570fc49943f565ad53ee0d22a7f3. The implementation is mostly identical to the original but with an adjustment to how data is loaded from the `left` array. In particular the left array cannot be guaranteed to be larger than the block size, so the read of e.g. `left[32]` in the `bs=32` case is not valid. This turns out to be not a problem since the last lane loaded in this case is unused. I have added comments in the code to explain why this is the case. Since we cannot load the last element directly, we instead construct it from the previous aligned read. This seems to have an inconsistent affect on performance, improving by up to 10% in some cases and regressing by up to 10% on others. Either way it is still significantly faster than the original C code. Speedups over the C code (higher is better): Microarch. | Compiler | Block | Speedup Neoverse N1 | LLVM 15 | 4x4 | 1.88 Neoverse N1 | LLVM 15 | 8x8 | 5.19 Neoverse N1 | LLVM 15 | 16x16 | 9.63 Neoverse N1 | LLVM 15 | 32x32 | 13.85 Neoverse N1 | GCC 12 | 4x4 | 2.04 Neoverse N1 | GCC 12 | 8x8 | 4.62 Neoverse N1 | GCC 12 | 16x16 | 9.79 Neoverse N1 | GCC 12 | 32x32 | 4.69 Neoverse V1 | LLVM 15 | 4x4 | 1.75 Neoverse V1 | LLVM 15 | 8x8 | 6.71 Neoverse V1 | LLVM 15 | 16x16 | 9.62 Neoverse V1 | LLVM 15 | 32x32 | 13.81 Neoverse V1 | GCC 12 | 4x4 | 1.75 Neoverse V1 | GCC 12 | 8x8 | 6.01 Neoverse V1 | GCC 12 | 16x16 | 6.91 Neoverse V1 | GCC 12 | 32x32 | 4.39 Change-Id: Ia0977ff0b0eba2c41c7884b64e7c22ff9bc9549d --- test/test_intra_pred_speed.cc | 20 +-- test/vp9_intrapred_test.cc | 8 ++ vpx_dsp/arm/intrapred_neon.c | 252 ++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 + 4 files changed, 276 insertions(+), 8 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 30817553ff..24af471eaa 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -269,28 +269,32 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon, vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon, vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon, vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon, - vpx_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, - vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon) + vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon, + nullptr, nullptr, vpx_d63_predictor_4x4_neon, + vpx_tm_predictor_4x4_neon) INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon, vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon, vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon, vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, - vpx_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, - vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon) + vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon, + nullptr, nullptr, vpx_d63_predictor_8x8_neon, + vpx_tm_predictor_8x8_neon) INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon, vpx_dc_left_predictor_16x16_neon, vpx_dc_top_predictor_16x16_neon, vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon, vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, - vpx_d135_predictor_16x16_neon, nullptr, nullptr, nullptr, - vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon) + vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon, + nullptr, nullptr, vpx_d63_predictor_16x16_neon, + vpx_tm_predictor_16x16_neon) INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon, vpx_dc_left_predictor_32x32_neon, vpx_dc_top_predictor_32x32_neon, vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon, vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon, - vpx_d135_predictor_32x32_neon, nullptr, nullptr, nullptr, - vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon) + vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon, + nullptr, nullptr, vpx_d63_predictor_32x32_neon, + vpx_tm_predictor_32x32_neon) #endif // HAVE_NEON #if HAVE_MSA diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 7f8e1c5b51..83e371df6e 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -251,6 +251,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_d63_predictor_16x16_c, 16, 8), IntraPredParam(&vpx_d63_predictor_32x32_neon, &vpx_d63_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d117_predictor_4x4_neon, &vpx_d117_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_d117_predictor_8x8_neon, &vpx_d117_predictor_8x8_c, + 8, 8), + IntraPredParam(&vpx_d117_predictor_16x16_neon, + &vpx_d117_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d117_predictor_32x32_neon, + &vpx_d117_predictor_32x32_c, 32, 8), IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c, 4, 8), IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c, diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index 02a05aae53..629c7170c6 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -545,6 +545,258 @@ void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, // ----------------------------------------------------------------------------- +void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d117_predictor_8x8_neon for more details on the implementation. + uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1; + + az = load_unaligned_u8_4x1(above - 1); + a0 = load_unaligned_u8_4x1(above + 0); + // [ left[0], above[-1], above[0], above[1], x, x, x, x ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2); + col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2); + + d0 = vrhadd_u8(az, a0); + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + d2 = vext_u8(col0, d0, 7); + d3 = vext_u8(col1, d1, 7); + + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, d1); + store_u8_4x1(dst + 2 * stride, d2); + store_u8_4x1(dst + 3 * stride, d3); +} + +void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd; + + az = vld1_u8(above - 1); + a0 = vld1_u8(above + 0); + // [ left[0], above[-1], ... , above[5] ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + l0 = vld1_u8(left + 0); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vext_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[6] ] + azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], above[0]) + // d0[1] = AVG2(above[0], above[1]) + // ... + // d0[7] = AVG2(above[6], above[7]) + d0 = vrhadd_u8(az, a0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vector to put the elements to be shifted in + // at the end. The lowest two lanes here are unused: + // col0[7] = AVG3(above[-1], left[0], left[1]) + // col0[6] = AVG3(left[0], left[1], left[2]) + // ... + // col0[2] = AVG3(left[4], left[5], left[6]) + // col0[1] = x (don't care) + // col0[0] = x (don't care) + col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0)); + + // We don't care about the first parameter to this uzp since we only ever use + // the high three elements, we just use col0 again since it is already + // available: + // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ] + // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ] + col0_even = vuzp_u8(col0, col0).val[1]; + col0_odd = vuzp_u8(col0, col0).val[0]; + + // Incrementally shift more elements from col0 into d0/1: + // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ] + // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ] + // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ] + // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ] + // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ] + // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ] + // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ] + vst1_u8(dst + 0 * stride, d0); + vst1_u8(dst + 1 * stride, d1); + vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7)); + vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7)); + vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6)); + vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6)); + vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5)); + vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5)); +} + +void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d117_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left + 0); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[15], x ] + l1 = vextq_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0 = vrhaddq_u8(az, a0); + d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + + col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + col0 = vrev64q_u8(vextq_u8(col0, col0, 8)); + + // The low nine lanes here are unused so the first input to the uzp is + // unused, so just use a duplicate of col0 since we have it already. This + // also means that the lowest lane of col0 here is unused. + col0_even = vuzpq_u8(col0, col0).val[1]; + col0_odd = vuzpq_u8(col0, col0).val[0]; + + vst1q_u8(dst + 0 * stride, d0); + vst1q_u8(dst + 1 * stride, d1); + vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15)); + vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15)); + vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14)); + vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14)); + vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13)); + vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13)); + vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12)); + vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12)); + vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11)); + vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11)); + vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10)); + vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10)); + vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9)); + vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9)); +} + +void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d117_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1, + l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + a14 = vld1q_u8(above + 14); + a15 = vld1q_u8(above + 15); + a16 = vld1q_u8(above + 16); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left + 0); + l1 = vld1q_u8(left + 1); + l15 = vld1q_u8(left + 15); + l16 = vld1q_u8(left + 16); + // The last lane here is unused, reading left[32] would cause a buffer + // over-read (observed as an address-sanitizer failure), so just fill with a + // duplicate of left[16] to avoid needing to materialize a zero: + // [ left[17], ... , left[31], x ] + l17 = vextq_u8(l16, l16, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0_lo = vrhaddq_u8(az, a0); + d0_hi = vrhaddq_u8(a15, a16); + d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15); + + // The last lane of col0_hi is unused here. + col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16); + + col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8)); + col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8)); + + // The first lane of these are unused since they are only ever called as + // ext(col0, _, i) where i >= 1. + col0_even = vuzpq_u8(col0_hi, col0_lo).val[1]; + col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0]; + + vst1q_u8(dst + 0 * stride + 0, d0_lo); + vst1q_u8(dst + 0 * stride + 16, d0_hi); + vst1q_u8(dst + 1 * stride + 0, d1_lo); + vst1q_u8(dst + 1 * stride + 16, d1_hi); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9)); + vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8)); + vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8)); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5)); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4)); + vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4)); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4)); + vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4)); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3)); + vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3)); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3)); + vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3)); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2)); + vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2)); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2)); + vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2)); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1)); + vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1)); + vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1)); + vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1)); +} + +// ----------------------------------------------------------------------------- + void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t XA0123 = vld1_u8(above - 1); diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 2301fbe328..652c553f97 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -57,6 +57,7 @@ () add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vpx_d117_predictor_4x4 neon/; add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_4x4 neon/; @@ -101,6 +102,7 @@ () specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/; add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vpx_d117_predictor_8x8 neon/; add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_8x8 neon/; @@ -141,6 +143,7 @@ () specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vpx_d117_predictor_16x16 neon/; add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_16x16 neon/; @@ -179,6 +182,7 @@ () specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/; add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vpx_d117_predictor_32x32 neon/; add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d135_predictor_32x32 neon/; From 872476c66b31a56c5e54262a5e38481c8430f71c Mon Sep 17 00:00:00 2001 From: George Steed Date: Tue, 14 Feb 2023 14:56:25 +0000 Subject: [PATCH 0651/1000] Implement d207_predictor using Neon Add Neon implementations of the d207 predictor for 4x4, 8x8, 16x16 and 32x32 block sizes. Also update tests to add new corresponding cases. Speedups over the C code (higher is better): Microarch. | Compiler | Block | Speedup Neoverse N1 | LLVM 15 | 4x4 | 1.72 Neoverse N1 | LLVM 15 | 8x8 | 5.68 Neoverse N1 | LLVM 15 | 16x16 | 12.30 Neoverse N1 | LLVM 15 | 32x32 | 16.70 Neoverse N1 | GCC 12 | 4x4 | 1.71 Neoverse N1 | GCC 12 | 8x8 | 6.01 Neoverse N1 | GCC 12 | 16x16 | 12.40 Neoverse N1 | GCC 12 | 32x32 | 6.71 Neoverse V1 | LLVM 15 | 4x4 | 1.99 Neoverse V1 | LLVM 15 | 8x8 | 8.28 Neoverse V1 | LLVM 15 | 16x16 | 14.36 Neoverse V1 | LLVM 15 | 32x32 | 17.55 Neoverse V1 | GCC 12 | 4x4 | 1.99 Neoverse V1 | GCC 12 | 8x8 | 8.43 Neoverse V1 | GCC 12 | 16x16 | 14.41 Neoverse V1 | GCC 12 | 32x32 | 7.82 Change-Id: I250ab56edab3390b0bac9dc96995a4bf9a4da641 --- test/test_intra_pred_speed.cc | 8 +- test/vp9_intrapred_test.cc | 8 ++ vpx_dsp/arm/intrapred_neon.c | 194 ++++++++++++++++++++++++++++++++++ vpx_dsp/arm/mem_neon.h | 13 +++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- 5 files changed, 223 insertions(+), 8 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 871f778116..5792161b34 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -270,14 +270,14 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon, vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon, vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon, vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon, - vpx_d153_predictor_4x4_neon, nullptr, + vpx_d153_predictor_4x4_neon, vpx_d207_predictor_4x4_neon, vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon) INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon, vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon, vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon, vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon, - vpx_d153_predictor_8x8_neon, nullptr, + vpx_d153_predictor_8x8_neon, vpx_d207_predictor_8x8_neon, vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon) INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon, vpx_dc_left_predictor_16x16_neon, @@ -285,7 +285,7 @@ INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon, vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon, vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon, - vpx_d153_predictor_16x16_neon, nullptr, + vpx_d153_predictor_16x16_neon, vpx_d207_predictor_16x16_neon, vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon) INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon, vpx_dc_left_predictor_32x32_neon, @@ -293,7 +293,7 @@ INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon, vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon, vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon, vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon, - vpx_d153_predictor_32x32_neon, nullptr, + vpx_d153_predictor_32x32_neon, vpx_d207_predictor_32x32_neon, vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon) #endif // HAVE_NEON diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index a2ea1334d8..8696c0a787 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -275,6 +275,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_d153_predictor_16x16_c, 16, 8), IntraPredParam(&vpx_d153_predictor_32x32_neon, &vpx_d153_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d207_predictor_4x4_neon, &vpx_d207_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_d207_predictor_8x8_neon, &vpx_d207_predictor_8x8_c, + 8, 8), + IntraPredParam(&vpx_d207_predictor_16x16_neon, + &vpx_d207_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d207_predictor_32x32_neon, + &vpx_d207_predictor_32x32_c, 32, 8), IntraPredParam(&vpx_dc_128_predictor_4x4_neon, &vpx_dc_128_predictor_4x4_c, 4, 8), IntraPredParam(&vpx_dc_128_predictor_8x8_neon, diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index 1ff4bf2955..892310f151 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -1335,6 +1335,200 @@ void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, // ----------------------------------------------------------------------------- +void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t l0, l3, l1, l2, c0, c1, c01, d0, d1; + (void)above; + + // We need the low half lanes here for the c0/c1 arithmetic but the high half + // lanes for the ext: + // [ left[0], left[1], left[2], left[3], left[0], left[1], left[2], left[3] ] + l0 = load_replicate_u8_4x1(left + 0); + l3 = vld1_dup_u8(left + 3); + + // [ left[1], left[2], left[3], left[3], x, x, x, x ] + l1 = vext_u8(l0, l3, 5); + // [ left[2], left[3], left[3], left[3], x, x, x, x ] + l2 = vext_u8(l0, l3, 6); + + c0 = vrhadd_u8(l0, l1); + c1 = vrhadd_u8(vhadd_u8(l0, l2), l1); + + // [ c0[0], c1[0], c0[1], c1[1], c0[2], c1[2], c0[3], c1[3] ] + c01 = vzip_u8(c0, c1).val[0]; + + d0 = c01; + d1 = vext_u8(c01, l3, 2); + + // Store the high half of the vector for stride={2,3} to avoid needing + // additional ext instructions: + // stride=0 [ c0[0], c1[0], c0[1], c1[1] ] + // stride=1 [ c0[1], c1[1], c0[2], c1[2] ] + // stride=2 [ c0[2], c1[2], c0[3], c1[3] ] + // stride=3 [ c0[3], c1[3], left[3], left[3] ] + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, d1); + store_u8_4x1_high(dst + 2 * stride, d0); + store_u8_4x1_high(dst + 3 * stride, d1); +} + +void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t l7, l0, l1, l2, c0, c1, c01_lo, c01_hi; + (void)above; + + l0 = vld1_u8(left + 0); + l7 = vld1_dup_u8(left + 7); + + // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ] + l1 = vext_u8(l0, l7, 1); + // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ] + l2 = vext_u8(l0, l7, 2); + + c0 = vrhadd_u8(l0, l1); + c1 = vrhadd_u8(vhadd_u8(l0, l2), l1); + + c01_lo = vzip_u8(c0, c1).val[0]; + c01_hi = vzip_u8(c0, c1).val[1]; + + vst1_u8(dst + 0 * stride, c01_lo); + vst1_u8(dst + 1 * stride, vext_u8(c01_lo, c01_hi, 2)); + vst1_u8(dst + 2 * stride, vext_u8(c01_lo, c01_hi, 4)); + vst1_u8(dst + 3 * stride, vext_u8(c01_lo, c01_hi, 6)); + vst1_u8(dst + 4 * stride, c01_hi); + vst1_u8(dst + 5 * stride, vext_u8(c01_hi, l7, 2)); + vst1_u8(dst + 6 * stride, vext_u8(c01_hi, l7, 4)); + vst1_u8(dst + 7 * stride, vext_u8(c01_hi, l7, 6)); +} + +void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t l15, l0, l1, l2, c0, c1, c01_lo, c01_hi; + (void)above; + + l0 = vld1q_u8(left + 0); + l15 = vld1q_dup_u8(left + 15); + + l1 = vextq_u8(l0, l15, 1); + l2 = vextq_u8(l0, l15, 2); + + c0 = vrhaddq_u8(l0, l1); + c1 = vrhaddq_u8(vhaddq_u8(l0, l2), l1); + + c01_lo = vzipq_u8(c0, c1).val[0]; + c01_hi = vzipq_u8(c0, c1).val[1]; + + vst1q_u8(dst + 0 * stride, c01_lo); + vst1q_u8(dst + 1 * stride, vextq_u8(c01_lo, c01_hi, 2)); + vst1q_u8(dst + 2 * stride, vextq_u8(c01_lo, c01_hi, 4)); + vst1q_u8(dst + 3 * stride, vextq_u8(c01_lo, c01_hi, 6)); + vst1q_u8(dst + 4 * stride, vextq_u8(c01_lo, c01_hi, 8)); + vst1q_u8(dst + 5 * stride, vextq_u8(c01_lo, c01_hi, 10)); + vst1q_u8(dst + 6 * stride, vextq_u8(c01_lo, c01_hi, 12)); + vst1q_u8(dst + 7 * stride, vextq_u8(c01_lo, c01_hi, 14)); + vst1q_u8(dst + 8 * stride, c01_hi); + vst1q_u8(dst + 9 * stride, vextq_u8(c01_hi, l15, 2)); + vst1q_u8(dst + 10 * stride, vextq_u8(c01_hi, l15, 4)); + vst1q_u8(dst + 11 * stride, vextq_u8(c01_hi, l15, 6)); + vst1q_u8(dst + 12 * stride, vextq_u8(c01_hi, l15, 8)); + vst1q_u8(dst + 13 * stride, vextq_u8(c01_hi, l15, 10)); + vst1q_u8(dst + 14 * stride, vextq_u8(c01_hi, l15, 12)); + vst1q_u8(dst + 15 * stride, vextq_u8(c01_hi, l15, 14)); +} + +void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t l0_lo, l0_hi, l1_lo, l1_hi, l2_lo, l2_hi, l31, c0_lo, c0_hi, c1_lo, + c1_hi, c01[4]; + (void)above; + + l0_lo = vld1q_u8(left + 0); + l0_hi = vld1q_u8(left + 16); + l31 = vld1q_dup_u8(left + 31); + + l1_lo = vextq_u8(l0_lo, l0_hi, 1); + l1_hi = vextq_u8(l0_hi, l31, 1); + l2_lo = vextq_u8(l0_lo, l0_hi, 2); + l2_hi = vextq_u8(l0_hi, l31, 2); + + c0_lo = vrhaddq_u8(l0_lo, l1_lo); + c0_hi = vrhaddq_u8(l0_hi, l1_hi); + c1_lo = vrhaddq_u8(vhaddq_u8(l0_lo, l2_lo), l1_lo); + c1_hi = vrhaddq_u8(vhaddq_u8(l0_hi, l2_hi), l1_hi); + + c01[0] = vzipq_u8(c0_lo, c1_lo).val[0]; + c01[1] = vzipq_u8(c0_lo, c1_lo).val[1]; + c01[2] = vzipq_u8(c0_hi, c1_hi).val[0]; + c01[3] = vzipq_u8(c0_hi, c1_hi).val[1]; + + vst1q_u8(dst + 0 * stride + 0, c01[0]); + vst1q_u8(dst + 0 * stride + 16, c01[1]); + vst1q_u8(dst + 1 * stride + 0, vextq_u8(c01[0], c01[1], 2)); + vst1q_u8(dst + 1 * stride + 16, vextq_u8(c01[1], c01[2], 2)); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(c01[0], c01[1], 4)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(c01[1], c01[2], 4)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(c01[0], c01[1], 6)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(c01[1], c01[2], 6)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(c01[0], c01[1], 8)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(c01[1], c01[2], 8)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(c01[0], c01[1], 10)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(c01[1], c01[2], 10)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(c01[0], c01[1], 12)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(c01[1], c01[2], 12)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(c01[0], c01[1], 14)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(c01[1], c01[2], 14)); + vst1q_u8(dst + 8 * stride + 0, c01[1]); + vst1q_u8(dst + 8 * stride + 16, c01[2]); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(c01[1], c01[2], 2)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(c01[2], c01[3], 2)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(c01[1], c01[2], 4)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(c01[2], c01[3], 4)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(c01[1], c01[2], 6)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(c01[2], c01[3], 6)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(c01[1], c01[2], 8)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(c01[2], c01[3], 8)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(c01[1], c01[2], 10)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(c01[2], c01[3], 10)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(c01[1], c01[2], 12)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(c01[2], c01[3], 12)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(c01[1], c01[2], 14)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(c01[2], c01[3], 14)); + vst1q_u8(dst + 16 * stride + 0, c01[2]); + vst1q_u8(dst + 16 * stride + 16, c01[3]); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(c01[2], c01[3], 2)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(c01[3], l31, 2)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(c01[2], c01[3], 4)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(c01[3], l31, 4)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(c01[2], c01[3], 6)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(c01[3], l31, 6)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(c01[2], c01[3], 8)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(c01[3], l31, 8)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(c01[2], c01[3], 10)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(c01[3], l31, 10)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(c01[2], c01[3], 12)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(c01[3], l31, 12)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(c01[2], c01[3], 14)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(c01[3], l31, 14)); + vst1q_u8(dst + 24 * stride + 0, c01[3]); + vst1q_u8(dst + 24 * stride + 16, l31); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(c01[3], l31, 2)); + vst1q_u8(dst + 25 * stride + 16, l31); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(c01[3], l31, 4)); + vst1q_u8(dst + 26 * stride + 16, l31); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(c01[3], l31, 6)); + vst1q_u8(dst + 27 * stride + 16, l31); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(c01[3], l31, 8)); + vst1q_u8(dst + 28 * stride + 16, l31); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(c01[3], l31, 10)); + vst1q_u8(dst + 29 * stride + 16, l31); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(c01[3], l31, 12)); + vst1q_u8(dst + 30 * stride + 16, l31); + vst1q_u8(dst + 31 * stride + 0, vextq_u8(c01[3], l31, 14)); + vst1q_u8(dst + 31 * stride + 16, l31); +} + +// ----------------------------------------------------------------------------- + #if !HAVE_NEON_ASM void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index fb8ab17780..400846b707 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -132,11 +132,24 @@ static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) { return vreinterpret_u8_u32(a_u32); } +// Load 4 contiguous bytes and replicate across a vector when alignment is not +// guaranteed. +static INLINE uint8x8_t load_replicate_u8_4x1(const uint8_t *buf) { + uint32_t a; + memcpy(&a, buf, 4); + return vreinterpret_u8_u32(vdup_n_u32(a)); +} + // Store 4 contiguous bytes from the low half of an 8x8 vector. static INLINE void store_u8_4x1(uint8_t *buf, uint8x8_t a) { vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 0); } +// Store 4 contiguous bytes from the high half of an 8x8 vector. +static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) { + vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 1); +} + // Load 2 sets of 4 bytes when alignment is not guaranteed. static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, ptrdiff_t stride) { diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 1423de2689..9c15912ca2 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -38,7 +38,7 @@ () # add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d207_predictor_4x4 sse2/; +specialize qw/vpx_d207_predictor_4x4 neon sse2/; add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d45_predictor_4x4 neon sse2/; @@ -87,7 +87,7 @@ () specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/; add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d207_predictor_8x8 ssse3/; +specialize qw/vpx_d207_predictor_8x8 neon ssse3/; add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; # TODO(crbug.com/webm/1522): Re-enable vsx implementation. @@ -131,7 +131,7 @@ () specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/; add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d207_predictor_16x16 ssse3/; +specialize qw/vpx_d207_predictor_16x16 neon ssse3/; add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/; @@ -170,7 +170,7 @@ () specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/; add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d207_predictor_32x32 ssse3/; +specialize qw/vpx_d207_predictor_32x32 neon ssse3/; add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/; From 33f3ae34144ea42bbf97d812ef23dccfc4bb8662 Mon Sep 17 00:00:00 2001 From: George Steed Date: Mon, 6 Mar 2023 13:24:47 +0000 Subject: [PATCH 0652/1000] Fix potential buffer over-read in highbd d117 predictor Neon The load of `left[bs]` in the standard bitdepth d117 Neon implementation triggered an address-sanitizer failure. The highbd equivalent does not appear to trigger any asan failures when running the VP9/ExternalFrameBufferMD5Test or VP9/TestVectorTest.MD5Match tests, but for consistency with the standard bitdepth implementation we adjust it to avoid the over-read. Performance is roughly identical, with a 0.8% performance improvement on average over the previous optimised code. Change-Id: I05dc4d43f244f4915c0ccc52cc0af999bbacb018 --- vpx_dsp/arm/highbd_intrapred_neon.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index d1e335c263..dc1b27dc10 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -465,7 +465,11 @@ void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, l0az = vext_u16(vld1_dup_u16(left), az, 3); l0 = vld1_u16(left + 0); - l1 = vld1_u16(left + 1); + // The last lane here is unused, reading left[4] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], left[2], left[3], x ] + l1 = vext_u16(l0, l0, 1); // [ above[-1], left[0], left[1], left[2] ] azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3); @@ -494,7 +498,11 @@ void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, l0az = vextq_u16(vld1q_dup_u16(left), az, 7); l0 = vld1q_u16(left + 0); - l1 = vld1q_u16(left + 1); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vextq_u16(l0, l0, 1); // [ above[-1], left[0], ..., left[6] ] azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); @@ -565,7 +573,11 @@ void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, l1 = vld1q_u16(left + 1); l7 = vld1q_u16(left + 7); l8 = vld1q_u16(left + 8); - l9 = vld1q_u16(left + 9); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[8] to avoid needing to + // materialize a zero: + // [ left[9], ... , left[15], x ] + l9 = vextq_u16(l8, l8, 1); // [ above[-1], left[0], ..., left[6] ] azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); @@ -658,6 +670,11 @@ void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, l23 = vld1q_u16(left + 23); l24 = vld1q_u16(left + 24); l25 = vld1q_u16(left + 25); + // The last lane here is unused, reading left[32] could cause a buffer + // over-read, so just fill with a duplicate of left[24] to avoid needing to + // materialize a zero: + // [ left[25], ... , left[31], x ] + l25 = vextq_u16(l24, l24, 1); // [ above[-1], left[0], ..., left[6] ] azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); From cf85ae9a49e5c0dfa71322275e99ac7accf0acaf Mon Sep 17 00:00:00 2001 From: George Steed Date: Wed, 22 Feb 2023 15:33:37 +0000 Subject: [PATCH 0653/1000] Implement highbd_d153_predictor using Neon Add Neon implementations of the highbd d153 predictor for 4x4, 8x8, 16x16 and 32x32 block sizes. Also update tests to add new corresponding cases. Speedups over the C code (higher is better): Microarch. | Compiler | Block | Speedup Neoverse N1 | LLVM 15 | 4x4 | 1.71 Neoverse N1 | LLVM 15 | 8x8 | 4.05 Neoverse N1 | LLVM 15 | 16x16 | 7.04 Neoverse N1 | LLVM 15 | 32x32 | 7.71 Neoverse N1 | GCC 12 | 4x4 | 1.84 Neoverse N1 | GCC 12 | 8x8 | 4.19 Neoverse N1 | GCC 12 | 16x16 | 6.07 Neoverse N1 | GCC 12 | 32x32 | 3.14 Neoverse V1 | LLVM 15 | 4x4 | 3.19 Neoverse V1 | LLVM 15 | 8x8 | 5.51 Neoverse V1 | LLVM 15 | 16x16 | 7.73 Neoverse V1 | LLVM 15 | 32x32 | 7.72 Neoverse V1 | GCC 12 | 4x4 | 3.97 Neoverse V1 | GCC 12 | 8x8 | 5.52 Neoverse V1 | GCC 12 | 16x16 | 6.31 Neoverse V1 | GCC 12 | 32x32 | 5.36 Change-Id: I2bce6f1921d76d1c10d163e0cd4f395b40799184 --- test/test_intra_pred_speed.cc | 16 +- test/vp9_intrapred_test.cc | 24 ++ vpx_dsp/arm/highbd_intrapred_neon.c | 400 ++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- 4 files changed, 436 insertions(+), 12 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 5792161b34..e334027ddc 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -566,16 +566,16 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon, vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon, vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon, - nullptr, nullptr, vpx_highbd_d63_predictor_4x4_neon, - vpx_highbd_tm_predictor_4x4_neon) + vpx_highbd_d153_predictor_4x4_neon, nullptr, + vpx_highbd_d63_predictor_4x4_neon, vpx_highbd_tm_predictor_4x4_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon, vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon, vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon, vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon, vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon, - nullptr, nullptr, vpx_highbd_d63_predictor_8x8_neon, - vpx_highbd_tm_predictor_8x8_neon) + vpx_highbd_d153_predictor_8x8_neon, nullptr, + vpx_highbd_d63_predictor_8x8_neon, vpx_highbd_tm_predictor_8x8_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon, vpx_highbd_dc_left_predictor_16x16_neon, @@ -583,8 +583,8 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon, vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon, vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon, - nullptr, nullptr, vpx_highbd_d63_predictor_16x16_neon, - vpx_highbd_tm_predictor_16x16_neon) + vpx_highbd_d153_predictor_16x16_neon, nullptr, + vpx_highbd_d63_predictor_16x16_neon, vpx_highbd_tm_predictor_16x16_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon, vpx_highbd_dc_left_predictor_32x32_neon, @@ -592,8 +592,8 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon, vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon, vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon, - nullptr, nullptr, vpx_highbd_d63_predictor_32x32_neon, - vpx_highbd_tm_predictor_32x32_neon) + vpx_highbd_d153_predictor_32x32_neon, nullptr, + vpx_highbd_d63_predictor_32x32_neon, vpx_highbd_tm_predictor_32x32_neon) #endif // HAVE_NEON #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 8696c0a787..d8ccd2db69 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -888,6 +888,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d135_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, &vpx_highbd_d135_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon, + &vpx_highbd_d153_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon, + &vpx_highbd_d153_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon, + &vpx_highbd_d153_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon, + &vpx_highbd_d153_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, &vpx_highbd_dc_128_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, @@ -980,6 +988,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d135_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, &vpx_highbd_d135_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon, + &vpx_highbd_d153_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon, + &vpx_highbd_d153_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon, + &vpx_highbd_d153_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon, + &vpx_highbd_d153_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, &vpx_highbd_dc_128_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, @@ -1072,6 +1088,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d135_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, &vpx_highbd_d135_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon, + &vpx_highbd_d153_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon, + &vpx_highbd_d153_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon, + &vpx_highbd_d153_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon, + &vpx_highbd_d153_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, &vpx_highbd_dc_128_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index 6b6ad95c12..4faecb575c 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -1178,6 +1178,406 @@ void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, // ----------------------------------------------------------------------------- +void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation. + uint16x4_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d20_lo, d20_hi; + (void)bd; + + az = vld1_u16(above - 1); + a0 = vld1_u16(above + 0); + // [ left[0], above[-1], above[0], above[1] ] + l0az = vext_u16(vld1_dup_u16(left), az, 3); + + l0 = vld1_u16(left); + // The last lane here is unused, reading left[4] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], left[2], left[3], x ] + l1 = vext_u16(l0, l0, 1); + // [ above[-1], left[0], left[1], left[2] ] + azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3); + + d0 = vrhadd_u16(azl0, l0); + d1 = vrhadd_u16(vhadd_u16(l0az, a0), az); + d2 = vrhadd_u16(vhadd_u16(azl0, l1), l0); + + d20_lo = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[0]; + d20_hi = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[1]; + + // Incrementally shift more elements from d0/d2 reversed into d1: + // stride=0 [ d0[0], d1[0], d1[1], d1[2] ] + // stride=1 [ d0[1], d2[0], d0[0], d1[0] ] + // stride=2 [ d0[2], d2[1], d0[1], d2[0] ] + // stride=3 [ d0[3], d2[2], d0[2], d2[1] ] + vst1_u16(dst + 0 * stride, vext_u16(d20_hi, d1, 3)); + vst1_u16(dst + 1 * stride, vext_u16(d20_hi, d1, 1)); + vst1_u16(dst + 2 * stride, vext_u16(d20_lo, d20_hi, 3)); + vst1_u16(dst + 3 * stride, vext_u16(d20_lo, d20_hi, 1)); +} + +void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d0_rev, d2_rev, d20_lo, + d20_hi; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + // [ left[0], above[-1], ... , above[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vextq_u16(l0, l0, 1); + // [ above[-1], left[0], ... , left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], left[0]) + // d0[1] = AVG2(left[0], left[1]) + // ... + // d0[7] = AVG2(left[6], left[7]) + d0 = vrhaddq_u16(azl0, l0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + + // d2[0] = AVG3(above[-1], left[0], left[1]) + // d2[1] = AVG3(left[0], left[1], left[2]) + // ... + // d2[7] = AVG3(left[6], left[7], left[8]) + d2 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vectors to put the elements to be shifted + // in at the end: + d0_rev = vrev64q_u16(vextq_u16(d0, d0, 4)); + d2_rev = vrev64q_u16(vextq_u16(d2, d2, 4)); + + d20_lo = vzipq_u16(d2_rev, d0_rev).val[0]; + d20_hi = vzipq_u16(d2_rev, d0_rev).val[1]; + + // Incrementally shift more elements from d0/d2 reversed into d1: + // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ] + // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ] + // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ] + // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ] + // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ] + // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ] + // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ] + vst1q_u16(dst + 0 * stride, vextq_u16(d20_hi, d1, 7)); + vst1q_u16(dst + 1 * stride, vextq_u16(d20_hi, d1, 5)); + vst1q_u16(dst + 2 * stride, vextq_u16(d20_hi, d1, 3)); + vst1q_u16(dst + 3 * stride, vextq_u16(d20_hi, d1, 1)); + vst1q_u16(dst + 4 * stride, vextq_u16(d20_lo, d20_hi, 7)); + vst1q_u16(dst + 5 * stride, vextq_u16(d20_lo, d20_hi, 5)); + vst1q_u16(dst + 6 * stride, vextq_u16(d20_lo, d20_hi, 3)); + vst1q_u16(dst + 7 * stride, vextq_u16(d20_lo, d20_hi, 1)); +} + +void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation. + uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, d0[2], d1[2], + d2[2], d20[4]; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[8] to avoid needing to + // materialize a zero: + // [ left[9], ... , left[15], x ] + l9 = vextq_u16(l8, l8, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0[0] = vrhaddq_u16(azl0, l0); + d0[1] = vrhaddq_u16(l7, l8); + d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + + d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4)); + d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4)); + d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4)); + d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4)); + + d20[0] = vzipq_u16(d2[1], d0[1]).val[0]; + d20[1] = vzipq_u16(d2[1], d0[1]).val[1]; + d20[2] = vzipq_u16(d2[0], d0[0]).val[0]; + d20[3] = vzipq_u16(d2[0], d0[0]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[3], d1[0], 7)); + vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[3], d1[0], 5)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[3], d1[0], 3)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[3], d1[0], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[3], d1[0], 7)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[3], d1[0], 5)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[3], d1[0], 3)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[3], d1[0], 1)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[1], d20[2], 1)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[2], d20[3], 1)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[0], d20[1], 7)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[0], d20[1], 5)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[0], d20[1], 3)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[0], d20[1], 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[1], d20[2], 1)); +} + +void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation. + uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7, + l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], d2[4], d20[8]; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a14 = vld1q_u16(above + 14); + a15 = vld1q_u16(above + 15); + a16 = vld1q_u16(above + 16); + a22 = vld1q_u16(above + 22); + a23 = vld1q_u16(above + 23); + a24 = vld1q_u16(above + 24); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + l9 = vld1q_u16(left + 9); + l15 = vld1q_u16(left + 15); + l16 = vld1q_u16(left + 16); + l17 = vld1q_u16(left + 17); + l23 = vld1q_u16(left + 23); + l24 = vld1q_u16(left + 24); + // The last lane here is unused, reading left[32] could cause a buffer + // over-read, so just fill with a duplicate of left[24] to avoid needing to + // materialize a zero: + // [ left[25], ... , left[31], x ] + l25 = vextq_u16(l24, l24, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0[0] = vrhaddq_u16(azl0, l0); + d0[1] = vrhaddq_u16(l7, l8); + d0[2] = vrhaddq_u16(l15, l16); + d0[3] = vrhaddq_u16(l23, l24); + + d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15); + d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23); + + d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + d2[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16); + d2[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24); + + d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4)); + d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4)); + d0[2] = vrev64q_u16(vextq_u16(d0[2], d0[2], 4)); + d0[3] = vrev64q_u16(vextq_u16(d0[3], d0[3], 4)); + d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4)); + d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4)); + d2[2] = vrev64q_u16(vextq_u16(d2[2], d2[2], 4)); + d2[3] = vrev64q_u16(vextq_u16(d2[3], d2[3], 4)); + + d20[0] = vzipq_u16(d2[3], d0[3]).val[0]; + d20[1] = vzipq_u16(d2[3], d0[3]).val[1]; + d20[2] = vzipq_u16(d2[2], d0[2]).val[0]; + d20[3] = vzipq_u16(d2[2], d0[2]).val[1]; + d20[4] = vzipq_u16(d2[1], d0[1]).val[0]; + d20[5] = vzipq_u16(d2[1], d0[1]).val[1]; + d20[6] = vzipq_u16(d2[0], d0[0]).val[0]; + d20[7] = vzipq_u16(d2[0], d0[0]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 0 * stride + 16, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 0 * stride + 24, vextq_u16(d1[2], d1[3], 7)); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 1 * stride + 16, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 1 * stride + 24, vextq_u16(d1[2], d1[3], 5)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(d1[2], d1[3], 3)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[7], d1[0], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 1)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 4 * stride + 16, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 4 * stride + 24, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[6], d20[7], 1)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[7], d1[0], 1)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[1], d1[2], 1)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 8 * stride + 16, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 8 * stride + 24, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[5], d20[6], 1)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[6], d20[7], 1)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(d20[7], d1[0], 1)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[0], d1[1], 1)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 12 * stride + 16, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 12 * stride + 24, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 14 * stride + 24, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[4], d20[5], 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[5], d20[6], 1)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(d20[6], d20[7], 1)); + vst1q_u16(dst + 15 * stride + 24, vextq_u16(d20[7], d1[0], 1)); + + vst1q_u16(dst + 16 * stride + 0, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 16 * stride + 8, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 16 * stride + 16, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 16 * stride + 24, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 17 * stride + 0, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 17 * stride + 8, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 17 * stride + 16, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 17 * stride + 24, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 18 * stride + 0, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 18 * stride + 24, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(d20[3], d20[4], 1)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(d20[4], d20[5], 1)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(d20[5], d20[6], 1)); + vst1q_u16(dst + 19 * stride + 24, vextq_u16(d20[6], d20[7], 1)); + + vst1q_u16(dst + 20 * stride + 0, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 20 * stride + 8, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 20 * stride + 16, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 20 * stride + 24, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 21 * stride + 24, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 22 * stride + 0, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 22 * stride + 24, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(d20[3], d20[4], 1)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(d20[4], d20[5], 1)); + vst1q_u16(dst + 23 * stride + 24, vextq_u16(d20[5], d20[6], 1)); + + vst1q_u16(dst + 24 * stride + 0, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 24 * stride + 8, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 24 * stride + 16, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 24 * stride + 24, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 25 * stride + 24, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 26 * stride + 0, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 26 * stride + 24, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(d20[1], d20[2], 1)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(d20[3], d20[4], 1)); + vst1q_u16(dst + 27 * stride + 24, vextq_u16(d20[4], d20[5], 1)); + + vst1q_u16(dst + 28 * stride + 0, vextq_u16(d20[0], d20[1], 7)); + vst1q_u16(dst + 28 * stride + 8, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 28 * stride + 16, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 28 * stride + 24, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(d20[0], d20[1], 5)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 29 * stride + 24, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 30 * stride + 0, vextq_u16(d20[0], d20[1], 3)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 30 * stride + 16, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 30 * stride + 24, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(d20[0], d20[1], 1)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(d20[1], d20[2], 1)); + vst1q_u16(dst + 31 * stride + 16, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 31 * stride + 24, vextq_u16(d20[3], d20[4], 1)); +} + +// ----------------------------------------------------------------------------- + void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 9c15912ca2..b3f655c2bd 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -229,7 +229,7 @@ () specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d153_predictor_4x4 sse2/; + specialize qw/vpx_highbd_d153_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/; @@ -268,7 +268,7 @@ () specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/; + specialize qw/vpx_highbd_d153_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/; @@ -307,7 +307,7 @@ () specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/; + specialize qw/vpx_highbd_d153_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/; @@ -346,7 +346,7 @@ () specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/; + specialize qw/vpx_highbd_d153_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/; From 9e35c35945ff95d325d1617c8ebcbe80245ae651 Mon Sep 17 00:00:00 2001 From: George Steed Date: Thu, 23 Feb 2023 16:25:38 +0000 Subject: [PATCH 0654/1000] Implement highbd_d207_predictor using Neon Add Neon implementations of the highbd d207 predictor for 4x4, 8x8, 16x16 and 32x32 block sizes. Also update tests to add new corresponding cases. Speedups over the C code (higher is better): Microarch. | Compiler | Block | Speedup Neoverse N1 | LLVM 15 | 4x4 | 1.61 Neoverse N1 | LLVM 15 | 8x8 | 5.30 Neoverse N1 | LLVM 15 | 16x16 | 8.93 Neoverse N1 | LLVM 15 | 32x32 | 8.35 Neoverse N1 | GCC 12 | 4x4 | 2.16 Neoverse N1 | GCC 12 | 8x8 | 5.75 Neoverse N1 | GCC 12 | 16x16 | 7.28 Neoverse N1 | GCC 12 | 32x32 | 3.31 Neoverse V1 | LLVM 15 | 4x4 | 1.71 Neoverse V1 | LLVM 15 | 8x8 | 7.46 Neoverse V1 | LLVM 15 | 16x16 | 10.09 Neoverse V1 | LLVM 15 | 32x32 | 8.10 Neoverse V1 | GCC 12 | 4x4 | 1.99 Neoverse V1 | GCC 12 | 8x8 | 7.81 Neoverse V1 | GCC 12 | 16x16 | 8.34 Neoverse V1 | GCC 12 | 32x32 | 5.74 Change-Id: Ic021e82eed0c7bc8263eb68606411354eb5e4870 --- test/test_intra_pred_speed.cc | 8 +- test/vp9_intrapred_test.cc | 24 +++ vpx_dsp/arm/highbd_intrapred_neon.c | 305 ++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- 4 files changed, 337 insertions(+), 8 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index e334027ddc..15303816b9 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -566,7 +566,7 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon, vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon, vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon, - vpx_highbd_d153_predictor_4x4_neon, nullptr, + vpx_highbd_d153_predictor_4x4_neon, vpx_highbd_d207_predictor_4x4_neon, vpx_highbd_d63_predictor_4x4_neon, vpx_highbd_tm_predictor_4x4_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon, @@ -574,7 +574,7 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon, vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon, vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon, - vpx_highbd_d153_predictor_8x8_neon, nullptr, + vpx_highbd_d153_predictor_8x8_neon, vpx_highbd_d207_predictor_8x8_neon, vpx_highbd_d63_predictor_8x8_neon, vpx_highbd_tm_predictor_8x8_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon, @@ -583,7 +583,7 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon, vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon, vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon, - vpx_highbd_d153_predictor_16x16_neon, nullptr, + vpx_highbd_d153_predictor_16x16_neon, vpx_highbd_d207_predictor_16x16_neon, vpx_highbd_d63_predictor_16x16_neon, vpx_highbd_tm_predictor_16x16_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon, @@ -592,7 +592,7 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon, vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon, vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon, - vpx_highbd_d153_predictor_32x32_neon, nullptr, + vpx_highbd_d153_predictor_32x32_neon, vpx_highbd_d207_predictor_32x32_neon, vpx_highbd_d63_predictor_32x32_neon, vpx_highbd_tm_predictor_32x32_neon) #endif // HAVE_NEON diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index d8ccd2db69..cec9031618 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -896,6 +896,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d153_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon, &vpx_highbd_d153_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon, + &vpx_highbd_d207_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon, + &vpx_highbd_d207_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon, + &vpx_highbd_d207_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon, + &vpx_highbd_d207_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, &vpx_highbd_dc_128_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, @@ -996,6 +1004,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d153_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon, &vpx_highbd_d153_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon, + &vpx_highbd_d207_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon, + &vpx_highbd_d207_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon, + &vpx_highbd_d207_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon, + &vpx_highbd_d207_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, &vpx_highbd_dc_128_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, @@ -1096,6 +1112,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_highbd_d153_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon, &vpx_highbd_d153_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon, + &vpx_highbd_d207_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon, + &vpx_highbd_d207_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon, + &vpx_highbd_d207_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon, + &vpx_highbd_d207_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, &vpx_highbd_dc_128_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index 4faecb575c..503900915d 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -1821,6 +1821,311 @@ void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ +void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x4_t l0, l1, l2, l3, c0, c1, c01_lo, c01_hi; + (void)above; + (void)bd; + + l0 = vld1_u16(left + 0); + l3 = vld1_dup_u16(left + 3); + + // [ left[1], left[2], left[3], left[3] ] + l1 = vext_u16(l0, l3, 1); + // [ left[2], left[3], left[3], left[3] ] + l2 = vext_u16(l0, l3, 2); + + c0 = vrhadd_u16(l0, l1); + c1 = vrhadd_u16(vhadd_u16(l0, l2), l1); + + c01_lo = vzip_u16(c0, c1).val[0]; + c01_hi = vzip_u16(c0, c1).val[1]; + + // stride=0 [ c0[0], c1[0], c0[1], c1[1] ] + // stride=1 [ c0[1], c1[1], c0[2], c1[2] ] + // stride=2 [ c0[2], c1[2], c0[3], c1[3] ] + // stride=3 [ c0[3], c1[3], left[3], left[3] ] + vst1_u16(dst + 0 * stride, c01_lo); + vst1_u16(dst + 1 * stride, vext_u16(c01_lo, c01_hi, 2)); + vst1_u16(dst + 2 * stride, c01_hi); + vst1_u16(dst + 3 * stride, vext_u16(c01_hi, l3, 2)); +} + +void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t l0, l1, l2, l7, c0, c1, c01_lo, c01_hi; + (void)above; + (void)bd; + + l0 = vld1q_u16(left + 0); + l7 = vld1q_dup_u16(left + 7); + + // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ] + l1 = vextq_u16(l0, l7, 1); + // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ] + l2 = vextq_u16(l0, l7, 2); + + c0 = vrhaddq_u16(l0, l1); + c1 = vrhaddq_u16(vhaddq_u16(l0, l2), l1); + + c01_lo = vzipq_u16(c0, c1).val[0]; + c01_hi = vzipq_u16(c0, c1).val[1]; + + vst1q_u16(dst + 0 * stride, c01_lo); + vst1q_u16(dst + 1 * stride, vextq_u16(c01_lo, c01_hi, 2)); + vst1q_u16(dst + 2 * stride, vextq_u16(c01_lo, c01_hi, 4)); + vst1q_u16(dst + 3 * stride, vextq_u16(c01_lo, c01_hi, 6)); + vst1q_u16(dst + 4 * stride, c01_hi); + vst1q_u16(dst + 5 * stride, vextq_u16(c01_hi, l7, 2)); + vst1q_u16(dst + 6 * stride, vextq_u16(c01_hi, l7, 4)); + vst1q_u16(dst + 7 * stride, vextq_u16(c01_hi, l7, 6)); +} + +void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t l0, l1, l2, l8, l9, l10, l15, c0[2], c1[2], c01[4]; + (void)above; + (void)bd; + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l2 = vld1q_u16(left + 2); + l8 = vld1q_u16(left + 8); + l15 = vld1q_dup_u16(left + 15); + + l9 = vextq_u16(l8, l15, 1); + l10 = vextq_u16(l8, l15, 2); + + c0[0] = vrhaddq_u16(l0, l1); + c0[1] = vrhaddq_u16(l8, l9); + c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1); + c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9); + + c01[0] = vzipq_u16(c0[0], c1[0]).val[0]; + c01[1] = vzipq_u16(c0[0], c1[0]).val[1]; + c01[2] = vzipq_u16(c0[1], c1[1]).val[0]; + c01[3] = vzipq_u16(c0[1], c1[1]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, c01[0]); + vst1q_u16(dst + 0 * stride + 8, c01[1]); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6)); + + vst1q_u16(dst + 4 * stride + 0, c01[1]); + vst1q_u16(dst + 4 * stride + 8, c01[2]); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6)); + + vst1q_u16(dst + 8 * stride + 0, c01[2]); + vst1q_u16(dst + 8 * stride + 8, c01[3]); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], l15, 2)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], l15, 4)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], l15, 6)); + + vst1q_u16(dst + 12 * stride + 0, c01[3]); + vst1q_u16(dst + 12 * stride + 8, l15); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], l15, 2)); + vst1q_u16(dst + 13 * stride + 8, l15); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], l15, 4)); + vst1q_u16(dst + 14 * stride + 8, l15); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], l15, 6)); + vst1q_u16(dst + 15 * stride + 8, l15); +} + +void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t l0, l1, l2, l8, l9, l10, l16, l17, l18, l24, l25, l26, l31, c0[4], + c1[4], c01[8]; + (void)above; + (void)bd; + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l2 = vld1q_u16(left + 2); + l8 = vld1q_u16(left + 8); + l9 = vld1q_u16(left + 9); + l10 = vld1q_u16(left + 10); + l16 = vld1q_u16(left + 16); + l17 = vld1q_u16(left + 17); + l18 = vld1q_u16(left + 18); + l24 = vld1q_u16(left + 24); + l31 = vld1q_dup_u16(left + 31); + + l25 = vextq_u16(l24, l31, 1); + l26 = vextq_u16(l24, l31, 2); + + c0[0] = vrhaddq_u16(l0, l1); + c0[1] = vrhaddq_u16(l8, l9); + c0[2] = vrhaddq_u16(l16, l17); + c0[3] = vrhaddq_u16(l24, l25); + c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1); + c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9); + c1[2] = vrhaddq_u16(vhaddq_u16(l16, l18), l17); + c1[3] = vrhaddq_u16(vhaddq_u16(l24, l26), l25); + + c01[0] = vzipq_u16(c0[0], c1[0]).val[0]; + c01[1] = vzipq_u16(c0[0], c1[0]).val[1]; + c01[2] = vzipq_u16(c0[1], c1[1]).val[0]; + c01[3] = vzipq_u16(c0[1], c1[1]).val[1]; + c01[4] = vzipq_u16(c0[2], c1[2]).val[0]; + c01[5] = vzipq_u16(c0[2], c1[2]).val[1]; + c01[6] = vzipq_u16(c0[3], c1[3]).val[0]; + c01[7] = vzipq_u16(c0[3], c1[3]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, c01[0]); + vst1q_u16(dst + 0 * stride + 8, c01[1]); + vst1q_u16(dst + 0 * stride + 16, c01[2]); + vst1q_u16(dst + 0 * stride + 24, c01[3]); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 1 * stride + 16, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 1 * stride + 24, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(c01[3], c01[4], 6)); + + vst1q_u16(dst + 4 * stride + 0, c01[1]); + vst1q_u16(dst + 4 * stride + 8, c01[2]); + vst1q_u16(dst + 4 * stride + 16, c01[3]); + vst1q_u16(dst + 4 * stride + 24, c01[4]); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(c01[3], c01[4], 6)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(c01[4], c01[5], 6)); + + vst1q_u16(dst + 8 * stride + 0, c01[2]); + vst1q_u16(dst + 8 * stride + 8, c01[3]); + vst1q_u16(dst + 8 * stride + 16, c01[4]); + vst1q_u16(dst + 8 * stride + 24, c01[5]); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], c01[4], 6)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(c01[4], c01[5], 6)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(c01[5], c01[6], 6)); + + vst1q_u16(dst + 12 * stride + 0, c01[3]); + vst1q_u16(dst + 12 * stride + 8, c01[4]); + vst1q_u16(dst + 12 * stride + 16, c01[5]); + vst1q_u16(dst + 12 * stride + 24, c01[6]); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 14 * stride + 24, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], c01[4], 6)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(c01[4], c01[5], 6)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(c01[5], c01[6], 6)); + vst1q_u16(dst + 15 * stride + 24, vextq_u16(c01[6], c01[7], 6)); + + vst1q_u16(dst + 16 * stride + 0, c01[4]); + vst1q_u16(dst + 16 * stride + 8, c01[5]); + vst1q_u16(dst + 16 * stride + 16, c01[6]); + vst1q_u16(dst + 16 * stride + 24, c01[7]); + vst1q_u16(dst + 17 * stride + 0, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 17 * stride + 8, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 17 * stride + 16, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 17 * stride + 24, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 18 * stride + 0, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 18 * stride + 24, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(c01[4], c01[5], 6)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(c01[5], c01[6], 6)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(c01[6], c01[7], 6)); + vst1q_u16(dst + 19 * stride + 24, vextq_u16(c01[7], l31, 6)); + + vst1q_u16(dst + 20 * stride + 0, c01[5]); + vst1q_u16(dst + 20 * stride + 8, c01[6]); + vst1q_u16(dst + 20 * stride + 16, c01[7]); + vst1q_u16(dst + 20 * stride + 24, l31); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 21 * stride + 24, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 22 * stride + 0, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 22 * stride + 24, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(c01[5], c01[6], 6)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(c01[6], c01[7], 6)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(c01[7], l31, 6)); + vst1q_u16(dst + 23 * stride + 24, vextq_u16(l31, l31, 6)); + + vst1q_u16(dst + 24 * stride + 0, c01[6]); + vst1q_u16(dst + 24 * stride + 8, c01[7]); + vst1q_u16(dst + 24 * stride + 16, l31); + vst1q_u16(dst + 24 * stride + 24, l31); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 25 * stride + 24, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 26 * stride + 0, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 26 * stride + 24, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(c01[6], c01[7], 6)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(c01[7], l31, 6)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(l31, l31, 6)); + vst1q_u16(dst + 27 * stride + 24, vextq_u16(l31, l31, 6)); + + vst1q_u16(dst + 28 * stride + 0, c01[7]); + vst1q_u16(dst + 28 * stride + 8, l31); + vst1q_u16(dst + 28 * stride + 16, l31); + vst1q_u16(dst + 28 * stride + 24, l31); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 29 * stride + 24, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 30 * stride + 0, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 30 * stride + 16, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 30 * stride + 24, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(c01[7], l31, 6)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(l31, l31, 6)); + vst1q_u16(dst + 31 * stride + 16, vextq_u16(l31, l31, 6)); + vst1q_u16(dst + 31 * stride + 24, vextq_u16(l31, l31, 6)); +} + +//------------------------------------------------------------------------------ + void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index b3f655c2bd..80dc6d95c9 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -211,7 +211,7 @@ () # High bitdepth functions if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d207_predictor_4x4 sse2/; + specialize qw/vpx_highbd_d207_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/; @@ -250,7 +250,7 @@ () specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/; + specialize qw/vpx_highbd_d207_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/; @@ -289,7 +289,7 @@ () specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/; + specialize qw/vpx_highbd_d207_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/; @@ -328,7 +328,7 @@ () specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/; + specialize qw/vpx_highbd_d207_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/; From 6b783c6975a5fc2ee21579cc3c48e59184bf3295 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 6 Mar 2023 17:52:13 +0000 Subject: [PATCH 0655/1000] Optimize vpx_sum_squares_2d_i16_neon Add an additional 32-bit vector accumulator to allow parallel processing on CPUs that have more than one Neon multiply-accumulate pipeline. Also use sum_neon.h horizontal-add helpers for reduction. Change-Id: Ibcb48a738f5dee1430c3ebcd305b5ea8ea344c40 --- vpx_dsp/arm/sum_neon.h | 8 +++ vpx_dsp/arm/sum_squares_neon.c | 117 +++++++++++++++++++-------------- 2 files changed, 74 insertions(+), 51 deletions(-) diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index 47748a8061..6f513ca7a8 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -127,4 +127,12 @@ static INLINE uint64_t horizontal_add_int64x2(const int64x2_t a) { #endif } +static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) { +#if defined(__aarch64__) + return vaddvq_u64(a); +#else + return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); +#endif +} + #endif // VPX_VPX_DSP_ARM_SUM_NEON_H_ diff --git a/vpx_dsp/arm/sum_squares_neon.c b/vpx_dsp/arm/sum_squares_neon.c index cfefad9938..074afe3258 100644 --- a/vpx_dsp/arm/sum_squares_neon.c +++ b/vpx_dsp/arm/sum_squares_neon.c @@ -9,77 +9,92 @@ */ #include - #include + #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/sum_neon.h" uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size) { - uint64x1_t s2; - if (size == 4) { int16x4_t s[4]; - int32x4_t s0; - uint32x2_t s1; + int32x4_t sum_s32; s[0] = vld1_s16(src + 0 * stride); s[1] = vld1_s16(src + 1 * stride); s[2] = vld1_s16(src + 2 * stride); s[3] = vld1_s16(src + 3 * stride); - s0 = vmull_s16(s[0], s[0]); - s0 = vmlal_s16(s0, s[1], s[1]); - s0 = vmlal_s16(s0, s[2], s[2]); - s0 = vmlal_s16(s0, s[3], s[3]); - s1 = vpadd_u32(vget_low_u32(vreinterpretq_u32_s32(s0)), - vget_high_u32(vreinterpretq_u32_s32(s0))); - s2 = vpaddl_u32(s1); + + sum_s32 = vmull_s16(s[0], s[0]); + sum_s32 = vmlal_s16(sum_s32, s[1], s[1]); + sum_s32 = vmlal_s16(sum_s32, s[2], s[2]); + sum_s32 = vmlal_s16(sum_s32, s[3], s[3]); + + return horizontal_long_add_uint32x4(vreinterpretq_u32_s32(sum_s32)); } else { - int r = size; - uint64x2_t s1 = vdupq_n_u64(0); + uint64x2_t sum_u64 = vdupq_n_u64(0); + int rows = size; do { - int c = size; - int32x4_t s0 = vdupq_n_s32(0); - const int16_t *src_t = src; + const int16_t *src_ptr = src; + int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int cols = size; do { int16x8_t s[8]; - s[0] = vld1q_s16(src_t + 0 * stride); - s[1] = vld1q_s16(src_t + 1 * stride); - s[2] = vld1q_s16(src_t + 2 * stride); - s[3] = vld1q_s16(src_t + 3 * stride); - s[4] = vld1q_s16(src_t + 4 * stride); - s[5] = vld1q_s16(src_t + 5 * stride); - s[6] = vld1q_s16(src_t + 6 * stride); - s[7] = vld1q_s16(src_t + 7 * stride); - s0 = vmlal_s16(s0, vget_low_s16(s[0]), vget_low_s16(s[0])); - s0 = vmlal_s16(s0, vget_low_s16(s[1]), vget_low_s16(s[1])); - s0 = vmlal_s16(s0, vget_low_s16(s[2]), vget_low_s16(s[2])); - s0 = vmlal_s16(s0, vget_low_s16(s[3]), vget_low_s16(s[3])); - s0 = vmlal_s16(s0, vget_low_s16(s[4]), vget_low_s16(s[4])); - s0 = vmlal_s16(s0, vget_low_s16(s[5]), vget_low_s16(s[5])); - s0 = vmlal_s16(s0, vget_low_s16(s[6]), vget_low_s16(s[6])); - s0 = vmlal_s16(s0, vget_low_s16(s[7]), vget_low_s16(s[7])); - s0 = vmlal_s16(s0, vget_high_s16(s[0]), vget_high_s16(s[0])); - s0 = vmlal_s16(s0, vget_high_s16(s[1]), vget_high_s16(s[1])); - s0 = vmlal_s16(s0, vget_high_s16(s[2]), vget_high_s16(s[2])); - s0 = vmlal_s16(s0, vget_high_s16(s[3]), vget_high_s16(s[3])); - s0 = vmlal_s16(s0, vget_high_s16(s[4]), vget_high_s16(s[4])); - s0 = vmlal_s16(s0, vget_high_s16(s[5]), vget_high_s16(s[5])); - s0 = vmlal_s16(s0, vget_high_s16(s[6]), vget_high_s16(s[6])); - s0 = vmlal_s16(s0, vget_high_s16(s[7]), vget_high_s16(s[7])); - src_t += 8; - c -= 8; - } while (c); + s[0] = vld1q_s16(src_ptr + 0 * stride); + s[1] = vld1q_s16(src_ptr + 1 * stride); + s[2] = vld1q_s16(src_ptr + 2 * stride); + s[3] = vld1q_s16(src_ptr + 3 * stride); + s[4] = vld1q_s16(src_ptr + 4 * stride); + s[5] = vld1q_s16(src_ptr + 5 * stride); + s[6] = vld1q_s16(src_ptr + 6 * stride); + s[7] = vld1q_s16(src_ptr + 7 * stride); - s1 = vaddw_u32(s1, vget_low_u32(vreinterpretq_u32_s32(s0))); - s1 = vaddw_u32(s1, vget_high_u32(vreinterpretq_u32_s32(s0))); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[0]), vget_low_s16(s[0])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[1]), vget_low_s16(s[1])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[2]), vget_low_s16(s[2])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[3]), vget_low_s16(s[3])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[4]), vget_low_s16(s[4])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[5]), vget_low_s16(s[5])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[6]), vget_low_s16(s[6])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[7]), vget_low_s16(s[7])); + + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[0]), vget_high_s16(s[0])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[1]), vget_high_s16(s[1])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[2]), vget_high_s16(s[2])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[3]), vget_high_s16(s[3])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[4]), vget_high_s16(s[4])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[5]), vget_high_s16(s[5])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[6]), vget_high_s16(s[6])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[7]), vget_high_s16(s[7])); + + src_ptr += 8; + cols -= 8; + } while (cols); + + sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[0])); + sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[1])); src += 8 * stride; - r -= 8; - } while (r); + rows -= 8; + } while (rows); - s2 = vadd_u64(vget_low_u64(s1), vget_high_u64(s1)); + return horizontal_add_uint64x2(sum_u64); } - - return vget_lane_u64(s2, 0); } From d94e16404a08f3a67aa570d6b8c107ae47e158b5 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 6 Mar 2023 13:56:17 -0800 Subject: [PATCH 0656/1000] vpx_convolve_copy_neon: fix unaligned loads w/w==4 Fixes a -fsanitize=undefined warning: vpx_dsp/arm/vpx_convolve_copy_neon.c:29:26: runtime error: load of misaligned address 0xffffa8242bea for type 'const uint32_t' (aka 'const unsigned int'), which requires 4 byte alignment 0xffffa8242bea: note: pointer points here 88 81 7d 7d 7d 7d 7d 81 81 7d 81 80 87 97 a8 ab a0 91 ... ^ #0 0xb0447c in vpx_convolve_copy_neon vpx_dsp/arm/vpx_convolve_copy_neon.c:29:26 #1 0x12285c8 in inter_predictor vp9/common/vp9_reconinter.h:29:3 #2 0x1228430 in dec_build_inter_predictors vp9/decoder/vp9_decodeframe.c ... Change-Id: Iaec4ac2a400b6e6db72d12e5a7acb316262b12a7 --- vpx_dsp/arm/vpx_convolve_copy_neon.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve_copy_neon.c b/vpx_dsp/arm/vpx_convolve_copy_neon.c index 361ec8a806..bea7c98437 100644 --- a/vpx_dsp/arm/vpx_convolve_copy_neon.c +++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c @@ -9,6 +9,7 @@ */ #include +#include #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" @@ -26,10 +27,10 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, if (w < 8) { // copy4 do { - *(uint32_t *)dst = *(const uint32_t *)src; + memcpy(dst, src, 4); src += src_stride; dst += dst_stride; - *(uint32_t *)dst = *(const uint32_t *)src; + memcpy(dst, src, 4); src += src_stride; dst += dst_stride; h -= 2; From 5a2bb12c52171ee8ef86f9a1129ac413204ea3cf Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Fri, 3 Mar 2023 23:42:50 +0000 Subject: [PATCH 0657/1000] Fix heap buffer overrun in vpx_get4x4sse_cs_neon Use a mem_neon.h helper to do strided 4-byte loads instead of Neon 8-byte loads - where the last 4 bytes are out of bounds. Re-enable the Neon code path and the tests. Bug: webm:1794 Change-Id: I69ccff730f4a5cbf585dd6a9aa0f3eb13e150074 --- test/variance_test.cc | 3 -- vpx_dsp/arm/variance_neon.c | 99 ++++++++++-------------------------- vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 3 files changed, 29 insertions(+), 77 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index 237d595bb7..1359bc4baf 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1446,12 +1446,9 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_AVX2 #if HAVE_NEON -// TODO(https://crbug.com/webm/1794): enable this after heap overflow is fixed. -#if 0 INSTANTIATE_TEST_SUITE_P(NEON, VpxSseTest, ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_neon))); -#endif INSTANTIATE_TEST_SUITE_P(NEON, VpxMseTest, ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon), diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index 76c2a15863..69ff1cf153 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -433,42 +433,18 @@ static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, return *sse; } -// TODO(https://crbug.com/webm/1794): enable this after heap overflow is fixed. -#if 0 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride) { - uint8x8_t a[4], b[4], abs_diff[4]; - uint32x2_t sse = vdup_n_u32(0); - - a[0] = vld1_u8(src_ptr); - src_ptr += src_stride; - b[0] = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - a[1] = vld1_u8(src_ptr); - src_ptr += src_stride; - b[1] = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - a[2] = vld1_u8(src_ptr); - src_ptr += src_stride; - b[2] = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - a[3] = vld1_u8(src_ptr); - b[3] = vld1_u8(ref_ptr); - - abs_diff[0] = vabd_u8(a[0], b[0]); - abs_diff[1] = vabd_u8(a[1], b[1]); - abs_diff[2] = vabd_u8(a[2], b[2]); - abs_diff[3] = vabd_u8(a[3], b[3]); - - sse = vdot_u32(sse, abs_diff[0], abs_diff[0]); - sse = vdot_u32(sse, abs_diff[1], abs_diff[1]); - sse = vdot_u32(sse, abs_diff[2], abs_diff[2]); - sse = vdot_u32(sse, abs_diff[3], abs_diff[3]); - - return vget_lane_u32(sse, 0); + uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); + uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); + + uint8x16_t abs_diff = vabdq_u8(s, r); + + uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff); + + return horizontal_add_uint32x4(sse); } -#endif // 0 #else // !defined(__ARM_FEATURE_DOTPROD) @@ -535,49 +511,30 @@ static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, return *sse; } -// TODO(https://crbug.com/webm/1794): enable this after heap overflow is fixed. -#if 0 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride) { - uint8x8_t a[4], b[4]; - int16x4_t diff_lo[4]; - uint16x8_t diff[4]; - int32x4_t sse; - - a[0] = vld1_u8(src_ptr); - src_ptr += src_stride; - b[0] = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - a[1] = vld1_u8(src_ptr); - src_ptr += src_stride; - b[1] = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - a[2] = vld1_u8(src_ptr); - src_ptr += src_stride; - b[2] = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - a[3] = vld1_u8(src_ptr); - b[3] = vld1_u8(ref_ptr); - - diff[0] = vsubl_u8(a[0], b[0]); - diff[1] = vsubl_u8(a[1], b[1]); - diff[2] = vsubl_u8(a[2], b[2]); - diff[3] = vsubl_u8(a[3], b[3]); - - diff_lo[0] = vget_low_s16(vreinterpretq_s16_u16(diff[0])); - diff_lo[1] = vget_low_s16(vreinterpretq_s16_u16(diff[1])); - diff_lo[2] = vget_low_s16(vreinterpretq_s16_u16(diff[2])); - diff_lo[3] = vget_low_s16(vreinterpretq_s16_u16(diff[3])); - - sse = vmull_s16(diff_lo[0], diff_lo[0]); - sse = vmlal_s16(sse, diff_lo[1], diff_lo[1]); - sse = vmlal_s16(sse, diff_lo[2], diff_lo[2]); - sse = vmlal_s16(sse, diff_lo[3], diff_lo[3]); - - return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse)); + uint8x8_t s[2], r[2]; + uint16x8_t abs_diff[2]; + uint32x4_t sse; + + s[0] = load_u8(src_ptr, src_stride); + r[0] = load_u8(ref_ptr, ref_stride); + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + s[1] = load_u8(src_ptr, src_stride); + r[1] = load_u8(ref_ptr, ref_stride); + + abs_diff[0] = vabdl_u8(s[0], r[0]); + abs_diff[1] = vabdl_u8(s[1], r[1]); + + sse = vmull_u16(vget_low_u16(abs_diff[0]), vget_low_u16(abs_diff[0])); + sse = vmlal_u16(sse, vget_high_u16(abs_diff[0]), vget_high_u16(abs_diff[0])); + sse = vmlal_u16(sse, vget_low_u16(abs_diff[1]), vget_low_u16(abs_diff[1])); + sse = vmlal_u16(sse, vget_high_u16(abs_diff[1]), vget_high_u16(abs_diff[1])); + + return horizontal_add_uint32x4(sse); } -#endif // 0 #endif // defined(__ARM_FEATURE_DOTPROD) diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 2301fbe328..c50ab93c5a 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1152,10 +1152,8 @@ () add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *"; specialize qw/vpx_get_mb_ss sse2 msa vsx/; - # TODO(https://crbug.com/webm/1794): enable neon after heap overflow is - # fixed. add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride"; - specialize qw/vpx_get4x4sse_cs msa vsx/; + specialize qw/vpx_get4x4sse_cs neon msa vsx/; add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/; From f2210fd29047de236bd8a3f349db10836ef283ef Mon Sep 17 00:00:00 2001 From: Neeraj Gadgil Date: Wed, 1 Mar 2023 15:35:57 +0530 Subject: [PATCH 0658/1000] Early terminate interp filt search based on best RD cost The CL prunes interpolation filter search based on rdcost of individual planes. Instruction Count BD-Rate Loss(%) cpu Resolution Reduction(%) avg.psnr ovr.psnr ssim 0 LOWRES2 1.613 0.0143 0.0208 0.0146 0 MIDRES2 1.637 0.0214 -0.0316 0.0036 0 HDRES2 1.369 0.0171 0.0178 0.1222 0 Average 1.539 0.0176 0.0023 0.0468 STATS_CHANGED Change-Id: I4be30bd1c7bbbc93c6bbc840565893a97d2598a4 --- vp9/encoder/vp9_rd.c | 40 -------------- vp9/encoder/vp9_rd.h | 5 -- vp9/encoder/vp9_rdopt.c | 91 +++++++++++++++++--------------- vp9/encoder/vp9_speed_features.c | 2 + vp9/encoder/vp9_speed_features.h | 4 ++ 5 files changed, 54 insertions(+), 88 deletions(-) diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 58dd75b441..95c95971c5 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -513,22 +513,6 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; } -static void model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE], - int r_q10[MAX_MB_PLANE], - int d_q10[MAX_MB_PLANE]) { - int i; - const int one_q10 = 1 << 10; - for (i = 0; i < MAX_MB_PLANE; ++i) { - const int tmp = (xsq_q10[i] >> 2) + 8; - const int k = get_msb(tmp) - 3; - const int xq = (k << 3) + ((tmp >> k) & 0x7); - const int a_q10 = ((xsq_q10[i] - xsq_iq_q10[xq]) << 10) >> (2 + k); - const int b_q10 = one_q10 - a_q10; - r_q10[i] = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10; - d_q10[i] = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; - } -} - static const uint32_t MAX_XSQ_Q10 = 245727; void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, @@ -554,30 +538,6 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, } } -// Implements a fixed length vector form of vp9_model_rd_from_var_lapndz where -// vectors are of length MAX_MB_PLANE and all elements of var are non-zero. -void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE], - unsigned int n_log2[MAX_MB_PLANE], - unsigned int qstep[MAX_MB_PLANE], - int64_t *rate_sum, int64_t *dist_sum) { - int i; - int xsq_q10[MAX_MB_PLANE], d_q10[MAX_MB_PLANE], r_q10[MAX_MB_PLANE]; - for (i = 0; i < MAX_MB_PLANE; ++i) { - const uint64_t xsq_q10_64 = - (((uint64_t)qstep[i] * qstep[i] << (n_log2[i] + 10)) + (var[i] >> 1)) / - var[i]; - xsq_q10[i] = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10); - } - model_rd_norm_vec(xsq_q10, r_q10, d_q10); - for (i = 0; i < MAX_MB_PLANE; ++i) { - int rate = - ROUND_POWER_OF_TWO(r_q10[i] << n_log2[i], 10 - VP9_PROB_COST_SHIFT); - int64_t dist = (var[i] * (int64_t)d_q10[i] + 512) >> 10; - *rate_sum += rate; - *dist_sum += dist; - } -} - // Disable gcc 12.2 false positive warning. // warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=] #if defined(__GNUC__) && !defined(__clang__) diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h index efd854edf4..6c61ae514a 100644 --- a/vp9/encoder/vp9_rd.h +++ b/vp9/encoder/vp9_rd.h @@ -164,11 +164,6 @@ void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex); void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, unsigned int qstep, int *rate, int64_t *dist); -void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE], - unsigned int n_log2[MAX_MB_PLANE], - unsigned int qstep[MAX_MB_PLANE], - int64_t *rate_sum, int64_t *dist_sum); - int vp9_get_switchable_rate(const struct VP9_COMP *cpi, const MACROBLOCKD *const xd); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 201bf416db..309341682b 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -160,10 +160,12 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n, } #if !CONFIG_REALTIME_ONLY -static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, - MACROBLOCKD *xd, int *out_rate_sum, - int64_t *out_dist_sum, int *skip_txfm_sb, - int64_t *skip_sse_sb) { +static int model_rd_for_sb_earlyterm(VP9_COMP *cpi, int mi_row, int mi_col, + BLOCK_SIZE bsize, MACROBLOCK *x, + MACROBLOCKD *xd, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, + int64_t *skip_sse_sb, int do_earlyterm, + int64_t best_rd) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. @@ -176,19 +178,15 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, int64_t total_sse = 0; int skip_flag = 1; const int shift = 6; - int64_t dist; const int dequant_shift = #if CONFIG_VP9_HIGHBITDEPTH (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : #endif // CONFIG_VP9_HIGHBITDEPTH 3; - unsigned int qstep_vec[MAX_MB_PLANE]; - unsigned int nlog2_vec[MAX_MB_PLANE]; - unsigned int sum_sse_vec[MAX_MB_PLANE]; - int any_zero_sum_sse = 0; x->pred_sse[ref] = 0; + // Build prediction signal, compute stats and RD cost on per-plane basis for (i = 0; i < MAX_MB_PLANE; ++i) { struct macroblock_plane *const p = &x->plane[i]; struct macroblockd_plane *const pd = &xd->plane[i]; @@ -207,7 +205,14 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, int idx, idy; int lw = b_width_log2_lookup[unit_size] + 2; int lh = b_height_log2_lookup[unit_size] + 2; + unsigned int qstep; + unsigned int nlog2; + int64_t dist = 0; + // Build inter predictor + vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); + + // Compute useful stats for (idy = 0; idy < bh; ++idy) { for (idx = 0; idx < bw; ++idx) { uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw); @@ -243,46 +248,36 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, } total_sse += sum_sse; - sum_sse_vec[i] = sum_sse; - any_zero_sum_sse = any_zero_sum_sse || (sum_sse == 0); - qstep_vec[i] = pd->dequant[1] >> dequant_shift; - nlog2_vec[i] = num_pels_log2_lookup[bs]; - } + qstep = pd->dequant[1] >> dequant_shift; + nlog2 = num_pels_log2_lookup[bs]; - // Fast approximate the modelling function. - if (cpi->sf.simple_model_rd_from_var) { - for (i = 0; i < MAX_MB_PLANE; ++i) { + // Fast approximate the modelling function. + if (cpi->sf.simple_model_rd_from_var) { int64_t rate; - const int64_t square_error = sum_sse_vec[i]; - int quantizer = qstep_vec[i]; - - if (quantizer < 120) - rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT); + if (qstep < 120) + rate = ((int64_t)sum_sse * (280 - qstep)) >> (16 - VP9_PROB_COST_SHIFT); else rate = 0; - dist = (square_error * quantizer) >> 8; + dist = ((int64_t)sum_sse * qstep) >> 8; rate_sum += rate; - dist_sum += dist; - } - } else { - if (any_zero_sum_sse) { - for (i = 0; i < MAX_MB_PLANE; ++i) { - int rate; - vp9_model_rd_from_var_lapndz(sum_sse_vec[i], nlog2_vec[i], qstep_vec[i], - &rate, &dist); - rate_sum += rate; - dist_sum += dist; - } } else { - vp9_model_rd_from_var_lapndz_vec(sum_sse_vec, nlog2_vec, qstep_vec, - &rate_sum, &dist_sum); + int rate; + vp9_model_rd_from_var_lapndz(sum_sse, nlog2, qstep, &rate, &dist); + rate_sum += rate; + } + dist_sum += dist; + if (do_earlyterm) { + if (RDCOST(x->rdmult, x->rddiv, rate_sum, + dist_sum << VP9_DIST_SCALE_LOG2) >= best_rd) + return 1; } } - *skip_txfm_sb = skip_flag; *skip_sse_sb = total_sse << VP9_DIST_SCALE_LOG2; *out_rate_sum = (int)rate_sum; *out_dist_sum = dist_sum << VP9_DIST_SCALE_LOG2; + + return 0; } #endif // !CONFIG_REALTIME_ONLY @@ -2964,6 +2959,9 @@ static int64_t handle_inter_mode( int64_t rs_rd; int tmp_skip_sb = 0; int64_t tmp_skip_sse = INT64_MAX; + const int enable_earlyterm = + cpi->sf.early_term_interp_search_plane_rd && cm->interp_filter != i; + int64_t filt_best_rd; mi->interp_filter = i; rs = vp9_get_switchable_rate(cpi, xd); @@ -2997,9 +2995,16 @@ static int64_t handle_inter_mode( xd->plane[j].dst.stride = 64; } } - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); - model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, &tmp_skip_sb, - &tmp_skip_sse); + // Compute RD cost with early termination option + filt_best_rd = + cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd; + if (model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd, + &rate_sum, &dist_sum, &tmp_skip_sb, + &tmp_skip_sse, enable_earlyterm, + filt_best_rd)) { + filter_cache[i] = INT64_MAX; + continue; + } rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum); filter_cache[i] = rd; @@ -3067,9 +3072,9 @@ static int64_t handle_inter_mode( // Handles the special case when a filter that is not in the // switchable list (ex. bilinear) is indicated at the frame level, or // skip condition holds. - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); - model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb, - &skip_sse_sb); + model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, + &tmp_dist, &skip_txfm_sb, &skip_sse_sb, + 0 /*do_earlyterm*/, INT64_MAX); rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist); memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); memcpy(bsse, x->bsse, sizeof(bsse)); diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index ce83a97626..f19385b6a8 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -227,6 +227,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->temporal_filter_search_method = NSTEP; sf->tx_size_search_breakout = 1; sf->use_square_partition_only = !boosted; + sf->early_term_interp_search_plane_rd = 1; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; @@ -919,6 +920,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 0; sf->cb_pred_filter_search = 0; + sf->early_term_interp_search_plane_rd = 0; sf->cb_partition_search = 0; sf->motion_field_mode_search = 0; sf->alt_ref_search_fp = 0; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index c2ae970b77..bd8e658cfd 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -402,6 +402,10 @@ typedef struct SPEED_FEATURES { // Chessboard pattern prediction filter type search int cb_pred_filter_search; + // This variable enables an early termination of interpolation filter eval + // based on the current rd cost after processing each plane + int early_term_interp_search_plane_rd; + int cb_partition_search; int motion_field_mode_search; From b9933679bf0088527f2f762bad50a4bb7820bdf1 Mon Sep 17 00:00:00 2001 From: Neeraj Gadgil Date: Wed, 1 Mar 2023 20:41:37 +0530 Subject: [PATCH 0659/1000] Use cb pattern for interp eval when filter is not switchable This CL uses a checkerboard pattern for interp filter eval when the filter is not switchable. Instruction Count BD-Rate Loss(%) cpu Resolution Reduction(%) avg.psnr ovr.psnr ssim 0 LOWRES2 0.725 0.0017 -0.0000 0.0192 0 MIDRES2 0.968 0.0004 0.0504 0.0810 0 HDRES2 1.135 0.0089 0.0130 0.0113 0 Average 0.943 0.0037 0.0211 0.0372 STATS_CHANGED Change-Id: Ia713e5170101302f264ffaa2350bc0ab15c27090 --- vp9/encoder/vp9_rdopt.c | 20 ++++++++++++-------- vp9/encoder/vp9_speed_features.c | 13 +++++++------ vp9/encoder/vp9_speed_features.h | 6 +++++- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 309341682b..f87ab3e0bc 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -2801,13 +2801,12 @@ static int64_t handle_inter_mode( uint8_t skip_txfm[MAX_MB_PLANE << 2] = { 0 }; int64_t bsse[MAX_MB_PLANE << 2] = { 0 }; - int bsl = mi_width_log2_lookup[bsize]; - int pred_filter_search = - cpi->sf.cb_pred_filter_search - ? (((mi_row + mi_col) >> bsl) + - get_chessboard_index(cm->current_video_frame)) & - 0x1 - : 0; + const int bsl = mi_width_log2_lookup[bsize]; + const int blk_parity = (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & + 0x1; + const int pred_filter_search = + (cpi->sf.cb_pred_filter_search >= 2) && blk_parity; int skip_txfm_sb = 0; int64_t skip_sse_sb = INT64_MAX; @@ -2947,9 +2946,14 @@ static int64_t handle_inter_mode( for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX; if (cm->interp_filter != BILINEAR) { + // Use cb pattern for filter eval when filter is not switchable + const int enable_interp_search = + (cpi->sf.cb_pred_filter_search && cm->interp_filter != SWITCHABLE) + ? blk_parity + : 1; if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) { best_filter = EIGHTTAP; - } else if (best_filter == SWITCHABLE) { + } else if (best_filter == SWITCHABLE && enable_interp_search) { int newbest; int tmp_rate_sum = 0; int64_t tmp_dist_sum = 0; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index f19385b6a8..f47e3d71c9 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -159,7 +159,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; sf->alt_ref_search_fp = 1; - sf->cb_pred_filter_search = 1; + sf->cb_pred_filter_search = 2; sf->adaptive_interp_filter_search = 1; sf->disable_split_mask = DISABLE_ALL_SPLIT; } @@ -228,6 +228,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->tx_size_search_breakout = 1; sf->use_square_partition_only = !boosted; sf->early_term_interp_search_plane_rd = 1; + sf->cb_pred_filter_search = 1; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; @@ -346,7 +347,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 1; sf->cb_partition_search = !boosted; - sf->cb_pred_filter_search = 1; + sf->cb_pred_filter_search = 2; sf->alt_ref_search_fp = 1; sf->recode_loop = ALLOW_RECODE_KFMAXBW; sf->adaptive_rd_thresh = 3; @@ -639,7 +640,7 @@ static void set_rt_speed_feature_framesize_independent( sf->use_altref_onepass = 1; sf->use_compound_nonrd_pickmode = 1; } - if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 1; + if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 2; if (!cpi->external_resize) sf->use_source_sad = 1; } @@ -729,7 +730,7 @@ static void set_rt_speed_feature_framesize_independent( if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer && svc->temporal_layer_id > 0) cpi->ref_frame_flags &= (~VP9_GOLD_FLAG); - if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 1; + if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 2; } if (speed >= 8) { @@ -773,7 +774,7 @@ static void set_rt_speed_feature_framesize_independent( } sf->limit_newmv_early_exit = 0; sf->use_simple_block_yrd = 1; - if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 1; + if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 2; } if (speed >= 9) { @@ -783,7 +784,7 @@ static void set_rt_speed_feature_framesize_independent( for (i = 0; i < BLOCK_SIZES; ++i) sf->intra_y_mode_bsize_mask[i] = INTRA_DC; } - sf->cb_pred_filter_search = 1; + sf->cb_pred_filter_search = 2; sf->mv.enable_adaptive_subpel_force_stop = 1; sf->mv.adapt_subpel_force_stop.mv_thresh = 1; sf->mv.adapt_subpel_force_stop.force_stop_below = QUARTER_PEL; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index bd8e658cfd..e30a26084a 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -399,7 +399,11 @@ typedef struct SPEED_FEATURES { // Adaptive prediction mode search int adaptive_mode_search; - // Chessboard pattern prediction filter type search + // Chessboard pattern prediction for interp filter. Aggressiveness increases + // with levels. + // 0: disable + // 1: cb pattern in eval when filter is not switchable + // 2: cb pattern prediction for filter search int cb_pred_filter_search; // This variable enables an early termination of interpolation filter eval From 5ae84ea5ae548314cfef982c95a4c9dbdfa79f6c Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Wed, 1 Mar 2023 10:06:01 +0000 Subject: [PATCH 0660/1000] Optimize vp9_block_error_fp_neon Currently vp9_block_error_fp_neon is only used when CONFIG_VP9_HIGHBITDEPTH is set to false. This patch optimizes the implementation and uses tran_low_t instead of int16_t so that the function can also be used in builds where vp9_highbitdepth is enabled. Change-Id: Ibab7ec5f74b7652fa2ae5edf328f9ec587088fd3 --- test/avg_test.cc | 4 -- vp9/common/vp9_rtcd_defs.pl | 5 +-- vp9/encoder/arm/neon/vp9_error_neon.c | 54 +++++++++++++++++---------- vp9/vp9cx.mk | 2 - vpx_dsp/arm/sum_neon.h | 8 ++++ 5 files changed, 43 insertions(+), 30 deletions(-) diff --git a/test/avg_test.cc b/test/avg_test.cc index 196522ce58..bcf8d0d993 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -694,16 +694,12 @@ INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest, make_tuple(256, &vpx_satd_neon), make_tuple(1024, &vpx_satd_neon))); -// TODO(jianj): Remove the highbitdepth flag once the SIMD functions are -// in place. -#if !CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_SUITE_P( NEON, BlockErrorTestFP, ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon), make_tuple(64, &vp9_block_error_fp_neon), make_tuple(256, &vp9_block_error_fp_neon), make_tuple(1024, &vp9_block_error_fp_neon))); -#endif // !CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON #if HAVE_MSA diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 20a482c85f..c939411a3c 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -127,6 +127,7 @@ () add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; +specialize qw/vp9_block_error_fp neon avx2 sse2/; add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/; @@ -137,14 +138,10 @@ () if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_block_error avx2 sse2/; - specialize qw/vp9_block_error_fp avx2 sse2/; - add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; specialize qw/vp9_highbd_block_error sse2/; } else { specialize qw/vp9_block_error avx2 msa sse2/; - - specialize qw/vp9_block_error_fp neon avx2 sse2/; } # fdct functions diff --git a/vp9/encoder/arm/neon/vp9_error_neon.c b/vp9/encoder/arm/neon/vp9_error_neon.c index 1c7503139e..eb1e2e03d0 100644 --- a/vp9/encoder/arm/neon/vp9_error_neon.c +++ b/vp9/encoder/arm/neon/vp9_error_neon.c @@ -12,30 +12,44 @@ #include #include "./vp9_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" -int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, - int block_size) { - int64x2_t error = vdupq_n_s64(0); +int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size) { + uint64x2_t err_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; - assert(block_size >= 8); - assert((block_size % 8) == 0); + assert(block_size >= 16); + assert((block_size % 16) == 0); do { - const int16x8_t c = vld1q_s16(coeff); - const int16x8_t d = vld1q_s16(dqcoeff); - const int16x8_t diff = vsubq_s16(c, d); - const int16x4_t diff_lo = vget_low_s16(diff); - const int16x4_t diff_hi = vget_high_s16(diff); - // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before - // accumulating them in 64-bits. - const int32x4_t err0 = vmull_s16(diff_lo, diff_lo); - const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi); - const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1)); - error = vaddq_s64(error, err2); - coeff += 8; - dqcoeff += 8; - block_size -= 8; + uint32x4_t err0, err1; + + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // diff is 15-bits, the squares 30, so in theory we can store 4 in 32-bits + // before accumulating them in 64-bits. However splitting into 2 mull, mlal + // pairs is beneficial since it allows us to use both Neon + // multiply-accumulate pipes - on CPUs that have them - rather than having + // a single chain of 4 instructions executing serially. + err0 = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err0 = vmlal_u16(err0, vget_high_u16(diff0), vget_high_u16(diff0)); + err_u64[0] = vpadalq_u32(err_u64[0], err0); + + err1 = vmull_u16(vget_low_u16(diff1), vget_low_u16(diff1)); + err1 = vmlal_u16(err1, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64[1] = vpadalq_u32(err_u64[1], err1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; } while (block_size != 0); - return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1); + return horizontal_add_uint64x2(vaddq_u64(err_u64[0], err_u64[1])); } diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index ae8fb85d87..cccaea712e 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -136,9 +136,7 @@ endif VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c -ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c -endif VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index 47748a8061..6f513ca7a8 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -127,4 +127,12 @@ static INLINE uint64_t horizontal_add_int64x2(const int64x2_t a) { #endif } +static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) { +#if defined(__aarch64__) + return vaddvq_u64(a); +#else + return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); +#endif +} + #endif // VPX_VPX_DSP_ARM_SUM_NEON_H_ From 57c6ea97522146e9471a3537304ce8a0a7a22ea0 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Fri, 3 Mar 2023 10:53:27 +0000 Subject: [PATCH 0661/1000] Fix return type of horizontal_add_int64x2 helper horizontal_add_int64x2 was incorrectly returning a uint64_t instead of an int64_t. This patch fixes that. Change-Id: Ic6016cf87aebfc6a14f540b784d6648757e12b49 --- vpx_dsp/arm/highbd_variance_neon.c | 2 +- vpx_dsp/arm/sum_neon.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c index d0b366c95b..75fde676a0 100644 --- a/vpx_dsp/arm/highbd_variance_neon.c +++ b/vpx_dsp/arm/highbd_variance_neon.c @@ -167,7 +167,7 @@ static INLINE void highbd_variance_xlarge_neon( } while (i < h); *sum = horizontal_add_int32x4(sum_s32); - *sse = horizontal_add_int64x2(sse_s64); + *sse = (uint64_t)horizontal_add_int64x2(sse_s64); } static INLINE void highbd_variance_32xh_xlarge_neon( diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index 6f513ca7a8..8291f07296 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -119,7 +119,7 @@ static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) { #endif } -static INLINE uint64_t horizontal_add_int64x2(const int64x2_t a) { +static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) { #if defined(__aarch64__) return vaddvq_s64(a); #else From eec48083936b52bc0ec9adfc452d29b177366d75 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Mon, 6 Mar 2023 11:37:26 +0000 Subject: [PATCH 0662/1000] Add Neon implementation of vp9_block_error_c Add Neon implementation of vp9_block_error_c as well as the corresponding tests. Change-Id: I79247b5ae24f51b7b55fc5e517d5e403dc86367a --- test/test.mk | 2 +- test/vp9_block_error_test.cc | 8 +++++ vp9/common/vp9_rtcd_defs.pl | 4 +-- vp9/encoder/arm/neon/vp9_error_neon.c | 47 +++++++++++++++++++++++++++ 4 files changed, 58 insertions(+), 3 deletions(-) diff --git a/test/test.mk b/test/test.mk index f60d8f823f..3c225bc750 100644 --- a/test/test.mk +++ b/test/test.mk @@ -179,7 +179,7 @@ ifneq ($(CONFIG_REALTIME_ONLY),yes) LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc -ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2))) +ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2) $(HAVE_NEON))) LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc diff --git a/test/vp9_block_error_test.cc b/test/vp9_block_error_test.cc index b93b014e65..bde84cd619 100644 --- a/test/vp9_block_error_test.cc +++ b/test/vp9_block_error_test.cc @@ -197,4 +197,12 @@ INSTANTIATE_TEST_SUITE_P( &BlockError8BitWrapper, VPX_BITS_8))); #endif // HAVE_AVX2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, BlockErrorTest, + ::testing::Values(make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, + VPX_BITS_8))); +#endif } // namespace diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index c939411a3c..2f9870dd48 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -136,12 +136,12 @@ () specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - specialize qw/vp9_block_error avx2 sse2/; + specialize qw/vp9_block_error neon avx2 sse2/; add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; specialize qw/vp9_highbd_block_error sse2/; } else { - specialize qw/vp9_block_error avx2 msa sse2/; + specialize qw/vp9_block_error neon avx2 msa sse2/; } # fdct functions diff --git a/vp9/encoder/arm/neon/vp9_error_neon.c b/vp9/encoder/arm/neon/vp9_error_neon.c index eb1e2e03d0..0cf0bf250e 100644 --- a/vp9/encoder/arm/neon/vp9_error_neon.c +++ b/vp9/encoder/arm/neon/vp9_error_neon.c @@ -15,6 +15,53 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" +int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + uint32x4_t err; + int32x4_t ssz0, ssz1; + + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // diff is 15-bits, the squares 30, so we can store 4 in 32-bits before + // accumulating them in 64-bits. + err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); + err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); + err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64 = vpadalq_u32(err_u64, err); + + // We can't do the same here as we're operating on signed integers, so we + // can store 2 15-bit diff before accumulating into 64-bits. + ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0)); + ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz0); + + ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1)); + ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + *ssz = horizontal_add_int64x2(ssz_s64); + return (int64_t)horizontal_add_uint64x2(err_u64); +} + int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size) { uint64x2_t err_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; From b7fabadc5d230dd78922b048a39a15c038429782 Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Thu, 2 Mar 2023 10:58:27 +0530 Subject: [PATCH 0663/1000] Add AVX2 for vpx_filter_block1d8_h8() function Introduced AVX2 intrinsic to compute convolve horizontal for w = 8 case. This is a bit-exact change. Instruction Count cpu Resolution Reduction(%) 0 LOWRES2 1.509 0 MIDRES2 1.165 0 HDRES2 0.898 0 Average 1.191 Change-Id: I699c94aa3d7ea74c58f901df906eed0b81b4ee79 --- vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 81 ++++++++++++++++++++++- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 841db7cd71..26e82f9b73 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -15,6 +15,7 @@ #include "vpx_dsp/x86/convolve.h" #include "vpx_dsp/x86/convolve_avx2.h" #include "vpx_dsp/x86/convolve_sse2.h" +#include "vpx_dsp/x86/convolve_ssse3.h" #include "vpx_ports/mem.h" // filters for 16_h8 @@ -38,6 +39,31 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; +#define CALC_CONVOLVE8_HORZ_ROW \ + srcReg = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3); \ + s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]); \ + s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]); \ + s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]); \ + s1[3] = _mm256_shuffle_epi8(srcReg, filt[3]); \ + s1[0] = convolve8_16_avx2(s1, f1); \ + s1[0] = _mm256_packus_epi16(s1[0], s1[0]); \ + src_ptr += src_stride; \ + _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(s1[0])); \ + output_ptr += output_pitch; \ + _mm_storel_epi64((__m128i *)&output_ptr[0], \ + _mm256_extractf128_si256(s1[0], 1)); \ + output_ptr += output_pitch; + +// 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0 +static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) { + // 0 0 0 0 0 0 0 0 | 0 0 0 0 lo3 lo2 lo1 lo0 + __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo))); + + // 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0 + a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1); + return a; +} + static INLINE void vpx_filter_block1d16_h8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter, @@ -177,6 +203,59 @@ static void vpx_filter_block1d16_h8_avg_avx2( output_height, filter, 1); } +static void vpx_filter_block1d8_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m256i filt[4], f1[4], s1[4], srcReg; + __m128i f[4], s[4]; + int y = output_height; + + // Multiply the size of the source stride by two + const ptrdiff_t src_stride = src_pitch << 1; + + shuffle_filter_avx2(filter, f1); + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + // Process next 4 rows + while (y > 3) { + CALC_CONVOLVE8_HORZ_ROW + CALC_CONVOLVE8_HORZ_ROW + y -= 4; + } + + // If remaining, then process 2 rows at a time + while (y > 1) { + CALC_CONVOLVE8_HORZ_ROW + y -= 2; + } + + // For the remaining height. + if (y > 0) { + const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + f[0] = _mm256_castsi256_si128(f1[0]); + f[1] = _mm256_castsi256_si128(f1[1]); + f[2] = _mm256_castsi256_si128(f1[2]); + f[3] = _mm256_castsi256_si128(f1[3]); + + // filter the source buffer + s[0] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])); + s[1] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])); + s[2] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])); + s[3] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])); + s[0] = convolve8_8_ssse3(s, f); + + // Saturate 16bit value to 8bit. + s[0] = _mm_packus_epi16(s[0], s[0]); + + // Save only 8 bytes + _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]); + } +} + static INLINE void vpx_filter_block1d16_v8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter, @@ -870,14 +949,12 @@ filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; #define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3 -#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3 #else // VPX_ARCH_X86 filter8_1dfunction vpx_filter_block1d8_v8_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_ssse3; #define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3 -#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3 #endif // VPX_ARCH_X86_64 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; From e33d4c276d71d982987d564cd9b962227d1631b6 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 7 Mar 2023 22:09:37 -0800 Subject: [PATCH 0664/1000] disable vpx_highbd_*_sub_pixel_variance4x{4,8}_neon vpx_highbd_8_sub_pixel_variance4x4_neon vpx_highbd_8_sub_pixel_variance4x8_neon vpx_highbd_10_sub_pixel_variance4x4_neon vpx_highbd_10_sub_pixel_variance4x8_neon vpx_highbd_12_sub_pixel_variance4x4_neon vpx_highbd_12_sub_pixel_variance4x8_neon all cause heap overflows of the form: [ RUN ] NEON/VpxHBDSubpelVarianceTest.Ref/24 ================================================================= ==450528==ERROR: AddressSanitizer: heap-buffer-overflow on address 0xffff8311a571 at pc 0x0000010ca52c bp 0xffffc63e96b0 sp 0xffffc63e96a8 READ of size 8 at 0xffff8311a571 thread T0 #0 0x10ca528 in load_unaligned_u16q vpx_dsp/arm/mem_neon.h:176:3 #1 0x10ca528 in highbd_var_filter_block2d_bil_w4 vpx_dsp/arm/highbd_subpel_variance_neon.c:49:21 #2 0x10ca528 in vpx_highbd_10_sub_pixel_variance4x8_neon vpx_dsp/arm/highbd_subpel_variance_neon.c:257:1 ... 0xffff8311a571 is located 0 bytes to the right of 113-byte region [0xffff8311a500,0xffff8311a571) allocated by thread T0 here: #0 0x5f18b0 in malloc (test_libvpx+0x5f18b0) #1 0xce4f90 in vpx_memalign vpx_mem/vpx_mem.c:62:10 #2 0xce4f90 in vpx_malloc vpx_mem/vpx_mem.c:70:40 #3 0xa4ad44 in (anonymous namespace)::SubpelVarianceTest::SetUp() test/variance_test.cc:586:14 Bug: webm:1796 Change-Id: I39f7f936bae2bcbbe1f803fb10375ec02d1c1277 --- test/variance_test.cc | 14 +++++++++---- vpx_dsp/arm/highbd_subpel_variance_neon.c | 15 ++++++++------ vpx_dsp/vpx_dsp_rtcd_defs.pl | 24 +++++++++++++++++------ 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index 1359bc4baf..144c2d2901 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1591,10 +1591,12 @@ INSTANTIATE_TEST_SUITE_P( 12), SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon, 12), + /*TODO(https://crbug.com/webm/1796): enable after heap overflow is + fixed. SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon, 12), SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon, - 12), + 12),*/ SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon, 10), SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon, @@ -1617,10 +1619,12 @@ INSTANTIATE_TEST_SUITE_P( 10), SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon, 10), + /*TODO(https://crbug.com/webm/1796): enable after heap overflow is + fixed. SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon, 10), SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon, - 10), + 10),*/ SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon, 8), SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon, @@ -1640,10 +1644,12 @@ INSTANTIATE_TEST_SUITE_P( SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon, 8), SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8), - SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8), + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8) + /*TODO(https://crbug.com/webm/1796): enable after heap overflow is + fixed. SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8), SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon, - 8))); + 8)*/)); INSTANTIATE_TEST_SUITE_P( NEON, VpxHBDSubpelAvgVarianceTest, diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c index aa64697458..c081d520ab 100644 --- a/vpx_dsp/arm/highbd_subpel_variance_neon.c +++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c @@ -234,8 +234,9 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, // padding. // 8-bit -HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4, 2) -HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8, 2) +// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. +// HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4, 2) +// HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8, 2) HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4, 1) HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8, 1) @@ -253,8 +254,9 @@ HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32, 1) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64, 1) // 10-bit -HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4, 2) -HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8, 2) +// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. +// HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4, 2) +// HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8, 2) HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4, 1) HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8, 1) @@ -272,8 +274,9 @@ HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32, 1) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64, 1) // 12-bit -HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4, 2) -HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8, 2) +// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. +// HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4, 2) +// HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8, 2) HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4, 1) HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8, 1) diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index ad8ff6e184..ad46499748 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1443,9 +1443,13 @@ () specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/; + # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is + # fixed. + # specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/; + # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is + # fixed. + # specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2 neon/; @@ -1481,9 +1485,13 @@ () specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/; + # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is + # fixed. + # specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/; + # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is + # fixed. + # specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2 neon/; @@ -1519,9 +1527,13 @@ () specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/; + # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is + # fixed. + # specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/; + # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is + # fixed. + # specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/; From a47967700d5d6fe4172c7fa48ad95b3e4ba39982 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 8 Mar 2023 13:17:17 -0800 Subject: [PATCH 0665/1000] disable vpx_highbd_*_sub_pixel_avg_variance4x{4,8}_neon vpx_highbd_8_sub_pixel_avg_variance4x4_neon vpx_highbd_8_sub_pixel_avg_variance4x8_neon vpx_highbd_10_sub_pixel_avg_variance4x4_neon vpx_highbd_10_sub_pixel_avg_variance4x8_neon vpx_highbd_12_sub_pixel_avg_variance4x4_neon vpx_highbd_12_sub_pixel_avg_variance4x8_neon all cause heap overflows of the form: i[ RUN ] NEON/VpxHBDSubpelAvgVarianceTest.Ref/33 ================================================================= ==535205==ERROR: AddressSanitizer: heap-buffer-overflow on address 0xffff95bb0b89 at pc 0x00000116dabc bp 0xffffd09f6430 sp 0xffffd09f6428 READ of size 8 at 0xffff95bb0b89 thread T0 #0 0x116dab8 in load_unaligned_u16q vpx_dsp/arm/mem_neon.h:176:3 #1 0x116dab8 in highbd_var_filter_block2d_bil_w4 vpx_dsp/arm/highbd_subpel_variance_neon.c:49:21 #2 0x116dab8 in vpx_highbd_8_sub_pixel_avg_variance4x4_neon vpx_dsp/arm/highbd_subpel_variance_neon.c:543:1 ... 0xffff95bb0b89 is located 0 bytes to the right of 73-byte region [0xffff95bb0b40,0xffff95bb0b89) allocated by thread T0 here: #0 0x5f18b0 in malloc (test_libvpx+0x5f18b0) #1 0xce4a40 in vpx_memalign vpx_mem/vpx_mem.c:62:10 #2 0xce4a40 in vpx_malloc vpx_mem/vpx_mem.c:70:40 #3 0xa52238 in (anonymous namespace)::SubpelVarianceTest::SetUp() test/variance_test.cc:586:14 ... This is the same issue as: e33d4c276 disable vpx_highbd_*_sub_pixel_variance4x{4,8}_neon They have highbd_var_filter_block2d_bil_w4 in common. Bug: webm:1796 Change-Id: I3ed70d0ba22e127720542612ea9f6665948eedfc --- test/variance_test.cc | 14 +++++++++---- vpx_dsp/arm/highbd_subpel_variance_neon.c | 21 ++++++++++++++------ vpx_dsp/vpx_dsp_rtcd_defs.pl | 24 +++++++++++++++++------ 3 files changed, 43 insertions(+), 16 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index 144c2d2901..8af26969c6 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1687,12 +1687,14 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_neon, 12), + /*TODO(https://crbug.com/webm/1796): enable after heap overflow is + fixed. SubpelAvgVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_avg_variance4x8_neon, 12), SubpelAvgVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_neon, - 12), + 12),*/ SubpelAvgVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_neon, 10), @@ -1726,12 +1728,14 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_neon, 10), + /*TODO(https://crbug.com/webm/1796): enable after heap overflow is + fixed. SubpelAvgVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_neon, 10), SubpelAvgVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_neon, - 10), + 10),*/ SubpelAvgVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_neon, 8), @@ -1764,13 +1768,15 @@ INSTANTIATE_TEST_SUITE_P( 8), SubpelAvgVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_neon, - 8), + 8) + /*TODO(https://crbug.com/webm/1796): enable after heap overflow is + fixed. SubpelAvgVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_neon, 8), SubpelAvgVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_neon, - 8))); + 8)*/)); #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c index c081d520ab..b2fe9921c9 100644 --- a/vpx_dsp/arm/highbd_subpel_variance_neon.c +++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c @@ -37,6 +37,8 @@ // 15-bit.) // Process a block exactly 4 wide and a multiple of 2 high. +// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. +#if 0 static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, @@ -60,6 +62,7 @@ static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr, i -= 2; } while (i != 0); } +#endif // 0 // Process a block which is a multiple of 8 and any height. static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr, @@ -295,6 +298,8 @@ HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64, 1) // Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having // width 4. +// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. +#if 0 static void highbd_avg_pred_var_filter_block2d_bil_w4( const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset, const uint16_t *second_pred) { @@ -319,6 +324,7 @@ static void highbd_avg_pred_var_filter_block2d_bil_w4( i -= 2; } while (i != 0); } +#endif // 0 // Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks. static void highbd_avg_pred_var_filter_block2d_bil_large( @@ -540,8 +546,9 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, // padding. // 8-bit -HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4, 2) -HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8, 2) +// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. +// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4, 2) +// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8, 2) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4, 1) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8, 1) @@ -559,8 +566,9 @@ HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32, 1) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64, 1) // 10-bit -HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4, 2) -HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8, 2) +// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. +// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4, 2) +// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8, 2) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4, 1) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8, 1) @@ -578,8 +586,9 @@ HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32, 1) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64, 1) // 12-bit -HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4, 2) -HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8, 2) +// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. +// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4, 2) +// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8, 2) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4, 1) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8, 1) diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index ad46499748..62f4789c2c 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1569,9 +1569,13 @@ () specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/; + # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is + # fixed. + # specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/; + # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is + # fixed. + # specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/; @@ -1607,9 +1611,13 @@ () specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/; + # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is + # fixed. + # specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/; + # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is + # fixed. + # specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/; @@ -1645,9 +1653,13 @@ () specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/; + # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is + # fixed. + # specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/; + # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is + # fixed. + # specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/; } # CONFIG_VP9_HIGHBITDEPTH From eab52a4f3c5bece97b8a2656553903aacd8f7ab4 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 8 Mar 2023 16:34:20 +0000 Subject: [PATCH 0666/1000] Fix buffer overrun in highbd Neon subpel variance filters The high bitdepth Neon code applying the first pass of the bilinear filter for subpixel variance on blocks of width 4 processed two rows at a time. This resulted in a source buffer overread, attempting to produce two rows of padding for the second (vertical) pass of the bilinear filter. This patch modifies highbd_var_filter_block2d_bil_w4 and highbd_avg_pred_var_filter_block2d_bil_w4 such that they only process a single row per iteration, and only require a single row of padding for the second pass. This prevents the buffer overread. Since all block sizes are now processed one row at a time, there is no need for a "padding" macro parameter - the value is always 1, with no special case for 4xh blocks. As well as re-enabling the Neon paths and their associated tests, we remove the now-redundant 'padding' macro parameter. Bug: webm:1796 Change-Id: Icd6076b38eb4476139795bb1734ca800c9edf079 --- test/variance_test.cc | 28 +- vpx_dsp/arm/highbd_subpel_variance_neon.c | 302 ++++++++++------------ vpx_dsp/arm/mem_neon.h | 9 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 48 +--- 4 files changed, 170 insertions(+), 217 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index 8af26969c6..1359bc4baf 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1591,12 +1591,10 @@ INSTANTIATE_TEST_SUITE_P( 12), SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon, 12), - /*TODO(https://crbug.com/webm/1796): enable after heap overflow is - fixed. SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon, 12), SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon, - 12),*/ + 12), SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon, 10), SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon, @@ -1619,12 +1617,10 @@ INSTANTIATE_TEST_SUITE_P( 10), SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon, 10), - /*TODO(https://crbug.com/webm/1796): enable after heap overflow is - fixed. SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon, 10), SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon, - 10),*/ + 10), SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon, 8), SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon, @@ -1644,12 +1640,10 @@ INSTANTIATE_TEST_SUITE_P( SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon, 8), SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8), - SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8) - /*TODO(https://crbug.com/webm/1796): enable after heap overflow is - fixed. + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8), SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8), SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon, - 8)*/)); + 8))); INSTANTIATE_TEST_SUITE_P( NEON, VpxHBDSubpelAvgVarianceTest, @@ -1687,14 +1681,12 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_neon, 12), - /*TODO(https://crbug.com/webm/1796): enable after heap overflow is - fixed. SubpelAvgVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_avg_variance4x8_neon, 12), SubpelAvgVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_neon, - 12),*/ + 12), SubpelAvgVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_neon, 10), @@ -1728,14 +1720,12 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_neon, 10), - /*TODO(https://crbug.com/webm/1796): enable after heap overflow is - fixed. SubpelAvgVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_neon, 10), SubpelAvgVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_neon, - 10),*/ + 10), SubpelAvgVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_neon, 8), @@ -1768,15 +1758,13 @@ INSTANTIATE_TEST_SUITE_P( 8), SubpelAvgVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_neon, - 8) - /*TODO(https://crbug.com/webm/1796): enable after heap overflow is - fixed. + 8), SubpelAvgVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_neon, 8), SubpelAvgVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_neon, - 8)*/)); + 8))); #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c index b2fe9921c9..683df5797a 100644 --- a/vpx_dsp/arm/highbd_subpel_variance_neon.c +++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c @@ -36,33 +36,29 @@ // requiring double the number of data processing instructions. (12-bit * 8 = // 15-bit.) -// Process a block exactly 4 wide and a multiple of 2 high. -// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. -#if 0 +// Process a block exactly 4 wide and any height. static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset) { - const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); - const uint16x8_t f1 = vdupq_n_u16(filter_offset); + const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); + const uint16x4_t f1 = vdup_n_u16(filter_offset); int i = dst_height; do { - uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride); - uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride); + uint16x4_t s0 = load_unaligned_u16(src_ptr); + uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step); - uint16x8_t blend = vmulq_u16(s0, f0); - blend = vmlaq_u16(blend, s1, f1); - blend = vrshrq_n_u16(blend, 3); + uint16x4_t blend = vmul_u16(s0, f0); + blend = vmla_u16(blend, s1, f1); + blend = vrshr_n_u16(blend, 3); - vst1q_u16(dst_ptr, blend); + vst1_u16(dst_ptr, blend); - src_ptr += 2 * src_stride; - dst_ptr += 8; - i -= 2; - } while (i != 0); + src_ptr += src_stride; + dst_ptr += 4; + } while (--i != 0); } -#endif // 0 // Process a block which is a multiple of 8 and any height. static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr, @@ -148,23 +144,23 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, } while (--i != 0); } -#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \ +#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, uint32_t *sse) { \ - uint16_t tmp0[w * (h + padding)]; \ + uint16_t tmp0[w * (h + 1)]; \ uint16_t tmp1[w * h]; \ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ \ - highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ - (h + padding), xoffset); \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ \ return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ w, ref, ref_stride, sse); \ } -#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \ +#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, unsigned int *sse) { \ @@ -188,28 +184,28 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ - uint16_t tmp0[w * (h + padding)]; \ + uint16_t tmp0[w * (h + 1)]; \ if (yoffset == 0) { \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ - uint16_t tmp1[w * (h + padding)]; \ + uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ - (h + padding)); \ + (h + 1)); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ - uint16_t tmp1[w * (h + padding)]; \ + uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ - (h + padding)); \ + (h + 1)); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } else { \ - uint16_t tmp0[w * (h + padding)]; \ + uint16_t tmp0[w * (h + 1)]; \ if (yoffset == 0) { \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ xoffset); \ @@ -218,14 +214,14 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ - (h + padding), xoffset); \ + (h + 1), xoffset); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ - (h + padding), xoffset); \ + (h + 1), xoffset); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ @@ -233,98 +229,88 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, } \ } -// 4x blocks are processed two rows at a time, so require an extra row of -// padding. - // 8-bit -// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. -// HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4, 2) -// HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8, 2) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) -HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8, 1) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16, 1) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16, 1) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32, 1) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32, 1) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) // 10-bit -// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. -// HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4, 2) -// HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8, 2) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) -HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8, 1) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16, 1) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16, 1) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32, 1) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32, 1) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) // 12-bit -// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. -// HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4, 2) -// HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8, 2) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) -HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8, 1) -HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8, 1) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16, 1) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16, 1) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32, 1) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32, 1) -HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) // Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having // width 4. -// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. -#if 0 static void highbd_avg_pred_var_filter_block2d_bil_w4( const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset, const uint16_t *second_pred) { - const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); - const uint16x8_t f1 = vdupq_n_u16(filter_offset); + const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); + const uint16x4_t f1 = vdup_n_u16(filter_offset); int i = dst_height; do { - uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride); - uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride); - uint16x8_t p = vld1q_u16(second_pred); + uint16x4_t s0 = load_unaligned_u16(src_ptr); + uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step); + uint16x4_t p = vld1_u16(second_pred); - uint16x8_t blend = vmulq_u16(s0, f0); - blend = vmlaq_u16(blend, s1, f1); - blend = vrshrq_n_u16(blend, 3); + uint16x4_t blend = vmul_u16(s0, f0); + blend = vmla_u16(blend, s1, f1); + blend = vrshr_n_u16(blend, 3); - vst1q_u16(dst_ptr, vrhaddq_u16(blend, p)); + vst1_u16(dst_ptr, vrhadd_u16(blend, p)); - src_ptr += 2 * src_stride; - dst_ptr += 2 * 4; - second_pred += 2 * 4; - i -= 2; - } while (i != 0); + src_ptr += src_stride; + dst_ptr += 4; + second_pred += 4; + } while (--i != 0); } -#endif // 0 // Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks. static void highbd_avg_pred_var_filter_block2d_bil_large( @@ -444,25 +430,25 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, } while (--i != 0); } -#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \ - uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t tmp0[w * (h + padding)]; \ - uint16_t tmp1[w * h]; \ - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ - \ - highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ - (h + padding), xoffset); \ - highbd_avg_pred_var_filter_block2d_bil_w##w( \ - tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ - CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ +#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ + uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } -#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \ +#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ const uint8_t *src, int source_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, unsigned int *sse, \ @@ -490,7 +476,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ - uint16_t tmp0[w * (h + padding)]; \ + uint16_t tmp0[w * (h + 1)]; \ if (yoffset == 0) { \ highbd_avg_pred_var_filter_block2d_avg( \ src_ptr, tmp0, source_stride, 1, w, h, \ @@ -498,24 +484,24 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ - uint16_t tmp1[w * (h + padding)]; \ + uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ - (h + padding)); \ + (h + 1)); \ highbd_avg_pred_var_filter_block2d_avg( \ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ - uint16_t tmp1[w * (h + padding)]; \ + uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ - (h + padding)); \ + (h + 1)); \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } else { \ - uint16_t tmp0[w * (h + padding)]; \ + uint16_t tmp0[w * (h + 1)]; \ if (yoffset == 0) { \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ src_ptr, tmp0, source_stride, 1, h, xoffset, \ @@ -525,7 +511,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ - (h + padding), xoffset); \ + (h + 1), xoffset); \ highbd_avg_pred_var_filter_block2d_avg( \ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ @@ -533,7 +519,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, } else { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ - (h + padding), xoffset); \ + (h + 1), xoffset); \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ @@ -542,65 +528,59 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, } \ } -// 4x blocks are processed two rows at a time, so require an extra row of -// padding. - // 8-bit -// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. -// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4, 2) -// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8, 2) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8) -HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4, 1) -HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8, 1) -HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16, 1) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8, 1) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16, 1) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16, 1) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32, 1) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32, 1) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64) // 10-bit -// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. -// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4, 2) -// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8, 2) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8) -HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4, 1) -HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8, 1) -HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16, 1) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8, 1) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16, 1) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16, 1) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32, 1) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32, 1) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64) // 12-bit -// TODO(https://crbug.com/webm/1796): enable after heap overflow is fixed. -// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4, 2) -// HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8, 2) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8) -HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4, 1) -HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8, 1) -HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16, 1) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8, 1) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16, 1) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16, 1) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32, 1) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32, 1) -HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64) diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index 400846b707..fa14f80b23 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -164,6 +164,15 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, return vreinterpret_u8_u32(a_u32); } +// Load 8 bytes when alignment is not guaranteed. +static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) { + uint64_t a; + uint64x1_t a_u64 = vdup_n_u64(0); + memcpy(&a, buf, 8); + a_u64 = vset_lane_u64(a, a_u64, 0); + return vreinterpret_u16_u64(a_u64); +} + // Load 2 sets of 8 bytes when alignment is not guaranteed. static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf, ptrdiff_t stride) { diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 62f4789c2c..ad8ff6e184 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1443,13 +1443,9 @@ () specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is - # fixed. - # specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/; + specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is - # fixed. - # specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/; + specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2 neon/; @@ -1485,13 +1481,9 @@ () specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is - # fixed. - # specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/; + specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is - # fixed. - # specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/; + specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2 neon/; @@ -1527,13 +1519,9 @@ () specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is - # fixed. - # specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/; + specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is - # fixed. - # specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/; + specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/; @@ -1569,13 +1557,9 @@ () specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is - # fixed. - # specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is - # fixed. - # specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/; @@ -1611,13 +1595,9 @@ () specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is - # fixed. - # specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is - # fixed. - # specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/; @@ -1653,13 +1633,9 @@ () specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is - # fixed. - # specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - # TODO(https://crbug.com/webm/1796): enable neon after heap overflow is - # fixed. - # specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/; } # CONFIG_VP9_HIGHBITDEPTH From 4959770032bb5646d9106620906822d1260496cb Mon Sep 17 00:00:00 2001 From: Neeraj Gadgil Date: Thu, 9 Mar 2023 14:51:44 +0530 Subject: [PATCH 0667/1000] Rename function 'model_rd_for_sb_earlyterm' Function renamed as 'build_inter_pred_model_rd_earlyterm' and added a comment to explain its behavior. Change-Id: I804e6273558ba36241232f62cf18ea754b85e369 --- vp9/encoder/vp9_rdopt.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index f87ab3e0bc..bcadd5777e 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -160,12 +160,13 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n, } #if !CONFIG_REALTIME_ONLY -static int model_rd_for_sb_earlyterm(VP9_COMP *cpi, int mi_row, int mi_col, - BLOCK_SIZE bsize, MACROBLOCK *x, - MACROBLOCKD *xd, int *out_rate_sum, - int64_t *out_dist_sum, int *skip_txfm_sb, - int64_t *skip_sse_sb, int do_earlyterm, - int64_t best_rd) { +// Planewise build inter prediction and compute rdcost with early termination +// option +static int build_inter_pred_model_rd_earlyterm( + VP9_COMP *cpi, int mi_row, int mi_col, BLOCK_SIZE bsize, MACROBLOCK *x, + MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, + int *skip_txfm_sb, int64_t *skip_sse_sb, int do_earlyterm, + int64_t best_rd) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. @@ -2999,13 +3000,13 @@ static int64_t handle_inter_mode( xd->plane[j].dst.stride = 64; } } - // Compute RD cost with early termination option + filt_best_rd = cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd; - if (model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd, - &rate_sum, &dist_sum, &tmp_skip_sb, - &tmp_skip_sse, enable_earlyterm, - filt_best_rd)) { + if (build_inter_pred_model_rd_earlyterm( + cpi, mi_row, mi_col, bsize, x, xd, &rate_sum, &dist_sum, + &tmp_skip_sb, &tmp_skip_sse, enable_earlyterm, + filt_best_rd)) { filter_cache[i] = INT64_MAX; continue; } @@ -3076,9 +3077,9 @@ static int64_t handle_inter_mode( // Handles the special case when a filter that is not in the // switchable list (ex. bilinear) is indicated at the frame level, or // skip condition holds. - model_rd_for_sb_earlyterm(cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, - &tmp_dist, &skip_txfm_sb, &skip_sse_sb, - 0 /*do_earlyterm*/, INT64_MAX); + build_inter_pred_model_rd_earlyterm( + cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb, + &skip_sse_sb, 0 /*do_earlyterm*/, INT64_MAX); rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist); memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); memcpy(bsse, x->bsse, sizeof(bsse)); From 775d594e462252c0e8f8113955122e9c34eeab44 Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Mon, 6 Mar 2023 10:38:20 +0530 Subject: [PATCH 0668/1000] Add AVX2 for vpx_filter_block1d8_v8() function Introduced AVX2 intrinsic to compute convolve vertical for w = 8 case. This is a bit-exact change. Instruction Count cpu Resolution Reduction(%) 0 LOWRES2 1.347 0 MIDRES2 1.046 0 HDRES2 0.805 0 Average 1.066 Change-Id: Idf77fff054beaf2c985b9bf2335591bda47e811f --- vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 96 ++++++++++++++++++++++- 1 file changed, 94 insertions(+), 2 deletions(-) diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 26e82f9b73..141614e7ad 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -942,19 +942,111 @@ static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr, } } +static void vpx_filter_block1d8_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m256i f[4], ss[4]; + __m256i r[8]; + __m128i s[9]; + + unsigned int y = output_height; + // Multiply the size of the source stride by two + const ptrdiff_t src_stride = src_pitch << 1; + + // The output_height is always a multiple of two. + assert(!(output_height & 1)); + + shuffle_filter_avx2(filter, f); + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); + + // merge the result together + // r[0]: 0 0 0 0 0 0 0 0 r17 r16 r15 r14 r13 r12 r11 r10 | 0 0 0 0 0 0 0 0 + // r07 r06 r05 r04 r03 r02 r01 r00 + r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1); + + // r[1]: 0 0 0 0 0 0 0 0 r27 r26 r25 r24 r23 r22 r21 r20 | 0 0 0 0 0 0 0 0 + // r17 r16 r15 r14 r13 r12 r11 r10 + r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1); + + // r[2]: 0 0 0 0 0 0 0 0 r37 r36 r35 r34 r33 r32 r31 r30 | 0 0 0 0 0 0 0 0 + // r27 r26 r25 r24 r23 r22 r21 r20 + r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1); + + // r[3]: 0 0 0 0 0 0 0 0 r47 r46 r45 r44 r43 r42 r41 r40 | 0 0 0 0 0 0 0 0 + // r37 r36 r35 r34 r33 r32 r31 r30 + r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1); + + // r[4]: 0 0 0 0 0 0 0 0 r57 r56 r55 r54 r53 r52 r51 r50 | 0 0 0 0 0 0 0 0 + // r47 r46 r45 r44 r43 r42 r41 r40 + r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1); + + // r[5]: 0 0 0 0 0 0 0 0 r67 r66 r65 r64 r63 r62 r61 r60 | 0 0 0 0 0 0 0 0 + // r57 r56 r55 r54 r53 r52 r51 r50 + r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[6], 1); + + // Merge together + // ss[0]: |r27 r17|.......|r21 r11|r20 r10 || r17 r07|.....|r12 r02|r11 + // r01|r10 r00| + ss[0] = _mm256_unpacklo_epi8(r[0], r[1]); + + // ss[0]: |r47 r37|.......|r41 r31|r40 r30 || r37 r27|.....|r32 r22|r31 + // r21|r30 r20| + ss[1] = _mm256_unpacklo_epi8(r[2], r[3]); + + // ss[2]: |r67 r57|.......|r61 r51|r60 r50 || r57 r47|.....|r52 r42|r51 + // r41|r50 r40| + ss[2] = _mm256_unpacklo_epi8(r[4], r[5]); + + // Process 2 rows at a time + do { + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + + // r[6]: 0 0 0 0 0 0 0 0 r77 r76 r75 r74 r73 r72 r71 r70 | 0 0 0 0 0 0 0 + // 0 r67 r66 r65 r64 r63 r62 r61 r60 + r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[7], 1); + // r[7]: 0 0 0 0 0 0 0 0 r87 r86 r85 r84 r83 r82 r81 r80 | 0 0 0 0 0 0 0 + // 0 r77 r76 r75 r74 r73 r72 r71 r70 + r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[8], 1); + + // ss[3] : | r87 r77 | .......| r81 r71 | r80 r70 || r77 r67 | .....| r72 + // r62 | r71 r61|r70 r60| + ss[3] = _mm256_unpacklo_epi8(r[6], r[7]); + ss[0] = convolve8_16_avx2(ss, f); + ss[0] = _mm256_packus_epi16(ss[0], ss[0]); + src_ptr += src_stride; + + /* shift down two rows */ + s[6] = s[8]; + _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(ss[0])); + output_ptr += out_pitch; + _mm_storel_epi64((__m128i *)&output_ptr[0], + _mm256_extractf128_si256(ss[0], 1)); + output_ptr += out_pitch; + ss[0] = ss[1]; + ss[1] = ss[2]; + ss[2] = ss[3]; + y -= 2; + } while (y > 1); +} + #if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; #if VPX_ARCH_X86_64 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; -#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3 #else // VPX_ARCH_X86 filter8_1dfunction vpx_filter_block1d8_v8_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_ssse3; -#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3 #endif // VPX_ARCH_X86_64 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; From 29beea82437213bd705de4b01103ac0d88bcab72 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 1 Mar 2023 23:37:32 +0000 Subject: [PATCH 0669/1000] [NEON] Add temporal filter functions, 8-bit and highbd Both are around 3x faster than original C version. 8-bit gives a small 0.5% speed increase, whereas highbd gives ~2.5%. Change-Id: I71d75ddd2757b19aa201e879fd9fa8f3a25431ad --- test/yuv_temporal_filter_test.cc | 18 + vp9/common/vp9_rtcd_defs.pl | 4 +- .../neon/vp9_highbd_temporal_filter_neon.c | 872 ++++++++++++++++++ .../arm/neon/vp9_temporal_filter_neon.c | 849 +++++++++++++++++ ...ants.h => vp9_temporal_filter_constants.h} | 8 +- vp9/encoder/x86/highbd_temporal_filter_sse4.c | 2 +- vp9/encoder/x86/temporal_filter_sse4.c | 2 +- vp9/vp9cx.mk | 9 +- 8 files changed, 1754 insertions(+), 10 deletions(-) create mode 100644 vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c create mode 100644 vp9/encoder/arm/neon/vp9_temporal_filter_neon.c rename vp9/encoder/{x86/temporal_filter_constants.h => vp9_temporal_filter_constants.h} (98%) diff --git a/test/yuv_temporal_filter_test.cc b/test/yuv_temporal_filter_test.cc index 2bdcf4d86f..91b4e804b3 100644 --- a/test/yuv_temporal_filter_test.cc +++ b/test/yuv_temporal_filter_test.cc @@ -694,6 +694,18 @@ INSTANTIATE_TEST_SUITE_P( TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_12, 12))); #endif // HAVE_SSE4_1 +#if HAVE_NEON +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 10) +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 12) + +INSTANTIATE_TEST_SUITE_P( + NEON, YUVTemporalFilterTest, + ::testing::Values( + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_10, + 10), + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_12, + 12))); +#endif // HAVE_NEON #else INSTANTIATE_TEST_SUITE_P( C, YUVTemporalFilterTest, @@ -704,5 +716,11 @@ INSTANTIATE_TEST_SUITE_P(SSE4_1, YUVTemporalFilterTest, ::testing::Values(TemporalFilterWithBd( &vp9_apply_temporal_filter_sse4_1, 8))); #endif // HAVE_SSE4_1 +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, YUVTemporalFilterTest, + ::testing::Values(TemporalFilterWithBd( + &vp9_apply_temporal_filter_neon, 8))); +#endif // HAVE_NEON #endif // CONFIG_VP9_HIGHBITDEPTH + } // namespace diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 2f9870dd48..d16b947110 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -179,11 +179,11 @@ () # if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") { add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count"; -specialize qw/vp9_apply_temporal_filter sse4_1/; +specialize qw/vp9_apply_temporal_filter sse4_1 neon/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_apply_temporal_filter/, "const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count"; - specialize qw/vp9_highbd_apply_temporal_filter sse4_1/; + specialize qw/vp9_highbd_apply_temporal_filter sse4_1 neon/; } } diff --git a/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c b/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c new file mode 100644 index 0000000000..c3aef3c865 --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c @@ -0,0 +1,872 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" + +// Compute (a-b)**2 for 8 pixels with size 16-bit +static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b, + uint32_t *dst) { + const uint16x8_t a_reg = vld1q_u16(a); + const uint16x8_t b_reg = vld1q_u16(b); + + uint16x8_t dist = vabdq_u16(a_reg, b_reg); + uint32x4_t dist_first = vmull_u16(vget_low_u16(dist), vget_low_u16(dist)); + uint32x4_t dist_second = vmull_u16(vget_high_u16(dist), vget_high_u16(dist)); + + vst1q_u32(dst, dist_first); + vst1q_u32(dst + 4, dist_second); +} + +// Sum up three neighboring distortions for the pixels +static INLINE void highbd_get_sum_4(const uint32_t *dist, uint32x4_t *sum) { + uint32x4_t dist_reg, dist_left, dist_right; + + dist_reg = vld1q_u32(dist); + dist_left = vld1q_u32(dist - 1); + dist_right = vld1q_u32(dist + 1); + + *sum = vaddq_u32(dist_reg, dist_left); + *sum = vaddq_u32(*sum, dist_right); +} + +static INLINE void highbd_get_sum_8(const uint32_t *dist, uint32x4_t *sum_first, + uint32x4_t *sum_second) { + highbd_get_sum_4(dist, sum_first); + highbd_get_sum_4(dist + 4, sum_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values, plus +// however many values from y/uv plane are). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE void highbd_average_4(uint32x4_t *output, const uint32x4_t sum, + const uint32x4_t *mul_constants, + const int strength, const int rounding, + const int weight) { + const int64x2_t strength_s64 = vdupq_n_s64(-strength - 32); + const uint64x2_t rounding_u64 = vdupq_n_u64((uint64_t)rounding << 32); + const uint32x4_t weight_u32 = vdupq_n_u32(weight); + const uint32x4_t sixteen = vdupq_n_u32(16); + uint32x4_t sum2; + + // modifier * 3 / index; + uint64x2_t sum_lo = + vmlal_u32(rounding_u64, vget_low_u32(sum), vget_low_u32(*mul_constants)); + uint64x2_t sum_hi = vmlal_u32(rounding_u64, vget_high_u32(sum), + vget_high_u32(*mul_constants)); + + // we cannot use vshrn_n_u64 as strength is not known at compile time. + sum_lo = vshlq_u64(sum_lo, strength_s64); + sum_hi = vshlq_u64(sum_hi, strength_s64); + + sum2 = vcombine_u32(vmovn_u64(sum_lo), vmovn_u64(sum_hi)); + + // Multiply with the weight + sum2 = vminq_u32(sum2, sixteen); + sum2 = vsubq_u32(sixteen, sum2); + *output = vmulq_u32(sum2, weight_u32); +} + +static INLINE void highbd_average_8(uint32x4_t *output_0, uint32x4_t *output_1, + const uint32x4_t sum_0_u32, + const uint32x4_t sum_1_u32, + const uint32x4_t *mul_constants_0, + const uint32x4_t *mul_constants_1, + const int strength, const int rounding, + const int weight) { + highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding, + weight); + highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding, + weight); +} + +// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static INLINE void highbd_accumulate_and_store_8( + const uint32x4_t sum_first_u32, const uint32x4_t sum_second_u32, + const uint16_t *pred, uint16_t *count, uint32_t *accumulator) { + const uint16x8_t sum_u16 = + vcombine_u16(vqmovn_u32(sum_first_u32), vqmovn_u32(sum_second_u32)); + uint16x8_t pred_u16 = vld1q_u16(pred); + uint16x8_t count_u16 = vld1q_u16(count); + uint32x4_t pred_0_u32, pred_1_u32; + uint32x4_t accum_0_u32, accum_1_u32; + + count_u16 = vqaddq_u16(count_u16, sum_u16); + vst1q_u16(count, count_u16); + + accum_0_u32 = vld1q_u32(accumulator); + accum_1_u32 = vld1q_u32(accumulator + 4); + + pred_0_u32 = vmovl_u16(vget_low_u16(pred_u16)); + pred_1_u32 = vmovl_u16(vget_high_u16(pred_u16)); + + // Don't use sum_u16 as that produces different results to the C version + accum_0_u32 = vmlaq_u32(accum_0_u32, sum_first_u32, pred_0_u32); + accum_1_u32 = vmlaq_u32(accum_1_u32, sum_second_u32, pred_1_u32); + + vst1q_u32(accumulator, accum_0_u32); + vst1q_u32(accumulator + 4, accum_1_u32); +} + +static INLINE void highbd_read_dist_4(const uint32_t *dist, + uint32x4_t *dist_reg) { + *dist_reg = vld1q_u32(dist); +} + +static INLINE void highbd_read_dist_8(const uint32_t *dist, + uint32x4_t *reg_first, + uint32x4_t *reg_second) { + highbd_read_dist_4(dist, reg_first); + highbd_read_dist_4(dist + 4, reg_second); +} + +static INLINE void highbd_read_chroma_dist_row_8( + int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, + uint32x4_t *u_first, uint32x4_t *u_second, uint32x4_t *v_first, + uint32x4_t *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 8 entries from chroma. + highbd_read_dist_8(u_dist, u_first, u_second); + highbd_read_dist_8(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + uint32x4_t u_reg, v_reg; + uint32x4x2_t pair; + + highbd_read_dist_4(u_dist, &u_reg); + + pair = vzipq_u32(u_reg, u_reg); + *u_first = pair.val[0]; + *u_second = pair.val[1]; + + highbd_read_dist_4(v_dist, &v_reg); + + pair = vzipq_u32(v_reg, v_reg); + *v_first = pair.val[0]; + *v_second = pair.val[1]; + } +} + +static void highbd_apply_temporal_filter_luma_8( + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_first, + const uint32_t *const *neighbors_second, int top_weight, + int bottom_weight) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + uint32x4_t mul_first, mul_second; + + uint32x4_t sum_row_1_first, sum_row_1_second; + uint32x4_t sum_row_2_first, sum_row_2_second; + uint32x4_t sum_row_3_first, sum_row_3_second; + + uint32x4_t u_first, u_second; + uint32x4_t v_first, v_second; + + uint32x4_t sum_row_first; + uint32x4_t sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(block_width == 8); + + (void)block_width; + + // First row + mul_first = vld1q_u32(neighbors_first[0]); + mul_second = vld1q_u32(neighbors_second[0]); + + // Add luma values + highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second); + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + // We don't need to saturate here because the maximum value is UINT12_MAX ** 2 + // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX + sum_row_first = vaddq_u32(sum_row_2_first, sum_row_3_first); + sum_row_second = vaddq_u32(sum_row_2_second, sum_row_3_second); + + // Add chroma values + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed + sum_row_first = vaddq_u32(sum_row_first, u_first); + sum_row_second = vaddq_u32(sum_row_second, u_second); + + sum_row_first = vaddq_u32(sum_row_first, v_first); + sum_row_second = vaddq_u32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first, + sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = vld1q_u32(neighbors_first[1]); + mul_second = vld1q_u32(neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + weight = bottom_weight; + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first); + sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second); + + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = vaddq_u32(sum_row_first, sum_row_3_first); + sum_row_second = vaddq_u32(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + } + + sum_row_first = vaddq_u32(sum_row_first, u_first); + sum_row_second = vaddq_u32(sum_row_second, u_second); + sum_row_first = vaddq_u32(sum_row_first, v_first); + sum_row_second = vaddq_u32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first, + sum_row_second, &mul_first, &mul_second, strength, + rounding, weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = vld1q_u32(neighbors_first[0]); + mul_second = vld1q_u32(neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first); + sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + } + + sum_row_first = vaddq_u32(sum_row_first, u_first); + sum_row_second = vaddq_u32(sum_row_second, u_second); + sum_row_first = vaddq_u32(sum_row_first, v_first); + sum_row_second = vaddq_u32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first, + sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void highbd_apply_temporal_filter_luma( + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_first; + const uint32_t *const *neighbors_second; + + // Left + neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + } + + // Right + neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS; + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); +} + +// Add a row of luma distortion that corresponds to 8 chroma mods. If we are +// subsampling in x direction, then we have 16 lumas, else we have 8. +static INLINE void highbd_add_luma_dist_to_8_chroma_mod( + const uint32_t *y_dist, int ss_x, int ss_y, uint32x4_t *u_mod_fst, + uint32x4_t *u_mod_snd, uint32x4_t *v_mod_fst, uint32x4_t *v_mod_snd) { + uint32x4_t y_reg_fst, y_reg_snd; + if (!ss_x) { + highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd); + if (ss_y == 1) { + uint32x4_t y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + y_reg_fst = vaddq_u32(y_reg_fst, y_tmp_fst); + y_reg_snd = vaddq_u32(y_reg_snd, y_tmp_snd); + } + } else { + // Temporary + uint32x4_t y_fst, y_snd; + uint64x2_t y_fst64, y_snd64; + + // First 8 + highbd_read_dist_8(y_dist, &y_fst, &y_snd); + if (ss_y == 1) { + uint32x4_t y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = vaddq_u32(y_fst, y_tmp_fst); + y_snd = vaddq_u32(y_snd, y_tmp_snd); + } + + y_fst64 = vpaddlq_u32(y_fst); + y_snd64 = vpaddlq_u32(y_snd); + y_reg_fst = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64)); + + // Second 8 + highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd); + if (ss_y == 1) { + uint32x4_t y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = vaddq_u32(y_fst, y_tmp_fst); + y_snd = vaddq_u32(y_snd, y_tmp_snd); + } + + y_fst64 = vpaddlq_u32(y_fst); + y_snd64 = vpaddlq_u32(y_snd); + y_reg_snd = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64)); + } + + *u_mod_fst = vaddq_u32(*u_mod_fst, y_reg_fst); + *u_mod_snd = vaddq_u32(*u_mod_snd, y_reg_snd); + *v_mod_fst = vaddq_u32(*v_mod_fst, y_reg_fst); + *v_mod_snd = vaddq_u32(*v_mod_snd, y_reg_snd); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void highbd_apply_temporal_filter_chroma_8( + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int uv_block_width, unsigned int uv_block_height, int ss_x, + int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist, + const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd, + int top_weight, int bottom_weight, const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + uint32x4_t mul_fst, mul_snd; + + uint32x4_t u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst; + uint32x4_t v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst; + uint32x4_t u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd; + uint32x4_t v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd; + + uint32x4_t u_sum_row_fst, v_sum_row_fst; + uint32x4_t u_sum_row_snd, v_sum_row_snd; + + // Loop variable + unsigned int h; + + (void)uv_block_width; + + // First row + mul_fst = vld1q_u32(neighbors_fst[0]); + mul_snd = vld1q_u32(neighbors_snd[0]); + + // Add chroma values + highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + + u_sum_row_fst = vaddq_u32(u_sum_row_2_fst, u_sum_row_3_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_2_snd, u_sum_row_3_snd); + + highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + + v_sum_row_fst = vaddq_u32(v_sum_row_2_fst, v_sum_row_3_fst); + v_sum_row_snd = vaddq_u32(v_sum_row_2_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst, + u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst, + v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul_fst = vld1q_u32(neighbors_fst[1]); + mul_snd = vld1q_u32(neighbors_snd[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + u_sum_row_fst = vaddq_u32(u_sum_row_fst, u_sum_row_3_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_snd, u_sum_row_3_snd); + + v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst); + v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + v_sum_row_fst = vaddq_u32(v_sum_row_fst, v_sum_row_3_fst); + v_sum_row_snd = vaddq_u32(v_sum_row_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst, + u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst, + v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul_fst = vld1q_u32(neighbors_fst[0]); + mul_snd = vld1q_u32(neighbors_snd[0]); + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst); + v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd); + v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst, + u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst, + v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); +} + +// Perform temporal filter for the chroma components. +static void highbd_apply_temporal_filter_chroma( + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_fst; + const uint32_t *const *neighbors_snd; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } else { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); +} + +void vp9_highbd_apply_temporal_filter_neon( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + + uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 8) { + highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, + block_height, ss_x, ss_y, strength, blk_fw, + use_whole_blk, y_accum, y_count, y_dist_ptr, + u_dist_ptr, v_dist_ptr); + + highbd_apply_temporal_filter_chroma( + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); +} diff --git a/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c b/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c new file mode 100644 index 0000000000..a651a15d90 --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c @@ -0,0 +1,849 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" + +// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the +// difference squared, and store as unsigned 16-bit integer to dst. +static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const uint8x8_t a_reg = vld1_u8(a); + const uint8x8_t b_reg = vld1_u8(b); + + uint16x8_t dist_first = vabdl_u8(a_reg, b_reg); + dist_first = vmulq_u16(dist_first, dist_first); + + vst1q_u16(dst, dist_first); +} + +static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const uint8x16_t a_reg = vld1q_u8(a); + const uint8x16_t b_reg = vld1q_u8(b); + + uint16x8_t dist_first = vabdl_u8(vget_low_u8(a_reg), vget_low_u8(b_reg)); + uint16x8_t dist_second = vabdl_u8(vget_high_u8(a_reg), vget_high_u8(b_reg)); + dist_first = vmulq_u16(dist_first, dist_first); + dist_second = vmulq_u16(dist_second, dist_second); + + vst1q_u16(dst, dist_first); + vst1q_u16(dst + 8, dist_second); +} + +static INLINE void read_dist_8(const uint16_t *dist, uint16x8_t *dist_reg) { + *dist_reg = vld1q_u16(dist); +} + +static INLINE void read_dist_16(const uint16_t *dist, uint16x8_t *reg_first, + uint16x8_t *reg_second) { + read_dist_8(dist, reg_first); + read_dist_8(dist + 8, reg_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE uint16x8_t average_8(uint16x8_t sum, + const uint16x8_t *mul_constants, + const int strength, const int rounding, + const uint16x8_t *weight) { + const uint32x4_t rounding_u32 = vdupq_n_u32(rounding << 16); + const uint16x8_t weight_u16 = *weight; + const uint16x8_t sixteen = vdupq_n_u16(16); + const int32x4_t strength_u32 = vdupq_n_s32(-strength - 16); + + // modifier * 3 / index; + uint32x4_t sum_hi = + vmull_u16(vget_low_u16(sum), vget_low_u16(*mul_constants)); + uint32x4_t sum_lo = + vmull_u16(vget_high_u16(sum), vget_high_u16(*mul_constants)); + + sum_lo = vqaddq_u32(sum_lo, rounding_u32); + sum_hi = vqaddq_u32(sum_hi, rounding_u32); + + // we cannot use vshrn_n_u32 as strength is not known at compile time. + sum_lo = vshlq_u32(sum_lo, strength_u32); + sum_hi = vshlq_u32(sum_hi, strength_u32); + + sum = vcombine_u16(vmovn_u32(sum_hi), vmovn_u32(sum_lo)); + + // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 + // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 + // So this needs to use the epu16 version which did not come until SSE4. + sum = vminq_u16(sum, sixteen); + sum = vsubq_u16(sixteen, sum); + return vmulq_u16(sum, weight_u16); +} + +// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static void accumulate_and_store_8(const uint16x8_t sum_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { + uint16x8_t pred_u16 = vmovl_u8(vld1_u8(pred)); + uint16x8_t count_u16 = vld1q_u16(count); + uint32x4_t accum_0_u32, accum_1_u32; + + count_u16 = vqaddq_u16(count_u16, sum_u16); + vst1q_u16(count, count_u16); + + accum_0_u32 = vld1q_u32(accumulator); + accum_1_u32 = vld1q_u32(accumulator + 4); + + accum_0_u32 = + vmlal_u16(accum_0_u32, vget_low_u16(sum_u16), vget_low_u16(pred_u16)); + accum_1_u32 = + vmlal_u16(accum_1_u32, vget_high_u16(sum_u16), vget_high_u16(pred_u16)); + + vst1q_u32(accumulator, accum_0_u32); + vst1q_u32(accumulator + 4, accum_1_u32); +} + +static INLINE void accumulate_and_store_16(const uint16x8_t sum_0_u16, + const uint16x8_t sum_1_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { + uint8x16_t pred_u8 = vld1q_u8(pred); + uint16x8_t pred_0_u16 = vmovl_u8(vget_low_u8(pred_u8)); + uint16x8_t pred_1_u16 = vmovl_u8(vget_high_u8(pred_u8)); + uint16x8_t count_0_u16 = vld1q_u16(count); + uint16x8_t count_1_u16 = vld1q_u16(count + 8); + uint32x4_t accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32; + + count_0_u16 = vqaddq_u16(count_0_u16, sum_0_u16); + vst1q_u16(count, count_0_u16); + count_1_u16 = vqaddq_u16(count_1_u16, sum_1_u16); + vst1q_u16(count + 8, count_1_u16); + + accum_0_u32 = vld1q_u32(accumulator); + accum_1_u32 = vld1q_u32(accumulator + 4); + accum_2_u32 = vld1q_u32(accumulator + 8); + accum_3_u32 = vld1q_u32(accumulator + 12); + + accum_0_u32 = + vmlal_u16(accum_0_u32, vget_low_u16(sum_0_u16), vget_low_u16(pred_0_u16)); + accum_1_u32 = vmlal_u16(accum_1_u32, vget_high_u16(sum_0_u16), + vget_high_u16(pred_0_u16)); + accum_2_u32 = + vmlal_u16(accum_2_u32, vget_low_u16(sum_1_u16), vget_low_u16(pred_1_u16)); + accum_3_u32 = vmlal_u16(accum_3_u32, vget_high_u16(sum_1_u16), + vget_high_u16(pred_1_u16)); + + vst1q_u32(accumulator, accum_0_u32); + vst1q_u32(accumulator + 4, accum_1_u32); + vst1q_u32(accumulator + 8, accum_2_u32); + vst1q_u32(accumulator + 12, accum_3_u32); +} + +// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int. +static INLINE void get_sum_8(const uint16_t *y_dist, uint16x8_t *sum) { + uint16x8_t dist_reg, dist_left, dist_right; + + dist_reg = vld1q_u16(y_dist); + dist_left = vld1q_u16(y_dist - 1); + dist_right = vld1q_u16(y_dist + 1); + + *sum = vqaddq_u16(dist_reg, dist_left); + *sum = vqaddq_u16(*sum, dist_right); +} + +// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and +// the rest in sum_second. +static INLINE void get_sum_16(const uint16_t *y_dist, uint16x8_t *sum_first, + uint16x8_t *sum_second) { + get_sum_8(y_dist, sum_first); + get_sum_8(y_dist + 8, sum_second); +} + +// Read in a row of chroma values corresponds to a row of 16 luma values. +static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist, + const uint16_t *v_dist, + uint16x8_t *u_first, + uint16x8_t *u_second, + uint16x8_t *v_first, + uint16x8_t *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 16 entries from chroma. + read_dist_16(u_dist, u_first, u_second); + read_dist_16(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + uint16x8_t u_reg, v_reg; + uint16x8x2_t pair; + + read_dist_8(u_dist, &u_reg); + + pair = vzipq_u16(u_reg, u_reg); + *u_first = pair.val[0]; + *u_second = pair.val[1]; + + read_dist_8(v_dist, &v_reg); + + pair = vzipq_u16(v_reg, v_reg); + *v_first = pair.val[0]; + *v_second = pair.val[1]; + } +} + +// Add a row of luma distortion to 8 corresponding chroma mods. +static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist, + int ss_x, int ss_y, + uint16x8_t *u_mod, + uint16x8_t *v_mod) { + uint16x8_t y_reg; + if (!ss_x) { + read_dist_8(y_dist, &y_reg); + if (ss_y == 1) { + uint16x8_t y_tmp; + read_dist_8(y_dist + DIST_STRIDE, &y_tmp); + + y_reg = vqaddq_u16(y_reg, y_tmp); + } + } else { + uint16x8_t y_first, y_second; + uint32x4_t y_first32, y_second32; + + read_dist_16(y_dist, &y_first, &y_second); + if (ss_y == 1) { + uint16x8_t y_tmp_0, y_tmp_1; + read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1); + + y_first = vqaddq_u16(y_first, y_tmp_0); + y_second = vqaddq_u16(y_second, y_tmp_1); + } + + y_first32 = vpaddlq_u16(y_first); + y_second32 = vpaddlq_u16(y_second); + + y_reg = vcombine_u16(vqmovn_u32(y_first32), vqmovn_u32(y_second32)); + } + + *u_mod = vqaddq_u16(*u_mod, y_reg); + *v_mod = vqaddq_u16(*v_mod, y_reg); +} + +// Apply temporal filter to the luma components. This performs temporal +// filtering on a luma block of 16 X block_height. Use blk_fw as an array of +// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL, +// else use top_weight for top half, and bottom weight for bottom half. +static void apply_temporal_filter_luma_16( + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors_first, + const int16_t *const *neighbors_second, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + uint16x8_t weight_first, weight_second; + + uint16x8_t mul_first, mul_second; + + uint16x8_t sum_row_1_first, sum_row_1_second; + uint16x8_t sum_row_2_first, sum_row_2_second; + uint16x8_t sum_row_3_first, sum_row_3_second; + + uint16x8_t u_first, u_second; + uint16x8_t v_first, v_second; + + uint16x8_t sum_row_first; + uint16x8_t sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 0); + assert(strength <= 6); + + assert(block_width == 16); + (void)block_width; + + // Initialize the weights + if (blk_fw) { + weight_first = vdupq_n_u16(blk_fw[0]); + weight_second = vdupq_n_u16(blk_fw[1]); + } else { + weight_first = vdupq_n_u16(top_weight); + weight_second = weight_first; + } + + // First row + mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]); + mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]); + + // Add luma values + get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second); + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = vqaddq_u16(sum_row_2_first, sum_row_3_first); + sum_row_second = vqaddq_u16(sum_row_2_second, sum_row_3_second); + + // Add chroma values + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + + sum_row_first = vqaddq_u16(sum_row_first, u_first); + sum_row_second = vqaddq_u16(sum_row_second, u_second); + + sum_row_first = vqaddq_u16(sum_row_first, v_first); + sum_row_second = vqaddq_u16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = vld1q_u16((const uint16_t *)neighbors_first[1]); + mul_second = vld1q_u16((const uint16_t *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + if (blk_fw) { + weight_first = vdupq_n_u16(blk_fw[2]); + weight_second = vdupq_n_u16(blk_fw[3]); + } else { + weight_first = vdupq_n_u16(bottom_weight); + weight_second = weight_first; + } + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first); + sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second); + + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = vqaddq_u16(sum_row_first, sum_row_3_first); + sum_row_second = vqaddq_u16(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + } + + sum_row_first = vqaddq_u16(sum_row_first, u_first); + sum_row_second = vqaddq_u16(sum_row_second, u_second); + sum_row_first = vqaddq_u16(sum_row_first, v_first); + sum_row_second = vqaddq_u16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]); + mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first); + sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + } + + sum_row_first = vqaddq_u16(sum_row_first, u_first); + sum_row_second = vqaddq_u16(sum_row_second, u_second); + sum_row_first = vqaddq_u16(sum_row_first, v_first); + sum_row_second = vqaddq_u16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void apply_temporal_filter_luma( + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors_first; + const int16_t *const *neighbors_second; + + if (block_width == 16) { + // Special Case: The block width is 16 and we are operating on a row of 16 + // chroma pixels. In this case, we can't use the usual left-middle-right + // pattern. We also don't support splitting now. + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + if (use_whole_blk) { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } else { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, 0, 0, blk_fw); + } + + return; + } + + // Left + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS; + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } + + // Right + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void apply_temporal_filter_chroma_8( + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, + unsigned int uv_block_height, int ss_x, int ss_y, int strength, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + + uint16x8_t weight; + + uint16x8_t mul; + + uint16x8_t u_sum_row_1, u_sum_row_2, u_sum_row_3; + uint16x8_t v_sum_row_1, v_sum_row_2, v_sum_row_3; + + uint16x8_t u_sum_row, v_sum_row; + + // Loop variable + unsigned int h; + + // Initialize weight + if (blk_fw) { + weight = vcombine_u16(vdup_n_u16(blk_fw[0]), vdup_n_u16(blk_fw[1])); + } else { + weight = vdupq_n_u16(top_weight); + } + + // First row + mul = vld1q_u16((const uint16_t *)neighbors[0]); + + // Add chroma values + get_sum_8(u_dist, &u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + + u_sum_row = vqaddq_u16(u_sum_row_2, u_sum_row_3); + + get_sum_8(v_dist, &v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + + v_sum_row = vqaddq_u16(v_sum_row_2, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul = vld1q_u16((const uint16_t *)neighbors[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + weight = vcombine_u16(vdup_n_u16(blk_fw[2]), vdup_n_u16(blk_fw[3])); + } else { + weight = vdupq_n_u16(bottom_weight); + } + } + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + u_sum_row = vqaddq_u16(u_sum_row, u_sum_row_3); + + v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + v_sum_row = vqaddq_u16(v_sum_row, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul = vld1q_u16((const uint16_t *)neighbors[0]); + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2); + v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); +} + +// Perform temporal filter for the chroma components. +static void apply_temporal_filter_chroma( + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } else { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + } + + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); +} + +void vp9_apply_temporal_filter_neon( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + const int *blk_fw_ptr = blk_fw; + + uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 16) { + store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height, + ss_x, ss_y, strength, blk_fw_ptr, use_whole_blk, + y_accum, y_count, y_dist_ptr, u_dist_ptr, + v_dist_ptr); + + apply_temporal_filter_chroma(u_pre, v_pre, uv_pre_stride, block_width, + block_height, ss_x, ss_y, strength, blk_fw_ptr, + use_whole_blk, u_accum, u_count, v_accum, + v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr); +} diff --git a/vp9/encoder/x86/temporal_filter_constants.h b/vp9/encoder/vp9_temporal_filter_constants.h similarity index 98% rename from vp9/encoder/x86/temporal_filter_constants.h rename to vp9/encoder/vp9_temporal_filter_constants.h index 7dcedda192..8776dfc068 100644 --- a/vp9/encoder/x86/temporal_filter_constants.h +++ b/vp9/encoder/vp9_temporal_filter_constants.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_ -#define VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_ +#ifndef VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_ +#define VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_ #include "./vpx_config.h" // Division using multiplication and shifting. The C implementation does: @@ -407,4 +407,4 @@ static const uint32_t #define DIST_STRIDE ((BW) + 2) -#endif // VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_ +#endif // VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_ diff --git a/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/vp9/encoder/x86/highbd_temporal_filter_sse4.c index bcbf6d77e6..97f182c660 100644 --- a/vp9/encoder/x86/highbd_temporal_filter_sse4.c +++ b/vp9/encoder/x86/highbd_temporal_filter_sse4.c @@ -16,7 +16,7 @@ #include "vpx/vpx_integer.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_temporal_filter.h" -#include "vp9/encoder/x86/temporal_filter_constants.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" // Compute (a-b)**2 for 8 pixels with size 16-bit static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b, diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c index 87e68fb438..7571bfccac 100644 --- a/vp9/encoder/x86/temporal_filter_sse4.c +++ b/vp9/encoder/x86/temporal_filter_sse4.c @@ -16,7 +16,7 @@ #include "vpx/vpx_integer.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_temporal_filter.h" -#include "vp9/encoder/x86/temporal_filter_constants.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" // Read in 8 pixels from a and b as 8-bit unsigned integers, compute the // difference squared, and store as unsigned 16-bit integer to dst. diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index cccaea712e..882c12d2e6 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -110,7 +110,9 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c -VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/vp9_temporal_filter_constants.h +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_temporal_filter_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/vp9_temporal_filter_constants.h VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c @@ -120,6 +122,7 @@ VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c endif VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm @@ -156,8 +159,10 @@ VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c -VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_constants.h +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter_constants.h VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/highbd_temporal_filter_sse4.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filter_neon.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.h VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.c From e553e3acff6d2e894ce0400f15247aa9cca58719 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Tue, 7 Mar 2023 15:13:17 +0000 Subject: [PATCH 0670/1000] Add Neon implementation of vp9_highbd_block_error_c Add Neon implementation of vp9_highbd_block_error_c as well as the corresponding tests. Change-Id: Ibe0eb077f959ced0dcd7d0d8d9d529d3b5bc1874 --- test/vp9_block_error_test.cc | 22 ++++++--- vp9/common/vp9_rtcd_defs.pl | 2 +- vp9/encoder/arm/neon/vp9_highbd_error_neon.c | 49 ++++++++++++++++++++ vp9/vp9cx.mk | 3 ++ 4 files changed, 69 insertions(+), 7 deletions(-) create mode 100644 vp9/encoder/arm/neon/vp9_highbd_error_neon.c diff --git a/test/vp9_block_error_test.cc b/test/vp9_block_error_test.cc index bde84cd619..9e9595ebe9 100644 --- a/test/vp9_block_error_test.cc +++ b/test/vp9_block_error_test.cc @@ -199,10 +199,20 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_AVX2 #if HAVE_NEON -INSTANTIATE_TEST_SUITE_P( - NEON, BlockErrorTest, - ::testing::Values(make_tuple(&BlockError8BitWrapper, - &BlockError8BitWrapper, - VPX_BITS_8))); -#endif +const BlockErrorParam neon_block_error_tests[] = { +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c, + VPX_BITS_10), + make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c, + VPX_BITS_12), + make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c, + VPX_BITS_8), +#endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, VPX_BITS_8) +}; + +INSTANTIATE_TEST_SUITE_P(NEON, BlockErrorTest, + ::testing::ValuesIn(neon_block_error_tests)); +#endif // HAVE_NEON } // namespace diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 2f9870dd48..556b361418 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -139,7 +139,7 @@ () specialize qw/vp9_block_error neon avx2 sse2/; add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; - specialize qw/vp9_highbd_block_error sse2/; + specialize qw/vp9_highbd_block_error neon sse2/; } else { specialize qw/vp9_block_error neon avx2 msa sse2/; } diff --git a/vp9/encoder/arm/neon/vp9_highbd_error_neon.c b/vp9/encoder/arm/neon/vp9_highbd_error_neon.c new file mode 100644 index 0000000000..d9b183472d --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_highbd_error_neon.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +int64_t vp9_highbd_block_error_neon(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, int bd) { + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); + + const int shift = 2 * (bd - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int32x4_t c = load_tran_low_to_s32q(coeff); + const int32x4_t d = load_tran_low_to_s32q(dqcoeff); + + const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d)); + + err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff)); + err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff)); + + ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c)); + ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c)); + + coeff += 4; + dqcoeff += 4; + block_size -= 4; + } while (block_size != 0); + + *ssz = (horizontal_add_int64x2(ssz_s64) + rounding) >> shift; + return ((int64_t)horizontal_add_uint64x2(err_u64) + rounding) >> shift; +} diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index cccaea712e..f21036357e 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -139,6 +139,9 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_error_neon.c +endif VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c From f7dbd848e43c0161b8680f799d908d7adb4dd188 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Tue, 7 Mar 2023 17:04:31 +0000 Subject: [PATCH 0671/1000] Optimize vpx_satd_neon Optimize Neon implementation of vpx_satd by using ABD and UADALP instead of ABAL and ABAL2, splitting the accumulator and using a dedicated helper function to perform the final reduction. Change-Id: Idcfa49e001b68b1dcd87c13fd9acc317a208cd2a --- vpx_dsp/arm/avg_neon.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index 8e57bdaa50..56d97e22ad 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -46,29 +46,25 @@ uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) { // coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. +// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] int vpx_satd_neon(const tran_low_t *coeff, int length) { - const int16x4_t zero = vdup_n_s16(0); - int32x4_t accum = vdupq_n_s32(0); + int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; do { - const int16x8_t src0 = load_tran_low_to_s16q(coeff); - const int16x8_t src8 = load_tran_low_to_s16q(coeff + 8); - accum = vabal_s16(accum, vget_low_s16(src0), zero); - accum = vabal_s16(accum, vget_high_s16(src0), zero); - accum = vabal_s16(accum, vget_low_s16(src8), zero); - accum = vabal_s16(accum, vget_high_s16(src8), zero); + int16x8_t abs0, abs1; + const int16x8_t s0 = load_tran_low_to_s16q(coeff); + const int16x8_t s1 = load_tran_low_to_s16q(coeff + 8); + + abs0 = vabsq_s16(s0); + sum_s32[0] = vpadalq_s16(sum_s32[0], abs0); + abs1 = vabsq_s16(s1); + sum_s32[1] = vpadalq_s16(sum_s32[1], abs1); + length -= 16; coeff += 16; } while (length != 0); - { - // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] - const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'. - const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), - vreinterpret_s32_s64(vget_high_s64(s0))); - const int satd = vget_lane_s32(s1, 0); - return satd; - } + return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1])); } void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, From be84aa14dc3d7b1eae3bab9bf060eabadd84196d Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Wed, 8 Mar 2023 12:01:04 +0000 Subject: [PATCH 0672/1000] Add Neon implementation of vpx_highbd_satd_c Add Neon implementation of vpx_highbd_satd_c as well as the corresponding tests. Change-Id: I3d50e6abdf168fb13743e7d8da9364f072308b7f --- test/avg_test.cc | 9 ++++++++ vpx_dsp/arm/highbd_avg_neon.c | 40 +++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 4 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 vpx_dsp/arm/highbd_avg_neon.c diff --git a/test/avg_test.cc b/test/avg_test.cc index bcf8d0d993..dd84403324 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -694,6 +694,15 @@ INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest, make_tuple(256, &vpx_satd_neon), make_tuple(1024, &vpx_satd_neon))); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, SatdHighbdTest, + ::testing::Values(make_tuple(16, &vpx_highbd_satd_neon), + make_tuple(64, &vpx_highbd_satd_neon), + make_tuple(256, &vpx_highbd_satd_neon), + make_tuple(1024, &vpx_highbd_satd_neon))); +#endif // CONFIG_VP9_HIGHBITDEPTH + INSTANTIATE_TEST_SUITE_P( NEON, BlockErrorTestFP, ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon), diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c new file mode 100644 index 0000000000..3ba58b8005 --- /dev/null +++ b/vpx_dsp/arm/highbd_avg_neon.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +// coeff: 32 bits, dynamic range [-2147483648, 2147483647]. +// length: value range {16, 64, 256, 1024}. +// satd: 42 bits, dynamic range [-2147483648 * 1024, 2147483647 * 1024] +int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) { + int64x2_t sum_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + int32x4_t abs0, abs1; + const int32x4_t s0 = load_tran_low_to_s32q(coeff); + const int32x4_t s1 = load_tran_low_to_s32q(coeff + 4); + + abs0 = vabsq_s32(s0); + sum_s64[0] = vpadalq_s32(sum_s64[0], abs0); + abs1 = vabsq_s32(s1); + sum_s64[1] = vpadalq_s32(sum_s64[1], abs1); + + length -= 8; + coeff += 8; + } while (length != 0); + + return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1])); +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index ab8e5bd817..207cda6310 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -344,6 +344,7 @@ DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_NEON) += arm/highbd_hadamard_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_neon.c endif DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c DSP_SRCS-$(HAVE_LSX) += loongarch/avg_lsx.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 3baf16cc8b..2a01ec1b54 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -821,7 +821,7 @@ () specialize qw/vpx_satd avx2 sse2 neon/; add_proto qw/int vpx_highbd_satd/, "const tran_low_t *coeff, int length"; - specialize qw/vpx_highbd_satd avx2/; + specialize qw/vpx_highbd_satd avx2 neon/; } else { add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx lsx/, "$ssse3_x86_64"; From 362c69cfe565e68f240eb37ae05695c50b435656 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Wed, 8 Mar 2023 14:08:23 +0000 Subject: [PATCH 0673/1000] Optimize vpx_minmax_8x8_neon for aarch64 Optimize vpx_minmax_8x8_neon on AArch64 targets by using the UMAXV and UMINV instructions - computing the maximum and minimum elements in a Neon vector. Change-Id: I54c3a3a087d266f6774e6113e5947253df288a64 --- vpx_dsp/arm/avg_neon.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index 56d97e22ad..d48115dd01 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -210,11 +210,16 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max); const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min); - // Split to D and start doing pairwise. +#if defined(__aarch64__) + *min = *max = 0; // Clear high bits + *((uint8_t *)max) = vmaxvq_u8(ab07_max); + *((uint8_t *)min) = vminvq_u8(ab07_min); +#else + // Split into 64-bit vectors and execute pairwise min/max. uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max)); uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min)); - // Enough runs of vpmax/min propogate the max/min values to every position. + // Enough runs of vpmax/min propagate the max/min values to every position. ab_max = vpmax_u8(ab_max, ab_max); ab_min = vpmin_u8(ab_min, ab_min); @@ -228,4 +233,5 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, // Store directly to avoid costly neon->gpr transfer. vst1_lane_u8((uint8_t *)max, ab_max, 0); vst1_lane_u8((uint8_t *)min, ab_min, 0); +#endif } From 5c2cd048a05d8d06777cb9af5c0f4a261456023b Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Tue, 14 Mar 2023 16:50:31 +0530 Subject: [PATCH 0674/1000] Add AVX2 for convolve horizontal filter for block width 4 Introduced AVX2 intrinsic to compute convolve horizontal for w = 4 case. This is a bit-exact change. Instruction Count cpu Resolution Reduction(%) 0 LOWRES2 0.763 0 MIDRES2 0.466 0 HDRES2 0.317 0 Average 0.516 Change-Id: I124f3f8e994c24461812f4963b113819466db44f --- vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 152 +++++++++++++++++++++- 1 file changed, 149 insertions(+), 3 deletions(-) diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 141614e7ad..37ef59f36c 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -39,6 +39,12 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; +DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, + 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, + 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, +}; + #define CALC_CONVOLVE8_HORZ_ROW \ srcReg = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3); \ s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]); \ @@ -1036,18 +1042,158 @@ static void vpx_filter_block1d8_v8_avx2( } while (y > 1); } +static void vpx_filter_block1d4_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64_256bit; + unsigned int y = output_height; + + assert(output_height > 1); + + addFilterReg64_256bit = _mm256_set1_epi16(32); + + // f7 f6 f5 f4 f3 f2 f1 f0 (16 bit) + filtersReg = _mm_loadu_si128((const __m128i *)filter); + + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + // f7 f6 f5 f4 f3 f2 f1 f0 || f7 f6 f5 f4 f3 f2 f1 f0 (8 bit each) + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + { + ptrdiff_t src_stride; + __m256i filt1Reg, filt2Reg, firstFilters, secondFilters; + // have the same data in both lanes of a 256 bit register + // f7 f6 f5 f4 f3 f2 f1 f0 f7 f6 f5 f4 f3 f2 f1 f0 | f7 f6 f5 f4 f3 f2 f1 f0 + // f7 f6 f5 f4 f3 f2 f1 f0 (8bit each) + const __m256i filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); + + // duplicate only the first 32 bits + // f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0 | f3 f2 f1 f0|f3 f2 f1 + // f0|f3 f2 f1 f0|f3 f2 f1 f0 + firstFilters = _mm256_shuffle_epi32(filtersReg32, 0); + // duplicate only the second 32 bits + // f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4 | f7 f6 f5 f4|f7 f6 f5 + // f4|f7 f6 f5 f4|f7 f6 f5 f4 + secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55); + + // s6 s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3 + // s2 s4 s3 s2 s1 s3 s2 s1 s0 + filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2); + + // s10 s9 s8 s7 s9 s8 s7 s6 s8 s7 s6 s5 s7 s6 s5 s4 | s10 s9 s8 s7 s9 s8 s7 + // s6 s8 s7 s6 s5 s7 s6 s5 s4 + filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + + do { + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcReg32b1; + // load the 2 strides of source + // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07 + // r06 r05 r04 r03 r02 r01 r00 + srcReg32b1 = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3); + + // filter the source buffer + // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06 + // r05 r04 r03 r05 r04 r03 r02 r04 r03 r02 r01 r03 r02 r01 r00 + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + // .....|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||......... + // |f1*r03+f0*r02|f3*r04+f2*r03|f1*r02+f0*r01|f3*r03+f2*r02|f1*r01+f0*r00 + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + // filter the source buffer + // r110 r19 r18 r17|r19 r18 r17 r16|r18 r17 r16 r15|r17 r16 r15 r14||r010 + // r09 r08 r07|r09 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04 + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + + // multiply 4 adjacent elements with the filter and add the result + // r010 r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04||r010 + // r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04 + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + srcRegFilt32b1_1 = + _mm256_add_epi16(srcRegFilt32b1_1, addFilterReg64_256bit); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // 0 0 0 0 R13 R12 R11 R10 || 0 0 0 0 R03 R02 R01 R00 (16bit) + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); + + // 8zeros 0 0 0 0 R13 R12 R11 R10 || 8zeros 0 0 0 0 R03 R02 R01 R00 (8bit) + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + // save first row 4 values + *((int *)&output_ptr[0]) = + _mm_cvtsi128_si32(_mm256_castsi256_si128(srcRegFilt32b1_1)); + output_ptr += output_pitch; + + // save second row 4 values + *((int *)&output_ptr[0]) = + _mm_cvtsi128_si32(_mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + output_ptr += output_pitch; + + y = y - 2; + } while (y > 1); + + // For remaining height + if (y > 0) { + __m128i srcReg1, srcRegFilt1_1, addFilterReg64; + __m128i srcRegFilt2; + + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1_1 = + _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(firstFilters)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } + } +} + #if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; #if VPX_ARCH_X86_64 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; -#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3 -#else // VPX_ARCH_X86 +#else // VPX_ARCH_X86 filter8_1dfunction vpx_filter_block1d8_v8_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_ssse3; -#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3 #endif // VPX_ARCH_X86_64 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; From d67a0021e798bdda8917a787832caf7038be56d0 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 16 Mar 2023 13:37:56 -0700 Subject: [PATCH 0675/1000] Update the sample code for VP9RateControlRTC Update the sample code to the current VP9RateControlRTC interface. Change-Id: I30b0712c897f93fd62ebce51ce39afce3cac1fd7 --- vp9/ratectrl_rtc.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index 3131c22231..a846f0742d 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -69,22 +69,21 @@ struct VP9SegmentationData { // the encoder. To use this interface, you need to link with libvpxrc.a. // // #include "vp9/ratectrl_rtc.h" -// VP9RateControlRTC rc_api; // VP9RateControlRtcConfig cfg; // VP9FrameParamsQpRTC frame_params; // // YourFunctionToInitializeConfig(cfg); -// rc_api.InitRateControl(cfg); +// std::unique_ptr rc_api = VP9RateControlRTC::Create(cfg); // // start encoding // while (frame_to_encode) { // if (config_changed) -// rc_api.UpdateRateControl(cfg); +// rc_api->UpdateRateControl(cfg); // YourFunctionToFillFrameParams(frame_params); -// rc_api.ComputeQP(frame_params); -// YourFunctionToUseQP(rc_api.GetQP()); -// YourFunctionToUseLoopfilter(rc_api.GetLoopfilterLevel()); +// rc_api->ComputeQP(frame_params); +// YourFunctionToUseQP(rc_api->GetQP()); +// YourFunctionToUseLoopfilter(rc_api->GetLoopfilterLevel()); // // After encoding -// rc_api.PostEncode(encoded_frame_size); +// rc_api->PostEncode(encoded_frame_size, frame_params); // } class VP9RateControlRTC { public: From d6b6f85063772f25624c6429cfa414b8d48b9c0e Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 16 Mar 2023 15:21:49 -0700 Subject: [PATCH 0676/1000] Remove repeated field from VP9RateControlRtcConfig Remove the `ts_number_layers` field from VP9RateControlRtcConfig because the base class VpxRateControlRtcConfig already has that field. Note: In commit 65a1751e5b98bf7f1d21bcbfdef352af34fb205d, `ts_number_layers` was moved to the newly created base class VpxRateControlRtcConfig but was inadvertently left in VP9RateControlRtcConfig: https://chromium-review.googlesource.com/c/webm/libvpx/+/3140048, Change-Id: I98d48e152683ec2e5e62efffb56b7f010c5d0695 --- vp9/ratectrl_rtc.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index a846f0742d..a82c776f94 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -44,8 +44,6 @@ struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig { // Number of spatial layers int ss_number_layers; - // Number of temporal layers - int ts_number_layers; int max_quantizers[VPX_MAX_LAYERS]; int min_quantizers[VPX_MAX_LAYERS]; int scaling_factor_num[VPX_SS_MAX_LAYERS]; From d92681b06ff7b0a51c51cbc012781851afa0701e Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 16 Mar 2023 18:36:13 -0700 Subject: [PATCH 0677/1000] Set oxcf->ts_rate_decimator[tl] only once The code that sets oxcf->ts_rate_decimator[tl] does not need to be inside a loop that iterates over sl. Move the code out of the sl loop so that oxcf->ts_rate_decimator[tl] is set only once. Change-Id: I22f6c117d200ec38a757b749a8700660d15436c1 --- vp9/ratectrl_rtc.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index 8592173fb6..cc12ea336a 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -136,6 +136,9 @@ void VP9RateControlRTC::UpdateRateControl( cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers; cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers; vp9_set_mb_mi(cm, cm->width, cm->height); + for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) { + oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl]; + } for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) { for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) { const int layer = @@ -149,7 +152,6 @@ void VP9RateControlRTC::UpdateRateControl( lrc->best_quality = vp9_quantizer_to_qindex(rc_cfg.min_quantizers[layer]); lc->scaling_factor_num = rc_cfg.scaling_factor_num[sl]; lc->scaling_factor_den = rc_cfg.scaling_factor_den[sl]; - oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl]; } } vp9_set_rc_buffer_sizes(cpi_); From 430c6c1553df0abfe2dadc480b21dd691a98140f Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 16 Mar 2023 13:30:01 -0700 Subject: [PATCH 0678/1000] Change UpdateRateControl() to return bool Change the VP9RateControlRtcConfig constructor to initialize ss_number_layers (to 1). Change UpdateRateControl() to return bool so that it can report failure (due to invalid configuration). Also change InitRateControl() to return bool to propagate the return value of UpdateRateControl(). Note: This is a port of the libaom CL https://aomedia-review.googlesource.com/c/aom/+/172042. Change-Id: I90b60353b5f15692dba5d89e7b1a9c81bb2fdd89 --- test/vp8_ratectrl_rtc_test.cc | 8 ++++---- test/vp9_ratectrl_rtc_test.cc | 6 +++--- vp8/vp8_ratectrl_rtc.cc | 15 +++++++++++---- vp8/vp8_ratectrl_rtc.h | 4 ++-- vp9/ratectrl_rtc.cc | 30 ++++++++++++++++++------------ vp9/ratectrl_rtc.h | 5 +++-- 6 files changed, 41 insertions(+), 27 deletions(-) diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc index 56c26a99f4..b76bcae11d 100644 --- a/test/vp8_ratectrl_rtc_test.cc +++ b/test/vp8_ratectrl_rtc_test.cc @@ -160,7 +160,7 @@ class Vp8RcInterfaceTest if (test_video_.width == 640 && target_bitrate_ == 1000) return; SetConfig(); rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); - rc_api_->UpdateRateControl(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, test_video_.height, 30, 1, 0, @@ -177,7 +177,7 @@ class Vp8RcInterfaceTest key_interval_ = 100; SetConfig(); rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); - rc_api_->UpdateRateControl(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, test_video_.height, 30, 1, 0, @@ -193,7 +193,7 @@ class Vp8RcInterfaceTest if (test_video_.width == 640 && target_bitrate_ == 1000) return; SetConfigTemporalLayers(2); rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); - rc_api_->UpdateRateControl(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, test_video_.height, 30, 1, 0, @@ -209,7 +209,7 @@ class Vp8RcInterfaceTest if (test_video_.width == 640 && target_bitrate_ == 1000) return; SetConfigTemporalLayers(3); rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); - rc_api_->UpdateRateControl(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, test_video_.height, 30, 1, 0, diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index c6ab5b034f..5abda1290a 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -212,7 +212,7 @@ class RcInterfaceSvcTest rc_cfg_.layer_target_bitrate[6] = 0; rc_cfg_.layer_target_bitrate[7] = 0; rc_cfg_.layer_target_bitrate[8] = 0; - rc_api_->UpdateRateControl(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); } else if (video->frame() == 200) { // Go down to 1 spatial layer. // Update the encoder config. @@ -226,7 +226,7 @@ class RcInterfaceSvcTest rc_cfg_.layer_target_bitrate[3] = 0; rc_cfg_.layer_target_bitrate[4] = 0; rc_cfg_.layer_target_bitrate[5] = 0; - rc_api_->UpdateRateControl(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); } else if (0 && video->frame() == 280) { // TODO(marpan): Re-enable this going back up when issue is fixed. // Go back up to 3 spatial layers. @@ -235,7 +235,7 @@ class RcInterfaceSvcTest encoder->Config(&cfg_); // Update the RC config. SetRCConfigSvc(3, 3); - rc_api_->UpdateRateControl(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); } } } diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc index c36cfea485..65c58536aa 100644 --- a/vp8/vp8_ratectrl_rtc.cc +++ b/vp8/vp8_ratectrl_rtc.cc @@ -62,7 +62,7 @@ std::unique_ptr VP8RateControlRTC::Create( if (!rc_api->cpi_) return nullptr; vp8_zero(*rc_api->cpi_); - rc_api->InitRateControl(cfg); + if (!rc_api->InitRateControl(cfg)) return nullptr; return rc_api; } @@ -74,7 +74,7 @@ VP8RateControlRTC::~VP8RateControlRTC() { } } -void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) { +bool VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) { VP8_COMMON *cm = &cpi_->common; VP8_CONFIG *oxcf = &cpi_->oxcf; oxcf->end_usage = USAGE_STREAM_FROM_SERVER; @@ -92,13 +92,19 @@ void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) { cpi_->kf_bitrate_adjustment = 0; cpi_->gf_overspend_bits = 0; cpi_->non_gf_bitrate_adjustment = 0; - UpdateRateControl(rc_cfg); + if (!UpdateRateControl(rc_cfg)) return false; cpi_->buffer_level = oxcf->starting_buffer_level; cpi_->bits_off_target = oxcf->starting_buffer_level; + return true; } -void VP8RateControlRTC::UpdateRateControl( +bool VP8RateControlRTC::UpdateRateControl( const VP8RateControlRtcConfig &rc_cfg) { + if (rc_cfg.ts_number_layers < 1 || + rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS) { + return false; + } + VP8_COMMON *cm = &cpi_->common; VP8_CONFIG *oxcf = &cpi_->oxcf; const unsigned int prev_number_of_layers = oxcf->number_of_layers; @@ -199,6 +205,7 @@ void VP8RateControlRTC::UpdateRateControl( vp8_new_framerate(cpi_, cpi_->framerate); vpx_clear_system_state(); + return true; } void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) { diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h index 0e81592eca..a8a886c56e 100644 --- a/vp8/vp8_ratectrl_rtc.h +++ b/vp8/vp8_ratectrl_rtc.h @@ -39,7 +39,7 @@ class VP8RateControlRTC { const VP8RateControlRtcConfig &cfg); ~VP8RateControlRTC(); - void UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg); + bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg); // GetQP() needs to be called after ComputeQP() to get the latest QP int GetQP() const; // int GetLoopfilterLevel() const; @@ -49,7 +49,7 @@ class VP8RateControlRTC { private: VP8RateControlRTC() {} - void InitRateControl(const VP8RateControlRtcConfig &cfg); + bool InitRateControl(const VP8RateControlRtcConfig &cfg); struct VP8_COMP *cpi_; int q_; }; diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index cc12ea336a..29033d4ba5 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -25,22 +25,16 @@ std::unique_ptr VP9RateControlRTC::Create( VP9RateControlRTC()); if (!rc_api) return nullptr; rc_api->cpi_ = static_cast(vpx_memalign(32, sizeof(*cpi_))); - if (!rc_api->cpi_) { - rc_api.reset(); - return nullptr; - } + if (!rc_api->cpi_) return nullptr; vp9_zero(*rc_api->cpi_); - rc_api->InitRateControl(cfg); + if (!rc_api->InitRateControl(cfg)) return nullptr; if (cfg.aq_mode) { VP9_COMP *const cpi = rc_api->cpi_; cpi->segmentation_map = static_cast( vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, sizeof(*cpi->segmentation_map))); - if (!cpi->segmentation_map) { - rc_api.reset(); - return nullptr; - } + if (!cpi->segmentation_map) return nullptr; cpi->cyclic_refresh = vp9_cyclic_refresh_alloc(cpi->common.mi_rows, cpi->common.mi_cols); cpi->cyclic_refresh->content_mode = 0; @@ -71,7 +65,7 @@ VP9RateControlRTC::~VP9RateControlRTC() { } } -void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { +bool VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { VP9_COMMON *cm = &cpi_->common; VP9EncoderConfig *oxcf = &cpi_->oxcf; RATE_CONTROL *const rc = &cpi_->rc; @@ -88,7 +82,7 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { cm->current_video_frame = 0; rc->kf_boost = DEFAULT_KF_BOOST; - UpdateRateControl(rc_cfg); + if (!UpdateRateControl(rc_cfg)) return false; vp9_set_mb_mi(cm, cm->width, cm->height); cpi_->use_svc = (cpi_->svc.number_spatial_layers > 1 || @@ -102,10 +96,21 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { vp9_rc_init(oxcf, 0, rc); rc->constrain_gf_key_freq_onepass_vbr = 0; cpi_->sf.use_nonrd_pick_mode = 1; + return true; } -void VP9RateControlRTC::UpdateRateControl( +bool VP9RateControlRTC::UpdateRateControl( const VP9RateControlRtcConfig &rc_cfg) { + // Since VPX_MAX_LAYERS (12) is less than the product of VPX_SS_MAX_LAYERS (5) + // and VPX_TS_MAX_LAYERS (5), check all three. + if (rc_cfg.ss_number_layers < 1 || + rc_cfg.ss_number_layers > VPX_SS_MAX_LAYERS || + rc_cfg.ts_number_layers < 1 || + rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS || + rc_cfg.ss_number_layers * rc_cfg.ts_number_layers > VPX_MAX_LAYERS) { + return false; + } + VP9_COMMON *cm = &cpi_->common; VP9EncoderConfig *oxcf = &cpi_->oxcf; RATE_CONTROL *const rc = &cpi_->rc; @@ -163,6 +168,7 @@ void VP9RateControlRTC::UpdateRateControl( (int)cpi_->oxcf.target_bandwidth); } vp9_check_reset_rc_flag(cpi_); + return true; } void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index a82c776f94..7f3c900459 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -30,6 +30,7 @@ namespace libvpx { struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig { public: VP9RateControlRtcConfig() { + ss_number_layers = 1; vp9_zero(max_quantizers); vp9_zero(min_quantizers); vp9_zero(scaling_factor_den); @@ -89,7 +90,7 @@ class VP9RateControlRTC { const VP9RateControlRtcConfig &cfg); ~VP9RateControlRTC(); - void UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg); + bool UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg); // GetQP() needs to be called after ComputeQP() to get the latest QP int GetQP() const; int GetLoopfilterLevel() const; @@ -101,7 +102,7 @@ class VP9RateControlRTC { private: VP9RateControlRTC() {} - void InitRateControl(const VP9RateControlRtcConfig &cfg); + bool InitRateControl(const VP9RateControlRtcConfig &cfg); struct VP9_COMP *cpi_; }; From 02fd7d6aeb1ea6d8eeef17315a6a7c4ffa6d7352 Mon Sep 17 00:00:00 2001 From: Johann Date: Sat, 12 Nov 2022 08:23:17 +0900 Subject: [PATCH 0679/1000] Reland "quantize: simplifly highbd 32x32_b args" This is a reland of commit 573f5e662b544dbc553d73fa2b61055c30dfe8cc Alignment issue with tests fixed in crrev.com/c/webm/libvpx/+/4305500 Original change's description: > quantize: simplify highbd 32x32_b args > > Change-Id: I431a41279c4c4193bc70cfe819da6ea7e1d2fba1 Change-Id: Ic868b6f987c99d88672858fedd092fa49c125e19 --- test/vp9_quantize_test.cc | 54 +++++++++++------------ vp9/encoder/vp9_encodemb.c | 10 ++--- vpx_dsp/arm/highbd_quantize_neon.c | 21 +++++---- vpx_dsp/quantize.c | 16 ++++--- vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 13 +++--- vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 21 ++++----- 7 files changed, 69 insertions(+), 68 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 6a8f1dafb1..bff2fa59a4 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -557,15 +557,15 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantWrapper, &QuantWrapper, VPX_BITS_12, 16, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 32, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, - 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, - 32, false))); + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_12, 32, false))); #else INSTANTIATE_TEST_SUITE_P( @@ -634,15 +634,15 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 32, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, - 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, - 32, false))); + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_12, 32, false))); #else INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, @@ -680,15 +680,15 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 32, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, - 32, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, - 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_12, 32, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), make_tuple(&QuantFPWrapper, diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 4910dc20f5..6a5f628808 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -511,9 +511,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32( - coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); @@ -856,9 +855,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32( - coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, + eob, scan_order->scan, scan_order->iscan); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index b9f72a94c5..3b1fec3321 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store( const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1, @@ -224,11 +225,9 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon( } void vpx_highbd_quantize_b_32x32_neon( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; @@ -237,12 +236,13 @@ void vpx_highbd_quantize_b_32x32_neon( // High half has identical elements, but we can reconstruct it from the low // half by duplicating the 2nd element. So we only need to pass a 4x32-bit // vector - int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1); - int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1); + int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1); + int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1); // Extend the quant, quant_shift vectors to ones of 32-bit elements // scale to high-half, so we can use vqdmulhq_s32 - int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15); - int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16); + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15); + int32x4_t quant_shift = + vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16); int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); // Process first 8 values which include a dc component. @@ -300,8 +300,7 @@ void vpx_highbd_quantize_b_32x32_neon( vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ - // Need these here, else the compiler complains about mixing declarations and + // Need this here, else the compiler complains about mixing declarations and // code in C90 - (void)n_coeffs; (void)scan; } diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index 212db45c88..c4642812ad 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -272,14 +272,16 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { + const intptr_t n_coeffs = 32 * 32; + const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), + ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; int idx = 0; int idx_arr[1024]; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 2a01ec1b54..ab86b9cc7c 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -731,7 +731,7 @@ () add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b neon sse2 avx2/; - add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c index 8edddd637f..6041d7289a 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -11,6 +11,7 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) { const __m128i sign = _mm_srai_epi16(*p, 15); @@ -222,17 +223,17 @@ static VPX_FORCE_INLINE void quantize_b_32x32( } void vpx_highbd_quantize_b_32x32_avx2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const unsigned int step = 8; + intptr_t n_coeffs = 32 * 32; __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; (void)scan; - init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1); + init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr, + mb_plane->quant_shift, qp, 1); quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index ae1981a834..6a8f42b8a4 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -15,6 +15,7 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vp9/encoder/vp9_block.h" #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, @@ -93,18 +94,17 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, } void vpx_highbd_quantize_b_32x32_sse2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; int i, eob = 0; - const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); - const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); + const intptr_t n_coeffs = 32 * 32; + const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1); (void)scan; zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); @@ -140,10 +140,11 @@ void vpx_highbd_quantize_b_32x32_sse2( const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1; const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15); qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; From c6da2329b90fc39ea48b99aff0b0468c1fdffa6c Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Thu, 9 Mar 2023 21:04:07 +0000 Subject: [PATCH 0680/1000] Add tests for vpx_highbd_minmax_8x8_c Write tests for vpx_highbd_minmax_8x8_c, and fix initial value of min in vpx_highbd_minmax_8x8_c. Change-Id: I1f127df945bbb8c7d373c5430ff5f94f28575968 --- test/minmax_test.cc | 109 ++++++++++++++++++++++++++++++++++++++++++++ vpx_dsp/avg.c | 2 +- 2 files changed, 110 insertions(+), 1 deletion(-) diff --git a/test/minmax_test.cc b/test/minmax_test.cc index 12327bc188..663b359c5d 100644 --- a/test/minmax_test.cc +++ b/test/minmax_test.cc @@ -15,6 +15,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" #include "test/acm_random.h" #include "test/register_state_check.h" @@ -115,7 +116,115 @@ TEST_P(MinMaxTest, CompareReferenceAndVaryStride) { } } +#if CONFIG_VP9_HIGHBITDEPTH + +using HBDMinMaxTest = MinMaxTest; + +void highbd_reference_minmax(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int *min_ret, int *max_ret) { + int min = 65535; + int max = 0; + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b); + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + const int diff = abs(a_ptr[i * a_stride + j] - b_ptr[i * b_stride + j]); + if (min > diff) min = diff; + if (max < diff) max = diff; + } + } + + *min_ret = min; + *max_ret = max; +} + +TEST_P(HBDMinMaxTest, MinValue) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + for (int i = 0; i < 64; i++) { + vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64); + vpx_memset16(CONVERT_TO_SHORTPTR(b), 65535, 64); + CONVERT_TO_SHORTPTR(b)[i] = i; // Set a minimum difference of i. + + int min, max; + ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max)); + EXPECT_EQ(65535, max); + EXPECT_EQ(i, min); + } + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); +} + +TEST_P(HBDMinMaxTest, MaxValue) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + for (int i = 0; i < 64; i++) { + vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64); + vpx_memset16(CONVERT_TO_SHORTPTR(b), 0, 64); + CONVERT_TO_SHORTPTR(b)[i] = i; // Set a minimum difference of i. + + int min, max; + ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max)); + EXPECT_EQ(i, max); + EXPECT_EQ(0, min); + } + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); +} + +TEST_P(HBDMinMaxTest, CompareReference) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + for (int j = 0; j < 64; j++) { + CONVERT_TO_SHORTPTR(a)[j] = rnd_.Rand16(); + CONVERT_TO_SHORTPTR(b)[j] = rnd_.Rand16(); + } + + int min_ref, max_ref, min, max; + highbd_reference_minmax(a, 8, b, 8, &min_ref, &max_ref); + ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max)); + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); + EXPECT_EQ(max_ref, max); + EXPECT_EQ(min_ref, min); +} + +TEST_P(HBDMinMaxTest, CompareReferenceAndVaryStride) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc((8 * 64) * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc((8 * 64) * sizeof(uint16_t)))); + for (int i = 0; i < 8 * 64; i++) { + CONVERT_TO_SHORTPTR(a)[i] = rnd_.Rand16(); + CONVERT_TO_SHORTPTR(b)[i] = rnd_.Rand16(); + } + for (int a_stride = 8; a_stride <= 64; a_stride += 8) { + for (int b_stride = 8; b_stride <= 64; b_stride += 8) { + int min_ref, max_ref, min, max; + highbd_reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref); + ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max)); + EXPECT_EQ(max_ref, max) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; + EXPECT_EQ(min_ref, min) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; + } + } + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); +} +#endif + INSTANTIATE_TEST_SUITE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c)); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(C, HBDMinMaxTest, + ::testing::Values(&vpx_highbd_minmax_8x8_c)); +#endif #if HAVE_SSE2 INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest, diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c index 9540154074..391e9eb144 100644 --- a/vpx_dsp/avg.c +++ b/vpx_dsp/avg.c @@ -428,7 +428,7 @@ void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int i, j; const uint16_t *s = CONVERT_TO_SHORTPTR(s8); const uint16_t *d = CONVERT_TO_SHORTPTR(d8); - *min = 255; + *min = 65535; *max = 0; for (i = 0; i < 8; ++i, s += p, d += dp) { for (j = 0; j < 8; ++j) { From fff4e76b55900767083434248c67c3e041bab97b Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Thu, 9 Mar 2023 13:58:16 +0000 Subject: [PATCH 0681/1000] Add Neon implementation of vpx_highbd_minmax_8x8_c Add Neon implementation of vpx_highbd_minmax_8x8_c as well as the corresponding tests. Change-Id: I5d9444a239fb1baa53634c1bdb5292b44067d90c --- test/minmax_test.cc | 4 ++ vpx_dsp/arm/highbd_avg_neon.c | 77 +++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 1 + 3 files changed, 82 insertions(+) diff --git a/test/minmax_test.cc b/test/minmax_test.cc index 663b359c5d..e710af6991 100644 --- a/test/minmax_test.cc +++ b/test/minmax_test.cc @@ -234,6 +234,10 @@ INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest, #if HAVE_NEON INSTANTIATE_TEST_SUITE_P(NEON, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_neon)); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(NEON, HBDMinMaxTest, + ::testing::Values(&vpx_highbd_minmax_8x8_neon)); +#endif #endif #if HAVE_MSA diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c index 3ba58b8005..b84a7875d4 100644 --- a/vpx_dsp/arm/highbd_avg_neon.c +++ b/vpx_dsp/arm/highbd_avg_neon.c @@ -38,3 +38,80 @@ int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) { return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1])); } + +void vpx_highbd_minmax_8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int *min, + int *max) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b); + + const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * a_stride); + const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * a_stride); + const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * a_stride); + const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * a_stride); + const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * a_stride); + const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * a_stride); + const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * a_stride); + const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * a_stride); + + const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * b_stride); + const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * b_stride); + const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * b_stride); + const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * b_stride); + const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * b_stride); + const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * b_stride); + const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * b_stride); + const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * b_stride); + + const uint16x8_t abs_diff0 = vabdq_u16(a0, b0); + const uint16x8_t abs_diff1 = vabdq_u16(a1, b1); + const uint16x8_t abs_diff2 = vabdq_u16(a2, b2); + const uint16x8_t abs_diff3 = vabdq_u16(a3, b3); + const uint16x8_t abs_diff4 = vabdq_u16(a4, b4); + const uint16x8_t abs_diff5 = vabdq_u16(a5, b5); + const uint16x8_t abs_diff6 = vabdq_u16(a6, b6); + const uint16x8_t abs_diff7 = vabdq_u16(a7, b7); + + const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1); + const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3); + const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5); + const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7); + + const uint16x8_t max0123 = vmaxq_u16(max01, max23); + const uint16x8_t max4567 = vmaxq_u16(max45, max67); + const uint16x8_t max07 = vmaxq_u16(max0123, max4567); + + const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1); + const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3); + const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5); + const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7); + + const uint16x8_t min0123 = vminq_u16(min01, min23); + const uint16x8_t min4567 = vminq_u16(min45, min67); + const uint16x8_t min07 = vminq_u16(min0123, min4567); + +#if defined(__aarch64__) + *min = *max = 0; // Clear high bits + *((uint16_t *)max) = vmaxvq_u16(max07); + *((uint16_t *)min) = vminvq_u16(min07); +#else + // Split into 64-bit vectors and execute pairwise min/max. + uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07)); + uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07)); + + // Enough runs of vpmax/min propagate the max/min values to every position. + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + *min = *max = 0; // Clear high bits + // Store directly to avoid costly neon->gpr transfer. + vst1_lane_u16((uint16_t *)max, ab_max, 0); + vst1_lane_u16((uint16_t *)min, ab_min, 0); +#endif +} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 2a01ec1b54..2780333e88 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1000,6 +1000,7 @@ () specialize qw/vpx_highbd_avg_4x4 sse2/; add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max"; + specialize qw/vpx_highbd_minmax_8x8 neon/; add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vpx_highbd_sad64x64_avg sse2 neon avx2/; From f23f27bb807794fe5773d0343e6281527ad5f640 Mon Sep 17 00:00:00 2001 From: Johann Date: Mon, 14 Nov 2022 16:47:33 +0900 Subject: [PATCH 0682/1000] Reland "quantize: use scan_order instead of passing scan/iscan" This is a reland of commit 14fc40040ff30486c45111056db44ee18590a24a Parent change fixed in crrev.com/c/webm/libvpx/+/4305500 Original change's description: > quantize: use scan_order instead of passing scan/iscan > > further reduces the arguments for the 32x32. This will be applied to the base > version as well. > > Change-Id: I25a162b5248b14af53d9e20c6a7fa2a77028a6d1 Change-Id: I2a7654558eaddd68bd09336bf317b297f18559d2 --- test/vp9_quantize_test.cc | 42 +++++++++++------------ vp9/common/vp9_scan.h | 2 +- vp9/encoder/vp9_encodemb.c | 8 ++--- vpx_dsp/arm/highbd_quantize_neon.c | 7 ++-- vpx_dsp/arm/quantize_neon.c | 7 ++-- vpx_dsp/quantize.c | 9 ++--- vpx_dsp/vpx_dsp_rtcd_defs.pl | 5 +-- vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 5 +-- vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 5 +-- vpx_dsp/x86/quantize_avx.c | 7 ++-- vpx_dsp/x86/quantize_avx2.c | 5 +-- vpx_dsp/x86/quantize_ssse3.c | 6 ++-- 12 files changed, 55 insertions(+), 53 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index bff2fa59a4..e9b17d5eb8 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -42,7 +42,7 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan); + const struct scan_order *const scan_order); typedef std::tuple QuantizeParam; @@ -60,9 +60,10 @@ template void QuantWrapper(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant, - mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan); + mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan_order->scan, + scan_order->iscan); } // Wrapper for 32x32 version which does not use count @@ -70,16 +71,16 @@ typedef void (*Quantize32x32Func)(const tran_low_t *coeff, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan); + const struct scan_order *const scan_order); template void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { (void)count; - fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan); + fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order); } // Wrapper for FP version which does not use zbin or quant_shift. @@ -93,9 +94,9 @@ template void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff, - dequant, eob, scan, iscan); + dequant, eob, scan_order->scan, scan_order->iscan); } void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, @@ -232,8 +233,7 @@ class VP9QuantizeTest : public VP9QuantizeBase, void VP9QuantizeTest::Run() { quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan, - scan_->iscan); + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_); } void VP9QuantizeTest::Speed(bool is_median) { @@ -306,7 +306,7 @@ void VP9QuantizeTest::Speed(bool is_median) { ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_->scan, scan_->iscan); + scan_); } vpx_usec_timer_mark(&timer); @@ -314,7 +314,7 @@ void VP9QuantizeTest::Speed(bool is_median) { for (int n = 0; n < kNumTests; ++n) { quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), - dequant_ptr_, &eob_, scan_->scan, scan_->iscan); + dequant_ptr_, &eob_, scan_); } vpx_usec_timer_mark(&simd_timer); @@ -455,12 +455,11 @@ TEST_P(VP9QuantizeTest, OperationCheck) { quant_fp_ptr_); ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), - dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); + dequant_ptr_, &ref_eob, scan_); - ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, - mb_plane_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, - &eob_, scan_->scan, scan_->iscan)); + ASM_REGISTER_STATE_CHECK(quantize_op_( + coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); @@ -512,12 +511,11 @@ TEST_P(VP9QuantizeTest, EOBCheck) { quant_fp_ptr_); ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), - dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan); + dequant_ptr_, &ref_eob, scan_); - ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_, - mb_plane_, qcoeff_.TopLeftPixel(), - dqcoeff_.TopLeftPixel(), dequant_ptr_, - &eob_, scan_->scan, scan_->iscan)); + ASM_REGISTER_STATE_CHECK(quantize_op_( + coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_)); EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h index 72a9a5ec47..efa0e23365 100644 --- a/vp9/common/vp9_scan.h +++ b/vp9/common/vp9_scan.h @@ -23,7 +23,7 @@ extern "C" { #define MAX_NEIGHBORS 2 -typedef struct { +typedef struct scan_order { const int16_t *scan; const int16_t *iscan; const int16_t *neighbors; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 6a5f628808..515c7a9031 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -512,7 +512,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + scan_order); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); @@ -542,7 +542,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + scan_order); break; case TX_16X16: vpx_fdct16x16(src_diff, coeff, diff_stride); @@ -856,7 +856,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, src_stride, dst, dst_stride, xd->bd); highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + eob, scan_order); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -946,7 +946,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, dst_stride); fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + scan_order); } if (args->enable_coeff_opt && !x->skip_recode) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index 3b1fec3321..5a40f1284e 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store( @@ -227,10 +228,11 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon( void vpx_highbd_quantize_b_32x32_neon( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { + uint16_t *eob_ptr, const struct scan_order *const scan_order) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; + const int16_t *iscan = scan_order->iscan; // Only the first element of each vector is DC. // High half has identical elements, but we can reconstruct it from the low @@ -300,7 +302,4 @@ void vpx_highbd_quantize_b_32x32_neon( vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ - // Need this here, else the compiler complains about mixing declarations and - // code in C90 - (void)scan; } diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index e81738a7bb..84b6d8c79f 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -14,6 +14,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, @@ -218,10 +219,11 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; + const int16_t *iscan = scan_order->iscan; // Only the first element of each vector is DC. int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1); @@ -285,7 +287,4 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ - // Need these here, else the compiler complains about mixing declarations and - // code in C90 - (void)scan; } diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index c4642812ad..f51bf253e7 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -14,6 +14,7 @@ #include "vpx_dsp/quantize.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, @@ -213,7 +214,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { const int n_coeffs = 32 * 32; const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; @@ -221,11 +222,11 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const int16_t *round_ptr = mb_plane->round; const int16_t *quant_ptr = mb_plane->quant; const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; int idx = 0; int idx_arr[32 * 32 /* n_coeffs */]; int i, eob = -1; - (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -274,7 +275,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, void vpx_highbd_quantize_b_32x32_c( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { + uint16_t *eob_ptr, const struct scan_order *const scan_order) { const intptr_t n_coeffs = 32 * 32; const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; @@ -282,11 +283,11 @@ void vpx_highbd_quantize_b_32x32_c( const int16_t *round_ptr = mb_plane->round; const int16_t *quant_ptr = mb_plane->quant; const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; int idx = 0; int idx_arr[1024]; int i, eob = -1; - (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 9853ba54a0..7cd3a0be89 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -19,6 +19,7 @@ () #include "vpx_dsp/vpx_filter.h" #if CONFIG_VP9_ENCODER struct macroblock_plane; + struct scan_order; #endif EOF @@ -724,14 +725,14 @@ () add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; - add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b neon sse2 avx2/; - add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order"; specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c index 6041d7289a..bfd7b2e23e 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -11,6 +11,7 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) { @@ -225,12 +226,12 @@ static VPX_FORCE_INLINE void quantize_b_32x32( void vpx_highbd_quantize_b_32x32_avx2( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { + uint16_t *eob_ptr, const struct scan_order *const scan_order) { const unsigned int step = 8; intptr_t n_coeffs = 32 * 32; + const int16_t *iscan = scan_order->iscan; __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; - (void)scan; init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr, mb_plane->quant_shift, qp, 1); diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 6a8f42b8a4..58d5a3a5ff 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -15,6 +15,7 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" #if CONFIG_VP9_HIGHBITDEPTH @@ -96,16 +97,16 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, void vpx_highbd_quantize_b_32x32_sse2( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { + uint16_t *eob_ptr, const struct scan_order *const scan_order) { __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; int i, eob = 0; const intptr_t n_coeffs = 32 * 32; + const int16_t *iscan = scan_order->iscan; const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1); const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1); - (void)scan; zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); zbins[1] = _mm_set1_epi32(zbin1_tmp); diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index d52f6c6644..d05a937be1 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -19,6 +19,8 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -144,10 +146,11 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { const __m128i zero = _mm_setzero_si128(); const __m256i big_zero = _mm256_setzero_si256(); int index; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -156,8 +159,6 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, __m128i all_zero; __m128i eob = zero, eob0; - (void)scan; - load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index a8412c5b8e..1c82542ae6 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -13,6 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void load_b_values_avx2( @@ -255,11 +256,11 @@ void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; - (void)scan; + const int16_t *iscan = scan_order->iscan; load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round, mb_plane->quant, &v_quant, dequant_ptr, &v_dequant, diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 6fe54d7d98..6401b2865d 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -16,6 +16,7 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, @@ -112,9 +113,10 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct scan_order *const scan_order) { const __m128i zero = _mm_setzero_si128(); int index; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -123,8 +125,6 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, __m128i all_zero; __m128i eob = zero, eob0; - (void)scan; - load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); From cb5b047ad89d8183ca512a4b57eee816b18f76bf Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 22 Feb 2023 13:22:08 -0800 Subject: [PATCH 0683/1000] vp9_setup_mask: clear -Wshadow warnings Bug: webm:1793 Change-Id: If678fc195ef87cc634d31fb7b24e0c844a5cb7b0 --- vp9/common/vp9_loopfilter.c | 38 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 765cb11726..1a9d45ae77 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -932,32 +932,32 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, break; default: for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) { - const int shift_y = shift_32_y[idx_32]; - const int shift_uv = shift_32_uv[idx_32]; + const int shift_y_32 = shift_32_y[idx_32]; + const int shift_uv_32 = shift_32_uv[idx_32]; const int mi_32_col_offset = ((idx_32 & 1) << 2); const int mi_32_row_offset = ((idx_32 >> 1) << 2); if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows) continue; switch (mip[0]->sb_type) { case BLOCK_32X32: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm); break; case BLOCK_32X16: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm); if (mi_32_row_offset + 2 >= max_rows) continue; mip2 = mip + mode_info_stride * 2; - build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm); + build_masks(lfi_n, mip2[0], shift_y_32 + 16, shift_uv_32 + 4, lfm); break; case BLOCK_16X32: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm); if (mi_32_col_offset + 2 >= max_cols) continue; mip2 = mip + 2; - build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm); + build_masks(lfi_n, mip2[0], shift_y_32 + 2, shift_uv_32 + 1, lfm); break; default: for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) { - const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16]; - const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16]; + const int shift_y_16 = shift_y_32 + shift_16_y[idx_16]; + const int shift_uv_16 = shift_uv_32 + shift_16_uv[idx_16]; const int mi_16_col_offset = mi_32_col_offset + ((idx_16 & 1) << 1); const int mi_16_row_offset = @@ -968,28 +968,26 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, switch (mip[0]->sb_type) { case BLOCK_16X16: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm); break; case BLOCK_16X8: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm); if (mi_16_row_offset + 1 >= max_rows) continue; mip2 = mip + mode_info_stride; - build_y_mask(lfi_n, mip2[0], shift_y + 8, lfm); + build_y_mask(lfi_n, mip2[0], shift_y_16 + 8, lfm); break; case BLOCK_8X16: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm); if (mi_16_col_offset + 1 >= max_cols) continue; mip2 = mip + 1; - build_y_mask(lfi_n, mip2[0], shift_y + 1, lfm); + build_y_mask(lfi_n, mip2[0], shift_y_16 + 1, lfm); break; default: { - const int shift_y = - shift_32_y[idx_32] + shift_16_y[idx_16] + shift_8_y[0]; - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + const int shift_y_8_0 = shift_y_16 + shift_8_y[0]; + build_masks(lfi_n, mip[0], shift_y_8_0, shift_uv_16, lfm); mip += offset[0]; for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) { - const int shift_y = shift_32_y[idx_32] + - shift_16_y[idx_16] + shift_8_y[idx_8]; + const int shift_y_8 = shift_y_16 + shift_8_y[idx_8]; const int mi_8_col_offset = mi_16_col_offset + ((idx_8 & 1)); const int mi_8_row_offset = @@ -998,7 +996,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, if (mi_8_col_offset >= max_cols || mi_8_row_offset >= max_rows) continue; - build_y_mask(lfi_n, mip[0], shift_y, lfm); + build_y_mask(lfi_n, mip[0], shift_y_8, lfm); } break; } From 492f4c5538fd8c9af8e8e661bab29d985ebf2ae0 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 22 Feb 2023 13:29:20 -0800 Subject: [PATCH 0684/1000] vp9_bitstream.c: clear -Wshadow warnings Bug: webm:1793 Change-Id: I8abac3c901ad24b642b39ea6e6081d8ba626853d --- vp9/encoder/vp9_bitstream.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index a84c8b524f..17c123af6f 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -169,8 +169,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp, vpx_write_bit(w, p->extra & 1); } else { // t >= TWO_TOKEN && t < EOB_TOKEN const struct vp9_token *const a = &vp9_coef_encodings[t]; - const int v = a->value; - const int n = a->len; + int v = a->value; + int n = a->len; const int e = p->extra; vpx_write(w, 1, context_tree[2]); vp9_write_tree(w, vp9_coef_con_tree, @@ -179,8 +179,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp, if (t >= CATEGORY1_TOKEN) { const vp9_extra_bit *const b = &extra_bits[t]; const unsigned char *pb = b->prob; - int v = e >> 1; - int n = b->len; // number of bits in v, assumed nonzero + v = e >> 1; + n = b->len; // number of bits in v, assumed nonzero do { const int bb = (v >> --n) & 1; vpx_write(w, bb, *pb++); @@ -599,7 +599,6 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, for (t = 0; t < entropy_nodes_update; ++t) { vpx_prob newp = new_coef_probs[i][j][k][l][t]; vpx_prob *oldp = old_coef_probs[i][j][k][l] + t; - const vpx_prob upd = DIFF_UPDATE_PROB; int64_t s; int u = 0; if (t == PIVOT_NODE) From 405ae856664c1f8390a43d58dbbd96d1b572f095 Mon Sep 17 00:00:00 2001 From: Deepa K G Date: Thu, 2 Mar 2023 13:39:55 +0530 Subject: [PATCH 0685/1000] Refactor logic of skipping trellis coeff opt The code to enable trellis coefficient optimization is refactored using the sf 'trellis_opt_tx_rd'. This change facilitates adaptive skipping of trellis optimization based on block properties. Change-Id: Ia1ff7cbbe5acf86414410f62655d46c099387847 --- vp9/encoder/vp9_block.h | 2 +- vp9/encoder/vp9_encodeframe.c | 14 ++--- vp9/encoder/vp9_encodemb.c | 105 +++++++++++++++++++------------ vp9/encoder/vp9_encodemb.h | 5 +- vp9/encoder/vp9_encoder.h | 19 ++++++ vp9/encoder/vp9_rdopt.c | 37 +++++++---- vp9/encoder/vp9_speed_features.c | 15 +++-- vp9/encoder/vp9_speed_features.h | 19 +++++- 8 files changed, 147 insertions(+), 69 deletions(-) diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 3e2c9a3c35..116fc2447c 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -78,7 +78,7 @@ struct macroblock { int skip_recode; int skip_optimize; int q_index; - int block_qcoeff_opt; + double log_block_src_var; int block_tx_domain; // The equivalent error at the current rdmult of one whole bit (not one diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 5b811016de..1d593cfc01 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -2021,20 +2021,20 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, // Save rdmult before it might be changed, so it can be restored later. orig_rdmult = x->rdmult; - if ((cpi->sf.tx_domain_thresh > 0.0) || (cpi->sf.quant_opt_thresh > 0.0)) { + if ((cpi->sf.tx_domain_thresh > 0.0) || + (cpi->sf.trellis_opt_tx_rd.thresh > 0.0)) { double logvar = vp9_log_block_var(cpi, x, bsize); - // Check block complexity as part of descision on using pixel or transform + // Check block complexity as part of decision on using pixel or transform // domain distortion in rd tests. x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion && (logvar >= cpi->sf.tx_domain_thresh); - // Check block complexity as part of descision on using quantized - // coefficient optimisation inside the rd loop. - x->block_qcoeff_opt = - cpi->sf.allow_quant_coeff_opt && (logvar <= cpi->sf.quant_opt_thresh); + // Store block complexity to decide on using quantized coefficient + // optimization inside the rd loop. + x->log_block_src_var = logvar; } else { x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion; - x->block_qcoeff_opt = cpi->sf.allow_quant_coeff_opt; + x->log_block_src_var = 0.0; } set_segment_index(cpi, x, mi_row, mi_col, bsize, 0); diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index fa222f9dcf..7c61419f8b 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -26,6 +26,7 @@ #include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_tokenize.h" @@ -759,10 +760,19 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, MODE_INFO *mi = xd->mi[0]; int plane; #if CONFIG_MISMATCH_DEBUG - struct encode_b_args arg = { x, 1, NULL, NULL, + struct encode_b_args arg = { x, + 1, // enable_trellis_opt + 0.0, // trellis_opt_thresh + NULL, // above entropy context + NULL, // left entropy context &mi->skip, mi_row, mi_col, output_enabled }; #else - struct encode_b_args arg = { x, 1, NULL, NULL, &mi->skip }; + struct encode_b_args arg = { x, + 1, // enable_trellis_opt + 0.0, // trellis_opt_thresh + NULL, // above entropy context + NULL, // left entropy context + &mi->skip }; (void)mi_row; (void)mi_col; (void)output_enabled; @@ -780,9 +790,9 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]); - arg.enable_coeff_opt = 1; + arg.enable_trellis_opt = 1; } else { - arg.enable_coeff_opt = 0; + arg.enable_trellis_opt = 0; } arg.ta = ctx.ta[plane]; arg.tl = ctx.tl[plane]; @@ -814,17 +824,13 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, uint16_t *eob = &p->eobs[block]; const int src_stride = p->src.stride; const int dst_stride = pd->dst.stride; + int enable_trellis_opt = !x->skip_recode; ENTROPY_CONTEXT *a = NULL; ENTROPY_CONTEXT *l = NULL; int entropy_ctx = 0; dst = &pd->dst.buf[4 * (row * dst_stride + col)]; src = &p->src.buf[4 * (row * src_stride + col)]; src_diff = &p->src_diff[4 * (row * diff_stride + col)]; - if (args->enable_coeff_opt) { - a = &args->ta[col]; - l = &args->tl[row]; - entropy_ctx = combine_entropy_contexts(*a, *l); - } if (tx_size == TX_4X4) { tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block); @@ -848,20 +854,42 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, // skip block condition should be handled before this is called. assert(!x->skip_block); + if (!x->skip_recode) { + const int tx_size_in_pixels = (1 << tx_size) << 2; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, + diff_stride, src, src_stride, dst, dst_stride, + xd->bd); + } else { + vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, + diff_stride, src, src_stride, dst, dst_stride); + } +#else + vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, + diff_stride, src, src_stride, dst, dst_stride); +#endif + enable_trellis_opt = do_trellis_opt(args); + } + + if (enable_trellis_opt) { + a = &args->ta[col]; + l = &args->tl[row]; + entropy_ctx = combine_entropy_contexts(*a, *l); + } + #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); switch (tx_size) { case TX_32X32: if (!x->skip_recode) { - vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src, - src_stride, dst, dst_stride, xd->bd); highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vpx_highbd_quantize_b_32x32( coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { @@ -870,8 +898,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, break; case TX_16X16: if (!x->skip_recode) { - vpx_highbd_subtract_block(16, 16, src_diff, diff_stride, src, - src_stride, dst, dst_stride, xd->bd); if (tx_type == DCT_DCT) vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); else @@ -880,7 +906,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { @@ -890,8 +916,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, break; case TX_8X8: if (!x->skip_recode) { - vpx_highbd_subtract_block(8, 8, src_diff, diff_stride, src, - src_stride, dst, dst_stride, xd->bd); if (tx_type == DCT_DCT) vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); else @@ -900,7 +924,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { @@ -911,8 +935,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, default: assert(tx_size == TX_4X4); if (!x->skip_recode) { - vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src, - src_stride, dst, dst_stride, xd->bd); if (tx_type != DCT_DCT) vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type); else @@ -921,7 +943,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { @@ -945,14 +967,12 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: if (!x->skip_recode) { - vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, - dst_stride); fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) @@ -960,14 +980,12 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, break; case TX_16X16: if (!x->skip_recode) { - vpx_subtract_block(16, 16, src_diff, diff_stride, src, src_stride, dst, - dst_stride); vp9_fht16x16(src_diff, coeff, diff_stride, tx_type); vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) @@ -975,14 +993,12 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, break; case TX_8X8: if (!x->skip_recode) { - vpx_subtract_block(8, 8, src_diff, diff_stride, src, src_stride, dst, - dst_stride); vp9_fht8x8(src_diff, coeff, diff_stride, tx_type); vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) @@ -991,8 +1007,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, default: assert(tx_size == TX_4X4); if (!x->skip_recode) { - vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst, - dst_stride); if (tx_type != DCT_DCT) vp9_fht4x4(src_diff, coeff, diff_stride, tx_type); else @@ -1001,7 +1015,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { @@ -1019,28 +1033,39 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, } void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane, - int enable_optimize_b) { + int enable_trellis_opt) { const MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx ctx; #if CONFIG_MISMATCH_DEBUG // TODO(angiebird): make mismatch_debug support intra mode struct encode_b_args arg = { - x, enable_optimize_b, ctx.ta[plane], ctx.tl[plane], &xd->mi[0]->skip, 0, 0, - 0 + x, + enable_trellis_opt, + 0.0, // trellis_opt_thresh + ctx.ta[plane], + ctx.tl[plane], + &xd->mi[0]->skip, + 0, // mi_row + 0, // mi_col + 0 // output_enabled }; #else - struct encode_b_args arg = { x, enable_optimize_b, ctx.ta[plane], - ctx.tl[plane], &xd->mi[0]->skip }; + struct encode_b_args arg = { x, + enable_trellis_opt, + 0.0, // trellis_opt_thresh + ctx.ta[plane], + ctx.tl[plane], + &xd->mi[0]->skip }; #endif - if (enable_optimize_b && x->optimize && + if (enable_trellis_opt && x->optimize && (!x->skip_recode || !x->skip_optimize)) { const struct macroblockd_plane *const pd = &xd->plane[plane]; const TX_SIZE tx_size = plane ? get_uv_tx_size(xd->mi[0], pd) : xd->mi[0]->tx_size; vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]); } else { - arg.enable_coeff_opt = 0; + arg.enable_trellis_opt = 0; } vp9_foreach_transformed_block_in_plane(xd, bsize, plane, diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h index 1975ee73ac..4091b02149 100644 --- a/vp9/encoder/vp9_encodemb.h +++ b/vp9/encoder/vp9_encodemb.h @@ -20,7 +20,8 @@ extern "C" { struct encode_b_args { MACROBLOCK *x; - int enable_coeff_opt; + int enable_trellis_opt; + double trellis_opt_thresh; ENTROPY_CONTEXT *ta; ENTROPY_CONTEXT *tl; int8_t *skip; @@ -48,7 +49,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane, - int enable_optimize_b); + int enable_trellis_opt); #ifdef __cplusplus } // extern "C" diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 77de5c8754..0e95037dc2 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1478,6 +1478,25 @@ static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) { } } +// Check if trellis coefficient optimization of the transform block is enabled. +static INLINE int do_trellis_opt(void *arg) { + const struct encode_b_args *const args = (struct encode_b_args *)arg; + const MACROBLOCK *const x = args->x; + const int enable_trellis_opt = args->enable_trellis_opt; + const double trellis_opt_thresh = args->trellis_opt_thresh; + + switch (enable_trellis_opt) { + case DISABLE_TRELLIS_OPT: return 0; + case ENABLE_TRELLIS_OPT: return 1; + case ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR: { + return (trellis_opt_thresh > 0.0) + ? (x->log_block_src_var <= trellis_opt_thresh) + : 1; + } + default: assert(0 && "Invalid trellis optimization method."); return 1; + } +} + #if CONFIG_COLLECT_COMPONENT_TIMING static INLINE void start_timing(VP9_COMP *cpi, int component) { vpx_usec_timer_start(&cpi->component_timer[component]); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index f87ab3e0bc..3a68952916 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -562,7 +562,7 @@ static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd, return sse; } -// Compute the squares sum squares on all visible 4x4s in the transform block. +// Compute the sum of squares on all visible 4x4s in the transform block. static int64_t sum_squares_visible(const MACROBLOCKD *xd, const struct macroblockd_plane *const pd, const int16_t *diff, const int diff_stride, @@ -749,20 +749,34 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, const struct macroblockd_plane *const pd = &xd->plane[plane]; const int dst_stride = pd->dst.stride; const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)]; + const int enable_trellis_opt = args->cpi->sf.trellis_opt_tx_rd.method; + const double trellis_opt_thresh = args->cpi->sf.trellis_opt_tx_rd.thresh; +#if CONFIG_MISMATCH_DEBUG + struct encode_b_args encode_b_arg = { + x, + enable_trellis_opt, + trellis_opt_thresh, + args->t_above, + args->t_left, + &mi->skip, + 0, // mi_row + 0, // mi_col + 0 // output_enabled + }; +#else + struct encode_b_args encode_b_arg = { x, + enable_trellis_opt, + trellis_opt_thresh, + args->t_above, + args->t_left, + &mi->skip }; +#endif if (args->exit_early) return; if (!is_inter_block(mi)) { -#if CONFIG_MISMATCH_DEBUG - struct encode_b_args intra_arg = { - x, x->block_qcoeff_opt, args->t_above, args->t_left, &mi->skip, 0, 0, 0 - }; -#else - struct encode_b_args intra_arg = { x, x->block_qcoeff_opt, args->t_above, - args->t_left, &mi->skip }; -#endif vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size, - &intra_arg); + &encode_b_arg); if (recon) { uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)]; copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride, @@ -803,9 +817,10 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, if (skip_txfm_flag == SKIP_TXFM_NONE || (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) { + const int enable_trellis_opt = do_trellis_opt(&encode_b_arg); // full forward transform and quantization vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size); - if (x->block_qcoeff_opt) + if (enable_trellis_opt) vp9_optimize_b(x, plane, block, tx_size, coeff_ctx); dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size, &dist, &sse, recon); diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index f47e3d71c9..d07bb34ae1 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -275,8 +275,10 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->allow_txfm_domain_distortion = 1; sf->tx_domain_thresh = tx_dom_thresholds[(speed < 6) ? speed : 5]; - sf->allow_quant_coeff_opt = sf->optimize_coefficients; - sf->quant_opt_thresh = qopt_thresholds[(speed < 6) ? speed : 5]; + sf->trellis_opt_tx_rd.method = sf->optimize_coefficients + ? ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR + : DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = qopt_thresholds[(speed < 6) ? speed : 5]; sf->less_rectangular_check = 1; sf->use_rd_breakout = 1; sf->adaptive_motion_search = 1; @@ -470,8 +472,8 @@ static void set_rt_speed_feature_framesize_independent( if (speed >= 1) { sf->allow_txfm_domain_distortion = 1; sf->tx_domain_thresh = 0.0; - sf->allow_quant_coeff_opt = 0; - sf->quant_opt_thresh = 0.0; + sf->trellis_opt_tx_rd.method = DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = 0.0; sf->use_square_partition_only = !frame_is_intra_only(cm); sf->less_rectangular_check = 1; sf->tx_size_search_method = @@ -946,8 +948,9 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { sf->adaptive_interp_filter_search = 0; sf->allow_txfm_domain_distortion = 0; sf->tx_domain_thresh = 99.0; - sf->allow_quant_coeff_opt = sf->optimize_coefficients; - sf->quant_opt_thresh = 99.0; + sf->trellis_opt_tx_rd.method = + sf->optimize_coefficients ? ENABLE_TRELLIS_OPT : DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = 99.0; sf->allow_acl = 1; sf->enable_tpl_model = oxcf->enable_tpl_model; sf->prune_ref_frame_for_rect_partitions = 0; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index e30a26084a..fceeb94023 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -246,6 +246,21 @@ typedef enum { USE_8_TAPS_SHARP, } SUBPEL_SEARCH_TYPE; +typedef enum { + // Disable trellis coefficient optimization + DISABLE_TRELLIS_OPT, + // Enable trellis coefficient optimization + ENABLE_TRELLIS_OPT, + // Enable trellis coefficient optimization based on source variance of the + // prediction block during transform RD + ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR, +} ENABLE_TRELLIS_OPT_METHOD; + +typedef struct TRELLIS_OPT_CONTROL { + ENABLE_TRELLIS_OPT_METHOD method; + double thresh; +} TRELLIS_OPT_CONTROL; + typedef struct SPEED_FEATURES { MV_SPEED_FEATURES mv; @@ -292,8 +307,8 @@ typedef struct SPEED_FEATURES { int coeff_prob_appx_step; // Enable uniform quantizer followed by trellis coefficient optimization - int allow_quant_coeff_opt; - double quant_opt_thresh; + // during transform RD + TRELLIS_OPT_CONTROL trellis_opt_tx_rd; // Enable asymptotic closed-loop encoding decision for key frame and // alternate reference frames. From 55e102dc54844b2f749395dbcd53d1e01b0e5030 Mon Sep 17 00:00:00 2001 From: Deepa K G Date: Thu, 2 Mar 2023 13:39:55 +0530 Subject: [PATCH 0686/1000] Skip trellis coeff opt based on tx block properties The trellis coefficient optimization is skipped for blocks with larger residual mse. Instruction Count BD-Rate Loss(%) cpu Resolution Reduction(%) avg.psnr ovr.psnr ssim 0 LOWRES2 9.467 0.0921 0.1057 0.0362 0 MIDRES2 4.328 -0.0155 0.0694 0.0178 0 HDRES2 1.858 0.0231 0.0214 -0.0034 0 Average 5.218 0.0332 0.0655 0.0169 STATS_CHANGED Change-Id: I321a9b1a34ebb59b7b6a065b5b2d717c8767a4a5 --- vp9/encoder/vp9_encodemb.c | 15 ++++- vp9/encoder/vp9_encodemb.h | 2 + vp9/encoder/vp9_encoder.h | 93 ++++++++++++++++++++++++++++-- vp9/encoder/vp9_rdopt.c | 97 +++++++++++--------------------- vp9/encoder/vp9_speed_features.c | 4 ++ vp9/encoder/vp9_speed_features.h | 3 + 6 files changed, 141 insertions(+), 73 deletions(-) diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 7c61419f8b..c079aa0547 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -763,6 +763,8 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, struct encode_b_args arg = { x, 1, // enable_trellis_opt 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse NULL, // above entropy context NULL, // left entropy context &mi->skip, mi_row, mi_col, output_enabled }; @@ -770,6 +772,8 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, struct encode_b_args arg = { x, 1, // enable_trellis_opt 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse NULL, // above entropy context NULL, // left entropy context &mi->skip }; @@ -869,7 +873,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, diff_stride, src, src_stride, dst, dst_stride); #endif - enable_trellis_opt = do_trellis_opt(args); + enable_trellis_opt = do_trellis_opt(pd, src_diff, diff_stride, row, col, + plane_bsize, tx_size, args); } if (enable_trellis_opt) { @@ -1041,7 +1046,9 @@ void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane, struct encode_b_args arg = { x, enable_trellis_opt, - 0.0, // trellis_opt_thresh + 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse ctx.ta[plane], ctx.tl[plane], &xd->mi[0]->skip, @@ -1052,7 +1059,9 @@ void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane, #else struct encode_b_args arg = { x, enable_trellis_opt, - 0.0, // trellis_opt_thresh + 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse ctx.ta[plane], ctx.tl[plane], &xd->mi[0]->skip }; diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h index 4091b02149..1391446bed 100644 --- a/vp9/encoder/vp9_encodemb.h +++ b/vp9/encoder/vp9_encodemb.h @@ -22,6 +22,8 @@ struct encode_b_args { MACROBLOCK *x; int enable_trellis_opt; double trellis_opt_thresh; + int *sse_calc_done; + int64_t *sse; ENTROPY_CONTEXT *ta; ENTROPY_CONTEXT *tl; int8_t *skip; diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 0e95037dc2..442ef1899c 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -14,6 +14,7 @@ #include #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx/vpx_ext_ratectrl.h" #include "vpx/vp8cx.h" @@ -1478,21 +1479,101 @@ static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) { } } +static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim, + int subsampling_dim, int blk_dim) { + return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim; +} + +// Compute the sum of squares on all visible 4x4s in the transform block. +static int64_t sum_squares_visible(const MACROBLOCKD *xd, + const struct macroblockd_plane *const pd, + const int16_t *diff, const int diff_stride, + int blk_row, int blk_col, + const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize, + int *visible_width, int *visible_height) { + int64_t sse; + const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize]; + const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize]; + const int b4x4s_to_right_edge = num_4x4_to_edge( + plane_4x4_w, xd->mb_to_right_edge, pd->subsampling_x, blk_col); + const int b4x4s_to_bottom_edge = num_4x4_to_edge( + plane_4x4_h, xd->mb_to_bottom_edge, pd->subsampling_y, blk_row); + if (tx_bsize == BLOCK_4X4 || + (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) { + assert(tx_4x4_w == tx_4x4_h); + sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2); + *visible_width = tx_4x4_w << 2; + *visible_height = tx_4x4_h << 2; + } else { + int r, c; + const int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h); + const int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w); + sse = 0; + // if we are in the unrestricted motion border. + for (r = 0; r < max_r; ++r) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (c = 0; c < max_c; ++c) { + sse += (int64_t)vpx_sum_squares_2d_i16( + diff + r * diff_stride * 4 + c * 4, diff_stride, 4); + } + } + *visible_width = max_c << 2; + *visible_height = max_r << 2; + } + return sse; +} + // Check if trellis coefficient optimization of the transform block is enabled. -static INLINE int do_trellis_opt(void *arg) { +static INLINE int do_trellis_opt(const struct macroblockd_plane *pd, + const int16_t *src_diff, int diff_stride, + int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { const struct encode_b_args *const args = (struct encode_b_args *)arg; const MACROBLOCK *const x = args->x; - const int enable_trellis_opt = args->enable_trellis_opt; - const double trellis_opt_thresh = args->trellis_opt_thresh; - switch (enable_trellis_opt) { + switch (args->enable_trellis_opt) { case DISABLE_TRELLIS_OPT: return 0; case ENABLE_TRELLIS_OPT: return 1; case ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR: { - return (trellis_opt_thresh > 0.0) - ? (x->log_block_src_var <= trellis_opt_thresh) + vpx_clear_system_state(); + + return (args->trellis_opt_thresh > 0.0) + ? (x->log_block_src_var <= args->trellis_opt_thresh) : 1; } + case ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE: { + const MACROBLOCKD *const xd = &x->e_mbd; + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; +#if CONFIG_VP9_HIGHBITDEPTH + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; +#else + const int dequant_shift = 3; +#endif // CONFIG_VP9_HIGHBITDEPTH + const int qstep = pd->dequant[1] >> dequant_shift; + int *sse_calc_done = args->sse_calc_done; + int64_t *sse = args->sse; + int visible_width = 0, visible_height = 0; + + // TODO: Enable the sf for high bit-depth case + if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) || !sse || + !sse_calc_done) + return 1; + + *sse = sum_squares_visible(xd, pd, src_diff, diff_stride, blk_row, + blk_col, plane_bsize, tx_bsize, &visible_width, + &visible_height); + *sse_calc_done = 1; + + vpx_clear_system_state(); + + return (*(sse) <= (int64_t)visible_width * visible_height * qstep * + qstep * args->trellis_opt_thresh); + } default: assert(0 && "Invalid trellis optimization method."); return 1; } } diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 3a68952916..88e7b538dc 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -457,11 +457,6 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, return cost; } -static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim, - int subsampling_dim, int blk_dim) { - return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim; -} - // Copy all visible 4x4s in the transform block. static void copy_block_visible(const MACROBLOCKD *xd, const struct macroblockd_plane *const pd, @@ -562,47 +557,11 @@ static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd, return sse; } -// Compute the sum of squares on all visible 4x4s in the transform block. -static int64_t sum_squares_visible(const MACROBLOCKD *xd, - const struct macroblockd_plane *const pd, - const int16_t *diff, const int diff_stride, - int blk_row, int blk_col, - const BLOCK_SIZE plane_bsize, - const BLOCK_SIZE tx_bsize) { - int64_t sse; - const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; - const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; - const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize]; - const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize]; - int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge, - pd->subsampling_x, blk_col); - int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge, - pd->subsampling_y, blk_row); - if (tx_bsize == BLOCK_4X4 || - (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) { - assert(tx_4x4_w == tx_4x4_h); - sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2); - } else { - int r, c; - int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h); - int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w); - sse = 0; - // if we are in the unrestricted motion border. - for (r = 0; r < max_r; ++r) { - // Skip visiting the sub blocks that are wholly within the UMV. - for (c = 0; c < max_c; ++c) { - sse += (int64_t)vpx_sum_squares_2d_i16( - diff + r * diff_stride * 4 + c * 4, diff_stride, 4); - } - } - } - return sse; -} - static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col, TX_SIZE tx_size, int64_t *out_dist, - int64_t *out_sse, struct buf_2d *out_recon) { + int64_t *out_sse, struct buf_2d *out_recon, + int sse_calc_done) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; @@ -652,8 +611,12 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); unsigned int tmp; - tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row, - blk_col, plane_bsize, tx_bsize); + if (sse_calc_done) { + tmp = (unsigned int)(*out_sse); + } else { + tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row, + blk_col, plane_bsize, tx_bsize); + } *out_sse = (int64_t)tmp * 16; if (out_recon) { const int out_recon_idx = 4 * (blk_row * out_recon->stride + blk_col); @@ -751,25 +714,20 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)]; const int enable_trellis_opt = args->cpi->sf.trellis_opt_tx_rd.method; const double trellis_opt_thresh = args->cpi->sf.trellis_opt_tx_rd.thresh; + int sse_calc_done = 0; #if CONFIG_MISMATCH_DEBUG struct encode_b_args encode_b_arg = { - x, - enable_trellis_opt, - trellis_opt_thresh, - args->t_above, - args->t_left, - &mi->skip, + x, enable_trellis_opt, trellis_opt_thresh, &sse_calc_done, + &sse, args->t_above, args->t_left, &mi->skip, 0, // mi_row 0, // mi_col 0 // output_enabled }; #else - struct encode_b_args encode_b_arg = { x, - enable_trellis_opt, - trellis_opt_thresh, - args->t_above, - args->t_left, - &mi->skip }; + struct encode_b_args encode_b_arg = { + x, enable_trellis_opt, trellis_opt_thresh, &sse_calc_done, + &sse, args->t_above, args->t_left, &mi->skip + }; #endif if (args->exit_early) return; @@ -784,16 +742,21 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, } if (x->block_tx_domain) { dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, - tx_size, &dist, &sse, /*recon =*/0); + tx_size, &dist, &sse, /*recon =*/0, sse_calc_done); } else { const struct macroblock_plane *const p = &x->plane[plane]; const int src_stride = p->src.stride; - const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)]; - const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; unsigned int tmp; - sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col, - plane_bsize, tx_bsize); + if (!sse_calc_done) { + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const int16_t *diff = + &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; + int visible_width, visible_height; + sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col, + plane_bsize, tx_bsize, &visible_width, + &visible_height); + } #if CONFIG_VP9_HIGHBITDEPTH if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8)) sse = ROUND64_POWER_OF_TWO(sse, (xd->bd - 8) * 2); @@ -817,13 +780,19 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, if (skip_txfm_flag == SKIP_TXFM_NONE || (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) { - const int enable_trellis_opt = do_trellis_opt(&encode_b_arg); + const struct macroblock_plane *const p = &x->plane[plane]; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const int16_t *const diff = + &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; + const int enable_trellis_opt = + do_trellis_opt(pd, diff, diff_stride, blk_row, blk_col, plane_bsize, + tx_size, &encode_b_arg); // full forward transform and quantization vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size); if (enable_trellis_opt) vp9_optimize_b(x, plane, block, tx_size, coeff_ctx); dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, - tx_size, &dist, &sse, recon); + tx_size, &dist, &sse, recon, sse_calc_done); } else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) { // compute DC coefficient tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block); diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index d07bb34ae1..3e121b799f 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -229,6 +229,10 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->use_square_partition_only = !boosted; sf->early_term_interp_search_plane_rd = 1; sf->cb_pred_filter_search = 1; + sf->trellis_opt_tx_rd.method = sf->optimize_coefficients + ? ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE + : DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = boosted ? 4.0 : 3.0; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index fceeb94023..d32bf09e4e 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -254,6 +254,9 @@ typedef enum { // Enable trellis coefficient optimization based on source variance of the // prediction block during transform RD ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR, + // Enable trellis coefficient optimization based on residual mse of the + // transform block during transform RD + ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE, } ENABLE_TRELLIS_OPT_METHOD; typedef struct TRELLIS_OPT_CONTROL { From 9c15fb62b3dfe1c698dc28f9efedb022b0ef8eb8 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 17 Mar 2023 14:34:42 -0400 Subject: [PATCH 0687/1000] Add codec control to get tpl stats Add command line flag to vpxenc to export tpl stats Bug: b/273736974 Change-Id: I6980096531b0c12fbf7a307fdef4c562d0c29e32 --- vp9/vp9_cx_iface.c | 23 +++++++++++++++++++++++ vpx/vp8cx.h | 9 +++++++++ vpxenc.c | 20 ++++++++++++++++++-- vpxenc.h | 1 + 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 4c7eaed725..ec2105b24b 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1788,6 +1788,28 @@ static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + VP9_COMMON *const cm = &cpi->common; + TplDepFrame **data = va_arg(args, TplDepFrame **); + int i; + *data = vpx_calloc(MAX_ARF_GOP_SIZE, sizeof(TplDepFrame)); + for (i = 0; i < MAX_ARF_GOP_SIZE; i++) { + const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int copy_size = mi_cols * mi_rows * sizeof(*(*data)[i].tpl_stats_ptr); + (*data)[i] = cpi->tpl_stats[i]; + (*data)[i].tpl_stats_ptr = NULL; + (*data)[i].tpl_stats_ptr = + vpx_calloc(mi_rows * mi_cols, sizeof(*(*data)[i].tpl_stats_ptr)); + memcpy((*data)[i].tpl_stats_ptr, cpi->tpl_stats[i].tpl_stats_ptr, + copy_size); + } + + return VPX_CODEC_OK; +} + static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, va_list args) { VP9_COMP *const cpi = ctx->cpi; @@ -2035,6 +2057,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_GET_ACTIVEMAP, ctrl_get_active_map }, { VP9E_GET_LEVEL, ctrl_get_level }, { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config }, + { VP9E_GET_TPL_STATS, ctrl_get_tpl_stats }, { -1, NULL }, }; diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index e0b679fbb7..01c0558673 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -767,6 +767,13 @@ enum vp8e_enc_control_id { * */ VP9E_SET_QUANTIZER_ONE_PASS, + + /*!\brief Codec control to get TPL stats for the current frame. + * + * Supported in codecs: VP9 + * + */ + VP9E_GET_TPL_STATS, }; /*!\brief vpx 1-D scaling mode @@ -1097,6 +1104,8 @@ VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int) #define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int) #define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS +VPX_CTRL_USE_TYPE(VP9E_GET_TPL_STATS, void *) +#define VPX_CTRL_VP9E_GET_TPL_STATS /*!\endcond */ /*! @} - end defgroup vp8_encoder */ diff --git a/vpxenc.c b/vpxenc.c index 61672acadd..9d57708f37 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -39,6 +39,10 @@ #include "vpx/vp8dx.h" #endif +#if CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_encoder.h" +#endif + #include "vpx/vpx_integer.h" #include "vpx_ports/mem_ops.h" #include "vpx_ports/vpx_timer.h" @@ -161,6 +165,8 @@ static const arg_def_t disable_warnings = static const arg_def_t disable_warning_prompt = ARG_DEF("y", "disable-warning-prompt", 0, "Display warnings, but do not prompt user to continue."); +static const arg_def_t export_tpl_stats = + ARG_DEF(NULL, "export-tpl-stats", 0, "Export TPL stats of vp9 encoder"); #if CONFIG_VP9_HIGHBITDEPTH static const arg_def_t test16bitinternalarg = ARG_DEF( @@ -191,6 +197,7 @@ static const arg_def_t *main_args[] = { &help, &disable_warnings, &disable_warning_prompt, &recontest, + &export_tpl_stats, NULL }; static const arg_def_t usage = @@ -531,9 +538,7 @@ static const arg_def_t disable_loopfilter = "1: Loopfilter off for non reference frames\n" " " "2: Loopfilter off for all frames"); -#endif -#if CONFIG_VP9_ENCODER static const arg_def_t *vp9_args[] = { &cpu_used_vp9, &auto_altref_vp9, &sharpness, @@ -804,6 +809,8 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { global->disable_warnings = 1; else if (arg_match(&arg, &disable_warning_prompt, argi)) global->disable_warning_prompt = 1; + else if (arg_match(&arg, &export_tpl_stats, argi)) + global->export_tpl_stats = 1; else argj++; } @@ -1982,6 +1989,15 @@ int main(int argc, const char **argv_) { if (got_data && global.test_decode != TEST_DECODE_OFF) FOREACH_STREAM(test_decode(stream, global.test_decode, global.codec)); + +#if CONFIG_VP9_ENCODER + if (got_data && global.export_tpl_stats) { + TplDepFrame *tpl_stats = NULL; + FOREACH_STREAM(vpx_codec_control(&stream->encoder, VP9E_GET_TPL_STATS, + &tpl_stats)); + vpx_free(tpl_stats); + } +#endif } fflush(stdout); diff --git a/vpxenc.h b/vpxenc.h index be54840f7d..f065f086db 100644 --- a/vpxenc.h +++ b/vpxenc.h @@ -56,6 +56,7 @@ struct VpxEncoderConfig { int disable_warnings; int disable_warning_prompt; int experimental_bitstream; + int export_tpl_stats; }; #ifdef __cplusplus From e4f0df53ece296c4cb7c7d7911025e020bc6e882 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 20 Mar 2023 16:43:47 -0700 Subject: [PATCH 0688/1000] vp8_sixtap_predict16x16_neon: fix overread Shift the final read from the source by 3 to avoid breaking the assumption that the 6-tap filter needs only 5 pixels outside of the macroblock; this matches the sse2 and ssse3 implementations. It's possible this restriction could be removed if the source buffers are assumed to be padded. Bug: webm:1795 Change-Id: I4c791e3a214898a503c78f4cedca154c75cdbaef Fixed: webm:1795 --- test/predict_test.cc | 4 +--- vp8/common/arm/neon/sixtappredict_neon.c | 8 +++----- vp8/common/rtcd_defs.pl | 4 +--- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/test/predict_test.cc b/test/predict_test.cc index e49d98272e..7472970576 100644 --- a/test/predict_test.cc +++ b/test/predict_test.cc @@ -307,9 +307,7 @@ INSTANTIATE_TEST_SUITE_P( #if HAVE_NEON INSTANTIATE_TEST_SUITE_P( NEON, SixtapPredictTest, - ::testing::Values(/*TODO(https://crbug.com/webm/1795): enable this after - buffer overflows are fixed. - make_tuple(16, 16, &vp8_sixtap_predict16x16_neon),*/ + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon), make_tuple(8, 8, &vp8_sixtap_predict8x8_neon), make_tuple(8, 4, &vp8_sixtap_predict8x4_neon), make_tuple(4, 4, &vp8_sixtap_predict4x4_neon))); diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c index 4960d16516..b15cfb4112 100644 --- a/vp8/common/arm/neon/sixtappredict_neon.c +++ b/vp8/common/arm/neon/sixtappredict_neon.c @@ -1253,9 +1253,6 @@ void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, return; } -// TODO(https://crbug.com/webm/1795): enable this after buffer overflows are -// fixed. -#if 0 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, @@ -1507,7 +1504,9 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, src += src_pixels_per_line; d12u8 = vld1_u8(src); d13u8 = vld1_u8(src + 8); - d14u8 = vld1_u8(src + 16); + // Only 5 pixels are needed, avoid a potential out of bounds read. + d14u8 = vld1_u8(src + 13); + d14u8 = vext_u8(d14u8, d14u8, 3); src += src_pixels_per_line; __builtin_prefetch(src); @@ -1731,4 +1730,3 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, } return; } -#endif // 0 diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 05e67ce11b..739a612847 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -146,9 +146,7 @@ () # Subpixel # add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; -# TODO(https://crbug.com/webm/1795): enable neon after buffer overflows are -# fixed. -specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 dspr2 msa mmi lsx/; +specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi lsx/; add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch"; specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi lsx/; From faa9142f5d5af0a6c4b998929aac928a38515ae3 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 20 Mar 2023 16:56:58 -0700 Subject: [PATCH 0689/1000] sixtappredict_neon.c,cosmetics: fix a typo Change-Id: If3e4cf372fc6ed076f0d42c435a72262494aab68 --- vp8/common/arm/neon/sixtappredict_neon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c index b15cfb4112..a7cf43b5f9 100644 --- a/vp8/common/arm/neon/sixtappredict_neon.c +++ b/vp8/common/arm/neon/sixtappredict_neon.c @@ -16,7 +16,7 @@ #include "vpx_ports/mem.h" static const int8_t vp8_sub_pel_filters[8][8] = { - { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positionyys are */ + { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positions are */ { 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */ { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */ { 0, -9, 93, 50, -6, 0, 0, 0 }, From 44250287fb3d0c51118478a447fa032cab6e3700 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 20 Mar 2023 16:58:28 -0700 Subject: [PATCH 0690/1000] sixtappredict_neon.c: remove redundant returns Change-Id: I650b305c2599fc32353daba030e6241d330796a7 --- vp8/common/arm/neon/sixtappredict_neon.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c index a7cf43b5f9..ee3c281f0f 100644 --- a/vp8/common/arm/neon/sixtappredict_neon.c +++ b/vp8/common/arm/neon/sixtappredict_neon.c @@ -781,7 +781,6 @@ void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, vst1_u8(dst_ptr, d8u8); dst_ptr += dst_pitch; vst1_u8(dst_ptr, d9u8); - return; } void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, @@ -1250,7 +1249,6 @@ void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, vst1_u8(dst_ptr, d9u8); dst_ptr += dst_pitch; } - return; } void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, @@ -1728,5 +1726,4 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, dst += dst_pitch; } } - return; } From 1c37aefcbd0aebc1f2043b3e9d2c9fd61e6275ef Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 20 Mar 2023 17:09:42 -0700 Subject: [PATCH 0691/1000] svc_encodeframe.c: fix -Wstringop-truncation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit use sizeof(buf) - 1 with strncpy. fixes: examples/svc_encodeframe.c:282:3: warning: ‘strncpy’ specified bound 1024 equals destination size [-Wstringop-truncation] 282 | strncpy(si->options, options, sizeof(si->options)); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Change-Id: I46980872f9865ae1dc2b56330c3a65d8bc6cf1f7 --- examples/svc_encodeframe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/svc_encodeframe.c b/examples/svc_encodeframe.c index 003096e701..c2b3ec9798 100644 --- a/examples/svc_encodeframe.c +++ b/examples/svc_encodeframe.c @@ -279,7 +279,7 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) { if (svc_ctx == NULL || options == NULL || si == NULL) { return VPX_CODEC_INVALID_PARAM; } - strncpy(si->options, options, sizeof(si->options)); + strncpy(si->options, options, sizeof(si->options) - 1); si->options[sizeof(si->options) - 1] = '\0'; return VPX_CODEC_OK; } From 3b6909977c99580b565b4150823b3f9c17c39bc0 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 20 Mar 2023 17:28:11 -0700 Subject: [PATCH 0692/1000] test.mk: use CONFIG_VP(8|9)_ENCODER for vp8/vp9-only tests fixes some uninstantiated test failures when configured with --disable-vp8 or --disable-vp9 Change-Id: If9a6705bd070edee02306e89da103ed474688ec8 --- test/test.mk | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test.mk b/test/test.mk index 3c225bc750..bbcdd0c6e4 100644 --- a/test/test.mk +++ b/test/test.mk @@ -22,10 +22,6 @@ LIBVPX_TEST_SRCS-yes += ../md5_utils.h ../md5_utils.c LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ivf_video_source.h LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += ../y4minput.h ../y4minput.c LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += altref_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += aq_segment_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += alt_ref_aq_segment_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += vp8_datarate_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += vp9_datarate_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += encode_api_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += error_resilience_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += i420_video_source.h @@ -37,6 +33,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += yuv_video_source.h LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_datarate_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc @@ -44,6 +41,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += alt_ref_aq_segment_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += aq_segment_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc @@ -58,6 +57,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_datarate_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ext_ratectrl_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += ../vp9/simple_encode.h From 5c7867beacb35f8f937ad03f8ca5e2f1ae9c7a6a Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Fri, 10 Mar 2023 16:30:36 +0000 Subject: [PATCH 0693/1000] Add Neon implementations of vpx_highbd_avg_x_c Add Neon implementation of vpx_highbd_avg_4x4_c and vpx_highbd_avg_8x8_c as well as the corresponding tests. Change-Id: Ib1b06af5206774347690c9c56e194b76aa409c91 --- test/avg_test.cc | 7 +++++++ vpx_dsp/arm/highbd_avg_neon.c | 24 ++++++++++++++++++++++++ vpx_dsp/arm/mem_neon.h | 21 +++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 ++-- 4 files changed, 54 insertions(+), 2 deletions(-) diff --git a/test/avg_test.cc b/test/avg_test.cc index dd84403324..a0428304a2 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -582,6 +582,13 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2))); #endif // HAVE_SSE2 +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, AverageTestHBD, + ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_neon), + make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_neon))); +#endif // HAVE_NEON + INSTANTIATE_TEST_SUITE_P(C, SatdHighbdTest, ::testing::Values(make_tuple(16, &vpx_satd_c), make_tuple(64, &vpx_satd_c), diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c index b84a7875d4..fc10197d71 100644 --- a/vpx_dsp/arm/highbd_avg_neon.c +++ b/vpx_dsp/arm/highbd_avg_neon.c @@ -16,6 +16,30 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" +uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *a, int a_stride) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); + const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * a_stride, a_stride); + const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * a_stride, a_stride); + return (horizontal_add_uint16x8(vaddq_u16(a0, a1)) + (1 << 3)) >> 4; +} + +uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *a, int a_stride) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); + uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7; + + load_u16_8x8(a_ptr, a_stride, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + sum = vaddq_u16(a0, a1); + sum = vaddq_u16(sum, a2); + sum = vaddq_u16(sum, a3); + sum = vaddq_u16(sum, a4); + sum = vaddq_u16(sum, a5); + sum = vaddq_u16(sum, a6); + sum = vaddq_u16(sum, a7); + + return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6; +} + // coeff: 32 bits, dynamic range [-2147483648, 2147483647]. // length: value range {16, 64, 256, 1024}. // satd: 42 bits, dynamic range [-2147483648 * 1024, 2147483647 * 1024] diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index fa14f80b23..1a20da70ef 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -419,4 +419,25 @@ static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p, vst1q_u8(s, s7); } +static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2, + uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5, + uint16x8_t *s6, uint16x8_t *s7) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); + s += p; + *s4 = vld1q_u16(s); + s += p; + *s5 = vld1q_u16(s); + s += p; + *s6 = vld1q_u16(s); + s += p; + *s7 = vld1q_u16(s); +} + #endif // VPX_VPX_DSP_ARM_MEM_NEON_H_ diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 7cd3a0be89..6637186f81 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -995,10 +995,10 @@ () # Avg # add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p"; - specialize qw/vpx_highbd_avg_8x8 sse2/; + specialize qw/vpx_highbd_avg_8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p"; - specialize qw/vpx_highbd_avg_4x4 sse2/; + specialize qw/vpx_highbd_avg_4x4 sse2 neon/; add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max"; specialize qw/vpx_highbd_minmax_8x8 neon/; From 78bb8e1c0a9b386b983c4e7cadf2ffb7b3b52bd5 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 22 Mar 2023 20:18:39 +0000 Subject: [PATCH 0694/1000] Revert "Add codec control to get tpl stats" This reverts commit 9c15fb62b3dfe1c698dc28f9efedb022b0ef8eb8. Reason for revert: vpxenc should only use public interface Original change's description: > Add codec control to get tpl stats > > Add command line flag to vpxenc to export tpl stats > > Bug: b/273736974 > Change-Id: I6980096531b0c12fbf7a307fdef4c562d0c29e32 Bug: b/273736974 Change-Id: Ifa8951bb34e5936bbfc33086b22e9fc36d379bc9 --- vp9/vp9_cx_iface.c | 23 ----------------------- vpx/vp8cx.h | 9 --------- vpxenc.c | 20 ++------------------ vpxenc.h | 1 - 4 files changed, 2 insertions(+), 51 deletions(-) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index ec2105b24b..4c7eaed725 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1788,28 +1788,6 @@ static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } -static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx, - va_list args) { - VP9_COMP *const cpi = ctx->cpi; - VP9_COMMON *const cm = &cpi->common; - TplDepFrame **data = va_arg(args, TplDepFrame **); - int i; - *data = vpx_calloc(MAX_ARF_GOP_SIZE, sizeof(TplDepFrame)); - for (i = 0; i < MAX_ARF_GOP_SIZE; i++) { - const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); - const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); - const int copy_size = mi_cols * mi_rows * sizeof(*(*data)[i].tpl_stats_ptr); - (*data)[i] = cpi->tpl_stats[i]; - (*data)[i].tpl_stats_ptr = NULL; - (*data)[i].tpl_stats_ptr = - vpx_calloc(mi_rows * mi_cols, sizeof(*(*data)[i].tpl_stats_ptr)); - memcpy((*data)[i].tpl_stats_ptr, cpi->tpl_stats[i].tpl_stats_ptr, - copy_size); - } - - return VPX_CODEC_OK; -} - static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, va_list args) { VP9_COMP *const cpi = ctx->cpi; @@ -2057,7 +2035,6 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_GET_ACTIVEMAP, ctrl_get_active_map }, { VP9E_GET_LEVEL, ctrl_get_level }, { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config }, - { VP9E_GET_TPL_STATS, ctrl_get_tpl_stats }, { -1, NULL }, }; diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 01c0558673..e0b679fbb7 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -767,13 +767,6 @@ enum vp8e_enc_control_id { * */ VP9E_SET_QUANTIZER_ONE_PASS, - - /*!\brief Codec control to get TPL stats for the current frame. - * - * Supported in codecs: VP9 - * - */ - VP9E_GET_TPL_STATS, }; /*!\brief vpx 1-D scaling mode @@ -1104,8 +1097,6 @@ VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int) #define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int) #define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS -VPX_CTRL_USE_TYPE(VP9E_GET_TPL_STATS, void *) -#define VPX_CTRL_VP9E_GET_TPL_STATS /*!\endcond */ /*! @} - end defgroup vp8_encoder */ diff --git a/vpxenc.c b/vpxenc.c index 9d57708f37..61672acadd 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -39,10 +39,6 @@ #include "vpx/vp8dx.h" #endif -#if CONFIG_VP9_ENCODER -#include "vp9/encoder/vp9_encoder.h" -#endif - #include "vpx/vpx_integer.h" #include "vpx_ports/mem_ops.h" #include "vpx_ports/vpx_timer.h" @@ -165,8 +161,6 @@ static const arg_def_t disable_warnings = static const arg_def_t disable_warning_prompt = ARG_DEF("y", "disable-warning-prompt", 0, "Display warnings, but do not prompt user to continue."); -static const arg_def_t export_tpl_stats = - ARG_DEF(NULL, "export-tpl-stats", 0, "Export TPL stats of vp9 encoder"); #if CONFIG_VP9_HIGHBITDEPTH static const arg_def_t test16bitinternalarg = ARG_DEF( @@ -197,7 +191,6 @@ static const arg_def_t *main_args[] = { &help, &disable_warnings, &disable_warning_prompt, &recontest, - &export_tpl_stats, NULL }; static const arg_def_t usage = @@ -538,7 +531,9 @@ static const arg_def_t disable_loopfilter = "1: Loopfilter off for non reference frames\n" " " "2: Loopfilter off for all frames"); +#endif +#if CONFIG_VP9_ENCODER static const arg_def_t *vp9_args[] = { &cpu_used_vp9, &auto_altref_vp9, &sharpness, @@ -809,8 +804,6 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { global->disable_warnings = 1; else if (arg_match(&arg, &disable_warning_prompt, argi)) global->disable_warning_prompt = 1; - else if (arg_match(&arg, &export_tpl_stats, argi)) - global->export_tpl_stats = 1; else argj++; } @@ -1989,15 +1982,6 @@ int main(int argc, const char **argv_) { if (got_data && global.test_decode != TEST_DECODE_OFF) FOREACH_STREAM(test_decode(stream, global.test_decode, global.codec)); - -#if CONFIG_VP9_ENCODER - if (got_data && global.export_tpl_stats) { - TplDepFrame *tpl_stats = NULL; - FOREACH_STREAM(vpx_codec_control(&stream->encoder, VP9E_GET_TPL_STATS, - &tpl_stats)); - vpx_free(tpl_stats); - } -#endif } fflush(stdout); diff --git a/vpxenc.h b/vpxenc.h index f065f086db..be54840f7d 100644 --- a/vpxenc.h +++ b/vpxenc.h @@ -56,7 +56,6 @@ struct VpxEncoderConfig { int disable_warnings; int disable_warning_prompt; int experimental_bitstream; - int export_tpl_stats; }; #ifdef __cplusplus From 5817bce969f2845493f22f85ca5f19c70adc1c2f Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Mon, 20 Mar 2023 16:05:11 -0700 Subject: [PATCH 0695/1000] Fix comment typos (likely copy-and-paste errors) Fix comment typos for vpx_codec_destroy() and vpx_codec_enc_init_ver(). Based on the change made in libaom: https://aomedia.googlesource.com/aom/+/365a968684 365a968684 Fix comment typos (likely copy-and-paste errors) Change-Id: I39edae835ed0752b569e8e7328d0709c59724ac2 --- vpx/vpx_codec.h | 8 +++++--- vpx/vpx_decoder.h | 2 +- vpx/vpx_encoder.h | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h index b0a931e019..11bf8aaa22 100644 --- a/vpx/vpx_codec.h +++ b/vpx/vpx_codec.h @@ -345,9 +345,11 @@ const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx); * \param[in] ctx Pointer to this instance's context * * \retval #VPX_CODEC_OK - * The codec algorithm initialized. - * \retval #VPX_CODEC_MEM_ERROR - * Memory allocation failed. + * The codec instance has been destroyed. + * \retval #VPX_CODEC_INVALID_PARAM + * ctx is a null pointer. + * \retval #VPX_CODEC_ERROR + * Codec context not initialized. */ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx); diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h index 39e5f585f6..99dd8cf694 100644 --- a/vpx/vpx_decoder.h +++ b/vpx/vpx_decoder.h @@ -127,7 +127,7 @@ typedef struct vpx_codec_dec_cfg { * \param[in] ver ABI version number. Must be set to * VPX_DECODER_ABI_VERSION * \retval #VPX_CODEC_OK - * The decoder algorithm initialized. + * The decoder algorithm has been initialized. * \retval #VPX_CODEC_MEM_ERROR * Memory allocation failed. */ diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index efaf5ef366..a0d2c87558 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -906,7 +906,7 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, * \param[in] ver ABI version number. Must be set to * VPX_ENCODER_ABI_VERSION * \retval #VPX_CODEC_OK - * The decoder algorithm initialized. + * The encoder algorithm has been initialized. * \retval #VPX_CODEC_MEM_ERROR * Memory allocation failed. */ From cda56fa0199a77929f72edd78ad8d4e0ca4968e7 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 23 Mar 2023 19:02:12 -0700 Subject: [PATCH 0696/1000] update libwebm to libwebm-1.0.0.29-9-g1930e3c changelog: https://chromium.googlesource.com/webm/libwebm/+log/ee0bab576..1930e3ca2 Bug: webm:1792 Change-Id: I5c5c30c767d357528f102ff38957655e2ec0c645 --- third_party/libwebm/AUTHORS.TXT | 1 + third_party/libwebm/Android.mk | 3 +++ third_party/libwebm/README.libvpx | 5 +++-- third_party/libwebm/mkvmuxer/mkvmuxer.cc | 4 ++-- third_party/libwebm/mkvmuxer/mkvmuxer.h | 8 ++++---- third_party/libwebm/mkvmuxer/mkvmuxerutil.cc | 2 +- third_party/libwebm/mkvparser/mkvparser.cc | 6 +++--- 7 files changed, 17 insertions(+), 12 deletions(-) diff --git a/third_party/libwebm/AUTHORS.TXT b/third_party/libwebm/AUTHORS.TXT index 9686ac13eb..59b648ca68 100644 --- a/third_party/libwebm/AUTHORS.TXT +++ b/third_party/libwebm/AUTHORS.TXT @@ -2,3 +2,4 @@ # Name or Organization Google Inc. +Elijah Cirioli diff --git a/third_party/libwebm/Android.mk b/third_party/libwebm/Android.mk index 23f935f2db..b02795ccae 100644 --- a/third_party/libwebm/Android.mk +++ b/third_party/libwebm/Android.mk @@ -1,3 +1,5 @@ +# Ignore this file during non-NDK builds. +ifdef NDK_ROOT LOCAL_PATH:= $(call my-dir) include $(CLEAR_VARS) @@ -18,3 +20,4 @@ LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD LOCAL_LICENSE_CONDITIONS := notice LOCAL_NOTICE_FILE := $(LOCAL_PATH)/LICENSE.TXT $(LOCAL_PATH)/PATENTS.TXT include $(BUILD_STATIC_LIBRARY) +endif # NDK_ROOT diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx index 325604cc66..a79b982ef4 100644 --- a/third_party/libwebm/README.libvpx +++ b/third_party/libwebm/README.libvpx @@ -1,7 +1,7 @@ URL: https://chromium.googlesource.com/webm/libwebm -Version: ee0bab576c338c9807249b99588e352b7268cb62 +Version: 1930e3ca23b007f3ff11d98a570077be6201957e License: BSD -License File: LICENSE.txt +License File: LICENSE.TXT Description: libwebm is used to handle WebM container I/O. @@ -18,3 +18,4 @@ Only keep: - mkvmuxer/ - mkvparser/ - PATENTS.TXT +- use -std=gnu++11 in Android.mk (https://crbug.com/webm/1708) diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/libwebm/mkvmuxer/mkvmuxer.cc index ae36531439..faaf0165f4 100644 --- a/third_party/libwebm/mkvmuxer/mkvmuxer.cc +++ b/third_party/libwebm/mkvmuxer/mkvmuxer.cc @@ -607,10 +607,10 @@ bool ContentEncoding::Write(IMkvWriter* writer) const { return true; } -uint64_t ContentEncoding::EncodingSize(uint64_t compresion_size, +uint64_t ContentEncoding::EncodingSize(uint64_t compression_size, uint64_t encryption_size) const { // TODO(fgalligan): Add support for compression settings. - if (compresion_size != 0) + if (compression_size != 0) return 0; uint64_t encoding_size = 0; diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.h b/third_party/libwebm/mkvmuxer/mkvmuxer.h index f2db377145..8602d82325 100644 --- a/third_party/libwebm/mkvmuxer/mkvmuxer.h +++ b/third_party/libwebm/mkvmuxer/mkvmuxer.h @@ -330,7 +330,7 @@ class ContentEncoding { private: // Returns the size in bytes for the encoding elements. - uint64_t EncodingSize(uint64_t compresion_size, + uint64_t EncodingSize(uint64_t compression_size, uint64_t encryption_size) const; // Returns the size in bytes for the encryption elements. @@ -1425,7 +1425,7 @@ class SeekHead { bool Write(IMkvWriter* writer); // We are going to put a cap on the number of Seek Entries. - const static int32_t kSeekEntryCount = 5; + constexpr static int32_t kSeekEntryCount = 5; private: // Returns the maximum size in bytes of one seek entry. @@ -1505,8 +1505,8 @@ class Segment { kBeforeClusters = 0x1 // Position Cues before Clusters }; - static const uint32_t kDefaultDocTypeVersion = 4; - static const uint64_t kDefaultMaxClusterDuration = 30000000000ULL; + static constexpr uint32_t kDefaultDocTypeVersion = 4; + static constexpr uint64_t kDefaultMaxClusterDuration = 30000000000ULL; Segment(); ~Segment(); diff --git a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc index bd2f769138..300b155797 100644 --- a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc +++ b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc @@ -607,7 +607,7 @@ uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) { void GetVersion(int32* major, int32* minor, int32* build, int32* revision) { *major = 0; *minor = 3; - *build = 0; + *build = 1; *revision = 0; } diff --git a/third_party/libwebm/mkvparser/mkvparser.cc b/third_party/libwebm/mkvparser/mkvparser.cc index de8884b381..868afcb3ed 100644 --- a/third_party/libwebm/mkvparser/mkvparser.cc +++ b/third_party/libwebm/mkvparser/mkvparser.cc @@ -55,7 +55,7 @@ Type* SafeArrayAlloc(unsigned long long num_elements, void GetVersion(int& major, int& minor, int& build, int& revision) { major = 1; minor = 1; - build = 0; + build = 1; revision = 0; } @@ -298,7 +298,7 @@ long UnserializeInt(IMkvReader* pReader, long long pos, long long size, if (status < 0) return status; - unsigned long long result = first_byte; + unsigned long long result = static_cast(first_byte); ++pos; for (long i = 1; i < size; ++i) { @@ -2432,7 +2432,7 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_, pos += size; // consume payload } - if ((m_pos < 0) || (m_track <= 0)) { + if ((m_pos < 0) || (m_track <= 0) || (m_block < 0) || (m_block > LONG_MAX)) { return false; } From 1701d55e33fe227cee4442d898c3ef6ae6a8206a Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 22 Feb 2023 13:53:49 -0800 Subject: [PATCH 0697/1000] vp9_encodeframe.c: clear -Wshadow warnings Bug: webm:1793 Change-Id: I77c7abae7bbb1e1f4972cd31e3a67d62477b896e --- vp9/encoder/vp9_encodeframe.c | 45 ++++++++++++++++------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 1d593cfc01..26e419e3d5 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -349,17 +349,17 @@ typedef struct { int32_t sum_error; int log2_count; int variance; -} var; +} Var; typedef struct { - var none; - var horz[2]; - var vert[2]; + Var none; + Var horz[2]; + Var vert[2]; } partition_variance; typedef struct { partition_variance part_variances; - var split[4]; + Var split[4]; } v4x4; typedef struct { @@ -384,7 +384,7 @@ typedef struct { typedef struct { partition_variance *part_variances; - var *split[4]; + Var *split[4]; } variance_node; typedef enum { @@ -436,13 +436,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { } // Set variance values given sum square error, sum error, count. -static void fill_variance(uint32_t s2, int32_t s, int c, var *v) { +static void fill_variance(uint32_t s2, int32_t s, int c, Var *v) { v->sum_square_error = s2; v->sum_error = s; v->log2_count = c; } -static void get_variance(var *v) { +static void get_variance(Var *v) { v->variance = (int)(256 * (v->sum_square_error - (uint32_t)(((int64_t)v->sum_error * v->sum_error) >> @@ -450,7 +450,7 @@ static void get_variance(var *v) { v->log2_count); } -static void sum_2_variances(const var *a, const var *b, var *r) { +static void sum_2_variances(const Var *a, const Var *b, Var *r) { assert(a->log2_count == b->log2_count); fill_variance(a->sum_square_error + b->sum_square_error, a->sum_error + b->sum_error, a->log2_count + 1, r); @@ -1863,8 +1863,8 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx, vp9_update_mv_count(td); if (cm->interp_filter == SWITCHABLE) { - const int ctx = get_pred_context_switchable_interp(xd); - ++td->counts->switchable_interp[ctx][xdmi->interp_filter]; + const int ctx_interp = get_pred_context_switchable_interp(xd); + ++td->counts->switchable_interp[ctx_interp][xdmi->interp_filter]; } } @@ -2748,10 +2748,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) { RD_COST tmp_rdc; - PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0]; + PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0]; vp9_rd_cost_init(&tmp_rdc); - update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); - encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); + update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx); pc_tree->horizontal[1].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col, &tmp_rdc, subsize, &pc_tree->horizontal[1], INT_MAX, @@ -2772,10 +2772,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) { RD_COST tmp_rdc; - PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0]; + PICK_MODE_CONTEXT *vctx = &pc_tree->vertical[0]; vp9_rd_cost_init(&tmp_rdc); - update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); - encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); + update_state(cpi, td, vctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, vctx); pc_tree->vertical[bsize > BLOCK_8X8].skip_ref_frame_mask = 0; rd_pick_sb_modes( cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), &tmp_rdc, @@ -2847,8 +2847,6 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, int x_idx = (i & 1) * (mi_step >> 1); int y_idx = (i >> 1) * (mi_step >> 1); RD_COST tmp_rdc; - ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; - PARTITION_CONTEXT sl[8], sa[8]; if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; @@ -3479,8 +3477,8 @@ static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi, int mi_col, int *none, int *split) { const VP9_COMMON *const cm = &cpi->common; const NN_CONFIG *nn_config = NULL; + const MACROBLOCKD *const xd = &x->e_mbd; #if CONFIG_VP9_HIGHBITDEPTH - MACROBLOCKD *xd = &x->e_mbd; DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64 * 2]); uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? (CONVERT_TO_BYTEPTR(pred_buffer)) @@ -3563,7 +3561,6 @@ static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi, const unsigned int var = cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); - const MACROBLOCKD *const xd = &x->e_mbd; const int has_above = !!xd->above_mi; const int has_left = !!xd->left_mi; const BLOCK_SIZE above_bsize = has_above ? xd->above_mi->sb_type : bsize; @@ -4348,9 +4345,9 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows && bsize > BLOCK_8X8) { - PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0]; - update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); - encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); + PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0]; + update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->horizontal[1].pred_interp_filter = pred_interp_filter; From bad39ce7a3b766ea44fcd1637610986a4a672999 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 22 Feb 2023 15:16:43 -0800 Subject: [PATCH 0698/1000] vp9_scan.h: rename scan_order struct to ScanOrder This matches the style guide and fixes some -Wshadow warnings related to variables with the same name. Something similar was done in libaom in: 03f6fdcfca Fix warnings reported by -Wshadow: Part1b: scan_order struct and variable Bug: webm:1793 Change-Id: Ide5127886b7fd7778e6d8a983bfba6edda21ff28 --- test/vp9_quantize_test.cc | 12 ++++++------ vp9/common/vp9_scan.c | 4 ++-- vp9/common/vp9_scan.h | 12 ++++++------ vp9/decoder/vp9_decodeframe.c | 16 ++++++++-------- vp9/decoder/vp9_detokenize.c | 5 ++--- vp9/decoder/vp9_detokenize.h | 5 ++--- vp9/encoder/vp9_encodemb.c | 8 ++++---- vp9/encoder/vp9_pickmode.c | 2 +- vp9/encoder/vp9_rdopt.c | 12 ++++++------ vp9/encoder/vp9_tokenize.c | 2 +- vp9/encoder/vp9_tpl_model.c | 2 +- vpx_dsp/arm/highbd_quantize_neon.c | 2 +- vpx_dsp/arm/quantize_neon.c | 2 +- vpx_dsp/quantize.c | 4 ++-- vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +++--- vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 2 +- vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 2 +- vpx_dsp/x86/quantize_avx.c | 2 +- vpx_dsp/x86/quantize_avx2.c | 2 +- vpx_dsp/x86/quantize_ssse3.c | 2 +- 20 files changed, 51 insertions(+), 53 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index e9b17d5eb8..84a5a58e4e 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -42,7 +42,7 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const struct scan_order *const scan_order); + const struct ScanOrder *const scan_order); typedef std::tuple QuantizeParam; @@ -60,7 +60,7 @@ template void QuantWrapper(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const struct scan_order *const scan_order) { + const struct ScanOrder *const scan_order) { fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant, mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan_order->scan, scan_order->iscan); @@ -71,14 +71,14 @@ typedef void (*Quantize32x32Func)(const tran_low_t *coeff, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const struct scan_order *const scan_order); + const struct ScanOrder *const scan_order); template void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const struct scan_order *const scan_order) { + const struct ScanOrder *const scan_order) { (void)count; fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order); } @@ -94,7 +94,7 @@ template void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const struct scan_order *const scan_order) { + const struct ScanOrder *const scan_order) { fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff, dequant, eob, scan_order->scan, scan_order->iscan); } @@ -213,7 +213,7 @@ class VP9QuantizeBase : public AbstractBench { int16_t *r_ptr_; int16_t *q_ptr_; int count_; - const scan_order *scan_; + const ScanOrder *scan_; uint16_t eob_; }; diff --git a/vp9/common/vp9_scan.c b/vp9/common/vp9_scan.c index 8bea61dea6..adacb7ef96 100644 --- a/vp9/common/vp9_scan.c +++ b/vp9/common/vp9_scan.c @@ -688,14 +688,14 @@ DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = { 968, 974, 989, 997, 1003, 1007, 1015, 1019, 1022, 1024, }; -const scan_order vp9_default_scan_orders[TX_SIZES] = { +const ScanOrder vp9_default_scan_orders[TX_SIZES] = { { default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors }, { default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors }, { default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors }, { default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors }, }; -const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES] = { +const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES] = { { // TX_4X4 { default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors }, { row_scan_4x4, vp9_row_iscan_4x4, row_scan_4x4_neighbors }, diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h index efa0e23365..3d1dcc66da 100644 --- a/vp9/common/vp9_scan.h +++ b/vp9/common/vp9_scan.h @@ -23,14 +23,14 @@ extern "C" { #define MAX_NEIGHBORS 2 -typedef struct scan_order { +typedef struct ScanOrder { const int16_t *scan; const int16_t *iscan; const int16_t *neighbors; -} scan_order; +} ScanOrder; -extern const scan_order vp9_default_scan_orders[TX_SIZES]; -extern const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES]; +extern const ScanOrder vp9_default_scan_orders[TX_SIZES]; +extern const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES]; static INLINE int get_coef_context(const int16_t *neighbors, const uint8_t *token_cache, int c) { @@ -39,8 +39,8 @@ static INLINE int get_coef_context(const int16_t *neighbors, 1; } -static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, - PLANE_TYPE type, int block_idx) { +static INLINE const ScanOrder *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, + PLANE_TYPE type, int block_idx) { const MODE_INFO *const mi = xd->mi[0]; if (is_inter_block(mi) || type != PLANE_TYPE_Y || xd->lossless) { diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 2a27e6fdb3..6eae41fcfb 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -323,9 +323,9 @@ static void predict_and_reconstruct_intra_block(TileWorkerData *twd, if (!mi->skip) { const TX_TYPE tx_type = (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode]; - const scan_order *sc = (plane || xd->lossless) - ? &vp9_default_scan_orders[tx_size] - : &vp9_scan_orders[tx_size][tx_type]; + const ScanOrder *sc = (plane || xd->lossless) + ? &vp9_default_scan_orders[tx_size] + : &vp9_scan_orders[tx_size][tx_type]; const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, mi->segment_id); if (eob > 0) { @@ -348,9 +348,9 @@ static void parse_intra_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi, struct macroblockd_plane *const pd = &xd->plane[plane]; const TX_TYPE tx_type = (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode]; - const scan_order *sc = (plane || xd->lossless) - ? &vp9_default_scan_orders[tx_size] - : &vp9_scan_orders[tx_size][tx_type]; + const ScanOrder *sc = (plane || xd->lossless) + ? &vp9_default_scan_orders[tx_size] + : &vp9_scan_orders[tx_size][tx_type]; *pd->eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, mi->segment_id); /* Keep the alignment to 16 */ @@ -393,7 +393,7 @@ static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi, int mi_row, int mi_col) { MACROBLOCKD *const xd = &twd->xd; struct macroblockd_plane *const pd = &xd->plane[plane]; - const scan_order *sc = &vp9_default_scan_orders[tx_size]; + const ScanOrder *sc = &vp9_default_scan_orders[tx_size]; const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, mi->segment_id); uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col]; @@ -423,7 +423,7 @@ static int parse_inter_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi, TX_SIZE tx_size) { MACROBLOCKD *const xd = &twd->xd; struct macroblockd_plane *const pd = &xd->plane[plane]; - const scan_order *sc = &vp9_default_scan_orders[tx_size]; + const ScanOrder *sc = &vp9_default_scan_orders[tx_size]; const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, mi->segment_id); diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index 3ed1bd6ffa..d957dc34e3 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -272,9 +272,8 @@ static void get_ctx_shift(MACROBLOCKD *xd, int *ctx_shift_a, int *ctx_shift_l, } } -int vp9_decode_block_tokens(TileWorkerData *twd, int plane, - const scan_order *sc, int x, int y, TX_SIZE tx_size, - int seg_id) { +int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc, + int x, int y, TX_SIZE tx_size, int seg_id) { vpx_reader *r = &twd->bit_reader; MACROBLOCKD *xd = &twd->xd; struct macroblockd_plane *const pd = &xd->plane[plane]; diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h index a32052ffff..a8e47021b8 100644 --- a/vp9/decoder/vp9_detokenize.h +++ b/vp9/decoder/vp9_detokenize.h @@ -19,9 +19,8 @@ extern "C" { #endif -int vp9_decode_block_tokens(TileWorkerData *twd, int plane, - const scan_order *sc, int x, int y, TX_SIZE tx_size, - int seg_id); +int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc, + int x, int y, TX_SIZE tx_size, int seg_id); #ifdef __cplusplus } // extern "C" diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index a81a753ae7..946a1c3ee8 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -79,7 +79,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, const int shift = (tx_size == TX_32X32); const int16_t *const dequant_ptr = pd->dequant; const uint8_t *const band_translate = get_band_translate(tx_size); - const scan_order *const so = get_scan(xd, tx_size, plane_type, block); + const ScanOrder *const so = get_scan(xd, tx_size, plane_type, block); const int16_t *const scan = so->scan; const int16_t *const nb = so->neighbors; const MODE_INFO *mbmi = xd->mi[0]; @@ -351,7 +351,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); @@ -496,7 +496,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); @@ -816,7 +816,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const scan_order *scan_order; + const ScanOrder *scan_order; TX_TYPE tx_type = DCT_DCT; PREDICTION_MODE mode; const int bwl = b_width_log2_lookup[plane_bsize]; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 579b466ca9..c19d57d15d 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -768,7 +768,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, for (r = 0; r < max_blocks_high; r += block_step) { for (c = 0; c < num_4x4_w; c += block_step) { if (c < max_blocks_wide) { - const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 7557b536ce..76d545cd96 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -77,7 +77,7 @@ struct rdcost_block_args { int64_t best_rd; int exit_early; int use_fast_coef_costing; - const scan_order *so; + const ScanOrder *so; uint8_t skippable; struct buf_2d *this_recon; }; @@ -1129,7 +1129,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride, xd->bd); if (xd->lossless) { - const scan_order *so = &vp9_default_scan_orders[TX_4X4]; + const ScanOrder *so = &vp9_default_scan_orders[TX_4X4]; const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_highbd_fwht4x4(src_diff, coeff, 8); @@ -1146,7 +1146,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, } else { int64_t unused; const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block); - const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type]; + const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type]; const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); if (tx_type == DCT_DCT) @@ -1236,7 +1236,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride); if (xd->lossless) { - const scan_order *so = &vp9_default_scan_orders[TX_4X4]; + const ScanOrder *so = &vp9_default_scan_orders[TX_4X4]; const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_fwht4x4(src_diff, coeff, 8); @@ -1253,7 +1253,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, } else { int64_t unused; const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block); - const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type]; + const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type]; const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_fht4x4(src_diff, coeff, 8, tx_type); @@ -1632,7 +1632,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)]; int64_t thisdistortion = 0, thissse = 0; int thisrate = 0, ref; - const scan_order *so = &vp9_default_scan_orders[TX_4X4]; + const ScanOrder *so = &vp9_default_scan_orders[TX_4X4]; const int is_compound = has_second_ref(mi); const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter]; diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index 814d769be3..6c6c04493f 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -364,7 +364,7 @@ static void tokenize_b(int plane, int block, int row, int col, const PLANE_TYPE type = get_plane_type(plane); const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); const int16_t *scan, *nb; - const scan_order *so; + const ScanOrder *so; const int ref = is_inter_block(mi); unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] = td->rd_counts.coef_counts[tx_size][type][ref]; diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index b0c735167e..53ef356981 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -434,7 +434,7 @@ static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; uint16_t eob; int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; const int shift = tx_size == TX_32X32 ? 0 : 2; diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index 5a40f1284e..526447acf5 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -228,7 +228,7 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon( void vpx_highbd_quantize_b_32x32_neon( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const struct scan_order *const scan_order) { + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index 84b6d8c79f..cc8f623744 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -219,7 +219,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct scan_order *const scan_order) { + const struct ScanOrder *const scan_order) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index f51bf253e7..d44ced20dc 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -214,7 +214,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct scan_order *const scan_order) { + const struct ScanOrder *const scan_order) { const int n_coeffs = 32 * 32; const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; @@ -275,7 +275,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, void vpx_highbd_quantize_b_32x32_c( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const struct scan_order *const scan_order) { + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { const intptr_t n_coeffs = 32 * 32; const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 6637186f81..49bc9a6309 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -19,7 +19,7 @@ () #include "vpx_dsp/vpx_filter.h" #if CONFIG_VP9_ENCODER struct macroblock_plane; - struct scan_order; + struct ScanOrder; #endif EOF @@ -725,14 +725,14 @@ () add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; - add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order"; + add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b neon sse2 avx2/; - add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct scan_order *const scan_order"; + add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c index bfd7b2e23e..fbebd7db1c 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -226,7 +226,7 @@ static VPX_FORCE_INLINE void quantize_b_32x32( void vpx_highbd_quantize_b_32x32_avx2( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const struct scan_order *const scan_order) { + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { const unsigned int step = 8; intptr_t n_coeffs = 32 * 32; const int16_t *iscan = scan_order->iscan; diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 58d5a3a5ff..a5d874f3bc 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -97,7 +97,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, void vpx_highbd_quantize_b_32x32_sse2( const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const struct scan_order *const scan_order) { + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index d05a937be1..d289bf6ebf 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -146,7 +146,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct scan_order *const scan_order) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); const __m256i big_zero = _mm256_setzero_si256(); int index; diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index 1c82542ae6..5421dcf0ba 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -256,7 +256,7 @@ void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct scan_order *const scan_order) { + const struct ScanOrder *const scan_order) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 6401b2865d..556f4ca617 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -113,7 +113,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct scan_order *const scan_order) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); int index; const int16_t *iscan = scan_order->iscan; From 601904d1f7f97b22efa955cb804ed1f5ba871eed Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 22 Feb 2023 15:36:03 -0800 Subject: [PATCH 0699/1000] vp9_rc_get_second_pass_params: clear -Wshadow warning Bug: webm:1793 Change-Id: I0d64c9234b4bdcfb49a06566dc41df26f5862c1f --- vp9/encoder/vp9_firstpass.c | 1 - 1 file changed, 1 deletion(-) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index e9250e25c0..08b68c93ee 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -3495,7 +3495,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { const int show_idx = cm->current_video_frame; if (cpi->common.current_frame_coding_index == 0) { - VP9_COMMON *cm = &cpi->common; const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats( &cpi->ext_ratectrl, &cpi->twopass.first_pass_info); if (codec_status != VPX_CODEC_OK) { From 89765feb99dc63826be72eb600b011fc9f75d61e Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 22 Feb 2023 15:47:10 -0800 Subject: [PATCH 0700/1000] vp9_mcomp.c: clear -Wshadow warnings Bug: webm:1793 Change-Id: I6d7d96ffb3e388eac94d1d41563f7079a8297c85 --- vp9/encoder/vp9_mcomp.c | 55 +++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 1f08aa5de7..207eb43949 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -163,8 +163,8 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { do { \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ int64_t tmpmse; \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ if (second_pred == NULL) { \ thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ src_stride, &sse); \ @@ -173,7 +173,8 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { src_stride, &sse, second_pred); \ } \ tmpmse = thismse; \ - tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \ + tmpmse += \ + mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit); \ if (tmpmse >= INT_MAX) { \ v = INT_MAX; \ } else if ((v = (uint32_t)tmpmse) < besterr) { \ @@ -192,15 +193,16 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { #define CHECK_BETTER(v, r, c) \ do { \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ if (second_pred == NULL) \ thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ src_stride, &sse); \ else \ thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ src_stride, &sse, second_pred); \ - if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \ + if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, \ + error_per_bit) + \ thismse) < besterr) { \ besterr = v; \ br = r; \ @@ -686,13 +688,14 @@ static int accurate_sub_pel_search( do { \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ int64_t tmpmse; \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ - thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ + thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \ src_stride, y, y_stride, second_pred, \ w, h, &sse); \ tmpmse = thismse; \ - tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \ + tmpmse += \ + mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit); \ if (tmpmse >= INT_MAX) { \ v = INT_MAX; \ } else if ((v = (uint32_t)tmpmse) < besterr) { \ @@ -711,12 +714,13 @@ static int accurate_sub_pel_search( #define CHECK_BETTER1(v, r, c) \ do { \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ - thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ + thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \ src_stride, y, y_stride, second_pred, \ w, h, &sse); \ - if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \ + if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, \ + error_per_bit) + \ thismse) < besterr) { \ besterr = v; \ br = r; \ @@ -980,16 +984,14 @@ static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv, const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 }; int br = best_mv->row; int bc = best_mv->col; - MV this_mv; + const MV mv = { br, bc }; int i; unsigned int sse; - this_mv.row = br; - this_mv.col = bc; cost_list[0] = - fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv), + fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), in_what->stride, &sse) + - mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb); + mvsad_err_cost(x, &mv, &fcenter_mv, sadpb); if (check_bounds(&x->mv_limits, br, bc, 1)) { for (i = 0; i < 4; i++) { const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col }; @@ -1170,6 +1172,9 @@ static int vp9_pattern_search( } while (s--); } + best_mv->row = br; + best_mv->col = bc; + // Returns the one-away integer pel sad values around the best as follows: // cost_list[0]: cost at the best integer pel // cost_list[1]: cost at delta {0, -1} (left) from the best integer pel @@ -1177,11 +1182,8 @@ static int vp9_pattern_search( // cost_list[3]: cost at delta { 0, 1} (right) from the best integer pel // cost_list[4]: cost at delta {-1, 0} (top) from the best integer pel if (cost_list) { - const MV best_mv = { br, bc }; - calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_mv, cost_list); + calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, best_mv, cost_list); } - best_mv->row = br; - best_mv->col = bc; return bestsad; } @@ -2321,17 +2323,16 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, // TODO(jingning): Implement integral projection functions for high bit-depth // setting and remove this part of code. if (xd->bd != 8) { - unsigned int this_sad; + const unsigned int sad = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, src_stride, xd->plane[0].pre[0].buf, ref_stride); tmp_mv->row = 0; tmp_mv->col = 0; - this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, - xd->plane[0].pre[0].buf, ref_stride); if (scaled_ref_frame) { int i; for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; } - return this_sad; + return sad; } #endif From 66885a69ffdec30f0142dbff20568ebeb249107e Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 23 Mar 2023 19:28:48 -0700 Subject: [PATCH 0701/1000] svc_datarate_test: clear -Wshadow warning rename class member from ref_frame_config to the correct style: ref_frame_config_. Bug: webm:1793 Change-Id: Idaf49de6d724014adee75f81efe974b2031241ba --- test/svc_datarate_test.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc index 484252ca43..d571f50860 100644 --- a/test/svc_datarate_test.cc +++ b/test/svc_datarate_test.cc @@ -256,13 +256,13 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { temporal_layer_id_ = layer_id.temporal_layer_id; for (int i = 0; i < number_spatial_layers_; i++) { layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; - ref_frame_config.duration[i] = 1; + ref_frame_config_.duration[i] = 1; } encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); set_frame_flags_bypass_mode(layer_id.temporal_layer_id, - number_spatial_layers_, 0, &ref_frame_config, + number_spatial_layers_, 0, &ref_frame_config_, 1); - encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); } if (update_pattern_ && video->frame() >= 100) { @@ -277,13 +277,13 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { temporal_layer_id_ = layer_id.temporal_layer_id; for (int i = 0; i < number_spatial_layers_; i++) { layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; - ref_frame_config.duration[i] = 1; + ref_frame_config_.duration[i] = 1; } encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); set_frame_flags_bypass_mode(layer_id.temporal_layer_id, - number_spatial_layers_, 0, &ref_frame_config, + number_spatial_layers_, 0, &ref_frame_config_, 0); - encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); } if (change_bitrate_ && video->frame() == 200) { @@ -611,7 +611,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { bool single_layer_resize_; unsigned int top_sl_width_; unsigned int top_sl_height_; - vpx_svc_ref_frame_config_t ref_frame_config; + vpx_svc_ref_frame_config_t ref_frame_config_; int update_pattern_; bool change_bitrate_; vpx_codec_pts_t last_pts_ref_; From 8f17482e824e021e16c91bcf2b034830639ad3bb Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 7 Mar 2023 15:29:37 -0800 Subject: [PATCH 0702/1000] vp9_rdopt,block_rd_txfm: fix clang-tidy warning argument name 'recon' in comment does not match parameter name 'out_recon'. https://clang.llvm.org/extra/clang-tidy/checks/bugprone/argument-comment.html + normalize similar calls, using /*var=*/NULL to better match the style guidelines https://google.github.io/styleguide/cppguide.html#Function_Argument_Comments Change-Id: I089591317f7138965735f737c1536a8b16fcd4e4 --- vp9/encoder/vp9_rdopt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 76d545cd96..1e40e5378b 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -743,7 +743,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, } if (x->block_tx_domain) { dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, - tx_size, &dist, &sse, /*recon =*/0, sse_calc_done); + tx_size, &dist, &sse, /*out_recon=*/NULL, sse_calc_done); } else { const struct macroblock_plane *const p = &x->plane[plane]; const int src_stride = p->src.stride; @@ -1396,7 +1396,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, mic->mode = mode; super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL, - bsize, best_rd, /*recon = */ 0); + bsize, best_rd, /*recon=*/NULL); if (this_rate_tokenonly == INT_MAX) continue; @@ -1449,7 +1449,7 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, for (plane = 1; plane < MAX_MB_PLANE; ++plane) { txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd, plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing, - /*recon = */ 0); + /*recon=*/NULL); if (pnrate == INT_MAX) { is_cost_valid = 0; break; From b4d154c9486afdd010b61ba24c72655aa761a9e7 Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Tue, 21 Mar 2023 13:00:25 +0530 Subject: [PATCH 0703/1000] Add AVX2 for convolve vertical filter for block width 4 Introduced AVX2 intrinsic to compute convolve vertical for w = 4 case. This is a bit-exact change. Instruction Count cpu Resolution Reduction(%) 0 LOWRES2 0.364 0 MIDRES2 0.236 0 HDRES2 0.162 0 Average 0.254 Change-Id: I413f58aa6333a6f2421d4c10d49dec01e55b2098 --- vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 185 +++++++++++++++++++++- 1 file changed, 183 insertions(+), 2 deletions(-) diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 37ef59f36c..3b5ff04ee9 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -1184,8 +1184,190 @@ static void vpx_filter_block1d4_h8_avx2( } } +static void vpx_filter_block1d4_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m256i f[4], ss[4]; + __m256i r[8]; + __m128i r1[10]; + __m128i s[11]; + + unsigned int y = output_height; + // Multiply the size of the source stride by four + const ptrdiff_t src_stride = src_pitch << 2; + const ptrdiff_t out_stride = out_pitch << 2; + + // The output_height is always a multiple of two. + assert(!(output_height & 0x01)); + + shuffle_filter_avx2(filter, f); + + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); + + // R1-0 xxxx .. . . x| r13 r12 r11 r10 r03 r02 r01 r00 + r1[0] = _mm_unpacklo_epi32(s[0], s[1]); + + // R2-1 xxxx .. . . x| r23 r22 r21 r20 r13 r12 r11 r10 + r1[1] = _mm_unpacklo_epi32(s[1], s[2]); + + // R3-2 xxxx .. . . x| r33 r32 r31 r30 r23 r22 r21 r20 + r1[2] = _mm_unpacklo_epi32(s[2], s[3]); + + // R4-3 xxxx .. . . x| r43 r42 r41 r40 r33 r32 r31 r30 + r1[3] = _mm_unpacklo_epi32(s[3], s[4]); + + // R5-4 xxxx .. . . x| r53 r52 r51 r50 r43 r42 r41 r40 + r1[4] = _mm_unpacklo_epi32(s[4], s[5]); + + // R6-5 xxxx .. . . x| r63 r62 r61 r60 r53 r52 r51 r50 + r1[5] = _mm_unpacklo_epi32(s[5], s[6]); + + // 00000000 r33 r32 r31 r30|r23 r22 r21 r20||00000000|r13 r12 r11 r10|r03 r02 + // r01 r00 + r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[0]), r1[2], 1); + + // 00000000 r43 r42 r41 r40|r33 r32 r31 r30||00000000|r23 r22 r21 r20|r13 r12 + // r11 r10 + r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[1]), r1[3], 1); + + // 00000000 r53 r52 r51 r50|r43 r42 r41 r40||00000000|r33 r32 r31 r30|r23 r22 + // r21 r20 + r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[2]), r1[4], 1); + + // 00000000 r63 r62 r61 r60|r53 r52 r51 r50||00000000|r43 r42 r41 r40|r33 r32 + // r31 r30 + r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[3]), r1[5], 1); + + // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10 + // r00| + ss[0] = _mm256_unpacklo_epi8(r[0], r[1]); + + // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30 + // r20| + ss[1] = _mm256_unpacklo_epi8(r[2], r[3]); + + // Process 4 rows at a time + while (y >= 4) { + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch)); + s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch)); + + // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60 + r1[6] = _mm_unpacklo_epi32(s[6], s[7]); + + // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70 + r1[7] = _mm_unpacklo_epi32(s[7], s[8]); + + // R9-8 xxxx .. . . x| r93 r92 r91 r90 r83 r82 r81 r80 + r1[8] = _mm_unpacklo_epi32(s[8], s[9]); + + // R10-9 xxxx .. . . x| r10-3 r10-2 r10-1 r10-0 r93 r92 r91 r90 + r1[9] = _mm_unpacklo_epi32(s[9], s[10]); + + // 00000000 r73 r72 r71 r70|r63 r62 r61 r60||00000000|r53 r52 r51 r50|r43 + // r42 r41 r40 + r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[4]), r1[6], 1); + + // 00000000 r83 r82 r81 r80|r73 r72 r71 r70||00000000|r63 r62 r61 r60|r53 + // r52 r51 r50 + r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[5]), r1[7], 1); + + // 00000000 r93 r92 r91 r90|r83 r82 r81 r80||00000000|r73 r72 r71 r70|r63 + // r62 r61 r60 + r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[6]), r1[8], 1); + + // 00000000 r10-3 r10-2 r10-1 r10-0|r93 r92 r91 r90||00000000|r83 r82 r81 + // r80|r73 r72 r71 r70 + r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[7]), r1[9], 1); + + // r83 r73....r80 r70|r73 r63....r70 r60||r63 r53....r60 r50|r53 r43....r50 + // r40| + ss[2] = _mm256_unpacklo_epi8(r[4], r[5]); + + // r10-3 r10-3....r10-0 r10-0|r93 r83....r90 r80||r83 r73....r80 r70|r73 + // r63....r70 r60| + ss[3] = _mm256_unpacklo_epi8(r[6], r[7]); + + ss[0] = convolve8_16_avx2(ss, f); + + // r3 r2 r3 r2 r1 r0 r1 r0 + ss[0] = _mm256_packus_epi16(ss[0], ss[0]); + src_ptr += src_stride; + + mm256_storeu2_epi32((__m128i *const)output_ptr, + (__m128i *const)(output_ptr + (2 * out_pitch)), ss); + + ss[0] = _mm256_srli_si256(ss[0], 4); + + mm256_storeu2_epi32((__m128i *const)(output_ptr + (1 * out_pitch)), + (__m128i *const)(output_ptr + (3 * out_pitch)), ss); + + output_ptr += out_stride; + + ss[0] = ss[2]; + ss[1] = ss[3]; + + s[6] = s[10]; + + r1[4] = r1[8]; + r1[5] = r1[9]; + + y -= 4; + } + + // Process 2 rows + if (y == 2) { + __m128i ss1[4], f1[4]; + + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + + f1[0] = _mm256_castsi256_si128(f[0]); + f1[1] = _mm256_castsi256_si128(f[1]); + f1[2] = _mm256_castsi256_si128(f[2]); + f1[3] = _mm256_castsi256_si128(f[3]); + + // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60 + r1[6] = _mm_unpacklo_epi32(s[6], s[7]); + + // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70 + r1[7] = _mm_unpacklo_epi32(s[7], s[8]); + + // r23 r13....r20 r10|r13 r03....r10 r00 + ss1[0] = _mm256_castsi256_si128(ss[0]); + + // r43 r33....r40 r30|r33 r23....r30 r20 + ss1[1] = _mm256_castsi256_si128(ss[1]); + + // r63 r53....r60 r50|r53 r43....r50 r40 + ss1[2] = _mm_unpacklo_epi8(r1[4], r1[5]); + + // r83 r73....r80 r70|r73 r63....r70 r60 + ss1[3] = _mm_unpacklo_epi8(r1[6], r1[7]); + + ss1[0] = convolve8_8_ssse3(ss1, f1); + + // r1 r0 r1 r0 + ss1[0] = _mm_packus_epi16(ss1[0], ss1[0]); + + // Save first row 4 values + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]); + output_ptr += out_pitch; + + ss1[0] = _mm_srli_si128(ss1[0], 4); + // Save second row 4 values + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]); + } +} + #if HAVE_AVX2 && HAVE_SSSE3 -filter8_1dfunction vpx_filter_block1d4_v8_ssse3; #if VPX_ARCH_X86_64 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; @@ -1209,7 +1391,6 @@ filter8_1dfunction vpx_filter_block1d8_v2_ssse3; filter8_1dfunction vpx_filter_block1d8_h2_ssse3; filter8_1dfunction vpx_filter_block1d4_v2_ssse3; filter8_1dfunction vpx_filter_block1d4_h2_ssse3; -#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3 #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3 #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3 #define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3 From 25825f6a78a267f99c4c6ba7988fc4d79c8cb19d Mon Sep 17 00:00:00 2001 From: George Steed Date: Thu, 9 Mar 2023 23:46:31 +0000 Subject: [PATCH 0704/1000] Allow non-uniform above array in highbd d45 predictor Neon impl The existing implementation doesn't appear to manifest as a failure in any of the predictor or MD5 tests, but it does rely on the predictor tests filling the second `bs` elements of the `above` input array with copies of `above[bs - 1]` in order to match the C implementation. This patch adjusts the Neon implementation to correctly match the C implementation in the case where the elements of the `above` array all differ. Performance of the predictor is mostly unchanged, except for the 16x16 block size where it appears to have gotten marginally faster across most compiler/micro-architecture combinations. Bug: webm:1797 Change-Id: Iac166d6047316c0382e0f2790ce780fc99674b43 --- vpx_dsp/arm/highbd_intrapred_neon.c | 275 +++++++++++++++------------- 1 file changed, 144 insertions(+), 131 deletions(-) diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index 503900915d..05c9c7f196 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -289,166 +289,179 @@ void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t ABCDEFGH = vld1q_u16(above); - const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1); - const uint16x8_t CDEFGH00 = vld1q_u16(above + 2); - const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00); - const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0); - const uint16x4_t avg2_low = vget_low_u16(avg2); - const uint16x4_t avg2_high = vget_high_u16(avg2); - const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1); - const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2); - const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3); + uint16x8_t a0, a1, a2, d0; + uint16_t a7; (void)left; (void)bd; - vst1_u16(dst, avg2_low); - dst += stride; - vst1_u16(dst, r1); - dst += stride; - vst1_u16(dst, r2); - dst += stride; - vst1_u16(dst, r3); - vst1q_lane_u16(dst + 3, ABCDEFGH, 7); -} -static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride, - const uint16x8_t above_right, uint16x8_t *row) { - *row = vextq_u16(*row, above_right, 1); - vst1q_u16(*dst, *row); - *dst += stride; + a0 = vld1q_u16(above); + a7 = above[7]; + + // [ above[1], ..., above[6], x, x ] + a1 = vextq_u16(a0, a0, 1); + // [ above[2], ..., above[7], x, x ] + a2 = vextq_u16(a0, a0, 2); + + // d0[0] = AVG3(above[0], above[1], above[2]); + // ... + // d0[5] = AVG3(above[5], above[6], above[7]); + // d0[6] = x (don't care) + // d0[7] = x (don't care) + d0 = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + + // We want: + // stride=0 [ d0[0], d0[1], d0[2], d0[3] ] + // stride=1 [ d0[1], d0[2], d0[3], d0[4] ] + // stride=2 [ d0[2], d0[3], d0[4], d0[5] ] + // stride=2 [ d0[3], d0[4], d0[5], above[7] ] + vst1_u16(dst + 0 * stride, vget_low_u16(d0)); + vst1_u16(dst + 1 * stride, vget_low_u16(vextq_u16(d0, d0, 1))); + vst1_u16(dst + 2 * stride, vget_low_u16(vextq_u16(d0, d0, 2))); + vst1_u16(dst + 3 * stride, vget_low_u16(vextq_u16(d0, d0, 3))); + + // We stored d0[6] above, so fixup into above[7]. + dst[3 * stride + 3] = a7; } void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t A0 = vld1q_u16(above); - const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3); - const uint16x8_t A1 = vld1q_u16(above + 1); - const uint16x8_t A2 = vld1q_u16(above + 2); - const uint16x8_t avg1 = vhaddq_u16(A0, A2); - uint16x8_t row = vrhaddq_u16(avg1, A1); + uint16x8_t ax0, a0, a1, a7, d0; (void)left; (void)bd; - vst1q_u16(dst, row); - dst += stride; - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - vst1q_u16(dst, above_right); -} - -static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride, - const uint16x8_t above_right, uint16x8_t *row_0, - uint16x8_t *row_1) { - *row_0 = vextq_u16(*row_0, *row_1, 1); - *row_1 = vextq_u16(*row_1, above_right, 1); - vst1q_u16(*dst, *row_0); - *dst += 8; - vst1q_u16(*dst, *row_1); - *dst += stride - 8; + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a7 = vld1q_dup_u16(above + 7); + + // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can + // shift in above[7] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[6] ] + ax0 = vextq_u16(a0, a0, 7); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[7] = AVG3(above[6], above[7], above[8]); + d0 = vrhaddq_u16(vhaddq_u16(ax0, a1), a0); + + // Undo the earlier ext, incrementally shift in duplicates of above[7]. + vst1q_u16(dst + 0 * stride, vextq_u16(d0, a7, 1)); + vst1q_u16(dst + 1 * stride, vextq_u16(d0, a7, 2)); + vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 3)); + vst1q_u16(dst + 3 * stride, vextq_u16(d0, a7, 4)); + vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 5)); + vst1q_u16(dst + 5 * stride, vextq_u16(d0, a7, 6)); + vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 7)); + vst1q_u16(dst + 7 * stride, a7); } void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t A0_0 = vld1q_u16(above); - const uint16x8_t A0_1 = vld1q_u16(above + 8); - const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3); - const uint16x8_t A1_0 = vld1q_u16(above + 1); - const uint16x8_t A1_1 = vld1q_u16(above + 9); - const uint16x8_t A2_0 = vld1q_u16(above + 2); - const uint16x8_t A2_1 = vld1q_u16(above + 10); - const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0); - const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1); - uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0); - uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1); + uint16x8_t ax0, a0, a1, a7, a8, a9, a15, d0[2]; (void)left; (void)bd; - vst1q_u16(dst, row_0); - vst1q_u16(dst + 8, row_1); - dst += stride; - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - vst1q_u16(dst, above_right); - vst1q_u16(dst + 8, above_right); + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a15 = vld1q_dup_u16(above + 15); + + // [ x, above[0], ... , above[6] ] + ax0 = vextq_u16(a0, a0, 7); + + // We have one unused lane here to leave room to shift in above[15] in the + // last lane: + // d0[0][1] = x (don't care) + // d0[0][1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[0][7] = AVG3(above[6], above[7], above[8]); + // d0[1][0] = AVG3(above[7], above[8], above[9]); + // ... + // d0[1][7] = AVG3(above[14], above[15], above[16]); + d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0); + d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8); + + // Incrementally shift in duplicates of above[15]. + vst1q_u16(dst + 0 * stride + 0, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 0 * stride + 8, vextq_u16(d0[1], a15, 1)); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(d0[1], a15, 2)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], a15, 3)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d0[1], a15, 4)); + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], a15, 5)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d0[1], a15, 6)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], a15, 7)); + vst1q_u16(dst + 7 * stride + 0, d0[1]); + vst1q_u16(dst + 7 * stride + 8, a15); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[1], a15, 1)); + vst1q_u16(dst + 8 * stride + 8, a15); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d0[1], a15, 2)); + vst1q_u16(dst + 9 * stride + 8, a15); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[1], a15, 3)); + vst1q_u16(dst + 10 * stride + 8, a15); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d0[1], a15, 4)); + vst1q_u16(dst + 11 * stride + 8, a15); + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[1], a15, 5)); + vst1q_u16(dst + 12 * stride + 8, a15); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d0[1], a15, 6)); + vst1q_u16(dst + 13 * stride + 8, a15); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[1], a15, 7)); + vst1q_u16(dst + 14 * stride + 8, a15); + vst1q_u16(dst + 15 * stride + 0, a15); + vst1q_u16(dst + 15 * stride + 8, a15); } void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t A0_0 = vld1q_u16(above); - const uint16x8_t A0_1 = vld1q_u16(above + 8); - const uint16x8_t A0_2 = vld1q_u16(above + 16); - const uint16x8_t A0_3 = vld1q_u16(above + 24); - const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3); - const uint16x8_t A1_0 = vld1q_u16(above + 1); - const uint16x8_t A1_1 = vld1q_u16(above + 9); - const uint16x8_t A1_2 = vld1q_u16(above + 17); - const uint16x8_t A1_3 = vld1q_u16(above + 25); - const uint16x8_t A2_0 = vld1q_u16(above + 2); - const uint16x8_t A2_1 = vld1q_u16(above + 10); - const uint16x8_t A2_2 = vld1q_u16(above + 18); - const uint16x8_t A2_3 = vld1q_u16(above + 26); - const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0); - const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1); - const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2); - const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3); - uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0); - uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1); - uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2); - uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3); + uint16x8_t ax0, a0, a1, a7, a8, a9, a15, a16, a17, a23, a24, a25, a31, d0[4]; int i; (void)left; (void)bd; - vst1q_u16(dst, row_0); - dst += 8; - vst1q_u16(dst, row_1); - dst += 8; - vst1q_u16(dst, row_2); - dst += 8; - vst1q_u16(dst, row_3); - dst += stride - 24; - - for (i = 0; i < 30; ++i) { - row_0 = vextq_u16(row_0, row_1, 1); - row_1 = vextq_u16(row_1, row_2, 1); - row_2 = vextq_u16(row_2, row_3, 1); - row_3 = vextq_u16(row_3, above_right, 1); - vst1q_u16(dst, row_0); - dst += 8; - vst1q_u16(dst, row_1); - dst += 8; - vst1q_u16(dst, row_2); - dst += 8; - vst1q_u16(dst, row_3); - dst += stride - 24; - } + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a15 = vld1q_u16(above + 15); + a16 = vld1q_u16(above + 16); + a17 = vld1q_u16(above + 17); + a23 = vld1q_u16(above + 23); + a24 = vld1q_u16(above + 24); + a25 = vld1q_u16(above + 25); + a31 = vld1q_dup_u16(above + 31); - vst1q_u16(dst, above_right); - dst += 8; - vst1q_u16(dst, above_right); - dst += 8; - vst1q_u16(dst, above_right); - dst += 8; - vst1q_u16(dst, above_right); + // [ x, above[0], ... , above[6] ] + ax0 = vextq_u16(a0, a0, 7); + + d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0); + d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8); + d0[2] = vrhaddq_u16(vhaddq_u16(a15, a17), a16); + d0[3] = vrhaddq_u16(vhaddq_u16(a23, a25), a24); + + for (i = 0; i < 32; ++i) { + d0[0] = vextq_u16(d0[0], d0[1], 1); + d0[1] = vextq_u16(d0[1], d0[2], 1); + d0[2] = vextq_u16(d0[2], d0[3], 1); + d0[3] = vextq_u16(d0[3], a31, 1); + vst1q_u16(dst + 0, d0[0]); + vst1q_u16(dst + 8, d0[1]); + vst1q_u16(dst + 16, d0[2]); + vst1q_u16(dst + 24, d0[3]); + dst += stride; + } } // ----------------------------------------------------------------------------- From 3eb3781589d30874634cab8952dec4ea883eb82a Mon Sep 17 00:00:00 2001 From: George Steed Date: Fri, 17 Mar 2023 17:59:26 +0000 Subject: [PATCH 0705/1000] Allow non-uniform above array in d45 predictor Neon impl The existing implementation doesn't appear to manifest as a failure in any of the predictor or MD5 tests, but it does rely on the predictor tests filling the second `bs` elements of the `above` input array with copies of `above[bs - 1]` in order to match the C implementation. This patch adjusts the Neon implementation to correctly match the C implementation in the case where the elements of the `above` array all differ. Performance of the predictor is mostly unchanged, except for the 32x32 block size where it appears to have gotten about 40% faster when compiled with clang-15. Bug: webm:1797 Change-Id: Iaad58e77c5467307a3c80d6989b7cf2988e09311 --- vpx_dsp/arm/intrapred_neon.c | 269 ++++++++++++++++++++++------------- 1 file changed, 174 insertions(+), 95 deletions(-) diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index 892310f151..7c225f6b71 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -263,123 +263,202 @@ void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t ABCDEFGH = vld1_u8(above); - const uint64x1_t A1 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 8); - const uint64x1_t A2 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 16); - const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); - const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); - const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00); - const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); - const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); - const uint32x2_t r0 = vreinterpret_u32_u8(avg2); - const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); - const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); - const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + uint8x8_t a0, a1, a2, d0; + uint8_t a7; (void)left; - vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); - vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); - vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); - vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); - vst1_lane_u8(dst + 3 * stride + 3, ABCDEFGH, 7); -} -static INLINE void d45_store_8(uint8_t **dst, const ptrdiff_t stride, - const uint8x8_t above_right, uint8x8_t *row) { - *row = vext_u8(*row, above_right, 1); - vst1_u8(*dst, *row); - *dst += stride; + a0 = vld1_u8(above); + a7 = above[7]; + + // [ above[1], ..., above[6], x, x ] + a1 = vext_u8(a0, a0, 1); + // [ above[2], ..., above[7], x, x ] + a2 = vext_u8(a0, a0, 2); + + // d0[0] = AVG3(above[0], above[1], above[2]); + // ... + // d0[5] = AVG3(above[5], above[6], above[7]); + // d0[6] = x (don't care) + // d0[7] = x (don't care) + d0 = vrhadd_u8(vhadd_u8(a0, a2), a1); + + // We want: + // stride=0 [ d0[0], d0[1], d0[2], d0[3] ] + // stride=1 [ d0[1], d0[2], d0[3], d0[4] ] + // stride=2 [ d0[2], d0[3], d0[4], d0[5] ] + // stride=2 [ d0[3], d0[4], d0[5], above[7] ] + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, vext_u8(d0, d0, 1)); + store_u8_4x1(dst + 2 * stride, vext_u8(d0, d0, 2)); + store_u8_4x1(dst + 3 * stride, vext_u8(d0, d0, 3)); + + // We stored d0[6] above, so fixup into above[7]. + dst[3 * stride + 3] = a7; } void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t A0 = vld1_u8(above); - const uint8x8_t above_right = vdup_lane_u8(A0, 7); - const uint8x8_t A1 = vext_u8(A0, above_right, 1); - const uint8x8_t A2 = vext_u8(A0, above_right, 2); - const uint8x8_t avg1 = vhadd_u8(A0, A2); - uint8x8_t row = vrhadd_u8(avg1, A1); + uint8x8_t ax0, a0, a1, a7, d0; (void)left; - vst1_u8(dst, row); - dst += stride; - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - vst1_u8(dst, above_right); -} - -static INLINE void d45_store_16(uint8_t **dst, const ptrdiff_t stride, - const uint8x16_t above_right, uint8x16_t *row) { - *row = vextq_u8(*row, above_right, 1); - vst1q_u8(*dst, *row); - *dst += stride; + a0 = vld1_u8(above + 0); + a1 = vld1_u8(above + 1); + a7 = vld1_dup_u8(above + 7); + + // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can + // shift in above[7] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[6] ] + ax0 = vext_u8(a0, a0, 7); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[7] = AVG3(above[6], above[7], above[8]); + d0 = vrhadd_u8(vhadd_u8(ax0, a1), a0); + + // Undo the earlier ext, incrementally shift in duplicates of above[7]. + vst1_u8(dst + 0 * stride, vext_u8(d0, a7, 1)); + vst1_u8(dst + 1 * stride, vext_u8(d0, a7, 2)); + vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 3)); + vst1_u8(dst + 3 * stride, vext_u8(d0, a7, 4)); + vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 5)); + vst1_u8(dst + 5 * stride, vext_u8(d0, a7, 6)); + vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 7)); + vst1_u8(dst + 7 * stride, a7); } void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16_t A0 = vld1q_u8(above); - const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0), 7); - const uint8x16_t A1 = vextq_u8(A0, above_right, 1); - const uint8x16_t A2 = vextq_u8(A0, above_right, 2); - const uint8x16_t avg1 = vhaddq_u8(A0, A2); - uint8x16_t row = vrhaddq_u8(avg1, A1); + uint8x16_t ax0, a0, a1, a15, d0; (void)left; - vst1q_u8(dst, row); - dst += stride; - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - vst1q_u8(dst, above_right); + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a15 = vld1q_dup_u8(above + 15); + + // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can + // shift in above[15] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[14] ] + ax0 = vextq_u8(a0, a0, 15); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[15] = AVG3(above[14], above[15], above[16]); + d0 = vrhaddq_u8(vhaddq_u8(ax0, a1), a0); + + // Undo the earlier ext, incrementally shift in duplicates of above[15]. + vst1q_u8(dst + 0 * stride, vextq_u8(d0, a15, 1)); + vst1q_u8(dst + 1 * stride, vextq_u8(d0, a15, 2)); + vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 3)); + vst1q_u8(dst + 3 * stride, vextq_u8(d0, a15, 4)); + vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 5)); + vst1q_u8(dst + 5 * stride, vextq_u8(d0, a15, 6)); + vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 7)); + vst1q_u8(dst + 7 * stride, vextq_u8(d0, a15, 8)); + vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 9)); + vst1q_u8(dst + 9 * stride, vextq_u8(d0, a15, 10)); + vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 11)); + vst1q_u8(dst + 11 * stride, vextq_u8(d0, a15, 12)); + vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 13)); + vst1q_u8(dst + 13 * stride, vextq_u8(d0, a15, 14)); + vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 15)); + vst1q_u8(dst + 15 * stride, a15); } void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16_t A0_0 = vld1q_u8(above); - const uint8x16_t A0_1 = vld1q_u8(above + 16); - const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0_1), 7); - const uint8x16_t A1_0 = vld1q_u8(above + 1); - const uint8x16_t A1_1 = vld1q_u8(above + 17); - const uint8x16_t A2_0 = vld1q_u8(above + 2); - const uint8x16_t A2_1 = vld1q_u8(above + 18); - const uint8x16_t avg_0 = vhaddq_u8(A0_0, A2_0); - const uint8x16_t avg_1 = vhaddq_u8(A0_1, A2_1); - uint8x16_t row_0 = vrhaddq_u8(avg_0, A1_0); - uint8x16_t row_1 = vrhaddq_u8(avg_1, A1_1); - int i; + uint8x16_t ax0, a0, a1, a15, a16, a17, a31, d0[2]; (void)left; - vst1q_u8(dst, row_0); - dst += 16; - vst1q_u8(dst, row_1); - dst += stride - 16; + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a15 = vld1q_u8(above + 15); + a16 = vld1q_u8(above + 16); + a17 = vld1q_u8(above + 17); + a31 = vld1q_dup_u8(above + 31); - for (i = 0; i < 30; ++i) { - row_0 = vextq_u8(row_0, row_1, 1); - row_1 = vextq_u8(row_1, above_right, 1); - vst1q_u8(dst, row_0); - dst += 16; - vst1q_u8(dst, row_1); - dst += stride - 16; - } + // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can + // shift in above[15] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[14] ] + ax0 = vextq_u8(a0, a0, 15); - vst1q_u8(dst, above_right); - dst += 16; - vst1q_u8(dst, row_1); + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[15] = AVG3(above[14], above[15], above[16]); + d0[0] = vrhaddq_u8(vhaddq_u8(ax0, a1), a0); + d0[1] = vrhaddq_u8(vhaddq_u8(a15, a17), a16); + + // Undo the earlier ext, incrementally shift in duplicates of above[15]. + vst1q_u8(dst + 0 * stride + 0, vextq_u8(d0[0], d0[1], 1)); + vst1q_u8(dst + 0 * stride + 16, vextq_u8(d0[1], a31, 1)); + vst1q_u8(dst + 1 * stride + 0, vextq_u8(d0[0], d0[1], 2)); + vst1q_u8(dst + 1 * stride + 16, vextq_u8(d0[1], a31, 2)); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0[0], d0[1], 3)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0[1], a31, 3)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(d0[0], d0[1], 4)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d0[1], a31, 4)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0[0], d0[1], 5)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0[1], a31, 5)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(d0[0], d0[1], 6)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d0[1], a31, 6)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0[0], d0[1], 7)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0[1], a31, 7)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(d0[0], d0[1], 8)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d0[1], a31, 8)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0[0], d0[1], 9)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0[1], a31, 9)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(d0[0], d0[1], 10)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d0[1], a31, 10)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0[0], d0[1], 11)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0[1], a31, 11)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(d0[0], d0[1], 12)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d0[1], a31, 12)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0[0], d0[1], 13)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0[1], a31, 13)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(d0[0], d0[1], 14)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d0[1], a31, 14)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0[0], d0[1], 15)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0[1], a31, 15)); + vst1q_u8(dst + 15 * stride + 0, d0[1]); + vst1q_u8(dst + 15 * stride + 16, a31); + + vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0[1], a31, 1)); + vst1q_u8(dst + 16 * stride + 16, a31); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(d0[1], a31, 2)); + vst1q_u8(dst + 17 * stride + 16, a31); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0[1], a31, 3)); + vst1q_u8(dst + 18 * stride + 16, a31); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(d0[1], a31, 4)); + vst1q_u8(dst + 19 * stride + 16, a31); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0[1], a31, 5)); + vst1q_u8(dst + 20 * stride + 16, a31); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(d0[1], a31, 6)); + vst1q_u8(dst + 21 * stride + 16, a31); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0[1], a31, 7)); + vst1q_u8(dst + 22 * stride + 16, a31); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(d0[1], a31, 8)); + vst1q_u8(dst + 23 * stride + 16, a31); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0[1], a31, 9)); + vst1q_u8(dst + 24 * stride + 16, a31); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(d0[1], a31, 10)); + vst1q_u8(dst + 25 * stride + 16, a31); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0[1], a31, 11)); + vst1q_u8(dst + 26 * stride + 16, a31); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(d0[1], a31, 12)); + vst1q_u8(dst + 27 * stride + 16, a31); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0[1], a31, 13)); + vst1q_u8(dst + 28 * stride + 16, a31); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(d0[1], a31, 14)); + vst1q_u8(dst + 29 * stride + 16, a31); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0[1], a31, 15)); + vst1q_u8(dst + 30 * stride + 16, a31); + vst1q_u8(dst + 31 * stride + 0, a31); + vst1q_u8(dst + 31 * stride + 16, a31); } // ----------------------------------------------------------------------------- From 911d6e165eb19e03ec1532fa20098b10ad402e39 Mon Sep 17 00:00:00 2001 From: George Steed Date: Fri, 17 Mar 2023 19:55:17 +0000 Subject: [PATCH 0706/1000] Allow non-uniform above array in d63 predictor Neon impl The existing standard bitdepth implementation doesn't appear to manifest as a failure in any of the predictor or MD5 tests, but it does rely on the predictor tests filling the second `bs` elements of the `above` input array with copies of `above[bs - 1]` in order to match the C implementation. This patch adjusts the Neon implementation to correctly match the C implementation in the case where the elements of the `above` array all differ. The geomean of performance for the predictor is approximately a 2% slowdown compared to the previous vectorized implementation. This is still considerably faster than the unspecialized naive C implementation. Bug: webm:1797 Change-Id: I8fb00a154288d54b24a72a7ff63c816bdcf3aca3 --- vpx_dsp/arm/intrapred_neon.c | 174 +++++++++++++++++++---------------- 1 file changed, 94 insertions(+), 80 deletions(-) diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index 7c225f6b71..3d117fa938 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -499,12 +499,16 @@ void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, vst1_u8(dst + 0 * stride, d0); vst1_u8(dst + 1 * stride, d1); - vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 1)); - vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 1)); - vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 2)); - vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 2)); - vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 3)); - vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 3)); + + d0 = vext_u8(d0, d0, 7); + d1 = vext_u8(d1, d1, 7); + + vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 2)); + vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 2)); + vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 3)); + vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 3)); + vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 4)); + vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 4)); } void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, @@ -522,20 +526,24 @@ void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, vst1q_u8(dst + 0 * stride, d0); vst1q_u8(dst + 1 * stride, d1); - vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 1)); - vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 1)); - vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 2)); - vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 2)); - vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 3)); - vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 3)); - vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 4)); - vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 4)); - vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 5)); - vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 5)); - vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 6)); - vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 6)); - vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 7)); - vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 7)); + + d0 = vextq_u8(d0, d0, 15); + d1 = vextq_u8(d1, d1, 15); + + vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 2)); + vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 2)); + vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 3)); + vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 3)); + vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 4)); + vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 4)); + vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 5)); + vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 5)); + vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 6)); + vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 6)); + vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 7)); + vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 7)); + vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 8)); + vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 8)); } void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, @@ -560,66 +568,72 @@ void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, vst1q_u8(dst + 0 * stride + 16, d0_hi); vst1q_u8(dst + 1 * stride + 0, d1_lo); vst1q_u8(dst + 1 * stride + 16, d1_hi); - vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 1)); - vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 1)); - vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 1)); - vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 1)); - vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 2)); - vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 2)); - vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 2)); - vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 2)); - vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 3)); - vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 3)); - vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 3)); - vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 3)); - vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 4)); - vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 4)); - vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 4)); - vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 4)); - vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 5)); - vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 5)); - vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 5)); - vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 5)); - vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 6)); - vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 6)); - vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 6)); - vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 6)); - vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 7)); - vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 7)); - vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 7)); - vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 7)); - vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 8)); - vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 8)); - vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 8)); - vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 8)); - vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 9)); - vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 9)); - vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 9)); - vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 9)); - vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 10)); - vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 10)); - vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 10)); - vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 10)); - vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 11)); - vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 11)); - vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 11)); - vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 11)); - vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 12)); - vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 12)); - vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 12)); - vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 12)); - vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 13)); - vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 13)); - vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 13)); - vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 13)); - vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 14)); - vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 14)); - vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 14)); - vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 14)); - vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0_lo, d0_hi, 15)); - vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_hi, a31, 15)); - vst1q_u8(dst + 31 * stride + 0, vextq_u8(d1_lo, d1_hi, 15)); - vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_hi, a31, 15)); + + d0_hi = vextq_u8(d0_lo, d0_hi, 15); + d0_lo = vextq_u8(d0_lo, d0_lo, 15); + d1_hi = vextq_u8(d1_lo, d1_hi, 15); + d1_lo = vextq_u8(d1_lo, d1_lo, 15); + + vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 2)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 2)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 2)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 2)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 3)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 3)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 3)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 3)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 4)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 4)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 4)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 4)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 5)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 5)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 5)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 5)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 6)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 6)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 6)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 6)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 7)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 7)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 7)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 7)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 8)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 8)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 8)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 8)); + vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 9)); + vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 9)); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 9)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 9)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 10)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 10)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 10)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 10)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 11)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 11)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 11)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 11)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 12)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 12)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 12)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 12)); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 13)); + vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 13)); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 13)); + vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 13)); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 14)); + vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 14)); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 14)); + vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 14)); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 15)); + vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 15)); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 15)); + vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 15)); + vst1q_u8(dst + 30 * stride + 0, d0_hi); + vst1q_u8(dst + 30 * stride + 16, a31); + vst1q_u8(dst + 31 * stride + 0, d1_hi); + vst1q_u8(dst + 31 * stride + 16, a31); } // ----------------------------------------------------------------------------- From 100ca0356ddf67e92da35699d92bc180429d0bc1 Mon Sep 17 00:00:00 2001 From: George Steed Date: Fri, 17 Mar 2023 20:00:24 +0000 Subject: [PATCH 0707/1000] Randomize second half of above_row_ in intrapred tests for Neon The existing tests duplicate `above_row_[block_size - 1]` after the first `block_size` elements, which can lead to tests incorrectly passing due to differing behaviour when calculating the average for the last elements of the output. This change adjusts the above array setup to be fully random instead, allowing us to catch such issues here rather than in other larger tests like the external MD5 tests. It doesn't appear that other architectures are fully clean with this change so restrict it to just Neon for now until they are fixed. Bug: webm:1797 Change-Id: If83ff1adbf1e8d30f2a92474d7186c65840a5d0b --- test/vp9_intrapred_test.cc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index cec9031618..6de7cf8d0f 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -55,6 +55,21 @@ class IntraPredTest : public ::testing::TestWithParam { ref_dst_ = ref_dst; int error_count = 0; for (int i = 0; i < count_test_block; ++i) { + // TODO(webm:1797): Some of the optimised predictor implementations rely + // on the trailing half of the above_row_ being a copy of the final + // element, however relying on this in some cases can cause the MD5 tests + // to fail. We have fixed all of these cases for Neon, so fill the whole + // of above_row_ randomly. +#if HAVE_NEON + // Fill edges with random data, try first with saturated values. + for (int x = -1; x < 2 * block_size; x++) { + if (i == 0) { + above_row_[x] = mask_; + } else { + above_row_[x] = rnd.Rand16() & mask_; + } + } +#else // Fill edges with random data, try first with saturated values. for (int x = -1; x < block_size; x++) { if (i == 0) { @@ -66,6 +81,7 @@ class IntraPredTest : public ::testing::TestWithParam { for (int x = block_size; x < 2 * block_size; x++) { above_row_[x] = above_row_[block_size - 1]; } +#endif for (int y = 0; y < block_size; y++) { if (i == 0) { left_col_[y] = mask_; From 972149cafeb71d6f08df89e91a0130d6a38c4b15 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 28 Mar 2023 10:09:16 -0400 Subject: [PATCH 0708/1000] svc: Fix a case where target bandwidth is 0 Bug: webrtc:15033 Change-Id: I28636de66842671b03284408186c4c18254109a5 --- vp9/encoder/vp9_svc_layercontext.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index c60445cba5..ac8b09cf31 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -220,7 +220,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, RATE_CONTROL *const lrc = &lc->rc; lc->spatial_layer_target_bandwidth = spatial_layer_target; - bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + if (target_bandwidth != 0) { + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + } lrc->starting_buffer_level = (int64_t)(rc->starting_buffer_level * bitrate_alloc); lrc->optimal_buffer_level = From 4cf9819282aa123e8b126731ef5629ee5144cd86 Mon Sep 17 00:00:00 2001 From: George Steed Date: Tue, 21 Mar 2023 14:31:50 +0000 Subject: [PATCH 0709/1000] Avoid LD2/ST2 instructions in vpx_dc_predictor_32x32_neon The LD2 and ST2 instructions are useful if we are dealing with interleaved data (e.g. real/imag components of complex numbers), but for simply loading or storing larger quantities of data it is preferable to simply use two of the normal load/store instructions. This patch replaces such occurrences in vpx_dc_predictor_32x32_neon and related functions. With Clang-15 this speeds up this function by 10-30% depending on the micro-architecture being benchmarked on. With GCC-12 this speeds up the function by 40-60% depending on the micro-architecture being benchmarked on. Change-Id: I670dc37908aa238f360104efd74d6c2108ecf945 --- vpx_dsp/arm/intrapred_neon.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index 892310f151..b7f2a11ca2 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -193,9 +193,10 @@ void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, // DC 32x32 static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) { - const uint8x16x2_t r = vld2q_u8(ref); - const uint16x8_t p0 = vpaddlq_u8(r.val[0]); - const uint16x8_t p1 = vpaddlq_u8(r.val[1]); + const uint8x16_t r0 = vld1q_u8(ref + 0); + const uint8x16_t r1 = vld1q_u8(ref + 16); + const uint16x8_t p0 = vpaddlq_u8(r0); + const uint16x8_t p1 = vpaddlq_u8(r1); const uint16x8_t p2 = vaddq_u16(p0, p1); uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); sum = vpadd_u16(sum, sum); @@ -204,23 +205,24 @@ static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) { static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride, const uint8x8_t dc) { - uint8x16x2_t dc_dup; + uint8x16_t dc_dup = vdupq_lane_u8(dc, 0); int i; - dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0); - for (i = 0; i < 32; ++i, dst += stride) { - vst2q_u8(dst, dc_dup); + vst1q_u8(dst + 0, dc_dup); + vst1q_u8(dst + 16, dc_dup); } } void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16x2_t a = vld2q_u8(above); - const uint8x16x2_t l = vld2q_u8(left); - const uint16x8_t pa0 = vpaddlq_u8(a.val[0]); - const uint16x8_t pl0 = vpaddlq_u8(l.val[0]); - const uint16x8_t pa1 = vpaddlq_u8(a.val[1]); - const uint16x8_t pl1 = vpaddlq_u8(l.val[1]); + const uint8x16_t a0 = vld1q_u8(above + 0); + const uint8x16_t a1 = vld1q_u8(above + 16); + const uint8x16_t l0 = vld1q_u8(left + 0); + const uint8x16_t l1 = vld1q_u8(left + 16); + const uint16x8_t pa0 = vpaddlq_u8(a0); + const uint16x8_t pl0 = vpaddlq_u8(l0); + const uint16x8_t pa1 = vpaddlq_u8(a1); + const uint16x8_t pl1 = vpaddlq_u8(l1); const uint16x8_t pa = vaddq_u16(pa0, pa1); const uint16x8_t pl = vaddq_u16(pl0, pl1); const uint16x8_t pal = vaddq_u16(pa, pl); From 83def747ff316d283c949458a4b890b23e5e0b8b Mon Sep 17 00:00:00 2001 From: George Steed Date: Wed, 22 Mar 2023 08:44:26 +0000 Subject: [PATCH 0710/1000] Avoid interleaving loads/stores in Neon for highbd dc predictor The interleaving load/store instructions (LD2/LD3/LD4 and ST2/ST3/ST4) are useful if we are dealing with interleaved data (e.g. real/imag components of complex numbers), but for simply loading or storing larger quantities of data it is preferable to simply use two or more of the normal load/store instructions. This patch replaces such occurrences in the two larger block sizes: vpx_highbd_dc_predictor_16x16_neon, vpx_highbd_dc_predictor_32x32_neon, and related helper functions. Speedups over the original Neon code (higher is better): Microarch. | Compiler | Block | Speedup Neoverse N1 | LLVM 15 | 16x16 | 1.25 Neoverse N1 | LLVM 15 | 32x32 | 1.13 Neoverse N1 | GCC 12 | 16x16 | 1.56 Neoverse N1 | GCC 12 | 32x32 | 1.52 Neoverse V1 | LLVM 15 | 16x16 | 1.63 Neoverse V1 | LLVM 15 | 32x32 | 1.08 Neoverse V1 | GCC 12 | 16x16 | 1.59 Neoverse V1 | GCC 12 | 32x32 | 1.37 Change-Id: If5ec220aba9dd19785454eabb0f3d6affec0cc8b --- vpx_dsp/arm/highbd_intrapred_neon.c | 61 +++++++++++++++++------------ 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index 503900915d..b2aea14f7b 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -143,8 +143,9 @@ void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, // DC 16x16 static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) { - const uint16x8x2_t ref_u16 = vld2q_u16(ref); - const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]); + const uint16x8_t ref_u16_0 = vld1q_u16(ref + 0); + const uint16x8_t ref_u16_1 = vld1q_u16(ref + 8); + const uint16x8_t p0 = vaddq_u16(ref_u16_0, ref_u16_1); uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); sum = vpadd_u16(sum, sum); return vpadd_u16(sum, sum); @@ -152,21 +153,23 @@ static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) { static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride, const uint16x4_t dc) { - uint16x8x2_t dc_dup; + uint16x8_t dc_dup = vdupq_lane_u16(dc, 0); int i; - dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0); for (i = 0; i < 16; ++i, dst += stride) { - vst2q_u16(dst, dc_dup); + vst1q_u16(dst + 0, dc_dup); + vst1q_u16(dst + 8, dc_dup); } } void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t a = vld2q_u16(above); - const uint16x8x2_t l = vld2q_u16(left); - const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]); - const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]); + const uint16x8_t a0 = vld1q_u16(above + 0); + const uint16x8_t a1 = vld1q_u16(above + 8); + const uint16x8_t l0 = vld1q_u16(left + 0); + const uint16x8_t l1 = vld1q_u16(left + 8); + const uint16x8_t pa = vaddq_u16(a0, a1); + const uint16x8_t pl = vaddq_u16(l0, l1); const uint16x8_t pal0 = vaddq_u16(pa, pl); uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0)); uint32x2_t sum; @@ -211,9 +214,12 @@ void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, // DC 32x32 static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) { - const uint16x8x4_t r = vld4q_u16(ref); - const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]); - const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]); + const uint16x8_t r0 = vld1q_u16(ref + 0); + const uint16x8_t r1 = vld1q_u16(ref + 8); + const uint16x8_t r2 = vld1q_u16(ref + 16); + const uint16x8_t r3 = vld1q_u16(ref + 24); + const uint16x8_t p0 = vaddq_u16(r0, r1); + const uint16x8_t p1 = vaddq_u16(r2, r3); const uint16x8_t p2 = vaddq_u16(p0, p1); uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); sum = vpadd_u16(sum, sum); @@ -222,27 +228,32 @@ static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) { static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride, const uint16x4_t dc) { - uint16x8x2_t dc_dup; + uint16x8_t dc_dup = vdupq_lane_u16(dc, 0); int i; - dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0); - for (i = 0; i < 32; ++i) { - vst2q_u16(dst, dc_dup); - dst += 16; - vst2q_u16(dst, dc_dup); - dst += stride - 16; + vst1q_u16(dst + 0, dc_dup); + vst1q_u16(dst + 8, dc_dup); + vst1q_u16(dst + 16, dc_dup); + vst1q_u16(dst + 24, dc_dup); + dst += stride; } } void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x4_t a = vld4q_u16(above); - const uint16x8x4_t l = vld4q_u16(left); - const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]); - const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]); - const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]); - const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]); + const uint16x8_t a0 = vld1q_u16(above + 0); + const uint16x8_t a1 = vld1q_u16(above + 8); + const uint16x8_t a2 = vld1q_u16(above + 16); + const uint16x8_t a3 = vld1q_u16(above + 24); + const uint16x8_t l0 = vld1q_u16(left + 0); + const uint16x8_t l1 = vld1q_u16(left + 8); + const uint16x8_t l2 = vld1q_u16(left + 16); + const uint16x8_t l3 = vld1q_u16(left + 24); + const uint16x8_t pa0 = vaddq_u16(a0, a1); + const uint16x8_t pa1 = vaddq_u16(a2, a3); + const uint16x8_t pl0 = vaddq_u16(l0, l1); + const uint16x8_t pl1 = vaddq_u16(l2, l3); const uint16x8_t pa = vaddq_u16(pa0, pa1); const uint16x8_t pl = vaddq_u16(pl0, pl1); const uint16x8_t pal0 = vaddq_u16(pa, pl); From 9824167ad292ee42c9c97f3e6ce1d9ca90bf679f Mon Sep 17 00:00:00 2001 From: George Steed Date: Wed, 22 Mar 2023 11:49:33 +0000 Subject: [PATCH 0711/1000] Avoid LD2/ST2 instructions in highbd v predictors in Neon The interleaving load/store instructions (LD2/LD3/LD4 and ST2/ST3/ST4) are useful if we are dealing with interleaved data (e.g. real/imag components of complex numbers), but for simply loading or storing larger quantities of data it is preferable to simply use the normal load/store instructions. This patch replaces such occurrences in the two larger block sizes: vpx_highbd_v_predictor_16x16_neon and vpx_highbd_v_predictor_32x32_neon. Change-Id: Ie4ffa298a2466ceaf893566fd0aefe3f66f439e4 --- vpx_dsp/arm/highbd_intrapred_neon.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index b2aea14f7b..ec97094be6 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -2166,30 +2166,36 @@ void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t row = vld2q_u16(above); + const uint16x8_t row0 = vld1q_u16(above + 0); + const uint16x8_t row1 = vld1q_u16(above + 8); int i; (void)left; (void)bd; - for (i = 0; i < 16; i++, dst += stride) { - vst2q_u16(dst, row); + for (i = 0; i < 16; i++) { + vst1q_u16(dst + 0, row0); + vst1q_u16(dst + 8, row1); + dst += stride; } } void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t row0 = vld2q_u16(above); - const uint16x8x2_t row1 = vld2q_u16(above + 16); + const uint16x8_t row0 = vld1q_u16(above + 0); + const uint16x8_t row1 = vld1q_u16(above + 8); + const uint16x8_t row2 = vld1q_u16(above + 16); + const uint16x8_t row3 = vld1q_u16(above + 24); int i; (void)left; (void)bd; for (i = 0; i < 32; i++) { - vst2q_u16(dst, row0); - dst += 16; - vst2q_u16(dst, row1); - dst += stride - 16; + vst1q_u16(dst + 0, row0); + vst1q_u16(dst + 8, row1); + vst1q_u16(dst + 16, row2); + vst1q_u16(dst + 24, row3); + dst += stride; } } From cf1efecebf0ed2e01bafea6804c98f80ab7e12e0 Mon Sep 17 00:00:00 2001 From: Salome Thirot Date: Mon, 27 Mar 2023 14:31:40 +0100 Subject: [PATCH 0712/1000] Optimize Neon paths of high bitdepth SAD and SAD4d for 8xh blocks For these block sizes there is no need to widen to 32-bits until the final reduction, so use a single vabaq instead of vabd + vpadalq. Change-Id: I9c19d620f7bb8b3a6b0bedd37789c03bb628b563 --- vpx_dsp/arm/highbd_sad4d_neon.c | 31 ++++++++++++++++++------------- vpx_dsp/arm/highbd_sad_neon.c | 7 +++---- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/vpx_dsp/arm/highbd_sad4d_neon.c b/vpx_dsp/arm/highbd_sad4d_neon.c index f731d38cc1..280d2087f7 100644 --- a/vpx_dsp/arm/highbd_sad4d_neon.c +++ b/vpx_dsp/arm/highbd_sad4d_neon.c @@ -48,12 +48,6 @@ static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride, vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); } -static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref, - uint32x4_t *const sad_sum) { - uint16x8_t abs_diff = vabdq_u16(src, ref); - *sad_sum = vpadalq_u16(*sad_sum, abs_diff); -} - static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], @@ -64,21 +58,32 @@ static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride, const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); - uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0) }; + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32[4]; int i = 0; do { uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); - sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]); - sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]); - sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]); - sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]); + sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride)); + sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride)); + sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride)); + sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride)); } while (++i < h); - vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); + sum_u32[0] = vpaddlq_u16(sum[0]); + sum_u32[1] = vpaddlq_u16(sum[1]); + sum_u32[2] = vpaddlq_u16(sum[2]); + sum_u32[3] = vpaddlq_u16(sum[3]); + vst1q_u32(res, horizontal_add_4d_uint32x4(sum_u32)); +} + +static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref, + uint32x4_t *const sad_sum) { + uint16x8_t abs_diff = vabdq_u16(src, ref); + *sad_sum = vpadalq_u16(*sad_sum, abs_diff); } static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr, diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c index 90971f6009..813710040b 100644 --- a/vpx_dsp/arm/highbd_sad_neon.c +++ b/vpx_dsp/arm/highbd_sad_neon.c @@ -44,20 +44,19 @@ static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr, int ref_stride, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); - uint32x4_t sum = vdupq_n_u32(0); + uint16x8_t sum = vdupq_n_u16(0); int i = h; do { uint16x8_t s = vld1q_u16(src16_ptr); uint16x8_t r = vld1q_u16(ref16_ptr); - uint16x8_t diff = vabdq_u16(s, r); - sum = vpadalq_u16(sum, diff); + sum = vabaq_u16(sum, s, r); src16_ptr += src_stride; ref16_ptr += ref_stride; } while (--i != 0); - return horizontal_add_uint32x4(sum); + return horizontal_add_uint16x8(sum); } static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr, From 0f893ea0b6a8843d825f97b12ecf78443f0e93d2 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 29 Mar 2023 13:06:19 -0400 Subject: [PATCH 0713/1000] svc: Fix a case where target bandwidth is 0 Bug: webrtc:15033 Change-Id: Iea2997c2ce8982f106a1eed3ec4f7dd1c6e83666 --- vp9/encoder/vp9_svc_layercontext.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index ac8b09cf31..83b6e5c99d 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -254,7 +254,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, lc->target_bandwidth = oxcf->layer_target_bitrate[layer]; - bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + if (target_bandwidth != 0) { + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + } // Update buffer-related quantities. lrc->starting_buffer_level = (int64_t)(rc->starting_buffer_level * bitrate_alloc); From c1c7dd3138796c512c441fba901d1a48ef7d61d1 Mon Sep 17 00:00:00 2001 From: George Steed Date: Mon, 27 Mar 2023 08:47:58 +0000 Subject: [PATCH 0714/1000] Use sum_neon.h helpers in Neon DC predictors Use sum_neon.h helpers for horizontal reductions in Neon DC predictors, enabling use of dedicated Neon reduction instructions on AArch64. Some of the surrounding code is also optimized to remove redundant broadcast instructions in the dc_store helpers. Performance is largely unchanged on both the standard as well as the high bit-depth predictors. The main improvement appears to be the 16x16 standard-bitdepth dc predictor, which improves by 10-15% when benchmarked on Neoverse N1. Change-Id: Ibfcc6ecf4b1b2f87ce1e1f63c314d0cc35a0c76f --- vpx_dsp/arm/highbd_intrapred_neon.c | 118 ++++++++++------------- vpx_dsp/arm/intrapred_neon.c | 139 +++++++++++----------------- vpx_dsp/arm/sum_neon.h | 43 +++++++++ 3 files changed, 143 insertions(+), 157 deletions(-) diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index b4a69017d2..235cb5b996 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -12,23 +12,22 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "sum_neon.h" #include "vpx/vpx_integer.h" //------------------------------------------------------------------------------ // DC 4x4 -static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) { +static INLINE uint16_t dc_sum_4(const uint16_t *ref) { const uint16x4_t ref_u16 = vld1_u16(ref); - const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16); - return vpadd_u16(p0, p0); + return horizontal_add_uint16x4(ref_u16); } static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, const uint16x4_t dc) { - const uint16x4_t dc_dup = vdup_lane_u16(dc, 0); int i; for (i = 0; i < 4; ++i, dst += stride) { - vst1_u16(dst, dc_dup); + vst1_u16(dst, dc); } } @@ -37,21 +36,17 @@ void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int bd) { const uint16x4_t a = vld1_u16(above); const uint16x4_t l = vld1_u16(left); - uint16x4_t sum; - uint16x4_t dc; + const uint16_t sum = horizontal_add_uint16x4(vadd_u16(a, l)); + const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 3); (void)bd; - sum = vadd_u16(a, l); - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vrshr_n_u16(sum, 3); dc_store_4x4(dst, stride, dc); } void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_4(left); - const uint16x4_t dc = vrshr_n_u16(sum, 2); + const uint16_t sum = dc_sum_4(left); + const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2); (void)above; (void)bd; dc_store_4x4(dst, stride, dc); @@ -60,8 +55,8 @@ void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_4(above); - const uint16x4_t dc = vrshr_n_u16(sum, 2); + const uint16_t sum = dc_sum_4(above); + const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2); (void)left; (void)bd; dc_store_4x4(dst, stride, dc); @@ -79,19 +74,16 @@ void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 8x8 -static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) { +static INLINE uint16_t dc_sum_8(const uint16_t *ref) { const uint16x8_t ref_u16 = vld1q_u16(ref); - uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); + return horizontal_add_uint16x8(ref_u16); } static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride, - const uint16x4_t dc) { - const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0); + const uint16x8_t dc) { int i; for (i = 0; i < 8; ++i, dst += stride) { - vst1q_u16(dst, dc_dup); + vst1q_u16(dst, dc); } } @@ -101,20 +93,17 @@ void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16x8_t above_u16 = vld1q_u16(above); const uint16x8_t left_u16 = vld1q_u16(left); const uint16x8_t p0 = vaddq_u16(above_u16, left_u16); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - uint16x4_t dc; + const uint16_t sum = horizontal_add_uint16x8(p0); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4); (void)bd; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vrshr_n_u16(sum, 4); dc_store_8x8(dst, stride, dc); } void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_8(left); - const uint16x4_t dc = vrshr_n_u16(sum, 3); + const uint16_t sum = dc_sum_8(left); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3); (void)above; (void)bd; dc_store_8x8(dst, stride, dc); @@ -123,8 +112,8 @@ void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_8(above); - const uint16x4_t dc = vrshr_n_u16(sum, 3); + const uint16_t sum = dc_sum_8(above); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3); (void)left; (void)bd; dc_store_8x8(dst, stride, dc); @@ -133,7 +122,7 @@ void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1)); (void)above; (void)left; dc_store_8x8(dst, stride, dc); @@ -142,22 +131,19 @@ void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 16x16 -static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) { +static INLINE uint16_t dc_sum_16(const uint16_t *ref) { const uint16x8_t ref_u16_0 = vld1q_u16(ref + 0); const uint16x8_t ref_u16_1 = vld1q_u16(ref + 8); const uint16x8_t p0 = vaddq_u16(ref_u16_0, ref_u16_1); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); + return horizontal_add_uint16x8(p0); } static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride, - const uint16x4_t dc) { - uint16x8_t dc_dup = vdupq_lane_u16(dc, 0); + const uint16x8_t dc) { int i; for (i = 0; i < 16; ++i, dst += stride) { - vst1q_u16(dst + 0, dc_dup); - vst1q_u16(dst + 8, dc_dup); + vst1q_u16(dst + 0, dc); + vst1q_u16(dst + 8, dc); } } @@ -171,21 +157,17 @@ void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16x8_t pa = vaddq_u16(a0, a1); const uint16x8_t pl = vaddq_u16(l0, l1); const uint16x8_t pal0 = vaddq_u16(pa, pl); - uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0)); - uint32x2_t sum; - uint16x4_t dc; + const uint32_t sum = horizontal_add_uint16x8(pal0); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0); (void)bd; - pal1 = vpadd_u16(pal1, pal1); - sum = vpaddl_u16(pal1); - dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); dc_store_16x16(dst, stride, dc); } void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_16(left); - const uint16x4_t dc = vrshr_n_u16(sum, 4); + const uint16_t sum = dc_sum_16(left); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4); (void)above; (void)bd; dc_store_16x16(dst, stride, dc); @@ -194,8 +176,8 @@ void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_16(above); - const uint16x4_t dc = vrshr_n_u16(sum, 4); + const uint16_t sum = dc_sum_16(above); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4); (void)left; (void)bd; dc_store_16x16(dst, stride, dc); @@ -204,7 +186,7 @@ void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1)); (void)above; (void)left; dc_store_16x16(dst, stride, dc); @@ -213,7 +195,7 @@ void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 32x32 -static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) { +static INLINE uint32_t dc_sum_32(const uint16_t *ref) { const uint16x8_t r0 = vld1q_u16(ref + 0); const uint16x8_t r1 = vld1q_u16(ref + 8); const uint16x8_t r2 = vld1q_u16(ref + 16); @@ -221,20 +203,17 @@ static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) { const uint16x8_t p0 = vaddq_u16(r0, r1); const uint16x8_t p1 = vaddq_u16(r2, r3); const uint16x8_t p2 = vaddq_u16(p0, p1); - uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - sum = vpadd_u16(sum, sum); - return vpaddl_u16(sum); + return horizontal_add_uint16x8(p2); } static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride, - const uint16x4_t dc) { - uint16x8_t dc_dup = vdupq_lane_u16(dc, 0); + const uint16x8_t dc) { int i; for (i = 0; i < 32; ++i) { - vst1q_u16(dst + 0, dc_dup); - vst1q_u16(dst + 8, dc_dup); - vst1q_u16(dst + 16, dc_dup); - vst1q_u16(dst + 24, dc_dup); + vst1q_u16(dst + 0, dc); + vst1q_u16(dst + 8, dc); + vst1q_u16(dst + 16, dc); + vst1q_u16(dst + 24, dc); dst += stride; } } @@ -257,20 +236,17 @@ void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16x8_t pa = vaddq_u16(pa0, pa1); const uint16x8_t pl = vaddq_u16(pl0, pl1); const uint16x8_t pal0 = vaddq_u16(pa, pl); - const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0)); - uint32x2_t sum = vpaddl_u16(pal1); - uint16x4_t dc; + const uint32_t sum = horizontal_add_uint16x8(pal0); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 6), 0); (void)bd; - sum = vpadd_u32(sum, sum); - dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6)); dc_store_32x32(dst, stride, dc); } void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint32x2_t sum = dc_sum_32(left); - const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); + const uint32_t sum = dc_sum_32(left); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0); (void)above; (void)bd; dc_store_32x32(dst, stride, dc); @@ -279,8 +255,8 @@ void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint32x2_t sum = dc_sum_32(above); - const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); + const uint32_t sum = dc_sum_32(above); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0); (void)left; (void)bd; dc_store_32x32(dst, stride, dc); @@ -289,7 +265,7 @@ void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1)); (void)above; (void)left; dc_store_32x32(dst, stride, dc); diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index d1f6f6da9f..d9b4db2eab 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -13,51 +13,46 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "mem_neon.h" +#include "sum_neon.h" #include "vpx/vpx_integer.h" //------------------------------------------------------------------------------ // DC 4x4 -static INLINE uint16x4_t dc_sum_4(const uint8_t *ref) { - const uint8x8_t ref_u8 = vld1_u8(ref); - const uint16x4_t p0 = vpaddl_u8(ref_u8); - return vpadd_u16(p0, p0); +static INLINE uint16_t dc_sum_4(const uint8_t *ref) { + return horizontal_add_uint8x4(load_unaligned_u8_4x1(ref)); } static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride, const uint8x8_t dc) { - const uint8x8_t dc_dup = vdup_lane_u8(dc, 0); int i; for (i = 0; i < 4; ++i, dst += stride) { - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc_dup), 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc), 0); } } void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t a = vld1_u8(above); - const uint8x8_t l = vld1_u8(left); - const uint16x8_t al = vaddl_u8(a, l); - uint16x4_t sum; - uint8x8_t dc; - sum = vpadd_u16(vget_low_u16(al), vget_low_u16(al)); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); + const uint8x8_t a = load_unaligned_u8_4x1(above); + const uint8x8_t l = load_unaligned_u8_4x1(left); + const uint16x4_t al = vget_low_u16(vaddl_u8(a, l)); + const uint16_t sum = horizontal_add_uint16x4(al); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3); dc_store_4x4(dst, stride, dc); } void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_4(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2)); + const uint16_t sum = dc_sum_4(left); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2); (void)above; dc_store_4x4(dst, stride, dc); } void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_4(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2)); + const uint16_t sum = dc_sum_4(above); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2); (void)left; dc_store_4x4(dst, stride, dc); } @@ -73,19 +68,15 @@ void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 8x8 -static INLINE uint16x4_t dc_sum_8(const uint8_t *ref) { - const uint8x8_t ref_u8 = vld1_u8(ref); - uint16x4_t sum = vpaddl_u8(ref_u8); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); +static INLINE uint16_t dc_sum_8(const uint8_t *ref) { + return horizontal_add_uint8x8(vld1_u8(ref)); } static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride, const uint8x8_t dc) { - const uint8x8_t dc_dup = vdup_lane_u8(dc, 0); int i; for (i = 0; i < 8; ++i, dst += stride) { - vst1_u8(dst, dc_dup); + vst1_u8(dst, dc); } } @@ -93,28 +84,24 @@ void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t above_u8 = vld1_u8(above); const uint8x8_t left_u8 = vld1_u8(left); - const uint8x16_t above_and_left = vcombine_u8(above_u8, left_u8); - const uint16x8_t p0 = vpaddlq_u8(above_and_left); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - uint8x8_t dc; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); + const uint16x8_t al = vaddl_u8(above_u8, left_u8); + const uint16_t sum = horizontal_add_uint16x8(al); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 4); dc_store_8x8(dst, stride, dc); } void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_8(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); + const uint16_t sum = dc_sum_8(left); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3); (void)above; dc_store_8x8(dst, stride, dc); } void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_8(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); + const uint16_t sum = dc_sum_8(above); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3); (void)left; dc_store_8x8(dst, stride, dc); } @@ -130,20 +117,15 @@ void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 16x16 -static INLINE uint16x4_t dc_sum_16(const uint8_t *ref) { - const uint8x16_t ref_u8 = vld1q_u8(ref); - const uint16x8_t p0 = vpaddlq_u8(ref_u8); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); +static INLINE uint16_t dc_sum_16(const uint8_t *ref) { + return horizontal_add_uint8x16(vld1q_u8(ref)); } static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride, - const uint8x8_t dc) { - const uint8x16_t dc_dup = vdupq_lane_u8(dc, 0); + const uint8x16_t dc) { int i; for (i = 0; i < 16; ++i, dst += stride) { - vst1q_u8(dst, dc_dup); + vst1q_u8(dst + 0, dc); } } @@ -151,22 +133,19 @@ void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t ref0 = vld1q_u8(above); const uint8x16_t ref1 = vld1q_u8(left); - const uint16x8_t p0 = vpaddlq_u8(ref0); - const uint16x8_t p1 = vpaddlq_u8(ref1); - const uint16x8_t p2 = vaddq_u16(p0, p1); - uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - uint8x8_t dc; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); + const uint16x8_t a = vpaddlq_u8(ref0); + const uint16x8_t l = vpaddlq_u8(ref1); + const uint16x8_t al = vaddq_u16(a, l); + const uint16_t sum = horizontal_add_uint16x8(al); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0); dc_store_16x16(dst, stride, dc); } void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_16(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); + const uint16_t sum = dc_sum_16(left); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0); (void)above; dc_store_16x16(dst, stride, dc); } @@ -174,8 +153,8 @@ void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_16(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); + const uint16_t sum = dc_sum_16(above); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0); (void)left; dc_store_16x16(dst, stride, dc); } @@ -183,7 +162,7 @@ void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t dc = vdup_n_u8(0x80); + const uint8x16_t dc = vdupq_n_u8(0x80); (void)above; (void)left; dc_store_16x16(dst, stride, dc); @@ -192,24 +171,19 @@ void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 32x32 -static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) { +static INLINE uint16_t dc_sum_32(const uint8_t *ref) { const uint8x16_t r0 = vld1q_u8(ref + 0); const uint8x16_t r1 = vld1q_u8(ref + 16); - const uint16x8_t p0 = vpaddlq_u8(r0); - const uint16x8_t p1 = vpaddlq_u8(r1); - const uint16x8_t p2 = vaddq_u16(p0, p1); - uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); + const uint16x8_t r01 = vaddq_u16(vpaddlq_u8(r0), vpaddlq_u8(r1)); + return horizontal_add_uint16x8(r01); } static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride, - const uint8x8_t dc) { - uint8x16_t dc_dup = vdupq_lane_u8(dc, 0); + const uint8x16_t dc) { int i; for (i = 0; i < 32; ++i, dst += stride) { - vst1q_u8(dst + 0, dc_dup); - vst1q_u8(dst + 16, dc_dup); + vst1q_u8(dst + 0, dc); + vst1q_u8(dst + 16, dc); } } @@ -219,26 +193,19 @@ void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8x16_t a1 = vld1q_u8(above + 16); const uint8x16_t l0 = vld1q_u8(left + 0); const uint8x16_t l1 = vld1q_u8(left + 16); - const uint16x8_t pa0 = vpaddlq_u8(a0); - const uint16x8_t pl0 = vpaddlq_u8(l0); - const uint16x8_t pa1 = vpaddlq_u8(a1); - const uint16x8_t pl1 = vpaddlq_u8(l1); - const uint16x8_t pa = vaddq_u16(pa0, pa1); - const uint16x8_t pl = vaddq_u16(pl0, pl1); - const uint16x8_t pal = vaddq_u16(pa, pl); - uint16x4_t sum = vadd_u16(vget_low_u16(pal), vget_high_u16(pal)); - uint8x8_t dc; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 6)); + const uint16x8_t a01 = vaddq_u16(vpaddlq_u8(a0), vpaddlq_u8(a1)); + const uint16x8_t l01 = vaddq_u16(vpaddlq_u8(l0), vpaddlq_u8(l1)); + const uint16x8_t al = vaddq_u16(a01, l01); + const uint16_t sum = horizontal_add_uint16x8(al); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 6), 0); dc_store_32x32(dst, stride, dc); } void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_32(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); + const uint16_t sum = dc_sum_32(left); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0); (void)above; dc_store_32x32(dst, stride, dc); } @@ -246,8 +213,8 @@ void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_32(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); + const uint16_t sum = dc_sum_32(above); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0); (void)left; dc_store_32x32(dst, stride, dc); } @@ -255,7 +222,7 @@ void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t dc = vdup_n_u8(0x80); + const uint8x16_t dc = vdupq_n_u8(0x80); (void)above; (void)left; dc_store_32x32(dst, stride, dc); diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index 8291f07296..1eb3484767 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -16,6 +16,49 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) { +#if defined(__aarch64__) + return vaddlv_u8(a); +#else + const uint16x4_t b = vpaddl_u8(a); + const uint16x4_t c = vpadd_u16(b, b); + return vget_lane_u16(c, 0); +#endif +} + +static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) { +#if defined(__aarch64__) + return vaddlv_u8(a); +#else + const uint16x4_t b = vpaddl_u8(a); + const uint16x4_t c = vpadd_u16(b, b); + const uint16x4_t d = vpadd_u16(c, c); + return vget_lane_u16(d, 0); +#endif +} + +static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) { +#if defined(__aarch64__) + return vaddlvq_u8(a); +#else + const uint16x8_t b = vpaddlq_u8(a); + const uint16x4_t c = vadd_u16(vget_low_u16(b), vget_high_u16(b)); + const uint16x4_t d = vpadd_u16(c, c); + const uint16x4_t e = vpadd_u16(d, d); + return vget_lane_u16(e, 0); +#endif +} + +static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) { +#if defined(__aarch64__) + return vaddv_u16(a); +#else + const uint16x4_t b = vpadd_u16(a, a); + const uint16x4_t c = vpadd_u16(b, b); + return vget_lane_u16(c, 0); +#endif +} + static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) { #if defined(__aarch64__) return vaddlvq_s16(a); From a257b4d6be525c50aea1e9f33f791fd4b627e92b Mon Sep 17 00:00:00 2001 From: George Steed Date: Tue, 28 Mar 2023 14:49:37 +0000 Subject: [PATCH 0715/1000] Avoid vshr and vget_{low,high} in Neon d135 predictor impl The shift instructions have marginally worse performance on some micro-architectures, and the vget_{low,high} instructions are unnecessary. This commit improves performance of the d135 predictors by 1.5% geomean averaged across a range of compilers and micro-architectures. Change-Id: Ied4c3eecc12fc973841696459d868ce403ed4e6c --- vpx_dsp/arm/intrapred_neon.c | 57 +++++++++++------------------------- 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index d9b4db2eab..4f909e4935 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -866,22 +866,14 @@ void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8x8_t L3210 = vrev64_u8(L0123); const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4); const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5); - const uint8x8_t L10XA0123_ = - vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(L210XA0123), 8)); + const uint8x8_t L10XA0123_ = vext_u8(L210XA0123, L210XA0123, 1); const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012); const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123); - const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); - const uint32x2_t r3 = vreinterpret_u32_u8(avg2); - const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); - const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); - const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); - vst1_lane_u32((uint32_t *)dst, r0, 0); - dst += stride; - vst1_lane_u32((uint32_t *)dst, r1, 0); - dst += stride; - vst1_lane_u32((uint32_t *)dst, r2, 0); - dst += stride; - vst1_lane_u32((uint32_t *)dst, r3, 0); + + store_u8_4x1(dst + 0 * stride, vext_u8(avg2, avg2, 3)); + store_u8_4x1(dst + 1 * stride, vext_u8(avg2, avg2, 2)); + store_u8_4x1(dst + 2 * stride, vext_u8(avg2, avg2, 1)); + store_u8_4x1(dst + 3 * stride, avg2); } void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, @@ -898,31 +890,15 @@ void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_); const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_); const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567); - const uint8x8_t row_0 = vget_low_u8(row); - const uint8x8_t row_1 = vget_high_u8(row); - const uint8x8_t r0 = vext_u8(row_0, row_1, 7); - const uint8x8_t r1 = vext_u8(row_0, row_1, 6); - const uint8x8_t r2 = vext_u8(row_0, row_1, 5); - const uint8x8_t r3 = vext_u8(row_0, row_1, 4); - const uint8x8_t r4 = vext_u8(row_0, row_1, 3); - const uint8x8_t r5 = vext_u8(row_0, row_1, 2); - const uint8x8_t r6 = vext_u8(row_0, row_1, 1); - - vst1_u8(dst, r0); - dst += stride; - vst1_u8(dst, r1); - dst += stride; - vst1_u8(dst, r2); - dst += stride; - vst1_u8(dst, r3); - dst += stride; - vst1_u8(dst, r4); - dst += stride; - vst1_u8(dst, r5); - dst += stride; - vst1_u8(dst, r6); - dst += stride; - vst1_u8(dst, row_0); + + vst1_u8(dst + 0 * stride, vget_low_u8(vextq_u8(row, row, 7))); + vst1_u8(dst + 1 * stride, vget_low_u8(vextq_u8(row, row, 6))); + vst1_u8(dst + 2 * stride, vget_low_u8(vextq_u8(row, row, 5))); + vst1_u8(dst + 3 * stride, vget_low_u8(vextq_u8(row, row, 4))); + vst1_u8(dst + 4 * stride, vget_low_u8(vextq_u8(row, row, 3))); + vst1_u8(dst + 5 * stride, vget_low_u8(vextq_u8(row, row, 2))); + vst1_u8(dst + 6 * stride, vget_low_u8(vextq_u8(row, row, 1))); + vst1_u8(dst + 7 * stride, vget_low_u8(row)); } static INLINE void d135_store_16x8( @@ -965,6 +941,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_); const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X); const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef); + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15); const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14); const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13); @@ -972,7 +949,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11); const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10); const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9); - const uint8x16_t r_7 = vcombine_u8(vget_high_u8(row_0), vget_low_u8(row_1)); + const uint8x16_t r_7 = vextq_u8(row_0, row_1, 8); const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7); const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6); const uint8x16_t r_a = vextq_u8(row_0, row_1, 5); From 1025d37b03247c790723dac7f4084e04fd45f2b3 Mon Sep 17 00:00:00 2001 From: Cherma Rajan A Date: Wed, 8 Mar 2023 17:50:06 +0530 Subject: [PATCH 0716/1000] Prune single ref modes based on mv difference and mode rate This patch introduces a speed feature to prune single reference modes - NEARMV and ZEROMV based on motion vector difference and mode rate w.r.t previously evaluated single reference modes corresponding to the same reference frame. Instruction Count BD-Rate Loss(%) cpu Resolution Reduction(%) avg.psnr ovr.psnr ssim 0 LOWRES2 1.686 -0.0039 -0.0105 -0.0098 0 MIDRES2 1.026 -0.0234 0.0029 0.0120 0 HDRES2 0.000 0.0000 0.0000 0.0000 0 Average 0.889 -0.0091 -0.0025 0.0007 STATS_CHANGED Change-Id: I387acd3a73d8256904a7ce684b198d251cf3dd04 --- vp9/encoder/vp9_rdopt.c | 71 +++++++++++++++++++++++++++++--- vp9/encoder/vp9_speed_features.c | 9 ++++ vp9/encoder/vp9_speed_features.h | 4 ++ 3 files changed, 79 insertions(+), 5 deletions(-) diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 76d545cd96..a6a7befc11 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1854,6 +1854,52 @@ static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) { return 0; } +// Compares motion vector and mode rate of current mode and given mode. +static INLINE int compare_mv_mode_rate(MV this_mv, MV mode_mv, + int this_mode_rate, int mode_rate, + int mv_thresh) { + const int mv_diff = + abs(mode_mv.col - this_mv.col) + abs(mode_mv.row - this_mv.row); + if (mv_diff <= mv_thresh && mode_rate < this_mode_rate) return 1; + return 0; +} + +// Skips single reference inter modes NEARMV and ZEROMV based on motion vector +// difference and mode rate. +static INLINE int skip_single_mode_based_on_mode_rate( + int_mv (*mode_mv)[MAX_REF_FRAMES], int *single_mode_rate, int this_mode, + int ref0, int this_mode_rate, int best_mode_index) { + MV this_mv = mode_mv[this_mode][ref0].as_mv; + const int mv_thresh = 3; + + // Pruning is not applicable for NEARESTMV or NEWMV modes. + if (this_mode == NEARESTMV || this_mode == NEWMV) return 0; + // Pruning is not done when reference frame of the mode is same as best + // reference so far. + if (best_mode_index > 0 && + ref0 == vp9_mode_order[best_mode_index].ref_frame[0]) + return 0; + + // Check absolute mv difference and mode rate of current mode w.r.t NEARESTMV + if (compare_mv_mode_rate( + this_mv, mode_mv[NEARESTMV][ref0].as_mv, this_mode_rate, + single_mode_rate[INTER_OFFSET(NEARESTMV)], mv_thresh)) + return 1; + + // Check absolute mv difference and mode rate of current mode w.r.t NEWMV + if (compare_mv_mode_rate(this_mv, mode_mv[NEWMV][ref0].as_mv, this_mode_rate, + single_mode_rate[INTER_OFFSET(NEWMV)], mv_thresh)) + return 1; + + // Pruning w.r.t NEARMV is applicable only for ZEROMV mode + if (this_mode == NEARMV) return 0; + // Check absolute mv difference and mode rate of current mode w.r.t NEARMV + if (compare_mv_mode_rate(this_mv, mode_mv[NEARMV][ref0].as_mv, this_mode_rate, + single_mode_rate[INTER_OFFSET(NEARMV)], mv_thresh)) + return 1; + return 0; +} + #define NUM_ITERS 4 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row, int mi_col, @@ -2756,8 +2802,9 @@ static int64_t handle_inter_mode( struct buf_2d *recon, int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], INTERP_FILTER (*single_filter)[MAX_REF_FRAMES], - int (*single_skippable)[MAX_REF_FRAMES], int64_t *psse, - const int64_t ref_best_rd, int64_t *mask_filter, int64_t filter_cache[]) { + int (*single_skippable)[MAX_REF_FRAMES], int *single_mode_rate, + int64_t *psse, const int64_t ref_best_rd, int64_t *mask_filter, + int64_t filter_cache[], int best_mode_index) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MODE_INFO *mi = xd->mi[0]; @@ -2914,6 +2961,15 @@ static int64_t handle_inter_mode( *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]); } + if (!is_comp_pred && cpi->sf.prune_single_mode_based_on_mv_diff_mode_rate) { + single_mode_rate[INTER_OFFSET(this_mode)] = *rate2; + // Prune NEARMV and ZEROMV modes based on motion vector difference and mode + // rate. + if (skip_single_mode_based_on_mode_rate(mode_mv, single_mode_rate, + this_mode, refs[0], *rate2, + best_mode_index)) + return INT64_MAX; + } if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd && mi->mode != NEARESTMV) return INT64_MAX; @@ -3380,6 +3436,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } }; INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES]; int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES]; + int single_mode_rate[MAX_REF_FRAMES][INTER_MODES]; int64_t best_rd = best_rd_so_far; int64_t best_pred_diff[REFERENCE_MODES]; int64_t best_pred_rd[REFERENCE_MODES]; @@ -3578,6 +3635,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, second_ref_frame = vp9_mode_order[mode_index].ref_frame[1]; vp9_zero(x->sum_y_eobs); + comp_pred = second_ref_frame > INTRA_FRAME; + if (!comp_pred && ref_frame != INTRA_FRAME && + sf->prune_single_mode_based_on_mv_diff_mode_rate) + single_mode_rate[ref_frame][INTER_OFFSET(this_mode)] = INT_MAX; if (is_rect_partition) { if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue; @@ -3663,7 +3724,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (this_mode == NEARMV || this_mode == ZEROMV) continue; } - comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { if (!cpi->allow_comp_inter_inter) continue; @@ -3783,8 +3843,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, this_rd = handle_inter_mode( cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv, recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv, - single_inter_filter, single_skippable, &total_sse, best_rd, - &mask_filter, filter_cache); + single_inter_filter, single_skippable, + &single_mode_rate[ref_frame][0], &total_sse, best_rd, &mask_filter, + filter_cache, best_mode_index); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, handle_inter_mode_time); #endif diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 3e121b799f..0522d4ec97 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -70,6 +70,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, const int is_720p_or_larger = min_frame_size >= 720; const int is_1080p_or_larger = min_frame_size >= 1080; const int is_2160p_or_larger = min_frame_size >= 2160; + const int boosted = frame_is_boosted(cpi); // speed 0 features sf->partition_search_breakout_thr.dist = (1 << 20); @@ -102,6 +103,13 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, } } + if (!is_720p_or_larger) { + if (is_480p_or_larger) + sf->prune_single_mode_based_on_mv_diff_mode_rate = boosted ? 0 : 1; + else + sf->prune_single_mode_based_on_mv_diff_mode_rate = 1; + } + if (speed >= 1) { sf->rd_ml_partition.search_early_termination = 0; sf->rd_ml_partition.search_breakout = 1; @@ -926,6 +934,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { sf->enhanced_full_pixel_motion_search = 1; sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 0; + sf->prune_single_mode_based_on_mv_diff_mode_rate = 0; sf->cb_pred_filter_search = 0; sf->early_term_interp_search_plane_rd = 0; sf->cb_partition_search = 0; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index d32bf09e4e..e267e55c41 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -417,6 +417,10 @@ typedef struct SPEED_FEATURES { // Adaptive prediction mode search int adaptive_mode_search; + // Prune NEAREST and ZEROMV single reference modes based on motion vector + // difference and mode rate + int prune_single_mode_based_on_mv_diff_mode_rate; + // Chessboard pattern prediction for interp filter. Aggressiveness increases // with levels. // 0: disable From e2465dfc2515e0872524b627a647d1613dfeae13 Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Tue, 28 Mar 2023 14:48:46 +0530 Subject: [PATCH 0717/1000] Add AVX2 intrinsic for variance function for block width 8 Added AVX2 intrinsic optimization for the following functions 1. vpx_variance8x4 2. vpx_variance8x8 3. vpx_variance8x16 This is a bit-exact change. Instruction Count cpu Resolution Reduction(%) 0 LOWRES2 0.698 0 MIDRES2 0.577 0 HDRES2 0.469 0 Average 0.582 Change-Id: Iae8fdf9344fd012cda4955ed140633141d60ba86 --- test/variance_test.cc | 5 ++- vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +-- vpx_dsp/x86/variance_avx2.c | 80 ++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 4 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index 1359bc4baf..df9a1c56f6 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1429,7 +1429,10 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(5, 4, &vpx_variance32x16_avx2), VarianceParams(4, 5, &vpx_variance16x32_avx2), VarianceParams(4, 4, &vpx_variance16x16_avx2), - VarianceParams(4, 3, &vpx_variance16x8_avx2))); + VarianceParams(4, 3, &vpx_variance16x8_avx2), + VarianceParams(3, 4, &vpx_variance8x16_avx2), + VarianceParams(3, 3, &vpx_variance8x8_avx2), + VarianceParams(3, 2, &vpx_variance8x4_avx2))); INSTANTIATE_TEST_SUITE_P( AVX2, VpxSubpelVarianceTest, diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 49bc9a6309..d63be5fb8f 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1123,13 +1123,13 @@ () specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x16 sse2 neon msa mmi vsx/; + specialize qw/vpx_variance8x16 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx lsx/; + specialize qw/vpx_variance8x8 sse2 avx2 neon msa mmi vsx lsx/; add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x4 sse2 neon msa mmi vsx/; + specialize qw/vpx_variance8x4 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance4x8 sse2 neon msa mmi vsx/; diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c index 35925d5908..8305b9f20f 100644 --- a/vpx_dsp/x86/variance_avx2.c +++ b/vpx_dsp/x86/variance_avx2.c @@ -98,6 +98,41 @@ static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) { return _mm256_add_epi32(sum_lo, sum_hi); } +static INLINE void variance8_kernel_avx2( + const uint8_t *const src, const int src_stride, const uint8_t *const ref, + const int ref_stride, __m256i *const sse, __m256i *const sum) { + __m128i src0, src1, ref0, ref1; + __m256i ss, rr, diff; + + // 0 0 0.... 0 s07 s06 s05 s04 s03 s02 s01 s00 + src0 = _mm_loadl_epi64((const __m128i *)(src + 0 * src_stride)); + + // 0 0 0.... 0 s17 s16 s15 s14 s13 s12 s11 s10 + src1 = _mm_loadl_epi64((const __m128i *)(src + 1 * src_stride)); + + // s17 s16...s11 s10 s07 s06...s01 s00 (8bit) + src0 = _mm_unpacklo_epi64(src0, src1); + + // s17 s16...s11 s10 s07 s06...s01 s00 (16 bit) + ss = _mm256_cvtepu8_epi16(src0); + + // 0 0 0.... 0 r07 r06 r05 r04 r03 r02 r01 r00 + ref0 = _mm_loadl_epi64((const __m128i *)(ref + 0 * ref_stride)); + + // 0 0 0.... 0 r17 r16 0 r15 0 r14 0 r13 0 r12 0 r11 0 r10 + ref1 = _mm_loadl_epi64((const __m128i *)(ref + 1 * ref_stride)); + + // r17 r16...r11 r10 r07 r06...r01 r00 (8 bit) + ref0 = _mm_unpacklo_epi64(ref0, ref1); + + // r17 r16...r11 r10 r07 r06...r01 r00 (16 bit) + rr = _mm256_cvtepu8_epi16(ref0); + + diff = _mm256_sub_epi16(ss, rr); + *sse = _mm256_add_epi32(*sse, _mm256_madd_epi16(diff, diff)); + *sum = _mm256_add_epi16(*sum, diff); +} + static INLINE void variance16_kernel_avx2( const uint8_t *const src, const int src_stride, const uint8_t *const ref, const int ref_stride, __m256i *const sse, __m256i *const sum) { @@ -119,6 +154,21 @@ static INLINE void variance32_kernel_avx2(const uint8_t *const src, variance_kernel_avx2(s, r, sse, sum); } +static INLINE void variance8_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + int i; + *vsum = _mm256_setzero_si256(); + *vsse = _mm256_setzero_si256(); + + for (i = 0; i < h; i += 2) { + variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); + src += 2 * src_stride; + ref += 2 * ref_stride; + } +} + static INLINE void variance16_avx2(const uint8_t *src, const int src_stride, const uint8_t *ref, const int ref_stride, const int h, __m256i *const vsse, @@ -612,6 +662,36 @@ typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 5); +} + +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 6); +} + +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 7); +} + unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { From a5801b00a8e3e440392c6bbc31754ba3c206ff41 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Tue, 4 Apr 2023 14:52:52 +0100 Subject: [PATCH 0718/1000] Optimize 4D Neon reduction for 4xh and 8xh SAD4D blocks Add a 4D reduction function operating on uint16x8_t vectors and use it to optimize the final reduction in standard bitdepth 4xh and 8xh SAD4D computations. Similar 4D reduction optimizations have already been implemented for all other standard bitdepth block sizes, and all high bitdepth block sizes.[1] [1] https://chromium-review.googlesource.com/c/webm/libvpx/+/4224681 Change-Id: I0aa0b6e0f70449776f316879cafc4b830e86ea51 --- vpx_dsp/arm/sad4d_neon.c | 10 ++-------- vpx_dsp/arm/sum_neon.h | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 9509573939..ab00e0e3a2 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -270,10 +270,7 @@ static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride, i++; } while (i < h); - res[0] = horizontal_add_uint16x8(sum[0]); - res[1] = horizontal_add_uint16x8(sum[1]); - res[2] = horizontal_add_uint16x8(sum[2]); - res[3] = horizontal_add_uint16x8(sum[3]); + vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); } static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride, @@ -298,10 +295,7 @@ static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride, i += 2; } while (i < h); - res[0] = horizontal_add_uint16x8(sum[0]); - res[1] = horizontal_add_uint16x8(sum[1]); - res[2] = horizontal_add_uint16x8(sum[2]); - res[3] = horizontal_add_uint16x8(sum[3]); + vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); } #define SAD_WXH_4D_NEON(w, h) \ diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index 1eb3484767..6259add4a4 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -83,6 +83,23 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) { #endif } +static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) { +#if defined(__aarch64__) + const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); + const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); + const uint16x8_t b0 = vpaddq_u16(a0, a1); + return vpaddlq_u16(b0); +#else + const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); + const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); + const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); + const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); + const uint16x4_t b0 = vpadd_u16(a0, a1); + const uint16x4_t b1 = vpadd_u16(a2, a3); + return vpaddlq_u16(vcombine_u16(b0, b1)); +#endif +} + static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo, const uint16x8_t vec_hi) { #if defined(__aarch64__) From ff8a9658568061e12e556d1f41754b94a8c30498 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 6 Apr 2023 16:14:51 +0100 Subject: [PATCH 0719/1000] Optimize Armv8.0 Neon SAD4D 16xh, 32xh, and 64xh functions Add a widening 4D reduction function operating on uint16x8_t vectors and use it to optimize the final reduction in Armv8.0 Neon standard bitdepth 16xh, 32xh and 64h SAD4D computations. Also simplify the Armv8.0 Neon version of the sad64xhx4d_neon helper function since VP9 block sizes are not large enough to require widening to 32-bit accumulators before the final reduction. Change-Id: I32b0a283d7688d8cdf21791add9476ed24c66a28 --- vpx_dsp/arm/sad4d_neon.c | 88 ++++++++++++++++------------------------ vpx_dsp/arm/sum_neon.h | 25 ++++++++++++ 2 files changed, 61 insertions(+), 52 deletions(-) diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index ab00e0e3a2..6ad6c96214 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -140,53 +140,43 @@ static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { - int h_tmp = h > 64 ? 64 : h; - int i = 0; - vst1q_u32(res, vdupq_n_u32(0)); + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + int i = 0; do { - uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0) }; - uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0) }; - - do { - uint8x16_t s0, s1, s2, s3; - - s0 = vld1q_u8(src + i * src_stride); - sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); - sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); - sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); - sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); - - s1 = vld1q_u8(src + i * src_stride + 16); - sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); - sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); - sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); - sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); - - s2 = vld1q_u8(src + i * src_stride + 32); - sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); - sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); - sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); - sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); - - s3 = vld1q_u8(src + i * src_stride + 48); - sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); - sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); - sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); - sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); - - i++; - } while (i < h_tmp); - - res[0] += horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]); - res[1] += horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]); - res[2] += horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]); - res[3] += horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]); - - h_tmp += 64; + uint8x16_t s0, s1, s2, s3; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + s2 = vld1q_u8(src + i * src_stride + 32); + sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); + sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); + sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); + sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); + + s3 = vld1q_u8(src + i * src_stride + 48); + sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); + sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); + sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); + sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); + + i++; } while (i < h); + + vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi)); } static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, @@ -216,10 +206,7 @@ static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, i++; } while (i < h); - res[0] = horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]); - res[1] = horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]); - res[2] = horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]); - res[3] = horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]); + vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi)); } static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, @@ -239,10 +226,7 @@ static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, i++; } while (i < h); - res[0] = horizontal_add_uint16x8(sum[0]); - res[1] = horizontal_add_uint16x8(sum[1]); - res[2] = horizontal_add_uint16x8(sum[2]); - res[3] = horizontal_add_uint16x8(sum[3]); + vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); } #endif // defined(__ARM_FEATURE_DOTPROD) diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index 6259add4a4..a0c72f92ce 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -117,6 +117,31 @@ static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo, #endif } +static INLINE uint32x4_t horizontal_long_add_4d_uint16x8( + const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) { + const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]); + const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]); + const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]); + const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]); + const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]); + const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]); + const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]); + const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]); +#if defined(__aarch64__) + const uint32x4_t c0 = vpaddq_u32(b0, b1); + const uint32x4_t c1 = vpaddq_u32(b2, b3); + return vpaddq_u32(c0, c1); +#else + const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0)); + const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1)); + const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2)); + const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); + const uint32x2_t d0 = vpadd_u32(c0, c1); + const uint32x2_t d1 = vpadd_u32(c2, c3); + return vcombine_u32(d0, d1); +#endif +} + static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) { #if defined(__aarch64__) return vaddv_s32(a); From 868674d3300478865c7ed4ca40e6e28e249f1f11 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 6 Apr 2023 12:57:23 -0700 Subject: [PATCH 0720/1000] vpx_subpixel_8t_intrin_avx2: clear -Wshadow warning Bug: webm:1793 Change-Id: Icba4ad242dcd0cad736b9a203829361c5bd1ca3f --- vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 3b5ff04ee9..9ff67bd301 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -240,7 +240,7 @@ static void vpx_filter_block1d8_h8_avx2( // For the remaining height. if (y > 0) { - const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i src_reg_128 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); f[0] = _mm256_castsi256_si128(f1[0]); f[1] = _mm256_castsi256_si128(f1[1]); @@ -248,10 +248,10 @@ static void vpx_filter_block1d8_h8_avx2( f[3] = _mm256_castsi256_si128(f1[3]); // filter the source buffer - s[0] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])); - s[1] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])); - s[2] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])); - s[3] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])); + s[0] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[0])); + s[1] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[1])); + s[2] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[2])); + s[3] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[3])); s[0] = convolve8_8_ssse3(s, f); // Saturate 16bit value to 8bit. From bebc860915188e2fda3983c7d8504371beb2f518 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 6 Apr 2023 13:00:07 -0700 Subject: [PATCH 0721/1000] vp9_encoder: clear -Wshadow warning Bug: webm:1793 Change-Id: Id390c61f82b9f15063d0310a2c252b02b479d9c5 --- vp9/encoder/vp9_encoder.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 4cec02eb93..72a6189d13 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2465,11 +2465,11 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, cpi->svc.number_temporal_layers > 1) { FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf; FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = { 0 }; - int i; + int n; - for (i = 0; i < oxcf->ss_number_layers; ++i) { + for (n = 0; n < oxcf->ss_number_layers; ++n) { FIRSTPASS_STATS *const last_packet_for_layer = - &stats[packets - oxcf->ss_number_layers + i]; + &stats[packets - oxcf->ss_number_layers + n]; const int layer_id = (int)last_packet_for_layer->spatial_layer_id; const int packets_in_layer = (int)last_packet_for_layer->count + 1; if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) { @@ -2494,11 +2494,11 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, } } - for (i = 0; i < packets; ++i) { - const int layer_id = (int)stats[i].spatial_layer_id; + for (n = 0; n < packets; ++n) { + const int layer_id = (int)stats[n].spatial_layer_id; if (layer_id >= 0 && layer_id < oxcf->ss_number_layers && stats_copy[layer_id] != NULL) { - *stats_copy[layer_id] = stats[i]; + *stats_copy[layer_id] = stats[n]; ++stats_copy[layer_id]; } } From 12ab4af3aefb320996ace24083fd26e857f4c533 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 6 Apr 2023 13:00:47 -0700 Subject: [PATCH 0722/1000] vp9_dx_iface: clear -Wshadow warnings Bug: webm:1793 Change-Id: Ice6cd08f145e5813e24345d03e0913e5eda5289f --- vp9/vp9_dx_iface.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index bdfe217936..20e71cc227 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -348,7 +348,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, // Initialize the decoder on the first frame. if (ctx->pbi == NULL) { - const vpx_codec_err_t res = init_decoder(ctx); + res = init_decoder(ctx); if (res != VPX_CODEC_OK) return res; } @@ -367,7 +367,6 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, for (i = 0; i < frame_count; ++i) { const uint8_t *data_start_copy = data_start; const uint32_t frame_size = frame_sizes[i]; - vpx_codec_err_t res; if (data_start < data || frame_size > (uint32_t)(data_end - data_start)) { set_error_detail(ctx, "Invalid frame size in index"); return VPX_CODEC_CORRUPT_FRAME; @@ -382,8 +381,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *const data_end = data + data_sz; while (data_start < data_end) { const uint32_t frame_size = (uint32_t)(data_end - data_start); - const vpx_codec_err_t res = - decode_one(ctx, &data_start, frame_size, user_priv, deadline); + res = decode_one(ctx, &data_start, frame_size, user_priv, deadline); if (res != VPX_CODEC_OK) return res; // Account for suboptimal termination by the encoder. From 61709a177aa8ef60dbc52e4409beb0d486095d55 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 10 Apr 2023 13:29:02 -0700 Subject: [PATCH 0723/1000] vp9_quantize_avx2,highbd_get_max_lane_eob: fix mask Pack nz_mask with zero. After the result is permuted this has the effect of ignoring the upper half of the iscan register which is only loaded with 128-bits. Depending on the optimization level and the load used the upper half of the ymm register may contain undefined values which can produce an incorrect eob. If this is large enough it can cause a crash. Bug: chromium:1431729 Change-Id: I4ebae9fa39f228bdd29dcc19935f3f07759d75f5 --- vp9/encoder/x86/vp9_quantize_avx2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c index da285be8e7..e6aa71d58a 100644 --- a/vp9/encoder/x86/vp9_quantize_avx2.c +++ b/vp9/encoder/x86/vp9_quantize_avx2.c @@ -295,7 +295,8 @@ static VPX_FORCE_INLINE void highbd_load_fp_values( static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob( const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) { - const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask); + const __m256i packed_nz_mask = + _mm256_packs_epi32(nz_mask, _mm256_setzero_si256()); const __m256i packed_nz_mask_perm = _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); const __m256i iscan = From 987ed6937bf27ec5a4e4cb136aa653104adfb068 Mon Sep 17 00:00:00 2001 From: Deepa K G Date: Mon, 3 Apr 2023 23:21:56 +0530 Subject: [PATCH 0724/1000] Avoid redundant start MV SAD calculation Avoided repeated calculation of start MV SAD during full pixel motion search. Instruction Count cpu Resolution Reduction(%) 0 LOWRES2 0.162 0 MIDRES2 0.246 0 HDRES2 0.325 0 Average 0.245 Change-Id: I2b4786901f254ce32ee8ca8a3d56f1c9f112f1d4 --- vp9/common/vp9_rtcd_defs.pl | 2 +- .../arm/neon/vp9_diamond_search_sad_neon.c | 40 +++------------- vp9/encoder/vp9_encoder.h | 34 +++++++++++++ vp9/encoder/vp9_firstpass.c | 18 +++++-- vp9/encoder/vp9_mcomp.c | 48 +++++++------------ vp9/encoder/vp9_mcomp.h | 6 +-- vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 40 +++------------- 7 files changed, 82 insertions(+), 106 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 7f77a36d6e..5e60792556 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -171,7 +171,7 @@ () # # Motion search # -add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; +add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; specialize qw/vp9_diamond_search_sad avx neon/; # diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c index 997775a668..15334b413b 100644 --- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c +++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -30,30 +30,6 @@ static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { return result; } -static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) { - // This is simplified from the C implementation to utilise that - // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and - // x->nmvjointsadcost[1] == x->nmvjointsadcost[3] - return mv.as_int == 0 ? 0 : 1; -} - -static INLINE int mv_cost(const int_mv mv, const int *joint_cost, - int *const comp_cost[2]) { - assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX); - assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX); - return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] + - comp_cost[1][mv.as_mv.col]; -} - -static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, - int sad_per_bit) { - const int_mv diff = - pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col); - return ROUND_POWER_OF_TWO( - (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, - VP9_PROB_COST_SHIFT); -} - /***************************************************************************** * This function utilizes 3 properties of the cost function lookup tables, * * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * @@ -71,8 +47,9 @@ static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, *****************************************************************************/ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, - MV *best_mv, int search_param, int sad_per_bit, - int *num00, const vp9_variance_fn_ptr_t *fn_ptr, + uint32_t start_mv_sad, MV *best_mv, + int search_param, int sad_per_bit, int *num00, + const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { static const uint32_t data[4] = { 0, 1, 2, 3 }; const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data); @@ -101,8 +78,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int)); - const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); - const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); + const int ref_row = ref_mv->row; + const int ref_col = ref_mv->col; int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; @@ -122,7 +99,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, #else int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address); #endif - unsigned int best_sad = INT_MAX; + // Starting position + unsigned int best_sad = start_mv_sad; int i, j, step; // Check the prerequisite cost function properties that are easy to check @@ -131,10 +109,6 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); - // Check the starting position - best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); - best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); - *num00 = 0; for (i = 0, step = 0; step < tot_steps; step++) { diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 442ef1899c..9e5e64629e 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1479,6 +1479,40 @@ static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) { } } +static INLINE int mv_cost(const MV *mv, const int *joint_cost, + int *const comp_cost[2]) { + assert(mv->row >= -MV_MAX && mv->row < MV_MAX); + assert(mv->col >= -MV_MAX && mv->col < MV_MAX); + return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] + + comp_cost[1][mv->col]; +} + +static INLINE int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, + const MV *ref, int sad_per_bit) { + MV diff; + diff.row = mv->row - ref->row; + diff.col = mv->col - ref->col; + return ROUND_POWER_OF_TWO( + (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, + VP9_PROB_COST_SHIFT); +} + +static INLINE uint32_t get_start_mv_sad(const MACROBLOCK *x, const MV *mvp_full, + const MV *ref_mv_full, + vpx_sad_fn_t sad_fn_ptr, int sadpb) { + const int src_buf_stride = x->plane[0].src.stride; + const uint8_t *const src_buf = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + const int pred_buf_stride = xd->plane[0].pre[0].stride; + const uint8_t *const pred_buf = + xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col; + uint32_t start_mv_sad = + sad_fn_ptr(src_buf, src_buf_stride, pred_buf, pred_buf_stride); + start_mv_sad += mvsad_err_cost(x, mvp_full, ref_mv_full, sadpb); + + return start_mv_sad; +} + static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim, int subsampling_dim, int blk_dim) { return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 08b68c93ee..0efa836aca 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -435,6 +435,8 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize = xd->mi[0]->sb_type; vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY; + MV center_mv_full = ref_mv_full; + unsigned int start_mv_sad; int step_param = 3; int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; @@ -455,9 +457,15 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } #endif // CONFIG_VP9_HIGHBITDEPTH + // Calculate SAD of the start mv + clamp_mv(&ref_mv_full, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + start_mv_sad = get_start_mv_sad(x, &ref_mv_full, ¢er_mv_full, + cpi->fn_ptr[bsize].sdf, x->sadperbit16); + // Center the initial step/diamond search on best mv. - tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, - step_param, x->sadperbit16, &num00, + tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, + &tmp_mv, step_param, x->sadperbit16, &num00, &v_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); @@ -478,9 +486,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, if (num00) { --num00; } else { - tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, - step_param + n, x->sadperbit16, &num00, - &v_fn_ptr, ref_mv); + tmp_err = cpi->diamond_search_sad( + x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n, + x->sadperbit16, &num00, &v_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); if (tmp_err < INT_MAX - new_mv_mode_penalty) diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 207eb43949..4ff685b242 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -77,14 +77,6 @@ int vp9_init_search_range(int size) { return sr; } -static INLINE int mv_cost(const MV *mv, const int *joint_cost, - int *const comp_cost[2]) { - assert(mv->row >= -MV_MAX && mv->row < MV_MAX); - assert(mv->col >= -MV_MAX && mv->col < MV_MAX); - return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] + - comp_cost[1][mv->col]; -} - int vp9_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost, int *mvcost[2], int weight) { const MV diff = { mv->row - ref->row, mv->col - ref->col }; @@ -103,15 +95,6 @@ static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost, } return 0; } - -static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref, - int sad_per_bit) { - const MV diff = { mv->row - ref->row, mv->col - ref->col }; - return ROUND_POWER_OF_TWO( - (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, - VP9_PROB_COST_SHIFT); -} - void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) { int len; int ss_count = 0; @@ -2070,8 +2053,8 @@ int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row, #endif // CONFIG_NON_GREEDY_MV int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, - MV *ref_mv, MV *best_mv, int search_param, - int sad_per_bit, int *num00, + MV *ref_mv, uint32_t start_mv_sad, MV *best_mv, + int search_param, int sad_per_bit, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { int i, j, step; @@ -2083,7 +2066,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, const int in_what_stride = xd->plane[0].pre[0].stride; const uint8_t *best_address; - unsigned int bestsad = INT_MAX; + unsigned int bestsad = start_mv_sad; int best_site = -1; int last_site = -1; @@ -2101,8 +2084,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, const int tot_steps = cfg->total_steps - search_param; const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max, - x->mv_limits.row_min, x->mv_limits.row_max); ref_row = ref_mv->row; ref_col = ref_mv->col; *num00 = 0; @@ -2113,10 +2094,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; best_address = in_what; - // Check the starting position - bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) + - mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit); - i = 0; for (step = 0; step < tot_steps; step++) { @@ -2514,8 +2491,17 @@ static int full_pixel_diamond(const VP9_COMP *const cpi, const MV *ref_mv, MV *dst_mv) { MV temp_mv; int thissme, n, num00 = 0; - int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, - step_param, sadpb, &n, fn_ptr, ref_mv); + int bestsme; + unsigned int start_mv_sad; + const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 }; + clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + start_mv_sad = + get_start_mv_sad(x, mvp_full, &ref_mv_full, fn_ptr->sdf, sadpb); + + bestsme = + cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv, + step_param, sadpb, &n, fn_ptr, ref_mv); if (bestsme < INT_MAX) bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); *dst_mv = temp_mv; @@ -2530,9 +2516,9 @@ static int full_pixel_diamond(const VP9_COMP *const cpi, if (num00) { num00--; } else { - thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, - step_param + n, sadpb, &num00, fn_ptr, - ref_mv); + thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, + &temp_mv, step_param + n, sadpb, &num00, + fn_ptr, ref_mv); if (thissme < INT_MAX) thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index bdaf2ce77d..62a7a047d4 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -94,9 +94,9 @@ extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv; extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv; typedef int (*vp9_diamond_search_fn_t)( - const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, - int search_param, int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv); + const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, + uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit, + int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv); int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c index 0e04a2f41f..719ab40f90 100644 --- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c @@ -32,29 +32,6 @@ static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { result.as_mv.col = col; return result; } - -static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) { - // This is simplified from the C implementation to utilise that - // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and - // x->nmvjointsadcost[1] == x->nmvjointsadcost[3] - return mv.as_int == 0 ? 0 : 1; -} - -static INLINE int mv_cost(const int_mv mv, const int *joint_cost, - int *const comp_cost[2]) { - return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] + - comp_cost[1][mv.as_mv.col]; -} - -static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, - int sad_per_bit) { - const int_mv diff = - pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col); - return ROUND_POWER_OF_TWO( - (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, - VP9_PROB_COST_SHIFT); -} - /***************************************************************************** * This function utilizes 3 properties of the cost function lookup tables, * * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * @@ -72,8 +49,9 @@ static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, *****************************************************************************/ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, - MV *best_mv, int search_param, int sad_per_bit, - int *num00, const vp9_variance_fn_ptr_t *fn_ptr, + uint32_t start_mv_sad, MV *best_mv, + int search_param, int sad_per_bit, int *num00, + const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int); @@ -98,8 +76,8 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int); - const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); - const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); + const int ref_row = ref_mv->row; + const int ref_col = ref_mv->col; int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; @@ -119,8 +97,8 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, #else __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif - - unsigned int best_sad; + // Starting position + unsigned int best_sad = start_mv_sad; int i, j, step; // Check the prerequisite cost function properties that are easy to check @@ -129,10 +107,6 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); - // Check the starting position - best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); - best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); - *num00 = 0; for (i = 0, step = 0; step < tot_steps; step++) { From 35c32b1d223edbf70b61d5fd247d3890acce2025 Mon Sep 17 00:00:00 2001 From: Cherma Rajan A Date: Tue, 11 Apr 2023 14:50:18 +0530 Subject: [PATCH 0725/1000] Add assert to ensure NEARESTMV or NEWMV modes are not skipped Added an assert for prune_single_mode_based_on_mv_diff_mode_rate speed feature. This ensures NEARMV or ZEROMV modes are pruned only when NEARESTMV and NEWMV modes are not early terminated. Change-Id: Id8b03eef6d1ef3f16714a9cbfde0c171c0c6fe0b --- vp9/encoder/vp9_rdopt.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 05811bd828..9121eeac15 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -2967,8 +2967,14 @@ static int64_t handle_inter_mode( // rate. if (skip_single_mode_based_on_mode_rate(mode_mv, single_mode_rate, this_mode, refs[0], *rate2, - best_mode_index)) + best_mode_index)) { + // Check when the single inter mode is pruned, NEARESTMV or NEWMV modes + // are not early terminated. This ensures all single modes are not getting + // skipped when the speed feature is enabled. + assert(single_mode_rate[INTER_OFFSET(NEARESTMV)] != INT_MAX || + single_mode_rate[INTER_OFFSET(NEWMV)] != INT_MAX); return INT64_MAX; + } } if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd && mi->mode != NEARESTMV) From 232f8659aafec1461cac76f76885c8663755957f Mon Sep 17 00:00:00 2001 From: Deepa K G Date: Wed, 5 Apr 2023 16:42:54 +0530 Subject: [PATCH 0726/1000] Downsample SAD computation in motion search Added a speed feature to skip every other row in SAD computation during motion search. Instruction Count BD-Rate Loss(%) cpu Resolution Reduction(%) avg.psnr ovr.psnr ssim 0 LOWRES2 0.958 0.0204 0.0095 0.0275 0 MIDRES2 1.891 -0.0636 0.0032 0.0247 0 HDRES2 2.869 0.0434 0.0345 0.0686 0 Average 1.905 0.0000 0.0157 0.0403 STATS_CHANGED Change-Id: I1a8692757ed0cbcb2259729b3ecfb0436cdf49ce --- test/sad_test.cc | 515 ++++++++++++++ vp9/common/vp9_rtcd_defs.pl | 4 +- .../arm/neon/vp9_diamond_search_sad_neon.c | 6 +- vp9/encoder/vp9_encoder.c | 647 ++++++++++-------- vp9/encoder/vp9_firstpass.c | 7 +- vp9/encoder/vp9_mcomp.c | 97 ++- vp9/encoder/vp9_mcomp.h | 10 +- vp9/encoder/vp9_speed_features.c | 2 + vp9/encoder/vp9_speed_features.h | 4 + vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 6 +- vpx_dsp/sad.c | 30 + vpx_dsp/variance.h | 4 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 147 ++++ vpx_dsp/x86/highbd_sad4d_avx2.c | 313 +++++---- vpx_dsp/x86/highbd_sad4d_sse2.asm | 43 +- vpx_dsp/x86/highbd_sad_avx2.c | 188 +++-- vpx_dsp/x86/highbd_sad_sse2.asm | 59 +- vpx_dsp/x86/sad4d_avx2.c | 66 +- vpx_dsp/x86/sad4d_sse2.asm | 43 +- vpx_dsp/x86/sad_avx2.c | 145 ++-- vpx_dsp/x86/sad_sse2.asm | 70 +- 21 files changed, 1824 insertions(+), 582 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 0896c77f12..561da5ddfb 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -42,6 +42,10 @@ typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); typedef TestParams SadMxNParam; +typedef unsigned int (*SadSkipMxNFunc)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride); +typedef TestParams SadSkipMxNParam; + typedef unsigned int (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); @@ -52,6 +56,11 @@ typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride, unsigned int *sad_array); typedef TestParams SadMxNx4Param; +typedef void (*SadSkipMxNx4Func)(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[], int ref_stride, + unsigned int *sad_array); +typedef TestParams SadSkipMxNx4Param; + typedef void (*SadMxNx8Func)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); @@ -170,6 +179,34 @@ class SADTestBase : public ::testing::TestWithParam { return sad; } + // Sum of Absolute Differences Skip rows. Given two blocks, calculate the + // absolute difference between two pixels in the same relative location every + // other row; accumulate and double the result at the end. + uint32_t ReferenceSADSkip(int ref_offset) const { + uint32_t sad = 0; + const uint8_t *const reference8 = GetReferenceFromOffset(ref_offset); + const uint8_t *const source8 = source_data_; +#if CONFIG_VP9_HIGHBITDEPTH + const uint16_t *const reference16 = + CONVERT_TO_SHORTPTR(GetReferenceFromOffset(ref_offset)); + const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); +#endif // CONFIG_VP9_HIGHBITDEPTH + for (int h = 0; h < params_.height; h += 2) { + for (int w = 0; w < params_.width; ++w) { + if (!use_high_bit_depth_) { + sad += abs(source8[h * source_stride_ + w] - + reference8[h * reference_stride_ + w]); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + sad += abs(source16[h * source_stride_ + w] - + reference16[h * reference_stride_ + w]); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + } + return sad * 2; + } + // Sum of Absolute Differences Average. Given two blocks, and a prediction // calculate the absolute difference between one pixel and average of the // corresponding and predicted pixels; accumulate. @@ -290,6 +327,32 @@ class SADx4Test : public SADTestBase { } }; +class SADSkipx4Test : public SADTestBase { + public: + SADSkipx4Test() : SADTestBase(GetParam()) {} + + protected: + void SADs(unsigned int *results) const { + const uint8_t *references[] = { GetReference(0), GetReference(1), + GetReference(2), GetReference(3) }; + + ASM_REGISTER_STATE_CHECK(params_.func( + source_data_, source_stride_, references, reference_stride_, results)); + } + + void CheckSADs() const { + uint32_t reference_sad; + DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); + + SADs(exp_sad); + for (int block = 0; block < 4; ++block) { + reference_sad = ReferenceSADSkip(GetBlockRefOffset(block)); + + EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block; + } + } +}; + class SADTest : public AbstractBench, public SADTestBase { public: SADTest() : SADTestBase(GetParam()) {} @@ -317,6 +380,33 @@ class SADTest : public AbstractBench, public SADTestBase { } }; +class SADSkipTest : public AbstractBench, public SADTestBase { + public: + SADSkipTest() : SADTestBase(GetParam()) {} + + protected: + unsigned int SAD(int block_idx) const { + unsigned int ret; + const uint8_t *const reference = GetReference(block_idx); + + ASM_REGISTER_STATE_CHECK(ret = params_.func(source_data_, source_stride_, + reference, reference_stride_)); + return ret; + } + + void CheckSAD() const { + const unsigned int reference_sad = ReferenceSADSkip(GetBlockRefOffset(0)); + const unsigned int exp_sad = SAD(0); + + ASSERT_EQ(reference_sad, exp_sad); + } + + void Run() override { + params_.func(source_data_, source_stride_, reference_data_, + reference_stride_); + } +}; + class SADavgTest : public AbstractBench, public SADTestBase { public: SADavgTest() : SADTestBase(GetParam()) {} @@ -397,6 +487,58 @@ TEST_P(SADTest, DISABLED_Speed) { PrintMedian(title); } +TEST_P(SADSkipTest, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(reference_data_, reference_stride_, mask_); + CheckSAD(); +} + +TEST_P(SADSkipTest, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(reference_data_, reference_stride_, 0); + CheckSAD(); +} + +TEST_P(SADSkipTest, ShortRef) { + const int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipTest, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + const int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipTest, ShortSrc) { + const int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + source_stride_ = tmp_stride; +} + +TEST_P(SADSkipTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height); + FillRandom(source_data_, source_stride_); + + RunNTimes(kCountSpeedTestBlock); + + char title[16]; + snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height); + PrintMedian(title); +} + TEST_P(SADavgTest, MaxRef) { FillConstant(source_data_, source_stride_, 0); FillConstant(reference_data_, reference_stride_, mask_); @@ -554,6 +696,105 @@ TEST_P(SADx4Test, DISABLED_Speed) { reference_stride_ = tmp_stride; } +TEST_P(SADSkipx4Test, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(GetReference(0), reference_stride_, mask_); + FillConstant(GetReference(1), reference_stride_, mask_); + FillConstant(GetReference(2), reference_stride_, mask_); + FillConstant(GetReference(3), reference_stride_, mask_); + CheckSADs(); +} + +TEST_P(SADSkipx4Test, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(GetReference(0), reference_stride_, 0); + FillConstant(GetReference(1), reference_stride_, 0); + FillConstant(GetReference(2), reference_stride_, 0); + FillConstant(GetReference(3), reference_stride_, 0); + CheckSADs(); +} + +TEST_P(SADSkipx4Test, ShortRef) { + int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, ShortSrc) { + int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + source_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, SrcAlignedByWidth) { + uint8_t *tmp_source_data = source_data_; + source_data_ += params_.width; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + source_data_ = tmp_source_data; +} + +TEST_P(SADSkipx4Test, DISABLED_Speed) { + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height); + uint32_t reference_sad[4]; + DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); + vpx_usec_timer timer; + for (int block = 0; block < 4; ++block) { + reference_sad[block] = ReferenceSADSkip(GetBlockRefOffset(block)); + } + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + SADs(exp_sad); + } + vpx_usec_timer_mark(&timer); + for (int block = 0; block < 4; ++block) { + EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block; + } + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer) / 1000); + printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height, + bit_depth_, elapsed_time); + + reference_stride_ = tmp_stride; +} + //------------------------------------------------------------------------------ // C functions const SadMxNParam c_tests[] = { @@ -614,6 +855,56 @@ const SadMxNParam c_tests[] = { }; INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests)); +const SadSkipMxNParam skip_c_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_c), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_c), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_c), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_c), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_c), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_c), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_c), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_c), + SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_c), + SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_c), + SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_c), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 8), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 8), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 8), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 8), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 10), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 10), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 10), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 10), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 12), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 12), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 12), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests)); + const SadMxNAvgParam avg_c_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_c), SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_c), @@ -730,6 +1021,57 @@ const SadMxNx4Param x4d_c_tests[] = { }; INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); +const SadSkipMxNx4Param skip_x4d_c_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_c), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_c), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_c), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_c), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_c), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_c), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_c), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_c), + SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_c), + SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_c), + SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_c), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 8), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 8), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 8), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 8), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 10), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 10), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 10), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 10), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 12), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 12), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 12), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 12), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_c_tests)); + //------------------------------------------------------------------------------ // ARM functions #if HAVE_NEON @@ -956,6 +1298,54 @@ const SadMxNParam sse2_tests[] = { }; INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests)); +const SadSkipMxNParam skip_sse2_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_sse2), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_sse2), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_sse2), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_sse2), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_sse2), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_sse2), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_sse2), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_sse2), + SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_sse2), + SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_sse2), + SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_sse2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 8), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 8), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 8), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 10), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 10), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 10), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 12), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 12), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipTest, + ::testing::ValuesIn(skip_sse2_tests)); + const SadMxNAvgParam avg_sse2_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_sse2), SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_sse2), @@ -1065,6 +1455,57 @@ const SadMxNx4Param x4d_sse2_tests[] = { #endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests)); + +const SadSkipMxNx4Param skip_x4d_sse2_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_sse2), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_sse2), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_sse2), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_sse2), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_sse2), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_sse2), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_sse2), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_sse2), + SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_sse2), + SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_sse2), + SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_sse2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 8), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 8), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 8), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 8), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 10), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 10), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 10), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 10), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 12), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 12), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 12), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 12), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_sse2_tests)); #endif // HAVE_SSE2 #if HAVE_SSE3 @@ -1113,6 +1554,44 @@ const SadMxNParam avx2_tests[] = { }; INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests)); +const SadSkipMxNParam skip_avx2_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_avx2), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_avx2), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_avx2), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_avx2), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 8), + + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 10), + + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipTest, + ::testing::ValuesIn(skip_avx2_tests)); + const SadMxNAvgParam avg_avx2_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_avx2), SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_avx2), @@ -1180,6 +1659,42 @@ const SadMxNx4Param x4d_avx2_tests[] = { }; INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); +const SadSkipMxNx4Param skip_x4d_avx2_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_avx2), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_avx2), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_avx2), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_avx2), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 8), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 10), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 12), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_avx2_tests)); + #endif // HAVE_AVX2 #if HAVE_AVX512 diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 5e60792556..4b94c31f15 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -23,7 +23,7 @@ () /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct vp9_sad_table; struct search_site_config; struct mv; union int_mv; @@ -171,7 +171,7 @@ () # # Motion search # -add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; +add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv"; specialize qw/vp9_diamond_search_sad avx neon/; # diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c index 15334b413b..255e6fbc4a 100644 --- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c +++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -49,7 +49,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, + const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv) { static const uint32_t data[4] = { 0, 1, 2, 3 }; const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data); @@ -188,8 +188,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, #endif } - fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], - in_what_stride, (uint32_t *)&v_sad_d); + sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], + in_what_stride, (uint32_t *)&v_sad_d); // Look up the component cost of the residual motion vector { diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 4cec02eb93..a0b1f2c7c6 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1561,13 +1561,15 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) { } #if CONFIG_VP9_HIGHBITDEPTH -#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; +#define HIGHBD_BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdsf = SDSF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].sdsx4df = SDSX4DF; #define MAKE_BFP_SAD_WRAPPER(fnname) \ static unsigned int fnname##_bits8(const uint8_t *src_ptr, \ @@ -1627,284 +1629,361 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) { } MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x16x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x32) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x32x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x32) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x32x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x64) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x64x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x32) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x32x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x64) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x64x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x16x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x8x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x16x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x8x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x4) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x4x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x8x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x4) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x4x4d) static void highbd_set_var_fns(VP9_COMP *const cpi) { VP9_COMMON *const cm = &cpi->common; if (cm->use_highbitdepth) { switch (cm->bit_depth) { case VPX_BITS_8: - HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8, - vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16, - vpx_highbd_8_sub_pixel_variance32x16, - vpx_highbd_8_sub_pixel_avg_variance32x16, - vpx_highbd_sad32x16x4d_bits8) - - HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8, - vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32, - vpx_highbd_8_sub_pixel_variance16x32, - vpx_highbd_8_sub_pixel_avg_variance16x32, - vpx_highbd_sad16x32x4d_bits8) - - HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8, - vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32, - vpx_highbd_8_sub_pixel_variance64x32, - vpx_highbd_8_sub_pixel_avg_variance64x32, - vpx_highbd_sad64x32x4d_bits8) - - HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8, - vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64, - vpx_highbd_8_sub_pixel_variance32x64, - vpx_highbd_8_sub_pixel_avg_variance32x64, - vpx_highbd_sad32x64x4d_bits8) - - HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8, - vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32, - vpx_highbd_8_sub_pixel_variance32x32, - vpx_highbd_8_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x4d_bits8) - - HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8, - vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64, - vpx_highbd_8_sub_pixel_variance64x64, - vpx_highbd_8_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x4d_bits8) - - HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8, - vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16, - vpx_highbd_8_sub_pixel_variance16x16, - vpx_highbd_8_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x4d_bits8) - - HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8, - vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8, - vpx_highbd_8_sub_pixel_variance16x8, - vpx_highbd_8_sub_pixel_avg_variance16x8, - vpx_highbd_sad16x8x4d_bits8) - - HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8, - vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16, - vpx_highbd_8_sub_pixel_variance8x16, - vpx_highbd_8_sub_pixel_avg_variance8x16, - vpx_highbd_sad8x16x4d_bits8) + HIGHBD_BFP( + BLOCK_32X16, vpx_highbd_sad32x16_bits8, + vpx_highbd_sad_skip_32x16_bits8, vpx_highbd_sad32x16_avg_bits8, + vpx_highbd_8_variance32x16, vpx_highbd_8_sub_pixel_variance32x16, + vpx_highbd_8_sub_pixel_avg_variance32x16, + vpx_highbd_sad32x16x4d_bits8, vpx_highbd_sad_skip_32x16x4d_bits8) + + HIGHBD_BFP( + BLOCK_16X32, vpx_highbd_sad16x32_bits8, + vpx_highbd_sad_skip_16x32_bits8, vpx_highbd_sad16x32_avg_bits8, + vpx_highbd_8_variance16x32, vpx_highbd_8_sub_pixel_variance16x32, + vpx_highbd_8_sub_pixel_avg_variance16x32, + vpx_highbd_sad16x32x4d_bits8, vpx_highbd_sad_skip_16x32x4d_bits8) + + HIGHBD_BFP( + BLOCK_64X32, vpx_highbd_sad64x32_bits8, + vpx_highbd_sad_skip_64x32_bits8, vpx_highbd_sad64x32_avg_bits8, + vpx_highbd_8_variance64x32, vpx_highbd_8_sub_pixel_variance64x32, + vpx_highbd_8_sub_pixel_avg_variance64x32, + vpx_highbd_sad64x32x4d_bits8, vpx_highbd_sad_skip_64x32x4d_bits8) + + HIGHBD_BFP( + BLOCK_32X64, vpx_highbd_sad32x64_bits8, + vpx_highbd_sad_skip_32x64_bits8, vpx_highbd_sad32x64_avg_bits8, + vpx_highbd_8_variance32x64, vpx_highbd_8_sub_pixel_variance32x64, + vpx_highbd_8_sub_pixel_avg_variance32x64, + vpx_highbd_sad32x64x4d_bits8, vpx_highbd_sad_skip_32x64x4d_bits8) + + HIGHBD_BFP( + BLOCK_32X32, vpx_highbd_sad32x32_bits8, + vpx_highbd_sad_skip_32x32_bits8, vpx_highbd_sad32x32_avg_bits8, + vpx_highbd_8_variance32x32, vpx_highbd_8_sub_pixel_variance32x32, + vpx_highbd_8_sub_pixel_avg_variance32x32, + vpx_highbd_sad32x32x4d_bits8, vpx_highbd_sad_skip_32x32x4d_bits8) HIGHBD_BFP( - BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8, - vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8, - vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x4d_bits8) + BLOCK_64X64, vpx_highbd_sad64x64_bits8, + vpx_highbd_sad_skip_64x64_bits8, vpx_highbd_sad64x64_avg_bits8, + vpx_highbd_8_variance64x64, vpx_highbd_8_sub_pixel_variance64x64, + vpx_highbd_8_sub_pixel_avg_variance64x64, + vpx_highbd_sad64x64x4d_bits8, vpx_highbd_sad_skip_64x64x4d_bits8) HIGHBD_BFP( - BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8, - vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4, - vpx_highbd_8_sub_pixel_avg_variance8x4, vpx_highbd_sad8x4x4d_bits8) + BLOCK_16X16, vpx_highbd_sad16x16_bits8, + vpx_highbd_sad_skip_16x16_bits8, vpx_highbd_sad16x16_avg_bits8, + vpx_highbd_8_variance16x16, vpx_highbd_8_sub_pixel_variance16x16, + vpx_highbd_8_sub_pixel_avg_variance16x16, + vpx_highbd_sad16x16x4d_bits8, vpx_highbd_sad_skip_16x16x4d_bits8) HIGHBD_BFP( - BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8, - vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8, - vpx_highbd_8_sub_pixel_avg_variance4x8, vpx_highbd_sad4x8x4d_bits8) + BLOCK_16X8, vpx_highbd_sad16x8_bits8, + vpx_highbd_sad_skip_16x8_bits8, vpx_highbd_sad16x8_avg_bits8, + vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8, + vpx_highbd_8_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits8, vpx_highbd_sad_skip_16x8x4d_bits8) HIGHBD_BFP( - BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8, - vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4, - vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits8) + BLOCK_8X16, vpx_highbd_sad8x16_bits8, + vpx_highbd_sad_skip_8x16_bits8, vpx_highbd_sad8x16_avg_bits8, + vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16, + vpx_highbd_8_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits8, vpx_highbd_sad_skip_8x16x4d_bits8) + + HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8, + vpx_highbd_sad_skip_8x8_bits8, vpx_highbd_sad8x8_avg_bits8, + vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8, + vpx_highbd_8_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits8, vpx_highbd_sad_skip_8x8x4d_bits8) + + HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8, + vpx_highbd_sad_skip_8x4_bits8, vpx_highbd_sad8x4_avg_bits8, + vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4, + vpx_highbd_8_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits8, vpx_highbd_sad_skip_8x4x4d_bits8) + + HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8, + vpx_highbd_sad_skip_4x8_bits8, vpx_highbd_sad4x8_avg_bits8, + vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8, + vpx_highbd_8_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits8, vpx_highbd_sad_skip_4x8x4d_bits8) + + HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8, + vpx_highbd_sad_skip_4x4_bits8, vpx_highbd_sad4x4_avg_bits8, + vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4, + vpx_highbd_8_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits8, vpx_highbd_sad_skip_4x4x4d_bits8) break; case VPX_BITS_10: - HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10, - vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16, - vpx_highbd_10_sub_pixel_variance32x16, - vpx_highbd_10_sub_pixel_avg_variance32x16, - vpx_highbd_sad32x16x4d_bits10) - - HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10, - vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32, - vpx_highbd_10_sub_pixel_variance16x32, - vpx_highbd_10_sub_pixel_avg_variance16x32, - vpx_highbd_sad16x32x4d_bits10) - - HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10, - vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32, - vpx_highbd_10_sub_pixel_variance64x32, - vpx_highbd_10_sub_pixel_avg_variance64x32, - vpx_highbd_sad64x32x4d_bits10) - - HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10, - vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64, - vpx_highbd_10_sub_pixel_variance32x64, - vpx_highbd_10_sub_pixel_avg_variance32x64, - vpx_highbd_sad32x64x4d_bits10) - - HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10, - vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32, - vpx_highbd_10_sub_pixel_variance32x32, - vpx_highbd_10_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x4d_bits10) - - HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10, - vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64, - vpx_highbd_10_sub_pixel_variance64x64, - vpx_highbd_10_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x4d_bits10) - - HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10, - vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16, - vpx_highbd_10_sub_pixel_variance16x16, - vpx_highbd_10_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x4d_bits10) - - HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10, - vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8, - vpx_highbd_10_sub_pixel_variance16x8, - vpx_highbd_10_sub_pixel_avg_variance16x8, - vpx_highbd_sad16x8x4d_bits10) - - HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10, - vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16, - vpx_highbd_10_sub_pixel_variance8x16, - vpx_highbd_10_sub_pixel_avg_variance8x16, - vpx_highbd_sad8x16x4d_bits10) - - HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10, - vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8, - vpx_highbd_10_sub_pixel_variance8x8, - vpx_highbd_10_sub_pixel_avg_variance8x8, - vpx_highbd_sad8x8x4d_bits10) - - HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10, - vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4, - vpx_highbd_10_sub_pixel_variance8x4, - vpx_highbd_10_sub_pixel_avg_variance8x4, - vpx_highbd_sad8x4x4d_bits10) - - HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10, - vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8, - vpx_highbd_10_sub_pixel_variance4x8, - vpx_highbd_10_sub_pixel_avg_variance4x8, - vpx_highbd_sad4x8x4d_bits10) - - HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10, - vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4, - vpx_highbd_10_sub_pixel_variance4x4, - vpx_highbd_10_sub_pixel_avg_variance4x4, - vpx_highbd_sad4x4x4d_bits10) + HIGHBD_BFP( + BLOCK_32X16, vpx_highbd_sad32x16_bits10, + vpx_highbd_sad_skip_32x16_bits10, vpx_highbd_sad32x16_avg_bits10, + vpx_highbd_10_variance32x16, vpx_highbd_10_sub_pixel_variance32x16, + vpx_highbd_10_sub_pixel_avg_variance32x16, + vpx_highbd_sad32x16x4d_bits10, vpx_highbd_sad_skip_32x16x4d_bits10) + + HIGHBD_BFP( + BLOCK_16X32, vpx_highbd_sad16x32_bits10, + vpx_highbd_sad_skip_16x32_bits10, vpx_highbd_sad16x32_avg_bits10, + vpx_highbd_10_variance16x32, vpx_highbd_10_sub_pixel_variance16x32, + vpx_highbd_10_sub_pixel_avg_variance16x32, + vpx_highbd_sad16x32x4d_bits10, vpx_highbd_sad_skip_16x32x4d_bits10) + + HIGHBD_BFP( + BLOCK_64X32, vpx_highbd_sad64x32_bits10, + vpx_highbd_sad_skip_64x32_bits10, vpx_highbd_sad64x32_avg_bits10, + vpx_highbd_10_variance64x32, vpx_highbd_10_sub_pixel_variance64x32, + vpx_highbd_10_sub_pixel_avg_variance64x32, + vpx_highbd_sad64x32x4d_bits10, vpx_highbd_sad_skip_64x32x4d_bits10) + + HIGHBD_BFP( + BLOCK_32X64, vpx_highbd_sad32x64_bits10, + vpx_highbd_sad_skip_32x64_bits10, vpx_highbd_sad32x64_avg_bits10, + vpx_highbd_10_variance32x64, vpx_highbd_10_sub_pixel_variance32x64, + vpx_highbd_10_sub_pixel_avg_variance32x64, + vpx_highbd_sad32x64x4d_bits10, vpx_highbd_sad_skip_32x64x4d_bits10) + + HIGHBD_BFP( + BLOCK_32X32, vpx_highbd_sad32x32_bits10, + vpx_highbd_sad_skip_32x32_bits10, vpx_highbd_sad32x32_avg_bits10, + vpx_highbd_10_variance32x32, vpx_highbd_10_sub_pixel_variance32x32, + vpx_highbd_10_sub_pixel_avg_variance32x32, + vpx_highbd_sad32x32x4d_bits10, vpx_highbd_sad_skip_32x32x4d_bits10) + + HIGHBD_BFP( + BLOCK_64X64, vpx_highbd_sad64x64_bits10, + vpx_highbd_sad_skip_64x64_bits10, vpx_highbd_sad64x64_avg_bits10, + vpx_highbd_10_variance64x64, vpx_highbd_10_sub_pixel_variance64x64, + vpx_highbd_10_sub_pixel_avg_variance64x64, + vpx_highbd_sad64x64x4d_bits10, vpx_highbd_sad_skip_64x64x4d_bits10) + + HIGHBD_BFP( + BLOCK_16X16, vpx_highbd_sad16x16_bits10, + vpx_highbd_sad_skip_16x16_bits10, vpx_highbd_sad16x16_avg_bits10, + vpx_highbd_10_variance16x16, vpx_highbd_10_sub_pixel_variance16x16, + vpx_highbd_10_sub_pixel_avg_variance16x16, + vpx_highbd_sad16x16x4d_bits10, vpx_highbd_sad_skip_16x16x4d_bits10) + + HIGHBD_BFP( + BLOCK_16X8, vpx_highbd_sad16x8_bits10, + vpx_highbd_sad_skip_16x8_bits10, vpx_highbd_sad16x8_avg_bits10, + vpx_highbd_10_variance16x8, vpx_highbd_10_sub_pixel_variance16x8, + vpx_highbd_10_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits10, vpx_highbd_sad_skip_16x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X16, vpx_highbd_sad8x16_bits10, + vpx_highbd_sad_skip_8x16_bits10, vpx_highbd_sad8x16_avg_bits10, + vpx_highbd_10_variance8x16, vpx_highbd_10_sub_pixel_variance8x16, + vpx_highbd_10_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits10, vpx_highbd_sad_skip_8x16x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad_skip_8x8_bits10, + vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8, + vpx_highbd_10_sub_pixel_variance8x8, + vpx_highbd_10_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits10, vpx_highbd_sad_skip_8x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad_skip_8x4_bits10, + vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4, + vpx_highbd_10_sub_pixel_variance8x4, + vpx_highbd_10_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits10, vpx_highbd_sad_skip_8x4x4d_bits10) + + HIGHBD_BFP( + BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad_skip_4x8_bits10, + vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8, + vpx_highbd_10_sub_pixel_variance4x8, + vpx_highbd_10_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits10, vpx_highbd_sad_skip_4x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad_skip_4x4_bits10, + vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4, + vpx_highbd_10_sub_pixel_variance4x4, + vpx_highbd_10_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits10, vpx_highbd_sad_skip_4x4x4d_bits10) break; default: assert(cm->bit_depth == VPX_BITS_12); - HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12, - vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16, - vpx_highbd_12_sub_pixel_variance32x16, - vpx_highbd_12_sub_pixel_avg_variance32x16, - vpx_highbd_sad32x16x4d_bits12) - - HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12, - vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32, - vpx_highbd_12_sub_pixel_variance16x32, - vpx_highbd_12_sub_pixel_avg_variance16x32, - vpx_highbd_sad16x32x4d_bits12) - - HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12, - vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32, - vpx_highbd_12_sub_pixel_variance64x32, - vpx_highbd_12_sub_pixel_avg_variance64x32, - vpx_highbd_sad64x32x4d_bits12) - - HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12, - vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64, - vpx_highbd_12_sub_pixel_variance32x64, - vpx_highbd_12_sub_pixel_avg_variance32x64, - vpx_highbd_sad32x64x4d_bits12) - - HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12, - vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32, - vpx_highbd_12_sub_pixel_variance32x32, - vpx_highbd_12_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x4d_bits12) - - HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12, - vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64, - vpx_highbd_12_sub_pixel_variance64x64, - vpx_highbd_12_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x4d_bits12) - - HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12, - vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16, - vpx_highbd_12_sub_pixel_variance16x16, - vpx_highbd_12_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x4d_bits12) - - HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12, - vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8, - vpx_highbd_12_sub_pixel_variance16x8, - vpx_highbd_12_sub_pixel_avg_variance16x8, - vpx_highbd_sad16x8x4d_bits12) - - HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12, - vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16, - vpx_highbd_12_sub_pixel_variance8x16, - vpx_highbd_12_sub_pixel_avg_variance8x16, - vpx_highbd_sad8x16x4d_bits12) - - HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12, - vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8, - vpx_highbd_12_sub_pixel_variance8x8, - vpx_highbd_12_sub_pixel_avg_variance8x8, - vpx_highbd_sad8x8x4d_bits12) - - HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12, - vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4, - vpx_highbd_12_sub_pixel_variance8x4, - vpx_highbd_12_sub_pixel_avg_variance8x4, - vpx_highbd_sad8x4x4d_bits12) - - HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12, - vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8, - vpx_highbd_12_sub_pixel_variance4x8, - vpx_highbd_12_sub_pixel_avg_variance4x8, - vpx_highbd_sad4x8x4d_bits12) - - HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12, - vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4, - vpx_highbd_12_sub_pixel_variance4x4, - vpx_highbd_12_sub_pixel_avg_variance4x4, - vpx_highbd_sad4x4x4d_bits12) + HIGHBD_BFP( + BLOCK_32X16, vpx_highbd_sad32x16_bits12, + vpx_highbd_sad_skip_32x16_bits12, vpx_highbd_sad32x16_avg_bits12, + vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16, + vpx_highbd_12_sub_pixel_avg_variance32x16, + vpx_highbd_sad32x16x4d_bits12, vpx_highbd_sad_skip_32x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_16X32, vpx_highbd_sad16x32_bits12, + vpx_highbd_sad_skip_16x32_bits12, vpx_highbd_sad16x32_avg_bits12, + vpx_highbd_12_variance16x32, vpx_highbd_12_sub_pixel_variance16x32, + vpx_highbd_12_sub_pixel_avg_variance16x32, + vpx_highbd_sad16x32x4d_bits12, vpx_highbd_sad_skip_16x32x4d_bits12) + + HIGHBD_BFP( + BLOCK_64X32, vpx_highbd_sad64x32_bits12, + vpx_highbd_sad_skip_64x32_bits12, vpx_highbd_sad64x32_avg_bits12, + vpx_highbd_12_variance64x32, vpx_highbd_12_sub_pixel_variance64x32, + vpx_highbd_12_sub_pixel_avg_variance64x32, + vpx_highbd_sad64x32x4d_bits12, vpx_highbd_sad_skip_64x32x4d_bits12) + + HIGHBD_BFP( + BLOCK_32X64, vpx_highbd_sad32x64_bits12, + vpx_highbd_sad_skip_32x64_bits12, vpx_highbd_sad32x64_avg_bits12, + vpx_highbd_12_variance32x64, vpx_highbd_12_sub_pixel_variance32x64, + vpx_highbd_12_sub_pixel_avg_variance32x64, + vpx_highbd_sad32x64x4d_bits12, vpx_highbd_sad_skip_32x64x4d_bits12) + + HIGHBD_BFP( + BLOCK_32X32, vpx_highbd_sad32x32_bits12, + vpx_highbd_sad_skip_32x32_bits12, vpx_highbd_sad32x32_avg_bits12, + vpx_highbd_12_variance32x32, vpx_highbd_12_sub_pixel_variance32x32, + vpx_highbd_12_sub_pixel_avg_variance32x32, + vpx_highbd_sad32x32x4d_bits12, vpx_highbd_sad_skip_32x32x4d_bits12) + + HIGHBD_BFP( + BLOCK_64X64, vpx_highbd_sad64x64_bits12, + vpx_highbd_sad_skip_64x64_bits12, vpx_highbd_sad64x64_avg_bits12, + vpx_highbd_12_variance64x64, vpx_highbd_12_sub_pixel_variance64x64, + vpx_highbd_12_sub_pixel_avg_variance64x64, + vpx_highbd_sad64x64x4d_bits12, vpx_highbd_sad_skip_64x64x4d_bits12) + + HIGHBD_BFP( + BLOCK_16X16, vpx_highbd_sad16x16_bits12, + vpx_highbd_sad_skip_16x16_bits12, vpx_highbd_sad16x16_avg_bits12, + vpx_highbd_12_variance16x16, vpx_highbd_12_sub_pixel_variance16x16, + vpx_highbd_12_sub_pixel_avg_variance16x16, + vpx_highbd_sad16x16x4d_bits12, vpx_highbd_sad_skip_16x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_16X8, vpx_highbd_sad16x8_bits12, + vpx_highbd_sad_skip_16x8_bits12, vpx_highbd_sad16x8_avg_bits12, + vpx_highbd_12_variance16x8, vpx_highbd_12_sub_pixel_variance16x8, + vpx_highbd_12_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits12, vpx_highbd_sad_skip_16x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X16, vpx_highbd_sad8x16_bits12, + vpx_highbd_sad_skip_8x16_bits12, vpx_highbd_sad8x16_avg_bits12, + vpx_highbd_12_variance8x16, vpx_highbd_12_sub_pixel_variance8x16, + vpx_highbd_12_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits12, vpx_highbd_sad_skip_8x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad_skip_8x8_bits12, + vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8, + vpx_highbd_12_sub_pixel_variance8x8, + vpx_highbd_12_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits12, vpx_highbd_sad_skip_8x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad_skip_8x4_bits12, + vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4, + vpx_highbd_12_sub_pixel_variance8x4, + vpx_highbd_12_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits12, vpx_highbd_sad_skip_8x4x4d_bits12) + + HIGHBD_BFP( + BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad_skip_4x8_bits12, + vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8, + vpx_highbd_12_sub_pixel_variance4x8, + vpx_highbd_12_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits12, vpx_highbd_sad_skip_4x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad_skip_4x4_bits12, + vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4, + vpx_highbd_12_sub_pixel_variance4x4, + vpx_highbd_12_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits12, vpx_highbd_sad_skip_4x4x4d_bits12) break; } } @@ -2550,61 +2629,67 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var))); cpi->source_var_thresh = 0; cpi->frames_till_next_var_check = 0; -#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; - - BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16, - vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16, - vpx_sad32x16x4d) - - BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32, - vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32, - vpx_sad16x32x4d) - - BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32, - vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32, - vpx_sad64x32x4d) - - BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64, - vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64, - vpx_sad32x64x4d) - - BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32, - vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32, - vpx_sad32x32x4d) - - BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64, - vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64, - vpx_sad64x64x4d) - - BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16, - vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16, - vpx_sad16x16x4d) - - BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8, - vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8, - vpx_sad16x8x4d) - - BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16, - vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16, - vpx_sad8x16x4d) - - BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8, - vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d) - - BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4, - vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d) - - BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8, - vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d) - - BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4, - vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d) +#define BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdsf = SDSF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].sdsx4df = SDSX4DF; + + BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad_skip_32x16, vpx_sad32x16_avg, + vpx_variance32x16, vpx_sub_pixel_variance32x16, + vpx_sub_pixel_avg_variance32x16, vpx_sad32x16x4d, vpx_sad_skip_32x16x4d) + + BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad_skip_16x32, vpx_sad16x32_avg, + vpx_variance16x32, vpx_sub_pixel_variance16x32, + vpx_sub_pixel_avg_variance16x32, vpx_sad16x32x4d, vpx_sad_skip_16x32x4d) + + BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad_skip_64x32, vpx_sad64x32_avg, + vpx_variance64x32, vpx_sub_pixel_variance64x32, + vpx_sub_pixel_avg_variance64x32, vpx_sad64x32x4d, vpx_sad_skip_64x32x4d) + + BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad_skip_32x64, vpx_sad32x64_avg, + vpx_variance32x64, vpx_sub_pixel_variance32x64, + vpx_sub_pixel_avg_variance32x64, vpx_sad32x64x4d, vpx_sad_skip_32x64x4d) + + BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad_skip_32x32, vpx_sad32x32_avg, + vpx_variance32x32, vpx_sub_pixel_variance32x32, + vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x4d, vpx_sad_skip_32x32x4d) + + BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad_skip_64x64, vpx_sad64x64_avg, + vpx_variance64x64, vpx_sub_pixel_variance64x64, + vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x4d, vpx_sad_skip_64x64x4d) + + BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad_skip_16x16, vpx_sad16x16_avg, + vpx_variance16x16, vpx_sub_pixel_variance16x16, + vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x4d, vpx_sad_skip_16x16x4d) + + BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad_skip_16x8, vpx_sad16x8_avg, + vpx_variance16x8, vpx_sub_pixel_variance16x8, + vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x4d, vpx_sad_skip_16x8x4d) + + BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad_skip_8x16, vpx_sad8x16_avg, + vpx_variance8x16, vpx_sub_pixel_variance8x16, + vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x4d, vpx_sad_skip_8x16x4d) + + BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad_skip_8x8, vpx_sad8x8_avg, vpx_variance8x8, + vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d, + vpx_sad_skip_8x8x4d) + + BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad_skip_8x4, vpx_sad8x4_avg, vpx_variance8x4, + vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d, + vpx_sad_skip_8x4x4d) + + BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad_skip_4x8, vpx_sad4x8_avg, vpx_variance4x8, + vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d, + vpx_sad_skip_4x8x4d) + + BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad_skip_4x4, vpx_sad4x4_avg, vpx_variance4x4, + vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d, + vpx_sad_skip_4x4x4d) #if CONFIG_VP9_HIGHBITDEPTH highbd_set_var_fns(cpi); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 0efa836aca..71d8775ea5 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -437,6 +437,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY; MV center_mv_full = ref_mv_full; unsigned int start_mv_sad; + vp9_sad_fn_ptr_t sad_fn_ptr; int step_param = 3; int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; @@ -462,11 +463,13 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, x->mv_limits.row_min, x->mv_limits.row_max); start_mv_sad = get_start_mv_sad(x, &ref_mv_full, ¢er_mv_full, cpi->fn_ptr[bsize].sdf, x->sadperbit16); + sad_fn_ptr.sdf = cpi->fn_ptr[bsize].sdf; + sad_fn_ptr.sdx4df = cpi->fn_ptr[bsize].sdx4df; // Center the initial step/diamond search on best mv. tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param, x->sadperbit16, &num00, - &v_fn_ptr, ref_mv); + &sad_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty; @@ -488,7 +491,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } else { tmp_err = cpi->diamond_search_sad( x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n, - x->sadperbit16, &num00, &v_fn_ptr, ref_mv); + x->sadperbit16, &num00, &sad_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); if (tmp_err < INT_MAX - new_mv_mode_penalty) diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 4ff685b242..64e9ef0f91 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -2055,7 +2055,7 @@ int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row, int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, + const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv) { int i, j, step; @@ -2117,8 +2117,8 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address; - fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, - sad_array); + sad_fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, + sad_array); for (t = 0; t < 4; t++, i++) { if (sad_array[t] < bestsad) { @@ -2142,7 +2142,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, if (is_mv_in(&x->mv_limits, &this_mv)) { const uint8_t *const check_here = ss_os[i] + best_address; unsigned int thissad = - fn_ptr->sdf(what, what_stride, check_here, in_what_stride); + sad_fn_ptr->sdf(what, what_stride, check_here, in_what_stride); if (thissad < bestsad) { thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); @@ -2484,24 +2484,54 @@ int vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x, point as the best match, we will do a final 1-away diamond refining search */ static int full_pixel_diamond(const VP9_COMP *const cpi, - const MACROBLOCK *const x, MV *mvp_full, - int step_param, int sadpb, int further_steps, - int do_refine, int *cost_list, + const MACROBLOCK *const x, BLOCK_SIZE bsize, + MV *mvp_full, int step_param, int sadpb, + int further_steps, int do_refine, + int use_downsampled_sad, int *cost_list, const vp9_variance_fn_ptr_t *fn_ptr, const MV *ref_mv, MV *dst_mv) { MV temp_mv; int thissme, n, num00 = 0; int bestsme; - unsigned int start_mv_sad; + const int src_buf_stride = x->plane[0].src.stride; + const uint8_t *const src_buf = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + const int pred_buf_stride = xd->plane[0].pre[0].stride; + uint8_t *pred_buf; + vp9_sad_fn_ptr_t sad_fn_ptr; + unsigned int start_mv_sad, start_mv_sad_even_rows, start_mv_sad_odd_rows; const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 }; clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max, x->mv_limits.row_min, x->mv_limits.row_max); - start_mv_sad = - get_start_mv_sad(x, mvp_full, &ref_mv_full, fn_ptr->sdf, sadpb); + + pred_buf = + xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col; + start_mv_sad_even_rows = + fn_ptr->sdsf(src_buf, src_buf_stride, pred_buf, pred_buf_stride); + start_mv_sad_odd_rows = + fn_ptr->sdsf(src_buf + src_buf_stride, src_buf_stride, + pred_buf + pred_buf_stride, pred_buf_stride); + start_mv_sad = (start_mv_sad_even_rows + start_mv_sad_odd_rows) >> 1; + start_mv_sad += mvsad_err_cost(x, mvp_full, &ref_mv_full, sadpb); + + sad_fn_ptr.sdf = fn_ptr->sdf; + sad_fn_ptr.sdx4df = fn_ptr->sdx4df; + if (use_downsampled_sad && num_4x4_blocks_high_lookup[bsize] >= 2) { + // If the absolute difference between the pred-to-src SAD of even rows and + // the pred-to-src SAD of odd rows is small, skip every other row in sad + // computation. + const int odd_to_even_diff_sad = + abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows); + const int mult_thresh = 10; + if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) { + sad_fn_ptr.sdf = fn_ptr->sdsf; + sad_fn_ptr.sdx4df = fn_ptr->sdsx4df; + } + } bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv, - step_param, sadpb, &n, fn_ptr, ref_mv); + step_param, sadpb, &n, &sad_fn_ptr, ref_mv); if (bestsme < INT_MAX) bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); *dst_mv = temp_mv; @@ -2518,7 +2548,7 @@ static int full_pixel_diamond(const VP9_COMP *const cpi, } else { thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv, step_param + n, sadpb, &num00, - fn_ptr, ref_mv); + &sad_fn_ptr, ref_mv); if (thissme < INT_MAX) thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); @@ -2536,8 +2566,8 @@ static int full_pixel_diamond(const VP9_COMP *const cpi, if (do_refine) { const int search_range = 8; MV best_mv = *dst_mv; - thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr, - ref_mv); + thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, + &sad_fn_ptr, ref_mv); if (thissme < INT_MAX) thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1); if (thissme < bestsme) { @@ -2546,6 +2576,27 @@ static int full_pixel_diamond(const VP9_COMP *const cpi, } } + if (sad_fn_ptr.sdf != fn_ptr->sdf) { + // If we are skipping rows when we perform the motion search, we need to + // check the quality of skipping. If it's bad, then we run search with + // skip row features off. + const uint8_t *best_address = get_buf_from_mv(&xd->plane[0].pre[0], dst_mv); + const int sad = + fn_ptr->sdf(src_buf, src_buf_stride, best_address, pred_buf_stride); + const int skip_sad = + fn_ptr->sdsf(src_buf, src_buf_stride, best_address, pred_buf_stride); + // We will keep the result of skipping rows if it's good enough. + const int kSADThresh = + 1 << (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= VPXMAX(sad, 1) * 9) { + // There is a large discrepancy between skipping and not skipping, so we + // need to redo the motion search. + return full_pixel_diamond(cpi, x, bsize, mvp_full, step_param, sadpb, + further_steps, do_refine, 0, cost_list, fn_ptr, + ref_mv, dst_mv); + } + } + // Return cost list. if (cost_list) { calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list); @@ -2697,7 +2748,7 @@ int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv, int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, - const vp9_variance_fn_ptr_t *fn_ptr, + const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv) { const MACROBLOCKD *const xd = &x->e_mbd; const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; @@ -2706,7 +2757,7 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv); unsigned int best_sad = - fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) + + sad_fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) + mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit); int i, j; @@ -2723,7 +2774,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, best_address - 1, best_address + 1, best_address + in_what->stride }; - fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads); + sad_fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, + sads); for (j = 0; j < 4; ++j) { if (sads[j] < best_sad) { @@ -2743,8 +2795,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, if (is_mv_in(&x->mv_limits, &mv)) { unsigned int sad = - fn_ptr->sdf(what->buf, what->stride, - get_buf_from_mv(in_what, &mv), in_what->stride); + sad_fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride); if (sad < best_sad) { sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); if (sad < best_sad) { @@ -2861,9 +2913,10 @@ int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x, break; case NSTEP: case MESH: - var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, - MAX_MVSEARCH_STEPS - 1 - step_param, 1, - cost_list, fn_ptr, ref_mv, tmp_mv); + var = full_pixel_diamond( + cpi, x, bsize, mvp_full, step_param, error_per_bit, + MAX_MVSEARCH_STEPS - 1 - step_param, 1, + cpi->sf.mv.use_downsampled_sad, cost_list, fn_ptr, ref_mv, tmp_mv); break; default: assert(0 && "Unknown search method"); } diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index 62a7a047d4..fd6a8b9aca 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -41,6 +41,11 @@ typedef struct search_site_config { int total_steps; } search_site_config; +typedef struct vp9_sad_table { + vpx_sad_fn_t sdf; + vpx_sad_multi_d_fn_t sdx4df; +} vp9_sad_fn_ptr_t; + static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, const MV *mv) { return &buf->buf[mv->row * buf->stride + mv->col]; @@ -63,12 +68,13 @@ int vp9_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv, struct VP9_COMP; struct SPEED_FEATURES; +struct vp9_sad_table; int vp9_init_search_range(int size); int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv, int error_per_bit, int search_range, - const struct vp9_variance_vtable *fn_ptr, + const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); // Perform integral projection based motion estimation. @@ -96,7 +102,7 @@ extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv; typedef int (*vp9_diamond_search_fn_t)( const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit, - int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv); + int *num00, const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv); int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 3e121b799f..2aa3140052 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -223,6 +223,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->allow_skip_recode = 1; sf->less_rectangular_check = 1; sf->mv.auto_mv_step_size = 1; + sf->mv.use_downsampled_sad = 1; sf->prune_ref_frame_for_rect_partitions = 1; sf->temporal_filter_search_method = NSTEP; sf->tx_size_search_breakout = 1; @@ -919,6 +920,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { sf->coeff_prob_appx_step = 1; sf->mv.auto_mv_step_size = 0; sf->mv.fullpel_search_step_param = 6; + sf->mv.use_downsampled_sad = 0; sf->comp_inter_joint_search_thresh = BLOCK_4X4; sf->tx_size_search_method = USE_FULL_RD; sf->use_lp32x32fdct = 0; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index d32bf09e4e..7b7290d714 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -210,6 +210,10 @@ typedef struct MV_SPEED_FEATURES { // This variable sets the step_param used in full pel motion search. int fullpel_search_step_param; + + // Whether to downsample the rows in sad calculation during motion search. + // This is only active when there are at least 8 rows. + int use_downsampled_sad; } MV_SPEED_FEATURES; typedef struct PARTITION_SEARCH_BREAKOUT_THR { diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c index 719ab40f90..80442e3594 100644 --- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c @@ -51,7 +51,7 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, + const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int); @@ -167,8 +167,8 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, #endif } - fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], - in_what_stride, (uint32_t *)&v_sad_d); + sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], + in_what_stride, (uint32_t *)&v_sad_d); // Look up the component cost of the residual motion vector { diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c index b47c43430d..619d7aa956 100644 --- a/vpx_dsp/sad.c +++ b/vpx_dsp/sad.c @@ -43,6 +43,12 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride, DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \ vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \ return sad(src_ptr, src_stride, comp_pred, m, m, n); \ + } \ + unsigned int vpx_sad_skip_##m##x##n##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride, (m), \ + (n / 2)); \ } // Compare |src_ptr| to 4 distinct references in |ref_array[4]| @@ -54,6 +60,15 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride, for (i = 0; i < 4; ++i) \ sad_array[i] = \ vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \ + } \ + void vpx_sad_skip_##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = 2 * sad(src_ptr, 2 * src_stride, ref_array[i], \ + 2 * ref_stride, (m), (n / 2)); \ + } \ } /* clang-format off */ @@ -156,6 +171,12 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride, vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \ n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride); \ return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n); \ + } \ + unsigned int vpx_highbd_sad_skip_##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * \ + highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \ } #define highbd_sadMxNx4D(m, n) \ @@ -167,6 +188,15 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride, sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \ ref_array[i], ref_stride); \ } \ + } \ + void vpx_highbd_sad_skip_##m##x##n##x4d_c( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad_skip_##m##x##n##_c( \ + src, src_stride, ref_array[i], ref_stride); \ + } \ } /* clang-format off */ diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h index 755cb907d2..ccdb2f90ba 100644 --- a/vpx_dsp/variance.h +++ b/vpx_dsp/variance.h @@ -69,11 +69,15 @@ typedef struct variance_vtable { #if CONFIG_VP9 typedef struct vp9_variance_vtable { vpx_sad_fn_t sdf; + // Same as normal sad, but downsample the rows by a factor of 2. + vpx_sad_fn_t sdsf; vpx_sad_avg_fn_t sdaf; vpx_variance_fn_t vf; vpx_subpixvariance_fn_t svf; vpx_subp_avg_variance_fn_t svaf; vpx_sad_multi_d_fn_t sdx4df; + // Same as sadx4, but downsample the rows by a factor of 2. + vpx_sad_multi_d_fn_t sdsx4df; } vp9_variance_fn_ptr_t; #endif // CONFIG_VP9 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 49bc9a6309..346097dc79 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -786,6 +786,43 @@ () add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad4x4 neon msa sse2 mmi/; +add_proto qw/unsigned int vpx_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_64x64 avx2 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_64x32 avx2 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_32x64 avx2 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_32x32 avx2 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_32x16 avx2 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_16x32 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_16x16 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_16x8 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_8x16 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_8x8 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + +add_proto qw/unsigned int vpx_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_4x8 sse2/; + +add_proto qw/unsigned int vpx_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + # # Avg # @@ -928,6 +965,43 @@ () add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/; +add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_64x64x4d avx2 sse2/; + +add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_64x32x4d avx2 sse2/; + +add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_32x64x4d avx2 sse2/; + +add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_32x32x4d avx2 sse2/; + +add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_32x16x4d avx2 sse2/; + +add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_16x32x4d sse2/; + +add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_16x16x4d sse2/; + +add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_16x8x4d sse2/; + +add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_8x16x4d sse2/; + +add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_8x8x4d sse2/; + +add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + +add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_4x8x4d sse2/; + +add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size"; specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/; @@ -991,6 +1065,42 @@ () add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_highbd_sad4x4 neon/; + add_proto qw/unsigned int vpx_highbd_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_64x64 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_64x32 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_32x64 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_32x32 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_32x16 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_16x32 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_16x16 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_16x8 sse2 avx2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_8x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + + add_proto qw/unsigned int vpx_highbd_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + + add_proto qw/unsigned int vpx_highbd_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + # # Avg # @@ -1084,6 +1194,43 @@ () add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad4x4x4d sse2 neon/; + add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_64x64x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_64x32x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_32x64x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_32x32x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_32x16x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_16x32x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_16x16x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_16x8x4d sse2 avx2/; + + add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_8x16x4d sse2/; + + add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_8x8x4d sse2/; + + add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + + add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_4x8x4d sse2/; + + add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + # # Structured Similarity (SSIM) # diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c index 947b5e9772..e483fdce73 100644 --- a/vpx_dsp/x86/highbd_sad4d_avx2.c +++ b/vpx_dsp/x86/highbd_sad4d_avx2.c @@ -61,70 +61,79 @@ static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/, } } +static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < (n / 2); ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); + + /* sums_16 will outrange after 2 rows, so add current sums_16 to + * sums_32*/ + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 1; + } + calc_final_4(sums_32, sad_array); +} + #define HIGHBD_SAD64XNX4D(n) \ - void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ + void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *refs[4]; \ - __m256i sums_16[4]; \ - __m256i sums_32[4]; \ - int i; \ - \ - refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ - refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ - refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ - refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ - sums_32[0] = _mm256_setzero_si256(); \ - sums_32[1] = _mm256_setzero_si256(); \ - sums_32[2] = _mm256_setzero_si256(); \ - sums_32[3] = _mm256_setzero_si256(); \ - \ - for (i = 0; i < (n / 2); ++i) { \ - sums_16[0] = _mm256_setzero_si256(); \ - sums_16[1] = _mm256_setzero_si256(); \ - sums_16[2] = _mm256_setzero_si256(); \ - sums_16[3] = _mm256_setzero_si256(); \ - \ - highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); \ - \ - /* sums_16 will outrange after 2 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32[0] = _mm256_add_epi32( \ - sums_32[0], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[0], 1)))); \ - sums_32[1] = _mm256_add_epi32( \ - sums_32[1], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[1], 1)))); \ - sums_32[2] = _mm256_add_epi32( \ - sums_32[2], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[2], 1)))); \ - sums_32[3] = _mm256_add_epi32( \ - sums_32[3], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[3], 1)))); \ - \ - src += src_stride << 1; \ - } \ - calc_final_4(sums_32, sad_array); \ + highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ } -// 64x64 -HIGHBD_SAD64XNX4D(64) - -// 64x32 -HIGHBD_SAD64XNX4D(32) +#define HIGHBD_SADSKIP64XNx4D(n) \ + void vpx_highbd_sad_skip_64x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, const uint16_t *src, @@ -171,73 +180,79 @@ static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, } } +static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < (n / 8); ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); + + /* sums_16 will outrange after 8 rows, so add current sums_16 to + * sums_32*/ + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 3; + } + calc_final_4(sums_32, sad_array); +} + #define HIGHBD_SAD32XNX4D(n) \ - void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ + void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *refs[4]; \ - __m256i sums_16[4]; \ - __m256i sums_32[4]; \ - int i; \ - \ - refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ - refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ - refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ - refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ - sums_32[0] = _mm256_setzero_si256(); \ - sums_32[1] = _mm256_setzero_si256(); \ - sums_32[2] = _mm256_setzero_si256(); \ - sums_32[3] = _mm256_setzero_si256(); \ - \ - for (i = 0; i < (n / 8); ++i) { \ - sums_16[0] = _mm256_setzero_si256(); \ - sums_16[1] = _mm256_setzero_si256(); \ - sums_16[2] = _mm256_setzero_si256(); \ - sums_16[3] = _mm256_setzero_si256(); \ - \ - highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); \ - \ - /* sums_16 will outrange after 8 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32[0] = _mm256_add_epi32( \ - sums_32[0], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[0], 1)))); \ - sums_32[1] = _mm256_add_epi32( \ - sums_32[1], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[1], 1)))); \ - sums_32[2] = _mm256_add_epi32( \ - sums_32[2], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[2], 1)))); \ - sums_32[3] = _mm256_add_epi32( \ - sums_32[3], \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ - _mm256_cvtepu16_epi32( \ - _mm256_extractf128_si256(sums_16[3], 1)))); \ - \ - src += src_stride << 3; \ - } \ - calc_final_4(sums_32, sad_array); \ + highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ } -// 32x64 -HIGHBD_SAD32XNX4D(64) - -// 32x32 -HIGHBD_SAD32XNX4D(32) - -// 32x16 -HIGHBD_SAD32XNX4D(16) +#define HIGHBD_SADSKIP32XNx4D(n) \ + void vpx_highbd_sad_skip_32x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, const uint16_t *src, @@ -275,13 +290,15 @@ static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, } } -void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], - int ref_stride, uint32_t sad_array[4]) { +static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); uint16_t *refs[4]; __m256i sums_16[4]; __m256i sums_32[4]; + const int height = VPXMIN(16, n); + const int num_iters = n / height; int i; refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); @@ -293,13 +310,13 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, sums_32[2] = _mm256_setzero_si256(); sums_32[3] = _mm256_setzero_si256(); - for (i = 0; i < 2; ++i) { + for (i = 0; i < num_iters; ++i) { sums_16[0] = _mm256_setzero_si256(); sums_16[1] = _mm256_setzero_si256(); sums_16[2] = _mm256_setzero_si256(); sums_16[3] = _mm256_setzero_si256(); - highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height); // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 sums_32[0] = _mm256_add_epi32( @@ -328,6 +345,26 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums_32, sad_array); } +#define HIGHBD_SAD16XNX4D(n) \ + void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ + } + +#define HIGHBD_SADSKIP16XNx4D(n) \ + void vpx_highbd_sad_skip_16x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { @@ -399,3 +436,27 @@ void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums_32, sad_array); } } + +// clang-format off +HIGHBD_SAD64XNX4D(64) +HIGHBD_SADSKIP64XNx4D(64) + +HIGHBD_SAD64XNX4D(32) +HIGHBD_SADSKIP64XNx4D(32) + +HIGHBD_SAD32XNX4D(64) +HIGHBD_SADSKIP32XNx4D(64) + +HIGHBD_SAD32XNX4D(32) +HIGHBD_SADSKIP32XNx4D(32) + +HIGHBD_SAD32XNX4D(16) +HIGHBD_SADSKIP32XNx4D(16) + +HIGHBD_SAD16XNX4D(32) +HIGHBD_SADSKIP16XNx4D(32) + +HIGHBD_SADSKIP16XNx4D(16) + +HIGHBD_SADSKIP16XNx4D(8) + // clang-format on diff --git a/vpx_dsp/x86/highbd_sad4d_sse2.asm b/vpx_dsp/x86/highbd_sad4d_sse2.asm index 6c2a61e019..a07892d811 100644 --- a/vpx_dsp/x86/highbd_sad4d_sse2.asm +++ b/vpx_dsp/x86/highbd_sad4d_sse2.asm @@ -213,7 +213,12 @@ SECTION .text ; uint8_t *ref[4], int ref_stride, ; uint32_t res[4]); ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 -%macro HIGH_SADNXN4D 2 +; Macro Arguments: +; 1: Width +; 2: Height +; 3: If 0, then normal sad, if 2, then skip every other row +%macro HIGH_SADNXN4D 2-3 0 +%if %3 == 0 ; normal sad %if UNIX64 cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ res, ref2, ref3, ref4 @@ -221,6 +226,15 @@ cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ ref2, ref3, ref4 %endif +%else ; %3 == 2, downsample +%if UNIX64 +cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif ; +%endif ; sad/avg/skip ; set m1 push srcq @@ -229,6 +243,10 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ pshufd m1, m1, 0x0 pop srcq +%if %3 == 2 ; skip rows + lea src_strided, [2*src_strided] + lea ref_strided, [2*ref_strided] +%endif ; skip rows movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided mov ref2q, [ref1q+gprsize*1] @@ -244,9 +262,15 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ shl ref1q, 1 HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 -%rep (%2-4)/2 +%if %3 == 2 ; Downsampling by two +%define num_rep (%2-8)/4 +%else +%define num_rep (%2-4)/2 +%endif +%rep num_rep HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 %endrep +%undef rep HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 ; N.B. HIGH_PROCESS outputs dwords (32 bits) ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM @@ -265,6 +289,9 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ paddd m4, m0 paddd m6, m1 punpcklqdq m4, m6 +%if %3 == 2 ; skip rows + pslld m4, 1 +%endif movifnidn r4, r4mp movu [r4], m4 RET @@ -285,3 +312,15 @@ HIGH_SADNXN4D 8, 8 HIGH_SADNXN4D 8, 4 HIGH_SADNXN4D 4, 8 HIGH_SADNXN4D 4, 4 + +HIGH_SADNXN4D 64, 64, 2 +HIGH_SADNXN4D 64, 32, 2 +HIGH_SADNXN4D 32, 64, 2 +HIGH_SADNXN4D 32, 32, 2 +HIGH_SADNXN4D 32, 16, 2 +HIGH_SADNXN4D 16, 32, 2 +HIGH_SADNXN4D 16, 16, 2 +HIGH_SADNXN4D 16, 8, 2 +HIGH_SADNXN4D 8, 16, 2 +HIGH_SADNXN4D 8, 8, 2 +HIGH_SADNXN4D 4, 8, 2 diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c index 231b67f809..78f8eb8bfa 100644 --- a/vpx_dsp/x86/highbd_sad_avx2.c +++ b/vpx_dsp/x86/highbd_sad_avx2.c @@ -50,39 +50,49 @@ static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16, } } -#define HIGHBD_SAD64XN(n) \ - unsigned int vpx_highbd_sad64x##n##_avx2( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - __m256i sums_32 = _mm256_setzero_si256(); \ - int i; \ - \ - for (i = 0; i < (n / 2); ++i) { \ - __m256i sums_16 = _mm256_setzero_si256(); \ - \ - highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); \ - \ - /* sums_16 will outrange after 2 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32 = _mm256_add_epi32( \ - sums_32, \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ - _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ - \ - src += src_stride << 1; \ - ref += ref_stride << 1; \ - } \ - return calc_final(sums_32); \ +static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < (n / 2); ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); + + /* sums_16 will outrange after 2 rows, so add current sums_16 to + * sums_32*/ + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 1; + ref += ref_stride << 1; } + return calc_final(sums_32); +} -// 64x64 -HIGHBD_SAD64XN(64) +#define HIGHBD_SAD64XN(n) \ + unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n); \ + } -// 64x32 -HIGHBD_SAD64XN(32) +#define HIGHBD_SADSKIP64xN(n) \ + unsigned int vpx_highbd_sad_skip_64x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, const uint16_t *src, int src_stride, @@ -107,42 +117,49 @@ static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, } } -#define HIGHBD_SAD32XN(n) \ - unsigned int vpx_highbd_sad32x##n##_avx2( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - __m256i sums_32 = _mm256_setzero_si256(); \ - int i; \ - \ - for (i = 0; i < (n / 8); ++i) { \ - __m256i sums_16 = _mm256_setzero_si256(); \ - \ - highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); \ - \ - /* sums_16 will outrange after 8 rows, so add current sums_16 to \ - * sums_32*/ \ - sums_32 = _mm256_add_epi32( \ - sums_32, \ - _mm256_add_epi32( \ - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ - _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ - \ - src += src_stride << 3; \ - ref += ref_stride << 3; \ - } \ - return calc_final(sums_32); \ - } +static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; -// 32x64 -HIGHBD_SAD32XN(64) + for (i = 0; i < (n / 8); ++i) { + __m256i sums_16 = _mm256_setzero_si256(); -// 32x32 -HIGHBD_SAD32XN(32) + highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); -// 32x16 -HIGHBD_SAD32XN(16) + /* sums_16 will outrange after 8 rows, so add current sums_16 to + * sums_32*/ + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 3; + ref += ref_stride << 3; + } + return calc_final(sums_32); +} + +#define HIGHBD_SAD32XN(n) \ + unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP32xN(n) \ + unsigned int vpx_highbd_sad_skip_32x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, const uint16_t *src, int src_stride, @@ -167,17 +184,22 @@ static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, } } -unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { +static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); __m256i sums_32 = _mm256_setzero_si256(); + const int height = VPXMIN(16, n); + const int num_iters = n / height; int i; - for (i = 0; i < 2; ++i) { + for (i = 0; i < num_iters; ++i) { __m256i sums_16 = _mm256_setzero_si256(); - highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height); // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 sums_32 = _mm256_add_epi32( @@ -192,6 +214,21 @@ unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride, return calc_final(sums_32); } +#define HIGHBD_SAD16XN(n) \ + unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP16xN(n) \ + unsigned int vpx_highbd_sad_skip_16x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } + unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); @@ -224,6 +261,23 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride, } } +// clang-format off +HIGHBD_SAD64XN(64) +HIGHBD_SADSKIP64xN(64) +HIGHBD_SAD64XN(32) +HIGHBD_SADSKIP64xN(32) +HIGHBD_SAD32XN(64) +HIGHBD_SADSKIP32xN(64) +HIGHBD_SAD32XN(32) +HIGHBD_SADSKIP32xN(32) +HIGHBD_SAD32XN(16) +HIGHBD_SADSKIP32xN(16) +HIGHBD_SAD16XN(32) +HIGHBD_SADSKIP16xN(32) +HIGHBD_SADSKIP16xN(16) +HIGHBD_SADSKIP16xN(8) +//clang-format on + // AVG ------------------------------------------------------------------------- static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16, const uint16_t *src, diff --git a/vpx_dsp/x86/highbd_sad_sse2.asm b/vpx_dsp/x86/highbd_sad_sse2.asm index 6a1a6f3d62..62ad2237ff 100644 --- a/vpx_dsp/x86/highbd_sad_sse2.asm +++ b/vpx_dsp/x86/highbd_sad_sse2.asm @@ -12,6 +12,11 @@ SECTION .text +; Macro Arguments +; Arg 1: Width +; Arg 2: Height +; Arg 3: Number of general purpose registers +; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows %macro HIGH_SAD_FN 4 %if %4 == 0 %if %3 == 5 @@ -20,7 +25,7 @@ cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 -%else ; avg +%elif %4 == 1 ; avg %if %3 == 5 cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ second_pred, n_rows @@ -35,7 +40,18 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \ %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 -%endif ; avg/sad +%else ; %4 == 2, skip rows +%if %3 == 5 +cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%endif ; sad/avg/skip +%if %4 == 2 ; double the stride if we are skipping rows + lea src_strided, [src_strided*2] + lea ref_strided, [ref_strided*2] +%endif movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 @@ -54,7 +70,11 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \ ; uint8_t *ref, int ref_stride); %macro HIGH_SAD64XN 1-2 0 HIGH_SAD_FN 64, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/2 +%else mov n_rowsd, %1 +%endif pxor m0, m0 pxor m6, m6 @@ -146,6 +166,9 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \ punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -155,13 +178,19 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 +HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2 +HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2 ; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD32XN 1-2 0 HIGH_SAD_FN 32, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/2 +%else mov n_rowsd, %1 +%endif pxor m0, m0 pxor m6, m6 @@ -213,6 +242,9 @@ HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -224,12 +256,19 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 +HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2 +HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2 +HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2 ; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD16XN 1-2 0 HIGH_SAD_FN 16, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/4 +%else mov n_rowsd, %1/2 +%endif pxor m0, m0 pxor m6, m6 @@ -281,6 +320,9 @@ HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -292,13 +334,19 @@ HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 - +HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2 +HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2 +HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2 ; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD8XN 1-2 0 HIGH_SAD_FN 8, %1, 7, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 pxor m6, m6 @@ -350,6 +398,9 @@ HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -361,3 +412,5 @@ HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 +HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2 +HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2 diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c index 399b67b3fb..c87fd3cd27 100644 --- a/vpx_dsp/x86/sad4d_avx2.c +++ b/vpx_dsp/x86/sad4d_avx2.c @@ -25,9 +25,10 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, _mm_storeu_si128((__m128i *)sad_array, sum); } -void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { +static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, int h, + uint32_t sad_array[4]) { int i; const uint8_t *refs[4]; __m256i sums[4]; @@ -41,7 +42,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, sums[2] = _mm256_setzero_si256(); sums[3] = _mm256_setzero_si256(); - for (i = 0; i < 32; i++) { + for (i = 0; i < h; i++) { __m256i r[4]; // load src and all ref[] @@ -73,9 +74,10 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums, sad_array); } -void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { +static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, int h, + uint32_t sad_array[4]) { __m256i sums[4]; int i; const uint8_t *refs[4]; @@ -89,7 +91,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, sums[2] = _mm256_setzero_si256(); sums[3] = _mm256_setzero_si256(); - for (i = 0; i < 64; i++) { + for (i = 0; i < h; i++) { __m256i r_lo[4], r_hi[4]; // load 64 bytes from src and all ref[] const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr); @@ -132,3 +134,51 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums, sad_array); } + +#define SAD64_H(h) \ + void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], int ref_stride, \ + uint32_t res[4]) { \ + sad64xhx4d_avx2(src, src_stride, ref, ref_stride, h, res); \ + } + +#define SAD32_H(h) \ + void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], int ref_stride, \ + uint32_t res[4]) { \ + sad32xhx4d_avx2(src, src_stride, ref, ref_stride, h, res); \ + } + +SAD64_H(64) +SAD32_H(32) + +#define SADS64_H(h) \ + void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], \ + int ref_stride, uint32_t res[4]) { \ + sad64xhx4d_avx2(src, 2 * src_stride, ref, 2 * ref_stride, ((h) >> 1), \ + res); \ + res[0] <<= 1; \ + res[1] <<= 1; \ + res[2] <<= 1; \ + res[3] <<= 1; \ + } + +#define SADS32_H(h) \ + void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], \ + int ref_stride, uint32_t res[4]) { \ + sad32xhx4d_avx2(src, 2 * src_stride, ref, 2 * ref_stride, ((h) >> 1), \ + res); \ + res[0] <<= 1; \ + res[1] <<= 1; \ + res[2] <<= 1; \ + res[3] <<= 1; \ + } + +SADS64_H(64) +SADS64_H(32) + +SADS32_H(64) +SADS32_H(32) +SADS32_H(16) diff --git a/vpx_dsp/x86/sad4d_sse2.asm b/vpx_dsp/x86/sad4d_sse2.asm index 3f6e55ce9a..ed4ea3ef9b 100644 --- a/vpx_dsp/x86/sad4d_sse2.asm +++ b/vpx_dsp/x86/sad4d_sse2.asm @@ -179,13 +179,27 @@ SECTION .text ; uint8_t *ref[4], int ref_stride, ; uint32_t res[4]); ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4 -%macro SADNXN4D 2 +%macro SADNXN4D 2-3 0 +%if %3 == 1 ; skip rows +%if UNIX64 +cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif +%else ; normal sad %if UNIX64 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ res, ref2, ref3, ref4 %else cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ ref2, ref3, ref4 +%endif +%endif +%if %3 == 1 + lea src_strided, [2*src_strided] + lea ref_strided, [2*ref_strided] %endif movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided @@ -195,9 +209,15 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ mov ref1q, [ref1q+gprsize*0] PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 -%rep (%2-4)/2 +%if %3 == 1 ; downsample number of rows by 2 +%define num_rep (%2-8)/4 +%else +%define num_rep (%2-4)/2 +%endif +%rep num_rep PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 %endrep +%undef num_rep PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 %if %1 > 4 @@ -211,12 +231,19 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ punpckhqdq m5, m7 movifnidn r4, r4mp paddd m4, m5 +%if %3 == 1 + pslld m4, 1 +%endif movu [r4], m4 RET %else movifnidn r4, r4mp pshufd m6, m6, 0x08 pshufd m7, m7, 0x08 +%if %3 == 1 + pslld m6, 1 + pslld m7, 1 +%endif movq [r4+0], m6 movq [r4+8], m7 RET @@ -237,3 +264,15 @@ SADNXN4D 8, 8 SADNXN4D 8, 4 SADNXN4D 4, 8 SADNXN4D 4, 4 + +SADNXN4D 64, 64, 1 +SADNXN4D 64, 32, 1 +SADNXN4D 32, 64, 1 +SADNXN4D 32, 32, 1 +SADNXN4D 32, 16, 1 +SADNXN4D 16, 32, 1 +SADNXN4D 16, 16, 1 +SADNXN4D 16, 8, 1 +SADNXN4D 8, 16, 1 +SADNXN4D 8, 8, 1 +SADNXN4D 4, 8, 1 diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c index 29bedb0e6e..e00494d766 100644 --- a/vpx_dsp/x86/sad_avx2.c +++ b/vpx_dsp/x86/sad_avx2.c @@ -11,73 +11,104 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" +static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + for (i = 0; i < h; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref_stride; + src_ptr += src_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + return res; +} + +static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + const int ref2_stride = ref_stride << 1; + const int src2_stride = src_stride << 1; + const int max = h >> 1; + for (i = 0; i < max; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref2_stride; + src_ptr += src2_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + return res; +} + #define FSAD64_H(h) \ unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ - int i; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - for (i = 0; i < h; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ - sad1_reg = _mm256_sad_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8( \ - ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ - sum_sad = \ - _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr += ref_stride; \ - src_ptr += src_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ + return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + } + +#define FSADS64_H(h) \ + unsigned int vpx_sad_skip_64x##h##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ } #define FSAD32_H(h) \ unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - int ref2_stride = ref_stride << 1; \ - int src2_stride = src_stride << 1; \ - int max = h >> 1; \ - for (i = 0; i < max; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ - sad1_reg = _mm256_sad_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8( \ - ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ - sum_sad = \ - _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr += ref2_stride; \ - src_ptr += src2_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + } + +#define FSADS32_H(h) \ + unsigned int vpx_sad_skip_32x##h##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ } -#define FSAD64 \ - FSAD64_H(64) \ - FSAD64_H(32) +#define FSAD64 \ + FSAD64_H(64) \ + FSAD64_H(32) \ + FSADS64_H(64) \ + FSADS64_H(32) -#define FSAD32 \ - FSAD32_H(64) \ - FSAD32_H(32) \ - FSAD32_H(16) +#define FSAD32 \ + FSAD32_H(64) \ + FSAD32_H(32) \ + FSAD32_H(16) \ + FSADS32_H(64) \ + FSADS32_H(32) \ + FSADS32_H(16) FSAD64 FSAD32 @@ -86,6 +117,8 @@ FSAD32 #undef FSAD32 #undef FSAD64_H #undef FSAD32_H +#undef FSADS64_H +#undef FSADS32_H #define FSADAVG64_H(h) \ unsigned int vpx_sad64x##h##_avg_avx2( \ diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm index e4e1bc3e98..627e463bf8 100644 --- a/vpx_dsp/x86/sad_sse2.asm +++ b/vpx_dsp/x86/sad_sse2.asm @@ -12,15 +12,29 @@ SECTION .text +; Macro Arguments +; Arg 1: Width +; Arg 2: Height +; Arg 3: Number of general purpose registers +; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows %macro SAD_FN 4 -%if %4 == 0 +%if %4 == 0 ; normal sad %if %3 == 5 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 -%else ; avg + +%elif %4 == 2 ; skip +%if %3 == 5 +cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 + +%else %if %3 == 5 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ second_pred, n_rows @@ -35,7 +49,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 -%endif ; avg/sad +%endif ; sad/avg/skip +%if %4 == 2; skip rows so double the stride +lea src_strided, [src_strided*2] +lea ref_strided, [ref_strided*2] +%endif ; %4 skip movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 @@ -48,7 +66,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ ; uint8_t *ref, int ref_stride); %macro SAD64XN 1-2 0 SAD_FN 64, %1, 5, %2 +%if %2 == 2 + mov n_rowsd, %1/2 +%else mov n_rowsd, %1 +%endif pxor m0, m0 .loop: movu m1, [refq] @@ -77,6 +99,9 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -86,12 +111,18 @@ SAD64XN 64 ; sad64x64_sse2 SAD64XN 32 ; sad64x32_sse2 SAD64XN 64, 1 ; sad64x64_avg_sse2 SAD64XN 32, 1 ; sad64x32_avg_sse2 +SAD64XN 64, 2 ; sad64x64_skip_sse2 +SAD64XN 32, 2 ; sad64x32_skip_sse2 ; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD32XN 1-2 0 SAD_FN 32, %1, 5, %2 +%if %2 == 2 + mov n_rowsd, %1/4 +%else mov n_rowsd, %1/2 +%endif pxor m0, m0 .loop: movu m1, [refq] @@ -120,6 +151,9 @@ SAD64XN 32, 1 ; sad64x32_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -131,12 +165,19 @@ SAD32XN 16 ; sad32x16_sse2 SAD32XN 64, 1 ; sad32x64_avg_sse2 SAD32XN 32, 1 ; sad32x32_avg_sse2 SAD32XN 16, 1 ; sad32x16_avg_sse2 +SAD32XN 64, 2 ; sad32x64_skip_sse2 +SAD32XN 32, 2 ; sad32x32_skip_sse2 +SAD32XN 16, 2 ; sad32x16_skip_sse2 ; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD16XN 1-2 0 SAD_FN 16, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 .loop: @@ -166,6 +207,9 @@ SAD32XN 16, 1 ; sad32x16_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -177,12 +221,19 @@ SAD16XN 8 ; sad16x8_sse2 SAD16XN 32, 1 ; sad16x32_avg_sse2 SAD16XN 16, 1 ; sad16x16_avg_sse2 SAD16XN 8, 1 ; sad16x8_avg_sse2 +SAD16XN 32, 2 ; sad16x32_skip_sse2 +SAD16XN 16, 2 ; sad16x16_skip_sse2 +SAD16XN 8, 2 ; sad16x8_skip_sse2 ; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD8XN 1-2 0 SAD_FN 8, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 .loop: @@ -210,6 +261,9 @@ SAD16XN 8, 1 ; sad16x8_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -221,12 +275,18 @@ SAD8XN 4 ; sad8x4_sse2 SAD8XN 16, 1 ; sad8x16_avg_sse2 SAD8XN 8, 1 ; sad8x8_avg_sse2 SAD8XN 4, 1 ; sad8x4_avg_sse2 +SAD8XN 16, 2 ; sad8x16_skip_sse2 +SAD8XN 8, 2 ; sad8x8_skip_sse2 ; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD4XN 1-2 0 SAD_FN 4, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 .loop: @@ -257,6 +317,9 @@ SAD8XN 4, 1 ; sad8x4_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -266,3 +329,4 @@ SAD4XN 8 ; sad4x8_sse SAD4XN 4 ; sad4x4_sse SAD4XN 8, 1 ; sad4x8_avg_sse SAD4XN 4, 1 ; sad4x4_avg_sse +SAD4XN 8, 2 ; sad4x8_skip_sse From e3c458149cd46eefb601ec684deb0352b52b77a1 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 11 Apr 2023 19:16:28 -0700 Subject: [PATCH 0727/1000] vp9_mbgraph: clear -Wshadow warnings Bug: webm:1793 Change-Id: Ibffb62775f09922d37f7d0460aa2751e74c36738 --- vp9/encoder/vp9_mbgraph.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index 7c2790cb98..9487fc5fae 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -98,8 +98,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv, // If the current best reference mv is not centered on 0,0 then do a 0,0 // based search as well. if (ref_mv->row != 0 || ref_mv->col != 0) { - unsigned int tmp_err; - MV zero_ref_mv = { 0, 0 }, tmp_mv; + MV zero_ref_mv = { 0, 0 }; tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv, mb_row, mb_col); From bde26b99611b5534ad4a67990817882e74a723fe Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 11 Apr 2023 19:23:27 -0700 Subject: [PATCH 0728/1000] vp9_ratectrl: clear -Wshadow warnings Bug: webm:1793 Change-Id: I2476a9d8e1d62414fdbe6feee87d5167058f499b --- vp9/encoder/vp9_ratectrl.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index d9207f7a2f..9e152629fb 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1150,8 +1150,9 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, if (frame_is_intra_only(cm)) { if (oxcf->rc_mode == VPX_Q) { int qindex = cq_level; - double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); - int delta_qindex = vp9_compute_qdelta(rc, q, q * 0.25, cm->bit_depth); + double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + int delta_qindex = + vp9_compute_qdelta(rc, qstart, qstart * 0.25, cm->bit_depth); active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); } else if (rc->this_key_frame_forced) { // Handle the special case for key frames forced when we have reached @@ -1206,12 +1207,14 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, } else if (oxcf->rc_mode == VPX_Q) { int qindex = cq_level; - double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth); int delta_qindex; if (cpi->refresh_alt_ref_frame) - delta_qindex = vp9_compute_qdelta(rc, q, q * 0.40, cm->bit_depth); + delta_qindex = + vp9_compute_qdelta(rc, qstart, qstart * 0.40, cm->bit_depth); else - delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth); + delta_qindex = + vp9_compute_qdelta(rc, qstart, qstart * 0.50, cm->bit_depth); active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); } else { active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); @@ -1219,11 +1222,12 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, } else { if (oxcf->rc_mode == VPX_Q) { int qindex = cq_level; - double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth); double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0, 0.70, 1.0, 0.85, 1.0 }; int delta_qindex = vp9_compute_qdelta( - rc, q, q * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL], + rc, qstart, + qstart * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL], cm->bit_depth); active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); } else { @@ -1859,8 +1863,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { rc->avg_frame_qindex[KEY_FRAME] = ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); if (cpi->use_svc) { - int i = 0; - SVC *svc = &cpi->svc; + int i; for (i = 0; i < svc->number_temporal_layers; ++i) { const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); From f254e6da84d564a5fbd2da7e0e1b31d81ba1dfba Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 11 Apr 2023 19:27:03 -0700 Subject: [PATCH 0729/1000] vp9_speed_features: clear -Wshadow warning Bug: webm:1793 Change-Id: I9f509c4461631e358f80b98afbb745ce88e9d7a2 --- vp9/encoder/vp9_speed_features.c | 1 - 1 file changed, 1 deletion(-) diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 0522d4ec97..034673b491 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -396,7 +396,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, } if (speed >= 5) { - int i; sf->optimize_coefficients = 0; sf->mv.search_method = HEX; sf->disable_filter_search_var_thresh = 500; From aaffc6e306a2b5d6aeb0b673677a3b1bc1a6ff6d Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 12 Apr 2023 13:31:35 -0700 Subject: [PATCH 0730/1000] vp9_pickmode: clear -Wshadow warnings Bug: webm:1793 Change-Id: I26c063818144d11c4c91165c3fcbf6f258453cc7 --- vp9/encoder/vp9_pickmode.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index c19d57d15d..fa88cd79da 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -566,23 +566,26 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, // Transform skipping test in UV planes. for (i = 1; i <= 2; i++) { - struct macroblock_plane *const p = &x->plane[i]; - struct macroblockd_plane *const pd = &xd->plane[i]; - const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd); + struct macroblock_plane *const p_uv = &x->plane[i]; + struct macroblockd_plane *const pd_uv = &xd->plane[i]; + const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd_uv); const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size]; - const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd); + const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd_uv); const int uv_bw = b_width_log2_lookup[uv_bsize]; const int uv_bh = b_height_log2_lookup[uv_bsize]; const int sf = (uv_bw - b_width_log2_lookup[unit_size]) + (uv_bh - b_height_log2_lookup[unit_size]); - const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf); - const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf); + const uint32_t uv_dc_thr = + pd_uv->dequant[0] * pd_uv->dequant[0] >> (6 - sf); + const uint32_t uv_ac_thr = + pd_uv->dequant[1] * pd_uv->dequant[1] >> (6 - sf); int j = i - 1; vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); flag_preduv_computed[i - 1] = 1; - var_uv[j] = cpi->fn_ptr[uv_bsize].vf( - p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]); + var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p_uv->src.buf, p_uv->src.stride, + pd_uv->dst.buf, pd_uv->dst.stride, + &sse_uv[j]); if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) && (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j])) @@ -1933,15 +1936,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (cpi->use_svc && svc->force_zero_mode_spatial_ref && svc->spatial_layer_id > 0 && !gf_temporal_ref) { if (cpi->ref_frame_flags & VP9_LAST_FLAG) { - struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf; - if (vp9_is_scaled(sf)) { + struct scale_factors *const ref_sf = &cm->frame_refs[LAST_FRAME - 1].sf; + if (vp9_is_scaled(ref_sf)) { svc_force_zero_mode[LAST_FRAME - 1] = 1; inter_layer_ref = LAST_FRAME; } } if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { - struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf; - if (vp9_is_scaled(sf)) { + struct scale_factors *const ref_sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf; + if (vp9_is_scaled(ref_sf)) { svc_force_zero_mode[GOLDEN_FRAME - 1] = 1; inter_layer_ref = GOLDEN_FRAME; } @@ -2772,9 +2775,10 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && (yv12 != NULL)) { int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame]; - const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; - vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, - sf); + const struct scale_factors *const ref_sf = + &cm->frame_refs[ref_frame - 1].sf; + vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, ref_sf, + ref_sf); vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col, mbmi_ext->mode_context); From 2513f6d5f4c9af6e3d715acf83c0e25d1560398e Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 12 Apr 2023 13:41:46 -0700 Subject: [PATCH 0731/1000] vp9_svc_layercontext: clear -Wshadow warnings Bug: webm:1793 Change-Id: I63669de9835713ec70dafa88ca8f2c2459e59698 --- vp9/encoder/vp9_svc_layercontext.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 83b6e5c99d..f08d668203 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -107,7 +107,6 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers); LAYER_CONTEXT *const lc = &svc->layer_context[layer]; RATE_CONTROL *const lrc = &lc->rc; - int i; lc->current_video_frame_in_layer = 0; lc->layer_size = 0; lc->frames_from_key_frame = 0; @@ -799,9 +798,9 @@ int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) { for (sl = svc->number_spatial_layers - 1; sl >= svc->first_spatial_layer_to_encode; sl--) { int layer = sl * svc->number_temporal_layers + svc->temporal_layer_id; - LAYER_CONTEXT *const lc = &svc->layer_context[layer]; - cpi->rc = lc->rc; - cpi->oxcf.target_bandwidth = lc->target_bandwidth; + LAYER_CONTEXT *const sl_lc = &svc->layer_context[layer]; + cpi->rc = sl_lc->rc; + cpi->oxcf.target_bandwidth = sl_lc->target_bandwidth; if (vp9_test_drop(cpi)) { int sl2; // Set flag to force drop in encoding for this mode. @@ -1050,17 +1049,17 @@ void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) { int sl, tl; for (sl = 0; sl < svc->number_spatial_layers; ++sl) { // Check for reset based on avg_frame_bandwidth for spatial layer sl. - int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1, - svc->number_temporal_layers); - LAYER_CONTEXT *lc = &svc->layer_context[layer]; + const int spatial_layer_idx = LAYER_IDS_TO_IDX( + sl, svc->number_temporal_layers - 1, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[spatial_layer_idx]; RATE_CONTROL *lrc = &lc->rc; if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) || lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) { // Reset for all temporal layers with spatial layer sl. for (tl = 0; tl < svc->number_temporal_layers; ++tl) { - int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); - LAYER_CONTEXT *lc = &svc->layer_context[layer]; - RATE_CONTROL *lrc = &lc->rc; + int temporal_layer_idx = + LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + lrc = &svc->layer_context[temporal_layer_idx].rc; lrc->rc_1_frame = 0; lrc->rc_2_frame = 0; lrc->bits_off_target = lrc->optimal_buffer_level; From 39a6b6c1364cf9c03ce947f9a98b40b63aef28ca Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 12 Apr 2023 13:46:26 -0700 Subject: [PATCH 0732/1000] vp9_temporal_filter: clear -Wshadow warnings Bug: webm:1793 Change-Id: Ia681ce636ae99f95b875ee1b0189bc6fa66a7608 --- vp9/encoder/vp9_temporal_filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 8af30c42aa..986553a4a8 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -450,8 +450,6 @@ void vp9_highbd_apply_temporal_filter_c( // Apply the filter to luma for (row = 0; row < (int)block_height; row++) { for (col = 0; col < (int)block_width; col++) { - const int uv_row = row >> ss_y; - const int uv_col = col >> ss_x; const int filter_weight = get_filter_weight( row, col, block_height, block_width, blk_fw, use_32x32); @@ -476,6 +474,8 @@ void vp9_highbd_apply_temporal_filter_c( // Sum the corresponding uv pixels to the current y modifier // Note we are rounding down instead of rounding to the nearest pixel. + uv_row = row >> ss_y; + uv_col = col >> ss_x; y_mod += u_diff_sse[uv_row * uv_diff_stride + uv_col]; y_mod += v_diff_sse[uv_row * uv_diff_stride + uv_col]; From ff4123215df48bfb6e90eac1691dc70611446dfb Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 12 Apr 2023 13:54:02 -0700 Subject: [PATCH 0733/1000] vp9_frame_scale_ssse3: clear -Wshadow warnings Bug: webm:1793 Change-Id: I85608ac7bb6d3a61649ba342c13c3bf6a39a5dea --- vp9/encoder/x86/vp9_frame_scale_ssse3.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c index bf0e8b121f..94506aad0f 100644 --- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c +++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -469,18 +469,18 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, // It's used to choose the src offset and filter coefficient offset. const int offset_idx1 = (offset1_q4 >> 4) & 1; const int offset_idx2 = (offset2_q4 >> 4) & 1; - static const shuffle_filter_funcs shuffle_filter_funcs[2] = { + static const shuffle_filter_funcs kShuffleFilterFuncs[2] = { shuffle_filter_ssse3, shuffle_filter_odd_ssse3 }; - static const convolve8_funcs convolve8_funcs[2] = { + static const convolve8_funcs kConvolve8Funcs[2] = { convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3 }; assert(w && h); shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0); - shuffle_filter_funcs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1); - shuffle_filter_funcs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2); + kShuffleFilterFuncs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1); + kShuffleFilterFuncs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2); // Sub 64 to avoid overflow. // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here. @@ -522,11 +522,11 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); - d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1); - d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2); d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); - d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); - d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 @@ -598,11 +598,11 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, loadu_8bit_16x4(t, stride_hor, &s[4]); d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); - d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1); - d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2); d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); - d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); - d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 From 698eb779f27d93b0c577009358cc94b1d4770b8e Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 12 Apr 2023 13:56:39 -0700 Subject: [PATCH 0734/1000] convolve_test: clear -Wshadow warning Bug: webm:1793 Change-Id: I22db73cb756c6c680b73684caef1e08bb6e729d8 --- test/convolve_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/convolve_test.cc b/test/convolve_test.cc index d569048691..5a17d80894 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -244,7 +244,7 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr, // Vertical pass (transposed intermediate -> dst). { - uint16_t *src_ptr = intermediate_buffer; + src_ptr = intermediate_buffer; const int dst_next_row_stride = dst_stride - output_width; unsigned int i, j; for (i = 0; i < output_height; ++i) { From 968960c7b346c05c4a41bffc4f48952c4ef127b8 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 12 Apr 2023 14:00:11 -0700 Subject: [PATCH 0735/1000] dct_test: clear -Wshadow warnings Bug: webm:1793 Change-Id: I571a9d641b2f7f4b9d7c473ca815d4ea10b9f9af --- test/dct_test.cc | 8 -------- 1 file changed, 8 deletions(-) diff --git a/test/dct_test.cc b/test/dct_test.cc index 9a150a24f1..235c407237 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -358,14 +358,6 @@ class TransTestBase : public ::testing::TestWithParam { ASSERT_TRUE(in.Init()); Buffer coeff = Buffer(size_, size_, 0, 16); ASSERT_TRUE(coeff.Init()); - Buffer dst = Buffer(size_, size_, 0, 16); - ASSERT_TRUE(dst.Init()); - Buffer src = Buffer(size_, size_, 0); - ASSERT_TRUE(src.Init()); - Buffer dst16 = Buffer(size_, size_, 0, 16); - ASSERT_TRUE(dst16.Init()); - Buffer src16 = Buffer(size_, size_, 0); - ASSERT_TRUE(src16.Init()); for (int i = 0; i < count_test_block; ++i) { InitMem(); From a3eb39ab6f22a09d06591db050bb07ede95fcd88 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 12 Apr 2023 14:02:15 -0700 Subject: [PATCH 0736/1000] svc_encodeframe: clear -Wshadow warnings Bug: webm:1793 Change-Id: Ib65a2dff124034d8e653572f8ada65984e55ed70 --- examples/svc_encodeframe.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/svc_encodeframe.c b/examples/svc_encodeframe.c index c2b3ec9798..1dd731765c 100644 --- a/examples/svc_encodeframe.c +++ b/examples/svc_encodeframe.c @@ -381,7 +381,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *enc_cfg) { vpx_codec_err_t res; - int i, sl, tl; + int sl, tl; SvcInternal_t *const si = get_svc_internal(svc_ctx); if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL || enc_cfg == NULL) { @@ -433,7 +433,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, } for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { - i = sl * svc_ctx->temporal_layers + tl; + const int i = sl * svc_ctx->temporal_layers + tl; si->svc_params.max_quantizers[i] = MAX_QUANTIZER; si->svc_params.min_quantizers[i] = 0; if (enc_cfg->rc_end_usage == VPX_CBR && @@ -503,7 +503,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { - i = sl * svc_ctx->temporal_layers + tl; + const int i = sl * svc_ctx->temporal_layers + tl; if (enc_cfg->rc_end_usage == VPX_CBR && enc_cfg->g_pass == VPX_RC_ONE_PASS) { si->svc_params.max_quantizers[i] = enc_cfg->rc_max_quantizer; From 556e4f6cadef38727ab2e83050915a5eee6584a4 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 12 Apr 2023 14:52:33 -0700 Subject: [PATCH 0737/1000] vpxdec: clear -Wshadow warnings Bug: webm:1793 Change-Id: I0b7f013682229cde50df7c62db9dab6eab0fd341 --- vpxdec.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vpxdec.c b/vpxdec.c index 84cef7dfd4..54a41f0799 100644 --- a/vpxdec.c +++ b/vpxdec.c @@ -996,7 +996,7 @@ static int main_loop(int argc, const char **argv_) { if (single_file) { if (use_y4m) { - char buf[Y4M_BUFFER_SIZE] = { 0 }; + char y4m_buf[Y4M_BUFFER_SIZE] = { 0 }; size_t len = 0; if (img->fmt == VPX_IMG_FMT_I440 || img->fmt == VPX_IMG_FMT_I44016) { fprintf(stderr, "Cannot produce y4m output for 440 sampling.\n"); @@ -1005,21 +1005,22 @@ static int main_loop(int argc, const char **argv_) { if (frame_out == 1) { // Y4M file header len = y4m_write_file_header( - buf, sizeof(buf), vpx_input_ctx.width, vpx_input_ctx.height, - &vpx_input_ctx.framerate, img->fmt, img->bit_depth); + y4m_buf, sizeof(y4m_buf), vpx_input_ctx.width, + vpx_input_ctx.height, &vpx_input_ctx.framerate, img->fmt, + img->bit_depth); if (do_md5) { - MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len); + MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len); } else { - fputs(buf, outfile); + fputs(y4m_buf, outfile); } } // Y4M frame header - len = y4m_write_frame_header(buf, sizeof(buf)); + len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf)); if (do_md5) { - MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len); + MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len); } else { - fputs(buf, outfile); + fputs(y4m_buf, outfile); } } else { if (frame_out == 1) { From 6c65608253ef6ff7786416d54985d0e27a286798 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 12 Apr 2023 14:52:44 -0700 Subject: [PATCH 0738/1000] vpxenc: clear -Wshadow warnings Bug: webm:1793 Change-Id: I2a26c9297016d3fa2c32e8974ef3d7dab1e524c4 --- vpxenc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vpxenc.c b/vpxenc.c index 61672acadd..38d69a1923 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -1586,13 +1586,14 @@ static void test_decode(struct stream_state *stream, /* Get the internal reference frame */ if (strcmp(codec->name, "vp8") == 0) { struct vpx_ref_frame ref_enc, ref_dec; - int width, height; + int aligned_width = (stream->config.cfg.g_w + 15) & ~15; + int aligned_height = (stream->config.cfg.g_h + 15) & ~15; - width = (stream->config.cfg.g_w + 15) & ~15; - height = (stream->config.cfg.g_h + 15) & ~15; - vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, width, height, 1); + vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, aligned_width, aligned_height, + 1); enc_img = ref_enc.img; - vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, width, height, 1); + vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, aligned_width, aligned_height, + 1); dec_img = ref_dec.img; ref_enc.frame_type = VP8_LAST_FRAME; @@ -1969,10 +1970,9 @@ int main(int argc, const char **argv_) { } else { const int64_t input_pos = ftello(input.file); const int64_t input_pos_lagged = input_pos - lagged_count; - const int64_t limit = input.length; rate = cx_time ? input_pos_lagged * (int64_t)1000000 / cx_time : 0; - remaining = limit - input_pos + lagged_count; + remaining = input.length - input_pos + lagged_count; } average_rate = From 536c9867644c2307986345efb10f3b566158ab63 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 13 Apr 2023 16:52:51 -0400 Subject: [PATCH 0739/1000] Add VP8RateControlRTC::GetLoopfilterLevel New linear model to calculate loopfilter level from frame qp. Linear regression was done on qvga, vga, and hd clips. Bug: b/275304642 Change-Id: I552b312212bb4de21b53b762d139aa9588c64ae2 --- vp8/vp8_ratectrl_rtc.cc | 28 ++++++++++++++++++++++++++++ vp8/vp8_ratectrl_rtc.h | 3 +++ 2 files changed, 31 insertions(+) diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc index 65c58536aa..60bc258a6f 100644 --- a/vp8/vp8_ratectrl_rtc.cc +++ b/vp8/vp8_ratectrl_rtc.cc @@ -294,6 +294,34 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) { int VP8RateControlRTC::GetQP() const { return q_; } +int VP8RateControlRTC::GetLoopfilterLevel() const { + VP8_COMMON *cm = &cpi_->common; + const double qp = q_; + + // This model is from linear regression + if (cm->Width * cm->Height <= 320 * 240) { + cm->filter_level = static_cast(0.352685 * qp + 2.957774); + } else if (cm->Width * cm->Height <= 640 * 480) { + cm->filter_level = static_cast(0.485069 * qp - 0.534462); + } else { + cm->filter_level = static_cast(0.314875 * qp + 7.959003); + } + + int min_filter_level = 0; + // This logic is from get_min_filter_level() in picklpf.c + if (q_ > 6 && q_ <= 16) { + min_filter_level = 1; + } else { + min_filter_level = (q_ / 8); + } + + const int max_filter_level = 63; + if (cm->filter_level < min_filter_level) cm->filter_level = min_filter_level; + if (cm->filter_level > max_filter_level) cm->filter_level = max_filter_level; + + return cm->filter_level; +} + void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { VP8_COMMON *const cm = &cpi_->common; vpx_clear_system_state(); diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h index a8a886c56e..496ef9eaad 100644 --- a/vp8/vp8_ratectrl_rtc.h +++ b/vp8/vp8_ratectrl_rtc.h @@ -42,6 +42,9 @@ class VP8RateControlRTC { bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg); // GetQP() needs to be called after ComputeQP() to get the latest QP int GetQP() const; + // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter + // level is calculated from frame qp. + int GetLoopfilterLevel() const; // int GetLoopfilterLevel() const; void ComputeQP(const VP8FrameParamsQpRTC &frame_params); // Feedback to rate control with the size of current encoded frame From dca0a8b86052f9ed55173a315cef97c584b5e1cb Mon Sep 17 00:00:00 2001 From: "L. E. Segovia" Date: Mon, 10 Apr 2023 19:08:54 -0300 Subject: [PATCH 0740/1000] libs.mk: Fix wrong scope end comments I believe the following comments are wrongly scoped, possibly left over from previous changesets. This made me very confused when reading the test suite Makefile, in order to port it to Meson. Change-Id: Ice3c7ba50c6909a9c7dfd4001afa1e1ddfa4b5ce --- libs.mk | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libs.mk b/libs.mk index 1f7f03aa38..92cf5509fb 100644 --- a/libs.mk +++ b/libs.mk @@ -631,8 +631,8 @@ test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \ -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \ -L. -l$(CODEC_LIB) -l$(RC_RTC_LIB) -l$(GTEST_LIB) $^ endif # RC_INTERFACE_TEST -endif # CONFIG_VP9_ENCODER -endif +endif # CONFIG_ENCODERS +endif # CONFIG_MSVS else include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk @@ -699,7 +699,7 @@ $(eval $(call linkerxx_template,$(SIMPLE_ENCODE_TEST_BIN), \ -L. -lsimple_encode -lvpx -lgtest $(extralibs) -lm)) endif # SIMPLE_ENCODE_TEST -endif # CONFIG_UNIT_TESTS +endif # CONFIG_EXTERNAL_BUILD # Install test sources only if codec source is included INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\ @@ -724,7 +724,7 @@ NUM_SHARDS := 10 SHARDS := 0 1 2 3 4 5 6 7 8 9 $(foreach s,$(SHARDS),$(eval $(call test_shard_template,$(s),$(NUM_SHARDS)))) -endif +endif # CONFIG_UNIT_TESTS ## ## documentation directives From bdba4591a7daaeb8068fcb99d86d010fa59d6c94 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 12 Apr 2023 11:47:49 -0700 Subject: [PATCH 0741/1000] vp9_rdcost: clear -Wshadow warnings Bug: webm:1793 Change-Id: I6d48038d74e510ecb5773dfffbdc4c10b765c2aa --- vp9/encoder/vp9_rdopt.c | 157 ++++++++++++++++++++-------------------- 1 file changed, 77 insertions(+), 80 deletions(-) diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 9121eeac15..c68cfefdea 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -588,15 +588,15 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, if (x->skip_encode && !is_inter_block(xd->mi[0])) { // TODO(jingning): tune the model to better capture the distortion. - const int64_t p = + const int64_t mean_quant_error = (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >> #if CONFIG_VP9_HIGHBITDEPTH (shift + 2 + (bd - 8) * 2); #else (shift + 2); #endif // CONFIG_VP9_HIGHBITDEPTH - *out_dist += (p >> 4); - *out_sse += p; + *out_dist += (mean_quant_error >> 4); + *out_sse += mean_quant_error; } } else { const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; @@ -785,13 +785,12 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const int16_t *const diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; - const int enable_trellis_opt = + const int use_trellis_opt = do_trellis_opt(pd, diff, diff_stride, blk_row, blk_col, plane_bsize, tx_size, &encode_b_arg); // full forward transform and quantization vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size); - if (enable_trellis_opt) - vp9_optimize_b(x, plane, block, tx_size, coeff_ctx); + if (use_trellis_opt) vp9_optimize_b(x, plane, block, tx_size, coeff_ctx); dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size, &dist, &sse, recon, sse_calc_done); } else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) { @@ -1436,7 +1435,6 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, if (ref_best_rd < 0) is_cost_valid = 0; if (is_inter_block(mi) && is_cost_valid) { - int plane; for (plane = 1; plane < MAX_MB_PLANE; ++plane) vp9_subtract_plane(x, bsize, plane); } @@ -2070,7 +2068,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, static int64_t rd_pick_best_sub8x8_mode( VP9_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv, - int_mv *second_best_ref_mv, int64_t best_rd, int *returntotrate, + int_mv *second_best_ref_mv, int64_t best_rd_so_far, int *returntotrate, int *returnyrate, int64_t *returndistortion, int *skippable, int64_t *psse, int mvthresh, int_mv seg_mvs[4][MAX_REF_FRAMES], BEST_SEG_INFO *bsi_buf, int filter_idx, int mi_row, int mi_col) { @@ -2103,7 +2101,7 @@ static int64_t rd_pick_best_sub8x8_mode( vp9_zero(*bsi); - bsi->segment_rd = best_rd; + bsi->segment_rd = best_rd_so_far; bsi->ref_mv[0] = best_ref_mv; bsi->ref_mv[1] = second_best_ref_mv; bsi->mvp.as_int = best_ref_mv->as_int; @@ -2129,14 +2127,14 @@ static int64_t rd_pick_best_sub8x8_mode( int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; PREDICTION_MODE mode_selected = ZEROMV; int64_t best_rd = INT64_MAX; - const int i = idy * 2 + idx; + const int block = idy * 2 + idx; int ref; for (ref = 0; ref < 1 + has_second_rf; ++ref) { const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; frame_mv[ZEROMV][frame].as_int = 0; vp9_append_sub8x8_mvs_for_idx( - cm, xd, i, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame], + cm, xd, block, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame], &frame_mv[NEARMV][frame], mbmi_ext->mode_context); } @@ -2146,7 +2144,7 @@ static int64_t rd_pick_best_sub8x8_mode( struct buf_2d orig_pre[2]; mode_idx = INTER_OFFSET(this_mode); - bsi->rdstat[i][mode_idx].brdcost = INT64_MAX; + bsi->rdstat[block][mode_idx].brdcost = INT64_MAX; if (!(inter_mode_mask & (1 << this_mode))) continue; if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv, @@ -2154,14 +2152,14 @@ static int64_t rd_pick_best_sub8x8_mode( continue; memcpy(orig_pre, pd->pre, sizeof(orig_pre)); - memcpy(bsi->rdstat[i][mode_idx].ta, t_above, - sizeof(bsi->rdstat[i][mode_idx].ta)); - memcpy(bsi->rdstat[i][mode_idx].tl, t_left, - sizeof(bsi->rdstat[i][mode_idx].tl)); + memcpy(bsi->rdstat[block][mode_idx].ta, t_above, + sizeof(bsi->rdstat[block][mode_idx].ta)); + memcpy(bsi->rdstat[block][mode_idx].tl, t_left, + sizeof(bsi->rdstat[block][mode_idx].tl)); // motion search for newmv (single predictor case only) if (!has_second_rf && this_mode == NEWMV && - seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV) { + seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV) { MV *const new_mv = &mode_mv[NEWMV][0].as_mv; int step_param = 0; uint32_t bestsme = UINT_MAX; @@ -2177,12 +2175,13 @@ static int64_t rd_pick_best_sub8x8_mode( if (cpi->oxcf.mode != BEST) { // use previous block's result as next block's MV predictor. - if (i > 0) { - bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int; - if (i == 2) bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int; + if (block > 0) { + bsi->mvp.as_int = mi->bmi[block - 1].as_mv[0].as_int; + if (block == 2) + bsi->mvp.as_int = mi->bmi[block - 2].as_mv[0].as_int; } } - if (i == 0) + if (block == 0) max_mv = x->max_mv_context[mi->ref_frame[0]]; else max_mv = @@ -2211,7 +2210,7 @@ static int64_t rd_pick_best_sub8x8_mode( } // adjust src pointer for this block - mi_buf_shift(x, i); + mi_buf_shift(x, block); vp9_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv); @@ -2234,7 +2233,7 @@ static int64_t rd_pick_best_sub8x8_mode( cpi->sf.use_accurate_subpel_search); // save motion search result for use in compound prediction - seg_mvs[i][mi->ref_frame[0]].as_mv = *new_mv; + seg_mvs[block][mi->ref_frame[0]].as_mv = *new_mv; } x->pred_mv[mi->ref_frame[0]] = *new_mv; @@ -2244,40 +2243,40 @@ static int64_t rd_pick_best_sub8x8_mode( } if (has_second_rf) { - if (seg_mvs[i][mi->ref_frame[1]].as_int == INVALID_MV || - seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV) + if (seg_mvs[block][mi->ref_frame[1]].as_int == INVALID_MV || + seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV) continue; } if (has_second_rf && this_mode == NEWMV && mi->interp_filter == EIGHTTAP) { // adjust src pointers - mi_buf_shift(x, i); + mi_buf_shift(x, block); if (sf->comp_inter_joint_search_thresh <= bsize) { int rate_mv; joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row, - mi_col, seg_mvs[i], &rate_mv); - seg_mvs[i][mi->ref_frame[0]].as_int = + mi_col, seg_mvs[block], &rate_mv); + seg_mvs[block][mi->ref_frame[0]].as_int = frame_mv[this_mode][mi->ref_frame[0]].as_int; - seg_mvs[i][mi->ref_frame[1]].as_int = + seg_mvs[block][mi->ref_frame[1]].as_int = frame_mv[this_mode][mi->ref_frame[1]].as_int; } // restore src pointers mi_buf_restore(x, orig_src, orig_pre); } - bsi->rdstat[i][mode_idx].brate = set_and_cost_bmi_mvs( - cpi, x, xd, i, this_mode, mode_mv[this_mode], frame_mv, seg_mvs[i], - bsi->ref_mv, x->nmvjointcost, x->mvcost); + bsi->rdstat[block][mode_idx].brate = set_and_cost_bmi_mvs( + cpi, x, xd, block, this_mode, mode_mv[this_mode], frame_mv, + seg_mvs[block], bsi->ref_mv, x->nmvjointcost, x->mvcost); for (ref = 0; ref < 1 + has_second_rf; ++ref) { - bsi->rdstat[i][mode_idx].mvs[ref].as_int = + bsi->rdstat[block][mode_idx].mvs[ref].as_int = mode_mv[this_mode][ref].as_int; if (num_4x4_blocks_wide > 1) - bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int = + bsi->rdstat[block + 1][mode_idx].mvs[ref].as_int = mode_mv[this_mode][ref].as_int; if (num_4x4_blocks_high > 1) - bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int = + bsi->rdstat[block + 2][mode_idx].mvs[ref].as_int = mode_mv[this_mode][ref].as_int; } @@ -2295,7 +2294,7 @@ static int64_t rd_pick_best_sub8x8_mode( for (ref = 0; ref < 1 + has_second_rf; ++ref) { subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv); have_ref &= mode_mv[this_mode][ref].as_int == - ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int; + ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int; } if (filter_idx > 1 && !subpelmv && !have_ref) { @@ -2303,53 +2302,55 @@ static int64_t rd_pick_best_sub8x8_mode( have_ref = 1; for (ref = 0; ref < 1 + has_second_rf; ++ref) have_ref &= mode_mv[this_mode][ref].as_int == - ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int; + ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int; } if (!subpelmv && have_ref && - ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) { - memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx], - sizeof(SEG_RDSTAT)); + ref_bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) { + memcpy(&bsi->rdstat[block][mode_idx], + &ref_bsi->rdstat[block][mode_idx], sizeof(SEG_RDSTAT)); if (num_4x4_blocks_wide > 1) - bsi->rdstat[i + 1][mode_idx].eobs = - ref_bsi->rdstat[i + 1][mode_idx].eobs; + bsi->rdstat[block + 1][mode_idx].eobs = + ref_bsi->rdstat[block + 1][mode_idx].eobs; if (num_4x4_blocks_high > 1) - bsi->rdstat[i + 2][mode_idx].eobs = - ref_bsi->rdstat[i + 2][mode_idx].eobs; + bsi->rdstat[block + 2][mode_idx].eobs = + ref_bsi->rdstat[block + 2][mode_idx].eobs; - if (bsi->rdstat[i][mode_idx].brdcost < best_rd) { + if (bsi->rdstat[block][mode_idx].brdcost < best_rd) { mode_selected = this_mode; - best_rd = bsi->rdstat[i][mode_idx].brdcost; + best_rd = bsi->rdstat[block][mode_idx].brdcost; } continue; } } - bsi->rdstat[i][mode_idx].brdcost = encode_inter_mb_segment( - cpi, x, bsi->segment_rd - this_segment_rd, i, - &bsi->rdstat[i][mode_idx].byrate, &bsi->rdstat[i][mode_idx].bdist, - &bsi->rdstat[i][mode_idx].bsse, bsi->rdstat[i][mode_idx].ta, - bsi->rdstat[i][mode_idx].tl, mi_row, mi_col); - if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) { - bsi->rdstat[i][mode_idx].brdcost += - RDCOST(x->rdmult, x->rddiv, bsi->rdstat[i][mode_idx].brate, 0); - bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate; - bsi->rdstat[i][mode_idx].eobs = p->eobs[i]; + bsi->rdstat[block][mode_idx].brdcost = encode_inter_mb_segment( + cpi, x, bsi->segment_rd - this_segment_rd, block, + &bsi->rdstat[block][mode_idx].byrate, + &bsi->rdstat[block][mode_idx].bdist, + &bsi->rdstat[block][mode_idx].bsse, bsi->rdstat[block][mode_idx].ta, + bsi->rdstat[block][mode_idx].tl, mi_row, mi_col); + if (bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) { + bsi->rdstat[block][mode_idx].brdcost += RDCOST( + x->rdmult, x->rddiv, bsi->rdstat[block][mode_idx].brate, 0); + bsi->rdstat[block][mode_idx].brate += + bsi->rdstat[block][mode_idx].byrate; + bsi->rdstat[block][mode_idx].eobs = p->eobs[block]; if (num_4x4_blocks_wide > 1) - bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1]; + bsi->rdstat[block + 1][mode_idx].eobs = p->eobs[block + 1]; if (num_4x4_blocks_high > 1) - bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2]; + bsi->rdstat[block + 2][mode_idx].eobs = p->eobs[block + 2]; } - if (bsi->rdstat[i][mode_idx].brdcost < best_rd) { + if (bsi->rdstat[block][mode_idx].brdcost < best_rd) { mode_selected = this_mode; - best_rd = bsi->rdstat[i][mode_idx].brdcost; + best_rd = bsi->rdstat[block][mode_idx].brdcost; } } /*for each 4x4 mode*/ if (best_rd == INT64_MAX) { int iy, midx; - for (iy = i + 1; iy < 4; ++iy) + for (iy = block + 1; iy < 4; ++iy) for (midx = 0; midx < INTER_MODES; ++midx) bsi->rdstat[iy][midx].brdcost = INT64_MAX; bsi->segment_rd = INT64_MAX; @@ -2357,22 +2358,22 @@ static int64_t rd_pick_best_sub8x8_mode( } mode_idx = INTER_OFFSET(mode_selected); - memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above)); - memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left)); + memcpy(t_above, bsi->rdstat[block][mode_idx].ta, sizeof(t_above)); + memcpy(t_left, bsi->rdstat[block][mode_idx].tl, sizeof(t_left)); - set_and_cost_bmi_mvs(cpi, x, xd, i, mode_selected, mode_mv[mode_selected], - frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost, - x->mvcost); + set_and_cost_bmi_mvs(cpi, x, xd, block, mode_selected, + mode_mv[mode_selected], frame_mv, seg_mvs[block], + bsi->ref_mv, x->nmvjointcost, x->mvcost); - br += bsi->rdstat[i][mode_idx].brate; - bd += bsi->rdstat[i][mode_idx].bdist; - block_sse += bsi->rdstat[i][mode_idx].bsse; - segmentyrate += bsi->rdstat[i][mode_idx].byrate; - this_segment_rd += bsi->rdstat[i][mode_idx].brdcost; + br += bsi->rdstat[block][mode_idx].brate; + bd += bsi->rdstat[block][mode_idx].bdist; + block_sse += bsi->rdstat[block][mode_idx].bsse; + segmentyrate += bsi->rdstat[block][mode_idx].byrate; + this_segment_rd += bsi->rdstat[block][mode_idx].brdcost; if (this_segment_rd > bsi->segment_rd) { int iy, midx; - for (iy = i + 1; iy < 4; ++iy) + for (iy = block + 1; iy < 4; ++iy) for (midx = 0; midx < INTER_MODES; ++midx) bsi->rdstat[iy][midx].brdcost = INT64_MAX; bsi->segment_rd = INT64_MAX; @@ -2390,7 +2391,7 @@ static int64_t rd_pick_best_sub8x8_mode( // update the coding decisions for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode; - if (bsi->segment_rd > best_rd) return INT64_MAX; + if (bsi->segment_rd > best_rd_so_far) return INT64_MAX; /* set it to the best */ for (i = 0; i < 4; i++) { mode_idx = INTER_OFFSET(bsi->modes[i]); @@ -2635,9 +2636,9 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, tmp_mv->as_int = INVALID_MV; if (scaled_ref_frame) { - int i; - for (i = 0; i < MAX_MB_PLANE; ++i) - xd->plane[i].pre[0] = backup_yv12[i]; + int j; + for (j = 0; j < MAX_MB_PLANE; ++j) + xd->plane[j].pre[0] = backup_yv12[j]; } return; } @@ -4352,7 +4353,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, int rate2 = 0, rate_y = 0, rate_uv = 0; int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0; int skippable = 0; - int i; int this_skip2 = 0; int64_t total_sse = INT_MAX; int early_term = 0; @@ -4513,7 +4513,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, : NULL; if (scaled_ref_frame[ref]) { - int i; // Swap out the reference frame for a version that's been scaled to // match the resolution of the current frame, allowing the existing // motion search code to be used without additional modifications. @@ -4657,7 +4656,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, &uv_sse, BLOCK_8X8, tmp_best_rdu)) { for (ref = 0; ref < 2; ++ref) { if (scaled_ref_frame[ref]) { - int i; for (i = 0; i < MAX_MB_PLANE; ++i) xd->plane[i].pre[ref] = backup_yv12[ref][i]; } @@ -4674,7 +4672,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, for (ref = 0; ref < 2; ++ref) { if (scaled_ref_frame[ref]) { // Restore the prediction frame pointers to their unscaled versions. - int i; for (i = 0; i < MAX_MB_PLANE; ++i) xd->plane[i].pre[ref] = backup_yv12[ref][i]; } From e15c2e34451b4177e4119cce47bf73dac9864de8 Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Tue, 4 Apr 2023 17:24:27 +0530 Subject: [PATCH 0742/1000] Add AVX2 intrinsic for vpx_fdct16x16() function Introduced AVX2 intrinsic to compute FDCT for block size 16x16 case. This is a bit-exact change. Please check the module level scaling w.r.t C function (timer based) for existing (SSE2) and new AVX2 intrinsics: Scaling SSE2 AVX2 3.88x 5.95x Change-Id: I02299c3746fcb52d808e2a75d30aa62652c816dc --- test/dct16x16_test.cc | 48 +++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/fwd_txfm_avx2.c | 373 +++++++++++++++++++++++++++++++++++ 3 files changed, 422 insertions(+), 1 deletion(-) diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index d4ef7ae13d..3c104f3a44 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -27,6 +27,7 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" #include "vpx_ports/msvc.h" // for round() +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; @@ -548,6 +549,44 @@ class Trans16x16TestBase { } } + void RunSpeedTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + int c_sum_time = 0; + int simd_sum_time = 0; + + DECLARE_ALIGNED(32, int16_t, input_block[kNumCoeffs]); + DECLARE_ALIGNED(32, tran_low_t, output_ref_block[kNumCoeffs]); + DECLARE_ALIGNED(32, tran_low_t, output_block[kNumCoeffs]); + + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < kNumCoeffs; ++j) { + input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_); + } + + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + vpx_fdct16x16_c(input_block, output_ref_block, pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += static_cast(vpx_usec_timer_elapsed(&timer_c)); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunFwdTxfm(input_block, output_block, pitch_); + } + + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += static_cast(vpx_usec_timer_elapsed(&timer_mod)); + + printf( + "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time, + simd_sum_time, + (static_cast(c_sum_time) / static_cast(simd_sum_time))); + } + void CompareInvReference(IdctFunc ref_txfm, int thresh) { ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = 10000; @@ -664,6 +703,8 @@ TEST_P(Trans16x16DCT, QuantCheck) { TEST_P(Trans16x16DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); } +TEST_P(Trans16x16DCT, DISABLED_Speed) { RunSpeedTest(); } + class Trans16x16HT : public Trans16x16TestBase, public ::testing::TestWithParam { public: @@ -823,6 +864,13 @@ INSTANTIATE_TEST_SUITE_P( 3, VPX_BITS_8))); #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + AVX2, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_avx2, + &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8))); +#endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_SUITE_P( SSE2, Trans16x16DCT, diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 49bc9a6309..f825e5a399 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -597,7 +597,7 @@ () specialize qw/vpx_fdct8x8_1 sse2 neon msa/; add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct16x16 neon sse2 msa lsx/; + specialize qw/vpx_fdct16x16 neon sse2 avx2 msa lsx/; add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct16x16_1 sse2 neon msa/; diff --git a/vpx_dsp/x86/fwd_txfm_avx2.c b/vpx_dsp/x86/fwd_txfm_avx2.c index a2ed420e37..c8f54a49cb 100644 --- a/vpx_dsp/x86/fwd_txfm_avx2.c +++ b/vpx_dsp/x86/fwd_txfm_avx2.c @@ -8,9 +8,382 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include // AVX2 #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#define ADD256_EPI16 _mm256_add_epi16 +#define SUB256_EPI16 _mm256_sub_epi16 + +static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in, + int stride, __m256i *out, + int out_size, int pass) { + int i; + const __m256i kOne = _mm256_set1_epi16(1); + if (pass == 0) { + for (i = 0; i < out_size; i++) { + out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride)); + // x = x << 2 + out[i] = _mm256_slli_epi16(out[i], 2); + } + } else { + for (i = 0; i < out_size; i++) { + out[i] = _mm256_loadu_si256((const __m256i *)(in + i * 16)); + // x = (x + 1) >> 2 + out[i] = _mm256_add_epi16(out[i], kOne); + out[i] = _mm256_srai_epi16(out[i], 2); + } + } +} + +static INLINE void transpose2_8x8_avx2(const __m256i *const in, + __m256i *const out) { + int i; + __m256i t[16], u[16]; + // (1st, 2nd) ==> (lo, hi) + // (0, 1) ==> (0, 1) + // (2, 3) ==> (2, 3) + // (4, 5) ==> (4, 5) + // (6, 7) ==> (6, 7) + for (i = 0; i < 4; i++) { + t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); + t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 2) ==> (0, 2) + // (1, 3) ==> (1, 3) + // (4, 6) ==> (4, 6) + // (5, 7) ==> (5, 7) + for (i = 0; i < 2; i++) { + u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); + u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); + + u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); + u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 4) ==> (0, 1) + // (1, 5) ==> (4, 5) + // (2, 6) ==> (2, 3) + // (3, 7) ==> (6, 7) + for (i = 0; i < 2; i++) { + out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); + out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); + + out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); + out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); + } +} + +static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, + __m256i *const out) { + __m256i t[16]; + +#define LOADL(idx) \ + t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ + t[idx] = _mm256_inserti128_si256( \ + t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1); + +#define LOADR(idx) \ + t[8 + idx] = \ + _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ + t[8 + idx] = _mm256_inserti128_si256( \ + t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1); + + // load left 8x16 + LOADL(0) + LOADL(1) + LOADL(2) + LOADL(3) + LOADL(4) + LOADL(5) + LOADL(6) + LOADL(7) + + // load right 8x16 + LOADR(0) + LOADR(1) + LOADR(2) + LOADR(3) + LOADR(4) + LOADR(5) + LOADR(6) + LOADR(7) + + // get the top 16x8 result + transpose2_8x8_avx2(t, out); + // get the bottom 16x8 result + transpose2_8x8_avx2(&t[8], &out[8]); +} + +// Store 8 16-bit values. Sign extend the values. +static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in, + tran_low_t *out, + const int stride, + const int out_size) { + int i; + for (i = 0; i < out_size; ++i) { + _mm256_storeu_si256((__m256i *)(out), in[i]); + out += stride; + } +} + +#define PAIR256_SET_EPI16(a, b) \ + _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +static INLINE __m256i mult256_round_shift(const __m256i *pin0, + const __m256i *pin1, + const __m256i *pmultiplier, + const __m256i *prounding, + const int shift) { + const __m256i u0 = _mm256_madd_epi16(*pin0, *pmultiplier); + const __m256i u1 = _mm256_madd_epi16(*pin1, *pmultiplier); + const __m256i v0 = _mm256_add_epi32(u0, *prounding); + const __m256i v1 = _mm256_add_epi32(u1, *prounding); + const __m256i w0 = _mm256_srai_epi32(v0, shift); + const __m256i w1 = _mm256_srai_epi32(v1, shift); + return _mm256_packs_epi32(w0, w1); +} + +static INLINE void fdct16x16_1D_avx2(__m256i *input, __m256i *output) { + int i; + __m256i step2[4]; + __m256i in[8]; + __m256i step1[8]; + __m256i step3[8]; + + const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64); + const __m256i k__cospi_p16_m16 = PAIR256_SET_EPI16(cospi_16_64, -cospi_16_64); + const __m256i k__cospi_p24_p08 = PAIR256_SET_EPI16(cospi_24_64, cospi_8_64); + const __m256i k__cospi_p08_m24 = PAIR256_SET_EPI16(cospi_8_64, -cospi_24_64); + const __m256i k__cospi_m08_p24 = PAIR256_SET_EPI16(-cospi_8_64, cospi_24_64); + const __m256i k__cospi_p28_p04 = PAIR256_SET_EPI16(cospi_28_64, cospi_4_64); + const __m256i k__cospi_m04_p28 = PAIR256_SET_EPI16(-cospi_4_64, cospi_28_64); + const __m256i k__cospi_p12_p20 = PAIR256_SET_EPI16(cospi_12_64, cospi_20_64); + const __m256i k__cospi_m20_p12 = PAIR256_SET_EPI16(-cospi_20_64, cospi_12_64); + const __m256i k__cospi_p30_p02 = PAIR256_SET_EPI16(cospi_30_64, cospi_2_64); + const __m256i k__cospi_p14_p18 = PAIR256_SET_EPI16(cospi_14_64, cospi_18_64); + const __m256i k__cospi_m02_p30 = PAIR256_SET_EPI16(-cospi_2_64, cospi_30_64); + const __m256i k__cospi_m18_p14 = PAIR256_SET_EPI16(-cospi_18_64, cospi_14_64); + const __m256i k__cospi_p22_p10 = PAIR256_SET_EPI16(cospi_22_64, cospi_10_64); + const __m256i k__cospi_p06_p26 = PAIR256_SET_EPI16(cospi_6_64, cospi_26_64); + const __m256i k__cospi_m10_p22 = PAIR256_SET_EPI16(-cospi_10_64, cospi_22_64); + const __m256i k__cospi_m26_p06 = PAIR256_SET_EPI16(-cospi_26_64, cospi_6_64); + const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); + + // Calculate input for the first 8 results. + for (i = 0; i < 8; i++) { + in[i] = ADD256_EPI16(input[i], input[15 - i]); + } + + // Calculate input for the next 8 results. + for (i = 0; i < 8; i++) { + step1[i] = SUB256_EPI16(input[7 - i], input[8 + i]); + } + + // Work on the first eight values; fdct8(input, even_results); + { + // Add/subtract + const __m256i q0 = ADD256_EPI16(in[0], in[7]); + const __m256i q1 = ADD256_EPI16(in[1], in[6]); + const __m256i q2 = ADD256_EPI16(in[2], in[5]); + const __m256i q3 = ADD256_EPI16(in[3], in[4]); + const __m256i q4 = SUB256_EPI16(in[3], in[4]); + const __m256i q5 = SUB256_EPI16(in[2], in[5]); + const __m256i q6 = SUB256_EPI16(in[1], in[6]); + const __m256i q7 = SUB256_EPI16(in[0], in[7]); + + // Work on first four results + { + // Add/subtract + const __m256i r0 = ADD256_EPI16(q0, q3); + const __m256i r1 = ADD256_EPI16(q1, q2); + const __m256i r2 = SUB256_EPI16(q1, q2); + const __m256i r3 = SUB256_EPI16(q0, q3); + + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m256i t0 = _mm256_unpacklo_epi16(r0, r1); + const __m256i t1 = _mm256_unpackhi_epi16(r0, r1); + const __m256i t2 = _mm256_unpacklo_epi16(r2, r3); + const __m256i t3 = _mm256_unpackhi_epi16(r2, r3); + + output[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[8] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[4] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[12] = + mult256_round_shift(&t2, &t3, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } + + // Work on next four results + { + // Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m256i d0 = _mm256_unpacklo_epi16(q6, q5); + const __m256i d1 = _mm256_unpackhi_epi16(q6, q5); + const __m256i r0 = mult256_round_shift( + &d0, &d1, &k__cospi_p16_m16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + const __m256i r1 = mult256_round_shift( + &d0, &d1, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + + { + // Add/subtract + const __m256i x0 = ADD256_EPI16(q4, r0); + const __m256i x1 = SUB256_EPI16(q4, r0); + const __m256i x2 = SUB256_EPI16(q7, r1); + const __m256i x3 = ADD256_EPI16(q7, r1); + + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m256i t0 = _mm256_unpacklo_epi16(x0, x3); + const __m256i t1 = _mm256_unpackhi_epi16(x0, x3); + const __m256i t2 = _mm256_unpacklo_epi16(x1, x2); + const __m256i t3 = _mm256_unpackhi_epi16(x1, x2); + output[2] = + mult256_round_shift(&t0, &t1, &k__cospi_p28_p04, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[14] = + mult256_round_shift(&t0, &t1, &k__cospi_m04_p28, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[10] = + mult256_round_shift(&t2, &t3, &k__cospi_p12_p20, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[6] = + mult256_round_shift(&t2, &t3, &k__cospi_m20_p12, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } + } + } + // Work on the next eight values; step1 -> odd_results + { // step 2 + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[5], step1[2]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[5], step1[2]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[4], step1[3]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[4], step1[3]); + step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + // step 3 + { + step3[0] = ADD256_EPI16(step1[0], step2[1]); + step3[1] = ADD256_EPI16(step1[1], step2[0]); + step3[2] = SUB256_EPI16(step1[1], step2[0]); + step3[3] = SUB256_EPI16(step1[0], step2[1]); + step3[4] = SUB256_EPI16(step1[7], step2[3]); + step3[5] = SUB256_EPI16(step1[6], step2[2]); + step3[6] = ADD256_EPI16(step1[6], step2[2]); + step3[7] = ADD256_EPI16(step1[7], step2[3]); + } + // step 4 + { + const __m256i t0 = _mm256_unpacklo_epi16(step3[1], step3[6]); + const __m256i t1 = _mm256_unpackhi_epi16(step3[1], step3[6]); + const __m256i t2 = _mm256_unpacklo_epi16(step3[2], step3[5]); + const __m256i t3 = _mm256_unpackhi_epi16(step3[2], step3[5]); + step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p08_m24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + // step 5 + { + step1[0] = ADD256_EPI16(step3[0], step2[0]); + step1[1] = SUB256_EPI16(step3[0], step2[0]); + step1[2] = ADD256_EPI16(step3[3], step2[1]); + step1[3] = SUB256_EPI16(step3[3], step2[1]); + step1[4] = SUB256_EPI16(step3[4], step2[3]); + step1[5] = ADD256_EPI16(step3[4], step2[3]); + step1[6] = SUB256_EPI16(step3[7], step2[2]); + step1[7] = ADD256_EPI16(step3[7], step2[2]); + } + // step 6 + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[0], step1[7]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[0], step1[7]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[1], step1[6]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[1], step1[6]); + output[1] = mult256_round_shift(&t0, &t1, &k__cospi_p30_p02, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[9] = mult256_round_shift(&t2, &t3, &k__cospi_p14_p18, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[15] = mult256_round_shift(&t0, &t1, &k__cospi_m02_p30, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[7] = mult256_round_shift(&t2, &t3, &k__cospi_m18_p14, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[2], step1[5]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[2], step1[5]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[3], step1[4]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[3], step1[4]); + output[5] = mult256_round_shift(&t0, &t1, &k__cospi_p22_p10, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[13] = mult256_round_shift(&t2, &t3, &k__cospi_p06_p26, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[11] = mult256_round_shift(&t0, &t1, &k__cospi_m10_p22, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[3] = mult256_round_shift(&t2, &t3, &k__cospi_m26_p06, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } +} + +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride) { + int pass; + DECLARE_ALIGNED(32, int16_t, intermediate[256]); + int16_t *out0 = intermediate; + tran_low_t *out1 = output; + const int width = 16; + const int height = 16; + __m256i buf0[16], buf1[16]; + + // Two transform and transpose passes + // Process 16 columns (transposed rows in second pass) at a time. + for (pass = 0; pass < 2; ++pass) { + // Load and pre-condition input. + load_buffer_16bit_to_16bit_avx2(input, stride, buf1, height, pass); + + // Calculate dct for 16x16 values + fdct16x16_1D_avx2(buf1, buf0); + + // Transpose the results. + transpose_16bit_16x16_avx2(buf0, buf1); + + if (pass == 0) { + store_buffer_16bit_to_32bit_w16_avx2(buf1, out0, width, height); + } else { + store_buffer_16bit_to_32bit_w16_avx2(buf1, out1, width, height); + } + // Setup in/out for next pass. + input = intermediate; + } +} + #if !CONFIG_VP9_HIGHBITDEPTH #define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2 #define FDCT32x32_HIGH_PRECISION 0 From 7bdce0887b7e1acd62093cd2315ce0e93e75ba5f Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 17 Apr 2023 21:57:59 -0700 Subject: [PATCH 0743/1000] onyx_if: clear -Wshadow warning with --enable-internal-stats Bug: webm:1793 Change-Id: I9d375e4cb45f78b82afe455f2c7ad2b56e217f7d --- vp8/encoder/onyx_if.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index bcf5227029..44a02b6ddc 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -2109,7 +2109,6 @@ void vp8_remove_compressor(VP8_COMP **comp) { double time_encoded = (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) / 10000000.000; - double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded; if (cpi->b_calculate_psnr) { if (cpi->oxcf.number_of_layers > 1) { @@ -2138,6 +2137,7 @@ void vp8_remove_compressor(VP8_COMP **comp) { total_psnr2, total_ssim); } } else { + double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded; double samples = 3.0 / 2 * cpi->count * cpi->common.Width * cpi->common.Height; double total_psnr = From eef765751a52ad12c6e681db71ad58e9b9c26c2e Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 17 Apr 2023 22:01:10 -0700 Subject: [PATCH 0744/1000] mr_dissim: clear -Wshadow warning Bug: webm:1793 Change-Id: I73ced43aba45215264134f917fd69ab0b1f10d01 --- vp8/encoder/mr_dissim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp8/encoder/mr_dissim.c b/vp8/encoder/mr_dissim.c index 011b62a08f..b1bfb4b54a 100644 --- a/vp8/encoder/mr_dissim.c +++ b/vp8/encoder/mr_dissim.c @@ -49,7 +49,6 @@ void vp8_cal_low_res_mb_cols(VP8_COMP *cpi) { void vp8_cal_dissimilarity(VP8_COMP *cpi) { VP8_COMMON *cm = &cpi->common; - int i; /* Note: The first row & first column in mip are outside the frame, which * were initialized to all 0.(ref_frame, mode, mv...) @@ -67,6 +66,7 @@ void vp8_cal_dissimilarity(VP8_COMP *cpi) { store_info->frame_type = cm->frame_type; if (cm->frame_type != KEY_FRAME) { + int i; store_info->is_frame_dropped = 0; for (i = 1; i < MAX_REF_FRAMES; ++i) store_info->low_res_ref_frames[i] = cpi->current_ref_frames[i]; From d725bdd8a1fc34346245b89a27eb0b377fe73119 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 17 Apr 2023 22:05:46 -0700 Subject: [PATCH 0745/1000] vp9_tpl_model: clear -Wshadow warning with --enable-experimental --enable-non-greedy-mv Bug: webm:1793 Change-Id: I19e38d7196291ae1ffbb5fb3daa70a4fefd54c55 --- vp9/encoder/vp9_tpl_model.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index 53ef356981..624bb1901f 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -1089,10 +1089,6 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, const int mi_height = num_8x8_blocks_high_lookup[bsize]; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; int64_t recon_error, sse; -#if CONFIG_NON_GREEDY_MV - int square_block_idx; - int rf_idx; -#endif // Setup scaling factor #if CONFIG_VP9_HIGHBITDEPTH @@ -1133,21 +1129,25 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, vp9_frame_init_quantizer(cpi); #if CONFIG_NON_GREEDY_MV - for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; - ++square_block_idx) { - BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx); - build_motion_field(cpi, frame_idx, ref_frame, square_bsize); - } - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; - if (ref_frame_idx != -1) { - MotionField *motion_field = vp9_motion_field_info_get_motion_field( - &cpi->motion_field_info, frame_idx, rf_idx, bsize); - predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx, - tpl_frame, rf_idx, bsize); + { + int square_block_idx; + int rf_idx; + for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; + ++square_block_idx) { + BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx); + build_motion_field(cpi, frame_idx, ref_frame, square_bsize); + } + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + MotionField *motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize); + } } } -#endif +#endif // CONFIG_NON_GREEDY_MV for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { From 7b7f84fe148168532bbf9add7b738d125588c926 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 12 Apr 2023 14:35:50 +0100 Subject: [PATCH 0746/1000] Add Neon implementation of vpx_sad_skip_x functions Add Neon implementations of standard bitdepth downsampling SAD functions for all block sizes. Also add corresponding unit tests. Change-Id: Ibda734c270278d947673ffcc29ef17a2f4970b01 --- test/sad_test.cc | 18 ++++++++++++++++++ vpx_dsp/arm/sad_neon.c | 30 ++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 24 +++++++++++++----------- 3 files changed, 61 insertions(+), 11 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 561da5ddfb..e43d9ac41e 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1129,6 +1129,24 @@ const SadMxNParam neon_tests[] = { }; INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests)); +const SadSkipMxNParam skip_neon_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon), + SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_neon), + SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_neon), + SadSkipMxNParam(8, 4, &vpx_sad_skip_8x4_neon), + SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_neon), + SadSkipMxNParam(4, 4, &vpx_sad_skip_4x4_neon) +}; +INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest, + ::testing::ValuesIn(skip_neon_tests)); + const SadMxNAvgParam avg_neon_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon), SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon), diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index 9382b80626..566a1f81db 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -250,6 +250,36 @@ SAD_WXH_NEON(32, 64) SAD_WXH_NEON(64, 32) SAD_WXH_NEON(64, 64) +#undef SAD_WXH_NEON + +#define SAD_SKIP_WXH_NEON(w, h) \ + unsigned int vpx_sad_skip_##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * \ + sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \ + } + +SAD_SKIP_WXH_NEON(4, 4) +SAD_SKIP_WXH_NEON(4, 8) + +SAD_SKIP_WXH_NEON(8, 4) +SAD_SKIP_WXH_NEON(8, 8) +SAD_SKIP_WXH_NEON(8, 16) + +SAD_SKIP_WXH_NEON(16, 8) +SAD_SKIP_WXH_NEON(16, 16) +SAD_SKIP_WXH_NEON(16, 32) + +SAD_SKIP_WXH_NEON(32, 16) +SAD_SKIP_WXH_NEON(32, 32) +SAD_SKIP_WXH_NEON(32, 64) + +SAD_SKIP_WXH_NEON(64, 32) +SAD_SKIP_WXH_NEON(64, 64) + +#undef SAD_SKIP_WXH_NEON + #if defined(__ARM_FEATURE_DOTPROD) static INLINE unsigned int sadwxh_avg_neon(const uint8_t *src_ptr, diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index e3d48f493e..05d031b8cd 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -787,41 +787,43 @@ () specialize qw/vpx_sad4x4 neon msa sse2 mmi/; add_proto qw/unsigned int vpx_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_64x64 avx2 sse2/; +specialize qw/vpx_sad_skip_64x64 neon avx2 sse2/; add_proto qw/unsigned int vpx_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_64x32 avx2 sse2/; +specialize qw/vpx_sad_skip_64x32 neon avx2 sse2/; add_proto qw/unsigned int vpx_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_32x64 avx2 sse2/; +specialize qw/vpx_sad_skip_32x64 neon avx2 sse2/; add_proto qw/unsigned int vpx_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_32x32 avx2 sse2/; +specialize qw/vpx_sad_skip_32x32 neon avx2 sse2/; add_proto qw/unsigned int vpx_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_32x16 avx2 sse2/; +specialize qw/vpx_sad_skip_32x16 neon avx2 sse2/; add_proto qw/unsigned int vpx_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_16x32 sse2/; +specialize qw/vpx_sad_skip_16x32 neon sse2/; add_proto qw/unsigned int vpx_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_16x16 sse2/; +specialize qw/vpx_sad_skip_16x16 neon sse2/; add_proto qw/unsigned int vpx_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_16x8 sse2/; +specialize qw/vpx_sad_skip_16x8 neon sse2/; add_proto qw/unsigned int vpx_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_8x16 sse2/; +specialize qw/vpx_sad_skip_8x16 neon sse2/; add_proto qw/unsigned int vpx_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_8x8 sse2/; +specialize qw/vpx_sad_skip_8x8 neon sse2/; add_proto qw/unsigned int vpx_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_8x4 neon/; add_proto qw/unsigned int vpx_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_4x8 sse2/; +specialize qw/vpx_sad_skip_4x8 neon sse2/; add_proto qw/unsigned int vpx_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vpx_sad_skip_4x4 neon/; # # Avg From 05b244af52e87ff7dacce78a6db3eab1765e84c8 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 12 Apr 2023 14:48:21 +0100 Subject: [PATCH 0747/1000] Add Neon implementation of vpx_highbd_sad_skip_x functions Add Neon implementations of high bitdepth downsampling SAD functions for all block sizes. Also add corresponding unit tests. Change-Id: I56ea656e9bb5f8b2aedfdc4637c9ab4e1951b31b --- test/sad_test.cc | 43 ++++++++++++++++++++++++++++++++++- vpx_dsp/arm/highbd_sad_neon.c | 30 ++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 23 +++++++++++-------- 3 files changed, 85 insertions(+), 11 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index e43d9ac41e..eae23cbada 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1142,7 +1142,48 @@ const SadSkipMxNParam skip_neon_tests[] = { SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_neon), SadSkipMxNParam(8, 4, &vpx_sad_skip_8x4_neon), SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_neon), - SadSkipMxNParam(4, 4, &vpx_sad_skip_4x4_neon) + SadSkipMxNParam(4, 4, &vpx_sad_skip_4x4_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 8), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 8), + SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 8), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 8), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 8), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 8), + SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 10), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 10), + SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 10), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 10), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 10), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 10), + SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 12), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 12), + SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 12), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 12), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 12), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest, ::testing::ValuesIn(skip_neon_tests)); diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c index 813710040b..b99bac66cd 100644 --- a/vpx_dsp/arm/highbd_sad_neon.c +++ b/vpx_dsp/arm/highbd_sad_neon.c @@ -179,6 +179,36 @@ HBD_SAD_WXH_NEON(32, 64) HBD_SAD_WXH_NEON(64, 32) HBD_SAD_WXH_NEON(64, 64) +#undef HBD_SAD_WXH_NEON + +#define HBD_SAD_SKIP_WXH_NEON(w, h) \ + unsigned int vpx_highbd_sad_skip_##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad##w##xh_neon(src, 2 * src_stride, ref, \ + 2 * ref_stride, (h) / 2); \ + } + +HBD_SAD_SKIP_WXH_NEON(4, 4) +HBD_SAD_SKIP_WXH_NEON(4, 8) + +HBD_SAD_SKIP_WXH_NEON(8, 4) +HBD_SAD_SKIP_WXH_NEON(8, 8) +HBD_SAD_SKIP_WXH_NEON(8, 16) + +HBD_SAD_SKIP_WXH_NEON(16, 8) +HBD_SAD_SKIP_WXH_NEON(16, 16) +HBD_SAD_SKIP_WXH_NEON(16, 32) + +HBD_SAD_SKIP_WXH_NEON(32, 16) +HBD_SAD_SKIP_WXH_NEON(32, 32) +HBD_SAD_SKIP_WXH_NEON(32, 64) + +HBD_SAD_SKIP_WXH_NEON(64, 32) +HBD_SAD_SKIP_WXH_NEON(64, 64) + +#undef HBD_SAD_SKIP_WXH_NEON + static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 05d031b8cd..7bea738952 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1068,40 +1068,43 @@ () specialize qw/vpx_highbd_sad4x4 neon/; add_proto qw/unsigned int vpx_highbd_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad_skip_64x64 sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_64x64 neon sse2 avx2/; add_proto qw/unsigned int vpx_highbd_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad_skip_64x32 sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_64x32 neon sse2 avx2/; add_proto qw/unsigned int vpx_highbd_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad_skip_32x64 sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_32x64 neon sse2 avx2/; add_proto qw/unsigned int vpx_highbd_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad_skip_32x32 sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_32x32 neon sse2 avx2/; add_proto qw/unsigned int vpx_highbd_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad_skip_32x16 sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_32x16 neon sse2 avx2/; add_proto qw/unsigned int vpx_highbd_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad_skip_16x32 sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_16x32 neon sse2 avx2/; add_proto qw/unsigned int vpx_highbd_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad_skip_16x16 sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_16x16 neon sse2 avx2/; add_proto qw/unsigned int vpx_highbd_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad_skip_16x8 sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_16x8 neon sse2 avx2/; add_proto qw/unsigned int vpx_highbd_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad_skip_8x16 sse2/; + specialize qw/vpx_highbd_sad_skip_8x16 neon sse2/; add_proto qw/unsigned int vpx_highbd_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad_skip_8x8 sse2/; + specialize qw/vpx_highbd_sad_skip_8x8 neon sse2/; add_proto qw/unsigned int vpx_highbd_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_8x4 neon/; add_proto qw/unsigned int vpx_highbd_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_4x8 neon/; add_proto qw/unsigned int vpx_highbd_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad_skip_4x4 neon/; # # Avg From 42c0cbb9cb114af824083f4e6f0e757985b8942f Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 12 Apr 2023 17:38:24 +0100 Subject: [PATCH 0748/1000] Add Neon implementation of vpx_sad_skip_xx4d functions Add Neon implementations of standard bitdepth downsampling SAD4D functions for all block sizes. Also add corresponding unit tests. Change-Id: Ieb77661ea2bbe357529862a5fb54956e34e8d758 --- test/sad_test.cc | 18 ++++++++++++++++++ vpx_dsp/arm/sad4d_neon.c | 32 ++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 24 +++++++++++++----------- 3 files changed, 63 insertions(+), 11 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index eae23cbada..32787db79d 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1300,6 +1300,24 @@ const SadMxNx4Param x4d_neon_tests[] = { #endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests)); + +const SadSkipMxNx4Param skip_x4d_neon_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon), + SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_neon), + SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_neon), + SadSkipMxNx4Param(8, 4, &vpx_sad_skip_8x4x4d_neon), + SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_neon), + SadSkipMxNx4Param(4, 4, &vpx_sad_skip_4x4x4d_neon), +}; +INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_neon_tests)); #endif // HAVE_NEON //------------------------------------------------------------------------------ diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 6ad6c96214..44cd990280 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -308,3 +308,35 @@ SAD_WXH_4D_NEON(64, 32) SAD_WXH_4D_NEON(64, 64) #undef SAD_WXH_4D_NEON + +#define SAD_SKIP_WXH_4D_NEON(w, h) \ + void vpx_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], \ + int ref_stride, uint32_t res[4]) { \ + sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res, \ + ((h) >> 1)); \ + res[0] <<= 1; \ + res[1] <<= 1; \ + res[2] <<= 1; \ + res[3] <<= 1; \ + } + +SAD_SKIP_WXH_4D_NEON(4, 4) +SAD_SKIP_WXH_4D_NEON(4, 8) + +SAD_SKIP_WXH_4D_NEON(8, 4) +SAD_SKIP_WXH_4D_NEON(8, 8) +SAD_SKIP_WXH_4D_NEON(8, 16) + +SAD_SKIP_WXH_4D_NEON(16, 8) +SAD_SKIP_WXH_4D_NEON(16, 16) +SAD_SKIP_WXH_4D_NEON(16, 32) + +SAD_SKIP_WXH_4D_NEON(32, 16) +SAD_SKIP_WXH_4D_NEON(32, 32) +SAD_SKIP_WXH_4D_NEON(32, 64) + +SAD_SKIP_WXH_4D_NEON(64, 32) +SAD_SKIP_WXH_4D_NEON(64, 64) + +#undef SAD_SKIP_WXH_4D_NEON diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 7bea738952..4c5fab3189 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -968,41 +968,43 @@ () specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/; add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_64x64x4d avx2 sse2/; +specialize qw/vpx_sad_skip_64x64x4d neon avx2 sse2/; add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_64x32x4d avx2 sse2/; +specialize qw/vpx_sad_skip_64x32x4d neon avx2 sse2/; add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_32x64x4d avx2 sse2/; +specialize qw/vpx_sad_skip_32x64x4d neon avx2 sse2/; add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_32x32x4d avx2 sse2/; +specialize qw/vpx_sad_skip_32x32x4d neon avx2 sse2/; add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_32x16x4d avx2 sse2/; +specialize qw/vpx_sad_skip_32x16x4d neon avx2 sse2/; add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_16x32x4d sse2/; +specialize qw/vpx_sad_skip_16x32x4d neon sse2/; add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_16x16x4d sse2/; +specialize qw/vpx_sad_skip_16x16x4d neon sse2/; add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_16x8x4d sse2/; +specialize qw/vpx_sad_skip_16x8x4d neon sse2/; add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_8x16x4d sse2/; +specialize qw/vpx_sad_skip_8x16x4d neon sse2/; add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_8x8x4d sse2/; +specialize qw/vpx_sad_skip_8x8x4d neon sse2/; add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_8x4x4d neon/; add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_4x8x4d sse2/; +specialize qw/vpx_sad_skip_4x8x4d neon sse2/; add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +specialize qw/vpx_sad_skip_4x4x4d neon/; add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size"; specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/; From ab830fe6a1272bf84fdbc3337cf161f3dd433ce1 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 12 Apr 2023 17:50:01 +0100 Subject: [PATCH 0749/1000] Add Neon implementations of vpx_highbd_sad_skip_xx4d Add Neon implementations of high bitdepth downsampling SAD4D functions for all block sizes. Also add corresponding unit tests. Change-Id: Ib0c2f852e269cbd6cbb8f4dfb54349654abb0adb --- test/sad_test.cc | 38 +++++++++++++++++++++++++++++++++ vpx_dsp/arm/highbd_sad4d_neon.c | 34 +++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 24 +++++++++++---------- 3 files changed, 85 insertions(+), 11 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 32787db79d..92b3a14d68 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1315,6 +1315,44 @@ const SadSkipMxNx4Param skip_x4d_neon_tests[] = { SadSkipMxNx4Param(8, 4, &vpx_sad_skip_8x4x4d_neon), SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_neon), SadSkipMxNx4Param(4, 4, &vpx_sad_skip_4x4x4d_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 8), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 8), + SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 8), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 8), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 8), + SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 10), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 10), + SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 10), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 10), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 10), + SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 12), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 12), + SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 12), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 12), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 12), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test, ::testing::ValuesIn(skip_x4d_neon_tests)); diff --git a/vpx_dsp/arm/highbd_sad4d_neon.c b/vpx_dsp/arm/highbd_sad4d_neon.c index 280d2087f7..62c4685a7a 100644 --- a/vpx_dsp/arm/highbd_sad4d_neon.c +++ b/vpx_dsp/arm/highbd_sad4d_neon.c @@ -236,3 +236,37 @@ HBD_SAD_WXH_4D_NEON(32, 64) HBD_SAD_WXH_4D_NEON(64, 32) HBD_SAD_WXH_4D_NEON(64, 64) + +#undef HBD_SAD_WXH_4D_NEON + +#define HBD_SAD_SKIP_WXH_4D_NEON(w, h) \ + void vpx_highbd_sad_skip_##w##x##h##x4d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *const ref[4], \ + int ref_stride, uint32_t res[4]) { \ + highbd_sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res, \ + ((h) >> 1)); \ + res[0] <<= 1; \ + res[1] <<= 1; \ + res[2] <<= 1; \ + res[3] <<= 1; \ + } + +HBD_SAD_SKIP_WXH_4D_NEON(4, 4) +HBD_SAD_SKIP_WXH_4D_NEON(4, 8) + +HBD_SAD_SKIP_WXH_4D_NEON(8, 4) +HBD_SAD_SKIP_WXH_4D_NEON(8, 8) +HBD_SAD_SKIP_WXH_4D_NEON(8, 16) + +HBD_SAD_SKIP_WXH_4D_NEON(16, 8) +HBD_SAD_SKIP_WXH_4D_NEON(16, 16) +HBD_SAD_SKIP_WXH_4D_NEON(16, 32) + +HBD_SAD_SKIP_WXH_4D_NEON(32, 16) +HBD_SAD_SKIP_WXH_4D_NEON(32, 32) +HBD_SAD_SKIP_WXH_4D_NEON(32, 64) + +HBD_SAD_SKIP_WXH_4D_NEON(64, 32) +HBD_SAD_SKIP_WXH_4D_NEON(64, 64) + +#undef HBD_SAD_SKIP_WXH_4D_NEON diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 4c5fab3189..bde0115298 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1202,41 +1202,43 @@ () specialize qw/vpx_highbd_sad4x4x4d sse2 neon/; add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad_skip_64x64x4d sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_64x64x4d neon sse2 avx2/; add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad_skip_64x32x4d sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_64x32x4d neon sse2 avx2/; add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad_skip_32x64x4d sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_32x64x4d neon sse2 avx2/; add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad_skip_32x32x4d sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_32x32x4d neon sse2 avx2/; add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad_skip_32x16x4d sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_32x16x4d neon sse2 avx2/; add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad_skip_16x32x4d sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_16x32x4d neon sse2 avx2/; add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad_skip_16x16x4d sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_16x16x4d neon sse2 avx2/; add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad_skip_16x8x4d sse2 avx2/; + specialize qw/vpx_highbd_sad_skip_16x8x4d neon sse2 avx2/; add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad_skip_8x16x4d sse2/; + specialize qw/vpx_highbd_sad_skip_8x16x4d neon sse2/; add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad_skip_8x8x4d sse2/; + specialize qw/vpx_highbd_sad_skip_8x8x4d neon sse2/; add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_8x4x4d neon/; add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad_skip_4x8x4d sse2/; + specialize qw/vpx_highbd_sad_skip_4x8x4d neon sse2/; add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + specialize qw/vpx_highbd_sad_skip_4x4x4d neon/; # # Structured Similarity (SSIM) From 933cf345dd0c09cb55e862d6413773b18e2f1404 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 19 Apr 2023 13:51:05 -0700 Subject: [PATCH 0750/1000] onyx_if,encode_frame_to_data_rate: rm unused var quiets -Wunused-but-set-variable with clang-17 Change-Id: Ia819beac84cbd57f4eeca6174c785fd320bc40c6 --- vp8/encoder/onyx_if.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 44a02b6ddc..a780048073 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -3203,7 +3203,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, int frame_under_shoot_limit; int Loop = 0; - int loop_count; VP8_COMMON *cm = &cpi->common; int active_worst_qchanged = 0; @@ -3769,8 +3768,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, vp8_save_coding_context(cpi); - loop_count = 0; - scale_and_extend_source(cpi->un_scaled_source, cpi); #if CONFIG_TEMPORAL_DENOISING && CONFIG_POSTPROC @@ -3993,7 +3990,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, q_low = cpi->active_best_quality; q_high = cpi->active_worst_quality; - loop_count++; Loop = 1; continue; @@ -4219,7 +4215,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, if (Loop == 1) { vp8_restore_coding_context(cpi); - loop_count++; #if CONFIG_INTERNAL_STATS cpi->tot_recode_hits++; #endif From 84b4dfa5ba9c588b13da60ace9c1ba5b85eeaa41 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 19 Apr 2023 13:53:34 -0700 Subject: [PATCH 0751/1000] vp9_encodeframe: rm unused vars in get_rdmult_delta() and compute_frame_aq_offset(). quiets -Wunused-but-set-variable with clang-17 Change-Id: I726852f3bc42afa80a18475de910040a9436b0bb --- vp9/encoder/vp9_encodeframe.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 26e419e3d5..3a042399cb 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -3710,7 +3710,6 @@ static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int row, col; int dr = 0; - int count = 0; double r0, rk, beta; TplDepFrame *tpl_frame; @@ -3734,8 +3733,6 @@ static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, intra_cost += this_stats->intra_cost; mc_dep_cost += this_stats->mc_dep_cost; - - ++count; } } @@ -6185,7 +6182,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) { int mi_row, mi_col; int sum_delta = 0; - int map_index = 0; int qdelta_index; int segment_id; @@ -6195,7 +6191,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) { segment_id = mi_8x8[0]->segment_id; qdelta_index = get_segdata(seg, segment_id, SEG_LVL_ALT_Q); sum_delta += qdelta_index; - map_index++; } mi_8x8_ptr += cm->mi_stride; } From 895317cdf122124a47fc2bfb4478504169c1bc3c Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 19 Apr 2023 13:55:35 -0700 Subject: [PATCH 0752/1000] vp9_ratectrl,vp9_encodedframe_overshoot: rm unused var quiets -Wunused-but-set-variable with clang-17 Change-Id: I5212a20286d0252e45a8e8813d15cb780494b0ad --- vp9/encoder/vp9_ratectrl.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 9e152629fb..13b43aa63a 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -3272,11 +3272,9 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) { MODE_INFO **mi = cm->mi_grid_visible; int sum_intra_usage = 0; int mi_row, mi_col; - int tot = 0; for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) { for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { if (mi[0]->ref_frame[0] == INTRA_FRAME) sum_intra_usage++; - tot++; mi++; } mi += 8; From 4366ff722297e7e57c158db9b41d61cf1a056bf6 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 19 Apr 2023 13:58:39 -0700 Subject: [PATCH 0753/1000] vp9_spatial_svc_encoder: quiet -Wunused-but-set-variable with clang-17. Move frames_received under OUTPUT_FRAME_STATS; it's only used in a printf. Change-Id: Idfdd59ccd04e43df1855203db82bb4c8a1d059fb --- examples/vp9_spatial_svc_encoder.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c index d287e58319..9d37ed0244 100644 --- a/examples/vp9_spatial_svc_encoder.c +++ b/examples/vp9_spatial_svc_encoder.c @@ -32,6 +32,7 @@ #include "vp9/encoder/vp9_encoder.h" #include "./y4minput.h" +#define OUTPUT_FRAME_STATS 0 #define OUTPUT_RC_STATS 1 #define SIMULCAST_MODE 0 @@ -880,7 +881,9 @@ int main(int argc, const char **argv) { int pts = 0; /* PTS starts at 0 */ int frame_duration = 1; /* 1 timebase tick per frame */ int end_of_stream = 0; +#if OUTPUT_FRAME_STATS int frames_received = 0; +#endif #if OUTPUT_RC_STATS VpxVideoWriter *outfile[VPX_SS_MAX_LAYERS] = { NULL }; struct RateControlStats rc; @@ -1126,14 +1129,14 @@ int main(int argc, const char **argv) { } #endif } - /* +#if OUTPUT_FRAME_STATS printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY), (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts); - */ + ++frames_received; +#endif if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1) si->bytes_sum[0] += (int)cx_pkt->data.frame.sz; - ++frames_received; #if CONFIG_VP9_DECODER && !SIMULCAST_MODE if (vpx_codec_decode(&decoder, cx_pkt->data.frame.buf, (unsigned int)cx_pkt->data.frame.sz, NULL, 0)) From e8fa7a038b2a536eb49bdaf53a24e54f20044e7b Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 19 Apr 2023 18:58:59 -0700 Subject: [PATCH 0754/1000] libs.mk: quote $(LIBVPX_TEST_DATA_PATH) This allows the testdata target to work environments like cygwin/msys when a windows style path is used. It may also fix using paths with spaces, though that's not generally recommended. Change-Id: Id444c14468b05d589bce49c1f612aa712a3f0c8c --- libs.mk | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libs.mk b/libs.mk index 92cf5509fb..1411fee9a1 100644 --- a/libs.mk +++ b/libs.mk @@ -545,7 +545,7 @@ testdata: $(LIBVPX_TEST_DATA) echo "Checking test data:";\ for f in $(call enabled,LIBVPX_TEST_DATA); do\ grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\ - (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\ + (cd "$(LIBVPX_TEST_DATA_PATH)"; $${sha1sum} -c);\ done; \ else\ echo "Skipping test data integrity check, sha1sum not found.";\ @@ -764,10 +764,10 @@ TEST_BIN_PATH := $(addsuffix /$(TGT_OS:win64=x64)/Release, $(TEST_BIN_PATH)) endif utiltest utiltest-no-data-check: $(qexec)$(SRC_PATH_BARE)/test/vpxdec.sh \ - --test-data-path $(LIBVPX_TEST_DATA_PATH) \ + --test-data-path "$(LIBVPX_TEST_DATA_PATH)" \ --bin-path $(TEST_BIN_PATH) $(qexec)$(SRC_PATH_BARE)/test/vpxenc.sh \ - --test-data-path $(LIBVPX_TEST_DATA_PATH) \ + --test-data-path "$(LIBVPX_TEST_DATA_PATH)" \ --bin-path $(TEST_BIN_PATH) utiltest: testdata else @@ -791,7 +791,7 @@ EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release endif exampletest exampletest-no-data-check: examples $(qexec)$(SRC_PATH_BARE)/test/examples.sh \ - --test-data-path $(LIBVPX_TEST_DATA_PATH) \ + --test-data-path "$(LIBVPX_TEST_DATA_PATH)" \ --bin-path $(EXAMPLES_BIN_PATH) exampletest: testdata else From f7d5c3eff865299e0915f9683fd28e325dcb75a9 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 20 Apr 2023 12:17:05 -0700 Subject: [PATCH 0755/1000] configure: skip arm64_neon.h workaround w/VS >= 2019 Visual Studio 2019+ include arm64_neon.h from arm_neon.h Bug: b/277255076 Change-Id: I52f42b69a5efe8214a4c541b68e940ad07499584 --- build/make/configure.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/build/make/configure.sh b/build/make/configure.sh index 4bf090f006..32105651ff 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -1066,8 +1066,11 @@ EOF enable_feature win_arm64_neon_h_workaround else # If a probe is not possible, assume this is the pure Windows - # SDK and so the workaround is necessary. - enable_feature win_arm64_neon_h_workaround + # SDK and so the workaround is necessary when using Visual + # Studio < 2019. + if [ ${tgt_cc##vs} -lt 16 ]; then + enable_feature win_arm64_neon_h_workaround + fi fi fi fi From f49879a2a3420fd140692f3b23c5c57b6298c954 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 20 Apr 2023 11:06:32 -0400 Subject: [PATCH 0756/1000] Store tpl stats before propagation Add two new structs TplBlockStats and TplFrameStats to store tpl stats before propagation Change-Id: I903db99326b199ed8f2d8b19ccb973a8c8910501 --- vp9/encoder/vp9_encoder.c | 5 ++++- vp9/encoder/vp9_encoder.h | 18 ++++++++++++++++++ vp9/encoder/vp9_tpl_model.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 354f08eae8..662ec24b83 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2622,7 +2622,10 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, #if CONFIG_NON_GREEDY_MV cpi->tpl_ready = 0; #endif // CONFIG_NON_GREEDY_MV - for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) cpi->tpl_stats[i].tpl_stats_ptr = NULL; + for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) { + cpi->tpl_stats[i].tpl_stats_ptr = NULL; + cpi->tpl_frame_stats[i].block_stats_list = NULL; + } // Allocate memory to store variances for a frame. CHECK_MEM_ERROR(cm, cpi->source_diff_var, diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 9e5e64629e..43789864c7 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -324,6 +324,22 @@ typedef struct TplDepFrame { #endif } TplDepFrame; +// Used to store the stats before propagation. +typedef struct TplBlockStats { + int64_t intra_cost; + int64_t inter_cost; + int_mv mv; + int64_t recrf_rate; + int64_t recrf_dist; + int ref_frame_index; +} TplBlockStats; + +typedef struct TplFrameStats { + int frame_width; + int frame_height; + TplBlockStats *block_stats_list; +} TplFrameStats; + #define TPL_DEP_COST_SCALE_LOG2 4 // TODO(jingning) All spatially adaptive variables should go to TileDataEnc. @@ -743,6 +759,8 @@ typedef struct VP9_COMP { BLOCK_SIZE tpl_bsize; TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE]; + // Used to store TPL stats before propagation + TplFrameStats tpl_frame_stats[MAX_ARF_GOP_SIZE]; YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES]; EncFrameBuf enc_frame_buf[REF_FRAMES]; #if CONFIG_MULTITHREAD diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index 53ef356981..c9565e8cab 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -157,9 +157,13 @@ static void init_tpl_stats(VP9_COMP *cpi) { int frame_idx; for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + TplFrameStats *tpl_frame_stats = &cpi->tpl_frame_stats[frame_idx]; memset(tpl_frame->tpl_stats_ptr, 0, tpl_frame->height * tpl_frame->width * sizeof(*tpl_frame->tpl_stats_ptr)); + memset(tpl_frame_stats->block_stats_list, 0, + tpl_frame->height * tpl_frame->width * + sizeof(*tpl_frame_stats->block_stats_list)); tpl_frame->is_valid = 0; } } @@ -355,6 +359,27 @@ static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, } } +static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats, + TplDepStats *tpl_stats, int mi_row, + int mi_col, BLOCK_SIZE bsize, + int stride) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplBlockStats *tpl_block_stats_ptr = + &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx]; + tpl_block_stats_ptr->inter_cost = src_stats->inter_cost; + tpl_block_stats_ptr->intra_cost = src_stats->intra_cost; + tpl_block_stats_ptr->mv = src_stats->mv; + tpl_block_stats_ptr->ref_frame_index = src_stats->ref_frame_index; + } + } +} + static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, int mi_row, int mi_col, const BLOCK_SIZE bsize) { TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index]; @@ -1062,6 +1087,8 @@ static void build_motion_field( static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx, BLOCK_SIZE bsize) { TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + TplFrameStats *tpl_frame_stats_before_propagation = + &cpi->tpl_frame_stats[frame_idx]; YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame; YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL }; @@ -1158,6 +1185,10 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride); + tpl_store_before_propagation( + tpl_frame_stats_before_propagation->block_stats_list, + tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride); + tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize); } @@ -1294,6 +1325,11 @@ void vp9_init_tpl_buffer(VP9_COMP *cpi) { CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr, vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr))); + vpx_free(cpi->tpl_frame_stats[frame].block_stats_list); + CHECK_MEM_ERROR( + cm, cpi->tpl_frame_stats[frame].block_stats_list, + vpx_calloc(mi_rows * mi_cols, + sizeof(*cpi->tpl_frame_stats[frame].block_stats_list))); cpi->tpl_stats[frame].is_valid = 0; cpi->tpl_stats[frame].width = mi_cols; cpi->tpl_stats[frame].height = mi_rows; @@ -1324,6 +1360,7 @@ void vp9_free_tpl_buffer(VP9_COMP *cpi) { #endif vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); cpi->tpl_stats[frame].is_valid = 0; + vpx_free(cpi->tpl_frame_stats[frame].block_stats_list); } } From b27cf67c30eee263d1d41a5953e63c976fd82365 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 20 Apr 2023 14:17:49 -0700 Subject: [PATCH 0757/1000] register_state_check: clear -Wshadow warning with --target=x86_64-win64-gcc Bug: webm:1793 Change-Id: I265533af4e8d05adbe1d66a62b6dcb191ca48747 --- test/register_state_check.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test/register_state_check.h b/test/register_state_check.h index 0b837dd042..ede86ef52f 100644 --- a/test/register_state_check.h +++ b/test/register_state_check.h @@ -184,13 +184,13 @@ class RegisterStateCheckMMX { uint16_t pre_fpu_env_[14]; }; -#define API_REGISTER_STATE_CHECK(statement) \ - do { \ - { \ - libvpx_test::RegisterStateCheckMMX reg_check; \ - ASM_REGISTER_STATE_CHECK(statement); \ - } \ - __asm__ volatile("" ::: "memory"); \ +#define API_REGISTER_STATE_CHECK(statement) \ + do { \ + { \ + libvpx_test::RegisterStateCheckMMX reg_check_mmx; \ + ASM_REGISTER_STATE_CHECK(statement); \ + } \ + __asm__ volatile("" ::: "memory"); \ } while (false) } // namespace libvpx_test From 3c59378e4eac2d241fba8b26e660318b850e5773 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 20 Apr 2023 15:09:00 -0400 Subject: [PATCH 0758/1000] Calculate recrf_dist and recrf_rate Change-Id: I74e74807436b92d729e2ccaab96149780f1f52d9 --- vp9/encoder/vp9_tpl_model.c | 52 ++++++++++++++++++++++++++++--------- vp9/encoder/vp9_tpl_model.h | 2 ++ 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index 0f9df78462..81c319c9fd 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -362,7 +362,8 @@ static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats, TplDepStats *tpl_stats, int mi_row, int mi_col, BLOCK_SIZE bsize, - int stride) { + int stride, int64_t recon_error, + int64_t rate_cost) { const int mi_height = num_8x8_blocks_high_lookup[bsize]; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; @@ -374,6 +375,8 @@ static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats, &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx]; tpl_block_stats_ptr->inter_cost = src_stats->inter_cost; tpl_block_stats_ptr->intra_cost = src_stats->intra_cost; + tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_block_stats_ptr->recrf_rate = rate_cost; tpl_block_stats_ptr->mv = src_stats->mv; tpl_block_stats_ptr->ref_frame_index = src_stats->ref_frame_index; } @@ -455,12 +458,11 @@ static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, tran_low_t *qcoeff, tran_low_t *dqcoeff, TX_SIZE tx_size, int64_t *recon_error, - int64_t *sse) { + int64_t *sse, uint16_t *eob) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; - uint16_t eob; int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; const int shift = tx_size == TX_32X32 ? 0 : 2; @@ -470,16 +472,16 @@ static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, &eob, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } else { vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, &eob, scan_order->scan, + dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } #else vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, &eob, scan_order->scan, + dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -523,6 +525,19 @@ static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row, ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND); } +static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) { + const ScanOrder *const scan_order = &vp9_scan_orders[tx_size][DCT_DCT]; + int rate_cost = 1; + int idx; + assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob); + for (idx = 0; idx < eob; ++idx) { + unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]); + rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0); + } + + return (rate_cost << VP9_PROB_COST_SHIFT); +} + static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, struct scale_factors *sf, GF_PICTURE *gf_picture, int frame_idx, TplDepFrame *tpl_frame, @@ -530,7 +545,8 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row, int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, - int64_t *recon_error, int64_t *sse) { + int64_t *recon_error, int64_t *rate_cost, + int64_t *sse) { VP9_COMMON *cm = &cpi->common; ThreadData *td = &cpi->td; @@ -553,6 +569,7 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, const int mi_width = num_8x8_blocks_wide_lookup[bsize]; TplDepStats *tpl_stats = &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + uint16_t eob = 0; xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8; @@ -606,6 +623,8 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { int_mv mv; + int64_t this_recon_error = 0; + int64_t this_rate = 0; #if CONFIG_NON_GREEDY_MV MotionField *motion_field; #endif @@ -657,12 +676,17 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, inter_cost = vpx_satd(coeff, pix_num); #endif + get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &this_recon_error, + sse, &eob); + + this_rate = rate_estimator(qcoeff, eob, tx_size); + *rate_cost += this_rate; + *recon_error += this_recon_error; + if (inter_cost < best_inter_cost) { best_rf_idx = rf_idx; best_inter_cost = inter_cost; best_mv.as_int = mv.as_int; - get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error, - sse); } } best_intra_cost = VPXMAX(best_intra_cost, 1); @@ -1115,7 +1139,6 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, const TX_SIZE tx_size = max_txsize_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - int64_t recon_error, sse; // Setup scaling factor #if CONFIG_VP9_HIGHBITDEPTH @@ -1178,16 +1201,21 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + int64_t recon_error = 0; + int64_t rate_cost = 0; + int64_t sse = 0; mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame, src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize, - tx_size, ref_frame, predictor, &recon_error, &sse); + tx_size, ref_frame, predictor, &recon_error, &rate_cost, + &sse); // Motion flow dependency dispenser. tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride); tpl_store_before_propagation( tpl_frame_stats_before_propagation->block_stats_list, - tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride); + tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride, + recon_error, rate_cost); tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize); diff --git a/vp9/encoder/vp9_tpl_model.h b/vp9/encoder/vp9_tpl_model.h index 86a7734f82..04beb22610 100644 --- a/vp9/encoder/vp9_tpl_model.h +++ b/vp9/encoder/vp9_tpl_model.h @@ -20,6 +20,8 @@ extern "C" { #endif #define log2f(x) (log(x) / (float)M_LOG2_E) +#define TPL_DEP_COST_SCALE_LOG2 4 + typedef struct GF_PICTURE { YV12_BUFFER_CONFIG *frame; int ref_frame[3]; From a425371ccdfc5a6faf17af216e16c2ad2ccb4d05 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 21 Apr 2023 18:10:46 +0000 Subject: [PATCH 0759/1000] Revert "Calculate recrf_dist and recrf_rate" This reverts commit 3c59378e4eac2d241fba8b26e660318b850e5773. Reason for revert: recon_error and recon_rate is summed by mistake across reference frames, as pointed out by Angie. It could also cause vp9 behavior changes. Original change's description: > Calculate recrf_dist and recrf_rate > > Change-Id: I74e74807436b92d729e2ccaab96149780f1f52d9 Change-Id: I6106ce77cb0fe8c12b2bcf070d01513ffa8dc613 No-Presubmit: true No-Tree-Checks: true No-Try: true --- vp9/encoder/vp9_tpl_model.c | 52 +++++++++---------------------------- vp9/encoder/vp9_tpl_model.h | 2 -- 2 files changed, 12 insertions(+), 42 deletions(-) diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index 81c319c9fd..0f9df78462 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -362,8 +362,7 @@ static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats, TplDepStats *tpl_stats, int mi_row, int mi_col, BLOCK_SIZE bsize, - int stride, int64_t recon_error, - int64_t rate_cost) { + int stride) { const int mi_height = num_8x8_blocks_high_lookup[bsize]; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; @@ -375,8 +374,6 @@ static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats, &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx]; tpl_block_stats_ptr->inter_cost = src_stats->inter_cost; tpl_block_stats_ptr->intra_cost = src_stats->intra_cost; - tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; - tpl_block_stats_ptr->recrf_rate = rate_cost; tpl_block_stats_ptr->mv = src_stats->mv; tpl_block_stats_ptr->ref_frame_index = src_stats->ref_frame_index; } @@ -458,11 +455,12 @@ static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, tran_low_t *qcoeff, tran_low_t *dqcoeff, TX_SIZE tx_size, int64_t *recon_error, - int64_t *sse, uint16_t *eob) { + int64_t *sse) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; + uint16_t eob; int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; const int shift = tx_size == TX_32X32 ? 0 : 2; @@ -472,16 +470,16 @@ static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, eob, + qcoeff, dqcoeff, pd->dequant, &eob, scan_order->scan, scan_order->iscan); } else { vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, + dqcoeff, pd->dequant, &eob, scan_order->scan, scan_order->iscan); } #else vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, + dqcoeff, pd->dequant, &eob, scan_order->scan, scan_order->iscan); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -525,19 +523,6 @@ static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row, ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND); } -static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) { - const ScanOrder *const scan_order = &vp9_scan_orders[tx_size][DCT_DCT]; - int rate_cost = 1; - int idx; - assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob); - for (idx = 0; idx < eob; ++idx) { - unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]); - rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0); - } - - return (rate_cost << VP9_PROB_COST_SHIFT); -} - static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, struct scale_factors *sf, GF_PICTURE *gf_picture, int frame_idx, TplDepFrame *tpl_frame, @@ -545,8 +530,7 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row, int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, - int64_t *recon_error, int64_t *rate_cost, - int64_t *sse) { + int64_t *recon_error, int64_t *sse) { VP9_COMMON *cm = &cpi->common; ThreadData *td = &cpi->td; @@ -569,7 +553,6 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, const int mi_width = num_8x8_blocks_wide_lookup[bsize]; TplDepStats *tpl_stats = &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; - uint16_t eob = 0; xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8; @@ -623,8 +606,6 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { int_mv mv; - int64_t this_recon_error = 0; - int64_t this_rate = 0; #if CONFIG_NON_GREEDY_MV MotionField *motion_field; #endif @@ -676,17 +657,12 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, inter_cost = vpx_satd(coeff, pix_num); #endif - get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &this_recon_error, - sse, &eob); - - this_rate = rate_estimator(qcoeff, eob, tx_size); - *rate_cost += this_rate; - *recon_error += this_recon_error; - if (inter_cost < best_inter_cost) { best_rf_idx = rf_idx; best_inter_cost = inter_cost; best_mv.as_int = mv.as_int; + get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error, + sse); } } best_intra_cost = VPXMAX(best_intra_cost, 1); @@ -1139,6 +1115,7 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, const TX_SIZE tx_size = max_txsize_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int64_t recon_error, sse; // Setup scaling factor #if CONFIG_VP9_HIGHBITDEPTH @@ -1201,21 +1178,16 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { - int64_t recon_error = 0; - int64_t rate_cost = 0; - int64_t sse = 0; mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame, src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize, - tx_size, ref_frame, predictor, &recon_error, &rate_cost, - &sse); + tx_size, ref_frame, predictor, &recon_error, &sse); // Motion flow dependency dispenser. tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride); tpl_store_before_propagation( tpl_frame_stats_before_propagation->block_stats_list, - tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride, - recon_error, rate_cost); + tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride); tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize); diff --git a/vp9/encoder/vp9_tpl_model.h b/vp9/encoder/vp9_tpl_model.h index 04beb22610..86a7734f82 100644 --- a/vp9/encoder/vp9_tpl_model.h +++ b/vp9/encoder/vp9_tpl_model.h @@ -20,8 +20,6 @@ extern "C" { #endif #define log2f(x) (log(x) / (float)M_LOG2_E) -#define TPL_DEP_COST_SCALE_LOG2 4 - typedef struct GF_PICTURE { YV12_BUFFER_CONFIG *frame; int ref_frame[3]; From ec2a75ce9c92798d0238575150e337a6f024fe3e Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 21 Apr 2023 13:03:34 -0700 Subject: [PATCH 0760/1000] vp9_highbd_iht16x16_add_neon: clear -Wshadow warning Bug: webm:1793 Change-Id: I4e79a4d7d41b6abf88e3e60c54ab48a92b0346d2 --- vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c index 219ff63cb8..aeb7e49c10 100644 --- a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c +++ b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c @@ -64,9 +64,9 @@ highbd_dct_const_round_shift_low_8(const int64x2x2_t *const in) { #define highbd_iadst_half_butterfly(in, c, lane, out) \ do { \ - int64x2x2_t t[2]; \ - vmull_lane_s32_dual(in, c, lane, t); \ - out = highbd_dct_const_round_shift_low_8(t); \ + int64x2x2_t _t[2]; \ + vmull_lane_s32_dual(in, c, lane, _t); \ + out = highbd_dct_const_round_shift_low_8(_t); \ } while (0) #define highbd_iadst_butterfly(in0, in1, c, lane0, lane1, s0, s1) \ From fed3de997ca639db0fd8a2a40b20300b16878292 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 21 Apr 2023 13:03:58 -0700 Subject: [PATCH 0761/1000] highbd_vpx_convolve8_neon: clear -Wshadow warning Bug: webm:1793 Change-Id: If1a46fe183cd18e05b5538b1eba098e420b745ec --- vpx_dsp/arm/highbd_vpx_convolve8_neon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c index c46c016312..47684473ca 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c @@ -355,7 +355,6 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, } else { const int16x8_t filters = vld1q_s16(filter[x0_q4]); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - uint16x8_t t0, t1, t2, t3; assert(!((intptr_t)dst & 3)); assert(!(dst_stride & 3)); @@ -365,6 +364,7 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, if (h == 4) { int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; int32x4_t d0, d1, d2, d3; + uint16x8_t t0, t1, t2, t3; uint16x8_t d01, d23, t01, t23; __builtin_prefetch(src + 0 * src_stride); From 24802201acd7dfa15928bcc47c1e270e7db5afac Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 20 Apr 2023 15:09:00 -0400 Subject: [PATCH 0762/1000] Reland "Calculate recrf_dist and recrf_rate" This is a reland of commit 3c59378e4eac2d241fba8b26e660318b850e5773 Addressed issues from the previous CL: - Both recon_error and rate_cost are scaled up - recon_error and rate_cost are not accumulated across ref frames, instead they are calculated with the best ref frame picked. - get_quantize_error() is put where it was, so there is no behavior change for vp9. Bug: b/273736974 Original change's description: > Calculate recrf_dist and recrf_rate > > Change-Id: I74e74807436b92d729e2ccaab96149780f1f52d9 Change-Id: I20e1f5543e83b576a074bd4e6b44d99da65f4b56 --- vp9/encoder/vp9_tpl_model.c | 46 ++++++++++++++++++++++++++++--------- vp9/encoder/vp9_tpl_model.h | 2 ++ 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index 0f9df78462..d6ce480c89 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -362,7 +362,8 @@ static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats, TplDepStats *tpl_stats, int mi_row, int mi_col, BLOCK_SIZE bsize, - int stride) { + int stride, int64_t recon_error, + int64_t rate_cost) { const int mi_height = num_8x8_blocks_high_lookup[bsize]; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; @@ -374,6 +375,8 @@ static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats, &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx]; tpl_block_stats_ptr->inter_cost = src_stats->inter_cost; tpl_block_stats_ptr->intra_cost = src_stats->intra_cost; + tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; tpl_block_stats_ptr->mv = src_stats->mv; tpl_block_stats_ptr->ref_frame_index = src_stats->ref_frame_index; } @@ -455,12 +458,11 @@ static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, tran_low_t *qcoeff, tran_low_t *dqcoeff, TX_SIZE tx_size, int64_t *recon_error, - int64_t *sse) { + int64_t *sse, uint16_t *eob) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; - uint16_t eob; int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; const int shift = tx_size == TX_32X32 ? 0 : 2; @@ -470,16 +472,16 @@ static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, &eob, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } else { vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, &eob, scan_order->scan, + dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } #else vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, &eob, scan_order->scan, + dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -523,6 +525,19 @@ static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row, ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND); } +static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) { + const ScanOrder *const scan_order = &vp9_scan_orders[tx_size][DCT_DCT]; + int rate_cost = 1; + int idx; + assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob); + for (idx = 0; idx < eob; ++idx) { + unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]); + rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0); + } + + return (rate_cost << VP9_PROB_COST_SHIFT); +} + static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, struct scale_factors *sf, GF_PICTURE *gf_picture, int frame_idx, TplDepFrame *tpl_frame, @@ -530,7 +545,8 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row, int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, - int64_t *recon_error, int64_t *sse) { + int64_t *recon_error, int64_t *rate_cost, + int64_t *sse) { VP9_COMMON *cm = &cpi->common; ThreadData *td = &cpi->td; @@ -658,11 +674,15 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, #endif if (inter_cost < best_inter_cost) { + uint16_t eob = 0; best_rf_idx = rf_idx; best_inter_cost = inter_cost; best_mv.as_int = mv.as_int; + // Since best_inter_cost is initialized as INT64_MAX, recon_error and + // rate_cost will be calculated with the best reference frame. get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error, - sse); + sse, &eob); + *rate_cost = rate_estimator(qcoeff, eob, tx_size); } } best_intra_cost = VPXMAX(best_intra_cost, 1); @@ -1115,7 +1135,6 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, const TX_SIZE tx_size = max_txsize_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - int64_t recon_error, sse; // Setup scaling factor #if CONFIG_VP9_HIGHBITDEPTH @@ -1178,16 +1197,21 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + int64_t recon_error = 0; + int64_t rate_cost = 0; + int64_t sse = 0; mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame, src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize, - tx_size, ref_frame, predictor, &recon_error, &sse); + tx_size, ref_frame, predictor, &recon_error, &rate_cost, + &sse); // Motion flow dependency dispenser. tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride); tpl_store_before_propagation( tpl_frame_stats_before_propagation->block_stats_list, - tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride); + tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride, + recon_error, rate_cost); tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize); diff --git a/vp9/encoder/vp9_tpl_model.h b/vp9/encoder/vp9_tpl_model.h index 86a7734f82..04beb22610 100644 --- a/vp9/encoder/vp9_tpl_model.h +++ b/vp9/encoder/vp9_tpl_model.h @@ -20,6 +20,8 @@ extern "C" { #endif #define log2f(x) (log(x) / (float)M_LOG2_E) +#define TPL_DEP_COST_SCALE_LOG2 4 + typedef struct GF_PICTURE { YV12_BUFFER_CONFIG *frame; int ref_frame[3]; From e7b58b69fd91a4288453c7c7003e1fc4cc48bb93 Mon Sep 17 00:00:00 2001 From: Neeraj Gadgil Date: Wed, 19 Apr 2023 08:13:26 +0530 Subject: [PATCH 0763/1000] Reduce joint motion search iters based on bsize Joint motion search during compound mode eval is optimized by reducing the number of mv search iterations based on bsize. The sf 'comp_inter_joint_search_thresh' is renamed as 'comp_inter_joint_search_iter_level' and used to add the logic. cpu Testset Instr. Cnt BD Rate loss (%) Red (%) avg. psnr ovr.psnr ssim 0 LOWRES2 5.373 0.0917 0.1088 0.0294 0 MIDRES2 3.395 0.0239 0.0520 0.0783 0 HDRES2 2.291 0.0223 0.0301 0.0053 0 Average 3.686 0.0460 0.0636 0.0377 STATS_CHANGED Change-Id: I7ee8873ebc8af967382324ae8f5c70c26665d5e6 --- vp9/encoder/vp9_rdopt.c | 40 +++++++++++++++++++++++++------- vp9/encoder/vp9_speed_features.c | 7 +++--- vp9/encoder/vp9_speed_features.h | 19 +++++++++++---- 3 files changed, 49 insertions(+), 17 deletions(-) diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index c68cfefdea..f051c62791 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1898,11 +1898,22 @@ static INLINE int skip_single_mode_based_on_mode_rate( return 0; } -#define NUM_ITERS 4 +#define MAX_JOINT_MV_SEARCH_ITERS 4 +static INLINE int get_joint_search_iters(int sf_level, BLOCK_SIZE bsize) { + int num_iters = MAX_JOINT_MV_SEARCH_ITERS; // sf_level = 0 + if (sf_level >= 2) + num_iters = 0; + else if (sf_level >= 1) + num_iters = bsize < BLOCK_8X8 + ? 0 + : (bsize <= BLOCK_16X16 ? 2 : MAX_JOINT_MV_SEARCH_ITERS); + return num_iters; +} + static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], - int *rate_mv) { + int *rate_mv, int num_iters) { const VP9_COMMON *const cm = &cpi->common; const int pw = 4 * num_4x4_blocks_wide_lookup[bsize]; const int ph = 4 * num_4x4_blocks_high_lookup[bsize]; @@ -1911,7 +1922,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, const int refs[2] = { mi->ref_frame[0], mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1] }; int_mv ref_mv[2]; - int_mv iter_mvs[NUM_ITERS][2]; + int_mv iter_mvs[MAX_JOINT_MV_SEARCH_ITERS][2]; int ite, ref; const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter]; struct scale_factors sf; @@ -1932,6 +1943,9 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]); #endif // CONFIG_VP9_HIGHBITDEPTH + // Check number of iterations do not exceed the max + assert(num_iters <= MAX_JOINT_MV_SEARCH_ITERS); + for (ref = 0; ref < 2; ++ref) { ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0]; @@ -1962,7 +1976,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // Allow joint search multiple times iteratively for each reference frame // and break out of the search loop if it couldn't find a better mv. - for (ite = 0; ite < NUM_ITERS; ite++) { + for (ite = 0; ite < num_iters; ite++) { struct buf_2d ref_yv12[2]; uint32_t bestsme = UINT_MAX; int sadpb = x->sadperbit16; @@ -2044,7 +2058,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, } else { break; } - if (ite < NUM_ITERS - 1) { + if (ite < num_iters - 1) { iter_mvs[ite + 1][0].as_int = frame_mv[refs[0]].as_int; iter_mvs[ite + 1][1].as_int = frame_mv[refs[1]].as_int; } @@ -2250,12 +2264,16 @@ static int64_t rd_pick_best_sub8x8_mode( if (has_second_rf && this_mode == NEWMV && mi->interp_filter == EIGHTTAP) { + // Decide number of joint motion search iterations + const int num_joint_search_iters = get_joint_search_iters( + cpi->sf.comp_inter_joint_search_iter_level, bsize); // adjust src pointers mi_buf_shift(x, block); - if (sf->comp_inter_joint_search_thresh <= bsize) { + if (num_joint_search_iters) { int rate_mv; joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row, - mi_col, seg_mvs[block], &rate_mv); + mi_col, seg_mvs[block], &rate_mv, + num_joint_search_iters); seg_mvs[block][mi->ref_frame[0]].as_int = frame_mv[this_mode][mi->ref_frame[0]].as_int; seg_mvs[block][mi->ref_frame[1]].as_int = @@ -2878,16 +2896,20 @@ static int64_t handle_inter_mode( if (this_mode == NEWMV) { int rate_mv; if (is_comp_pred) { + // Decide number of joint motion search iterations + const int num_joint_search_iters = get_joint_search_iters( + cpi->sf.comp_inter_joint_search_iter_level, bsize); + // Initialize mv using single prediction mode result. frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int; frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int; - if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { + if (num_joint_search_iters) { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, joint_motion_search_time); #endif joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, - single_newmv, &rate_mv); + single_newmv, &rate_mv, num_joint_search_iters); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, joint_motion_search_time); #endif diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 04804da1ca..60720e3ea6 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -244,6 +244,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->trellis_opt_tx_rd.thresh = boosted ? 4.0 : 3.0; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->comp_inter_joint_search_iter_level = 1; // Reference masking is not supported in dynamic scaling mode. sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC; @@ -331,7 +332,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR; sf->disable_filter_search_var_thresh = 100; - sf->comp_inter_joint_search_thresh = BLOCK_SIZES; + sf->comp_inter_joint_search_iter_level = 2; sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; sf->recode_tolerance_high = 45; sf->enhanced_full_pixel_motion_search = 0; @@ -530,7 +531,7 @@ static void set_rt_speed_feature_framesize_independent( } sf->disable_filter_search_var_thresh = 50; - sf->comp_inter_joint_search_thresh = BLOCK_SIZES; + sf->comp_inter_joint_search_iter_level = 2; sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; sf->lf_motion_threshold = LOW_MOTION_THRESHOLD; sf->adjust_partitioning_from_last_frame = 1; @@ -928,7 +929,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { sf->mv.auto_mv_step_size = 0; sf->mv.fullpel_search_step_param = 6; sf->mv.use_downsampled_sad = 0; - sf->comp_inter_joint_search_thresh = BLOCK_4X4; + sf->comp_inter_joint_search_iter_level = 0; sf->tx_size_search_method = USE_FULL_RD; sf->use_lp32x32fdct = 0; sf->adaptive_motion_search = 0; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 7cb3f3527d..70c61fe00d 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -286,11 +286,20 @@ typedef struct SPEED_FEATURES { // adds overhead. int static_segmentation; - // If 1 we iterate finding a best reference for 2 ref frames together - via - // a log search that iterates 4 times (check around mv for last for best - // error of combined predictor then check around mv for alt). If 0 we - // we just use the best motion vector found for each frame by itself. - BLOCK_SIZE comp_inter_joint_search_thresh; + // The best compound predictor is found using an iterative log search process + // that searches for best ref0 mv using error of combined predictor and then + // searches for best ref1 mv. This sf determines the number of iterations of + // this process based on block size. The sf becomes more aggressive from level + // 0 to 2. The following table indicates the number of iterations w.r.t bsize: + // ----------------------------------------------- + // |sf (level)|bsize < 8X8| [8X8, 16X16] | > 16X16 | + // | 0 | 4 | 4 | 4 | + // | 1 | 0 | 2 | 4 | + // | 2 | 0 | 0 | 0 | + // ----------------------------------------------- + // Here, 0 iterations indicate using the best single motion vector selected + // for each ref frame without any iterative refinement. + int comp_inter_joint_search_iter_level; // This variable is used to cap the maximum number of times we skip testing a // mode to be evaluated. A high value means we will be faster. From dbb1e8c7a6eafcd209a56c84b6f0111e74ec3ae5 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 27 Apr 2023 15:58:08 -0400 Subject: [PATCH 0764/1000] Clean up a stale TODO in tpl Change-Id: Ieccaff1cc94cbb2c5a294d83f3080f7407267016 --- vp9/encoder/vp9_tpl_model.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index d6ce480c89..1dff8d3fd2 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -40,9 +40,6 @@ static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, memset(recon_frame_index, -1, sizeof(recon_frame_index)); stack_init(arf_index_stack, MAX_ARF_LAYERS); - // TODO(jingning): To be used later for gf frame type parsing. - (void)gf_group; - for (i = 0; i < FRAME_BUFFERS; ++i) { if (frame_bufs[i].ref_count == 0) { alloc_frame_mvs(cm, i); From 84a180fe858fd6de9c301cd884e2f1ff341781b3 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 27 Apr 2023 16:15:56 -0400 Subject: [PATCH 0765/1000] Move TplFrameStats to public header Get ready for changes to follow: - Custom reader/writer IO functions - Codec control to get TPL stats from the encoder Move the definition of TplFrameStats to public header so applications can use them directly. Bug: b/273736974 Change-Id: Ieb0db4560ddd966df1bc01f6a7e179cc97f9bac1 --- vp9/encoder/vp9_encoder.h | 16 ---------------- vp9/encoder/vp9_tpl_model.c | 3 ++- vpx/vpx_encoder.h | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 43789864c7..7c22c807b7 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -324,22 +324,6 @@ typedef struct TplDepFrame { #endif } TplDepFrame; -// Used to store the stats before propagation. -typedef struct TplBlockStats { - int64_t intra_cost; - int64_t inter_cost; - int_mv mv; - int64_t recrf_rate; - int64_t recrf_dist; - int ref_frame_index; -} TplBlockStats; - -typedef struct TplFrameStats { - int frame_width; - int frame_height; - TplBlockStats *block_stats_list; -} TplFrameStats; - #define TPL_DEP_COST_SCALE_LOG2 4 // TODO(jingning) All spatially adaptive variables should go to TileDataEnc. diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index 1dff8d3fd2..dbd7482b0d 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -374,7 +374,8 @@ static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats, tpl_block_stats_ptr->intra_cost = src_stats->intra_cost; tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; - tpl_block_stats_ptr->mv = src_stats->mv; + tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row; + tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col; tpl_block_stats_ptr->ref_frame_index = src_stats->ref_frame_index; } } diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index a0d2c87558..9247231328 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -252,6 +252,25 @@ enum vpx_kf_mode { VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */ }; +/*!\brief Temporal dependency model stats for each block before propagation */ +typedef struct TplBlockStats { + int64_t intra_cost; /**< Intra cost */ + int64_t inter_cost; /**< Inter cost */ + int16_t mv_r; /**< Motion vector row */ + int16_t mv_c; /**< Motion vector col */ + int64_t recrf_rate; /**< Rate from reconstructed ref frame */ + int64_t recrf_dist; /**< Distortion from reconstructed ref frame */ + int ref_frame_index; /**< Ref frame index */ +} TplBlockStats; + +/*!\brief Temporal dependency model stats for each frame before propagation */ +typedef struct TplFrameStats { + int frame_width; /**< Frame width */ + int frame_height; /**< Frame height */ + // Size of the list can be calculated from frame_width and frame_height. + TplBlockStats *block_stats_list; /**< List of tpl stats for each block */ +} TplFrameStats; + /*!\brief Encoded Frame Flags * * This type indicates a bitfield to be passed to vpx_codec_encode(), defining From 33aba6ecc16c71b6f380cfd3d24eab11690deb99 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 20 Apr 2023 18:15:23 -0700 Subject: [PATCH 0766/1000] configure: add aarch64 to ARCH_LIST This will allow identifying Windows Visual Studio targets as aarch64; the Microsoft compiler does not define __aarch64__. An alternative would be to define this in the code, checking for _M_ARM64 or _M_ARM64EC. For now we'll use the existing VPX_ARCH_* system. For compatibility VPX_ARCH_ARM will continue to be defined to 1 in this case. Bug: webm:1788 Bug: b/277255076 Change-Id: I12e25710891e86f0c7339ba96884c18ed90ba16f --- build/make/configure.sh | 4 ++++ configure | 1 + 2 files changed, 5 insertions(+) diff --git a/build/make/configure.sh b/build/make/configure.sh index 32105651ff..ec9af5e63d 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -842,6 +842,10 @@ process_common_toolchain() { # Enable the architecture family case ${tgt_isa} in + arm64 | armv8) + enable_feature arm + enable_feature aarch64 + ;; arm*) enable_feature arm ;; diff --git a/configure b/configure index 890ad3968a..20707727ef 100755 --- a/configure +++ b/configure @@ -243,6 +243,7 @@ CODEC_FAMILIES=" ARCH_LIST=" arm + aarch64 mips x86 x86_64 From 57b9afa58f849a8165ce3132c21087ae451d862c Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 2 May 2023 18:37:59 -0700 Subject: [PATCH 0767/1000] s/__aarch64__/VPX_ARCH_AARCH64/ This allows AArch64 to be correctly detected when building with Visual Studio (cl.exe) and fixes a crash in vp9_diamond_search_sad_neon.c. There are still test failures, however. Microsoft's compiler doesn't define __ARM_FEATURE_*. To use those paths we may need to rely on _M_ARM64_EXTENSION. Bug: webm:1788 Bug: b/277255076 Change-Id: I4d26f5f84dbd0cbcd1cdf0d7d932ebcf109febe5 --- vp8/encoder/arm/neon/fastquantizeb_neon.c | 8 ++--- vp9/encoder/arm/neon/vp9_denoiser_neon.c | 2 +- .../arm/neon/vp9_diamond_search_sad_neon.c | 14 ++++---- vp9/encoder/arm/neon/vp9_quantize_neon.c | 6 ++-- vpx_dsp/arm/avg_neon.c | 2 +- vpx_dsp/arm/highbd_avg_neon.c | 2 +- vpx_dsp/arm/highbd_quantize_neon.c | 8 ++--- vpx_dsp/arm/quantize_neon.c | 8 ++--- vpx_dsp/arm/sum_neon.h | 34 +++++++++---------- vpx_dsp/arm/transpose_neon.h | 10 +++--- vpx_dsp/arm/vpx_convolve8_neon.c | 6 ++-- vpx_dsp/arm/vpx_convolve8_neon.h | 8 ++--- 12 files changed, 54 insertions(+), 54 deletions(-) diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c index 6fc60805f6..950c943343 100644 --- a/vp8/encoder/arm/neon/fastquantizeb_neon.c +++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c @@ -28,11 +28,11 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { zig_zag1 = vld1q_u16(inv_zig_zag + 8); int16x8_t x0, x1, sz0, sz1, y0, y1; uint16x8_t eob0, eob1; -#ifndef __aarch64__ +#if !VPX_ARCH_AARCH64 uint16x4_t eob_d16; uint32x2_t eob_d32; uint32x4_t eob_q32; -#endif // __arch64__ +#endif // !VPX_ARCH_AARCH64 /* sign of z: z >> 15 */ sz0 = vshrq_n_s16(z0, 15); @@ -70,7 +70,7 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { /* select the largest value */ eob0 = vmaxq_u16(eob0, eob1); -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *d->eob = (int8_t)vmaxvq_u16(eob0); #else eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0)); @@ -79,7 +79,7 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { eob_d32 = vpmax_u32(eob_d32, eob_d32); vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); -#endif // __aarch64__ +#endif // VPX_ARCH_AARCH64 /* qcoeff = x */ vst1q_s16(d->qcoeff, x0); diff --git a/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/vp9/encoder/arm/neon/vp9_denoiser_neon.c index 53e8c7e498..d631cd437d 100644 --- a/vp9/encoder/arm/neon/vp9_denoiser_neon.c +++ b/vp9/encoder/arm/neon/vp9_denoiser_neon.c @@ -21,7 +21,7 @@ // Compute the sum of all pixel differences of this MB. static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlvq_s8(v_sum_diff_total); #else const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total); diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c index 255e6fbc4a..b82b3f9db5 100644 --- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c +++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -94,7 +94,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, // Work out the start point for the search const uint8_t *best_address = in_what; const uint8_t *new_best_address = best_address; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address); #else int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address); @@ -117,7 +117,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, int8x16_t v_inside_d; uint32x4_t v_outside_d; int32x4_t v_cost_d, v_sad_d; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 int64x2_t v_blocka[2]; #else int32x4_t v_blocka[1]; @@ -138,7 +138,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, vreinterpretq_s32_s16(v_these_mv_w))); // If none of them are inside, then move on -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d)); #else horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)), @@ -167,7 +167,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, // Compute the SIMD pointer offsets. { -#if defined(__aarch64__) // sizeof(intptr_t) == 8 +#if VPX_ARCH_AARCH64 // sizeof(intptr_t) == 8 // Load the offsets int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]); int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]); @@ -234,7 +234,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, // Find the minimum value and index horizontally in v_sad_d { uint32_t local_best_sad; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d)); #else uint32x2_t horiz_min_0 = @@ -256,7 +256,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d); v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff)); -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 local_best_idx = vminvq_u32(v_mask_d); #else horiz_min_0 = @@ -280,7 +280,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, best_address = new_best_address; v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 v_ba_q = vdupq_n_s64((intptr_t)best_address); #else v_ba_d = vdupq_n_s32((intptr_t)best_address); diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index c2b55fcbaa..97ab13628e 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -50,7 +50,7 @@ static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr, } static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 return (uint16_t)vmaxvq_s16(v_eobmax); #else const int16x4_t v_eobmax_3210 = @@ -65,7 +65,7 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); return (uint16_t)vget_lane_s16(v_eobmax_final, 0); -#endif // __aarch64__ +#endif // VPX_ARCH_AARCH64 } static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr, @@ -81,7 +81,7 @@ static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr, static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round, int16x8_t *v_quant, int16x8_t *v_dequant) { -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *v_round = vdupq_laneq_s16(*v_round, 1); *v_quant = vdupq_laneq_s16(*v_quant, 1); *v_dequant = vdupq_laneq_s16(*v_dequant, 1); diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index d48115dd01..8c61fc26f4 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -210,7 +210,7 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max); const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min); -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 *min = *max = 0; // Clear high bits *((uint8_t *)max) = vmaxvq_u8(ab07_max); *((uint8_t *)min) = vminvq_u8(ab07_min); diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c index fc10197d71..8939ee131e 100644 --- a/vpx_dsp/arm/highbd_avg_neon.c +++ b/vpx_dsp/arm/highbd_avg_neon.c @@ -114,7 +114,7 @@ void vpx_highbd_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint16x8_t min4567 = vminq_u16(min45, min67); const uint16x8_t min07 = vminq_u16(min0123, min4567); -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 *min = *max = 0; // Clear high bits *((uint16_t *)max) = vmaxvq_u16(max07); *((uint16_t *)min) = vminvq_u16(min07); diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index 526447acf5..d2a7add60d 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -166,7 +166,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } while (n_coeffs > 0); } -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *eob_ptr = vmaxvq_u16(eob_max); #else { @@ -176,7 +176,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } -#endif // __aarch64__ +#endif // VPX_ARCH_AARCH64 // Need these here, else the compiler complains about mixing declarations and // code in C90 (void)n_coeffs; @@ -291,7 +291,7 @@ void vpx_highbd_quantize_b_32x32_neon( } } -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *eob_ptr = vmaxvq_u16(eob_max); #else { @@ -301,5 +301,5 @@ void vpx_highbd_quantize_b_32x32_neon( const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } -#endif // __aarch64__ +#endif // VPX_ARCH_AARCH64 } diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index cc8f623744..35c67f6075 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -134,7 +134,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } while (n_coeffs > 0); } -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *eob_ptr = vmaxvq_u16(eob_max); #else { @@ -144,7 +144,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } -#endif // __aarch64__ +#endif // VPX_ARCH_AARCH64 // Need these here, else the compiler complains about mixing declarations and // code in C90 (void)scan; @@ -276,7 +276,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, } } -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *eob_ptr = vmaxvq_u16(eob_max); #else { @@ -286,5 +286,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } -#endif // __aarch64__ +#endif // VPX_ARCH_AARCH64 } diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index a0c72f92ce..48a2fc05ca 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -17,7 +17,7 @@ #include "vpx/vpx_integer.h" static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlv_u8(a); #else const uint16x4_t b = vpaddl_u8(a); @@ -27,7 +27,7 @@ static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) { } static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlv_u8(a); #else const uint16x4_t b = vpaddl_u8(a); @@ -38,7 +38,7 @@ static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) { } static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlvq_u8(a); #else const uint16x8_t b = vpaddlq_u8(a); @@ -50,7 +50,7 @@ static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) { } static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddv_u16(a); #else const uint16x4_t b = vpadd_u16(a, a); @@ -60,7 +60,7 @@ static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) { } static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlvq_s16(a); #else const int32x4_t b = vpaddlq_s16(a); @@ -72,7 +72,7 @@ static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) { } static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlvq_u16(a); #else const uint32x4_t b = vpaddlq_u16(a); @@ -84,7 +84,7 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) { } static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); const uint16x8_t b0 = vpaddq_u16(a0, a1); @@ -102,7 +102,7 @@ static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) { static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo, const uint16x8_t vec_hi) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi); #else const uint32x4_t vec_l_lo = @@ -127,7 +127,7 @@ static INLINE uint32x4_t horizontal_long_add_4d_uint16x8( const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]); const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]); const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]); -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 const uint32x4_t c0 = vpaddq_u32(b0, b1); const uint32x4_t c1 = vpaddq_u32(b2, b3); return vpaddq_u32(c0, c1); @@ -143,7 +143,7 @@ static INLINE uint32x4_t horizontal_long_add_4d_uint16x8( } static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddv_s32(a); #else return vget_lane_s32(a, 0) + vget_lane_s32(a, 1); @@ -151,7 +151,7 @@ static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) { } static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddv_u32(a); #else return vget_lane_u32(a, 0) + vget_lane_u32(a, 1); @@ -159,7 +159,7 @@ static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) { } static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddvq_s32(a); #else const int64x2_t b = vpaddlq_s32(a); @@ -170,7 +170,7 @@ static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) { } static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddvq_u32(a); #else const uint64x2_t b = vpaddlq_u32(a); @@ -181,7 +181,7 @@ static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) { } static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]); uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]); return vpaddq_u32(res01, res23); @@ -196,7 +196,7 @@ static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) { } static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlvq_u32(a); #else const uint64x2_t b = vpaddlq_u32(a); @@ -205,7 +205,7 @@ static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) { } static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddvq_s64(a); #else return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); @@ -213,7 +213,7 @@ static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) { } static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddvq_u64(a); #else return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index 518278f303..74f85a6bb6 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -23,7 +23,7 @@ // b0.val[1]: 04 05 06 07 20 21 22 23 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { int16x8x2_t b0; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 b0.val[0] = vreinterpretq_s16_s64( vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); b0.val[1] = vreinterpretq_s16_s64( @@ -39,7 +39,7 @@ static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { int32x4x2_t b0; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 b0.val[0] = vreinterpretq_s32_s64( vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); b0.val[1] = vreinterpretq_s32_s64( @@ -53,7 +53,7 @@ static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { int64x2x2_t b0; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)); b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)); #else @@ -67,7 +67,7 @@ static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) { uint8x16x2_t b0; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 b0.val[0] = vreinterpretq_u8_u64( vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); b0.val[1] = vreinterpretq_u8_u64( @@ -83,7 +83,7 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) { static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { uint16x8x2_t b0; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 b0.val[0] = vreinterpretq_u16_u64( vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); b0.val[1] = vreinterpretq_u16_u64( diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index b4cdd58c70..b312cc747c 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -31,7 +31,7 @@ // instructions. This optimization is much faster in speed unit test, but slowed // down the whole decoder by 5%. -#if defined(__aarch64__) && \ +#if VPX_ARCH_AARCH64 && \ (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)) DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { @@ -1261,7 +1261,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, #endif // defined(__ARM_FEATURE_MATMUL_INT8) -#else // !(defined(__aarch64__) && +#else // !(VPX_ARCH_AARCH64 && // (defined(__ARM_FEATURE_DOTPROD) || // defined(__ARM_FEATURE_MATMUL_INT8))) @@ -2105,6 +2105,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } } -#endif // #if defined(__aarch64__) && +#endif // #if VPX_ARCH_AARCH64 && // (defined(__ARM_FEATURE_DOTPROD) || // defined(__ARM_FEATURE_MATMUL_INT8)) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index ed7f180538..07cf8242d3 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -16,7 +16,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, const int8x16_t samples_hi, @@ -114,9 +114,9 @@ static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, return vqrshrun_n_s16(sum, 7); } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) -#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8) +#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, const uint8x16_t samples_hi, @@ -199,7 +199,7 @@ static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples, return vqrshrun_n_s16(sum, 7); } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8) +#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, From a398b60d6c0253a99711b8496d691cdbdf635b2c Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 3 May 2023 10:09:03 -0700 Subject: [PATCH 0768/1000] fdct8x8_test: EXPECT_* -> ASSERT_* This avoids unnecessary logging when a block has multiple errors. Change-Id: If0f3e6f8ff5bd284655f7cabfd23c253c93d44c5 --- test/fdct8x8_test.cc | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 83d1ff1429..fcc84690a0 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -170,7 +170,7 @@ class FwdTrans8x8TestBase { for (int j = 0; j < 64; ++j) { const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]); const int max_diff = kSignBiasMaxDiff255; - EXPECT_LT(diff, max_diff << (bit_depth_ - 8)) + ASSERT_LT(diff, max_diff << (bit_depth_ - 8)) << "Error: 8x8 FDCT/FHT has a sign bias > " << 1. * max_diff / count_test_block * 100 << "%" << " for input range [-255, 255] at index " << j @@ -201,7 +201,7 @@ class FwdTrans8x8TestBase { for (int j = 0; j < 64; ++j) { const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]); const int max_diff = kSignBiasMaxDiff15; - EXPECT_LT(diff, max_diff << (bit_depth_ - 8)) + ASSERT_LT(diff, max_diff << (bit_depth_ - 8)) << "Error: 8x8 FDCT/FHT has a sign bias > " << 1. * max_diff / count_test_block * 100 << "%" << " for input range [-15, 15] at index " << j @@ -275,11 +275,11 @@ class FwdTrans8x8TestBase { } } - EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error) + ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error) << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual" << " roundtrip error > 1"; - EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error) + ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error) << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip " << "error > 1/5 per block"; } @@ -360,17 +360,17 @@ class FwdTrans8x8TestBase { total_coeff_error += abs(coeff_diff); } - EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error) + ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error) << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has" - << "an individual roundtrip error > 1"; + << " an individual roundtrip error > 1"; - EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error) + ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error) << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average" << " roundtrip error > 1/5 per block"; - EXPECT_EQ(0, total_coeff_error) + ASSERT_EQ(0, total_coeff_error) << "Error: Extremal 8x8 FDCT/FHT has" - << "overflow issues in the intermediate steps > 1"; + << " overflow issues in the intermediate steps > 1"; } } @@ -426,7 +426,7 @@ class FwdTrans8x8TestBase { const int diff = dst[j] - src[j]; #endif const uint32_t error = diff * diff; - EXPECT_GE(1u << 2 * (bit_depth_ - 8), error) + ASSERT_GE(1u << 2 * (bit_depth_ - 8), error) << "Error: 8x8 IDCT has error " << error << " at index " << j; } } @@ -456,7 +456,7 @@ class FwdTrans8x8TestBase { for (int j = 0; j < kNumCoeffs; ++j) { const int32_t diff = coeff[j] - coeff_r[j]; const uint32_t error = diff * diff; - EXPECT_GE(9u << 2 * (bit_depth_ - 8), error) + ASSERT_GE(9u << 2 * (bit_depth_ - 8), error) << "Error: 8x8 DCT has error " << error << " at index " << j; } } @@ -512,7 +512,7 @@ class FwdTrans8x8TestBase { const int diff = dst[j] - ref[j]; #endif const uint32_t error = diff * diff; - EXPECT_EQ(0u, error) + ASSERT_EQ(0u, error) << "Error: 8x8 IDCT has error " << error << " at index " << j; } } From 3dbadd1b83ce83b804f7dc2034fb51870365e337 Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Wed, 3 May 2023 13:28:41 -0700 Subject: [PATCH 0769/1000] Fix clang warning on const-qualification of parameters Change-Id: I900a0a48dde5fcb262157b191ac536e18269feb3 --- test/vp9_quantize_test.cc | 4 ++-- vpx_dsp/arm/quantize_neon.c | 4 ++-- vpx_dsp/quantize.c | 4 ++-- vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 ++-- vpx_dsp/x86/quantize_avx.c | 4 ++-- vpx_dsp/x86/quantize_avx2.c | 4 ++-- vpx_dsp/x86/quantize_ssse3.c | 4 ++-- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 84a5a58e4e..5e3a7c2701 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -39,10 +39,10 @@ namespace { const int number_of_iterations = 100; typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, - const macroblock_plane *const mb_plane, + const macroblock_plane *mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const struct ScanOrder *const scan_order); + const struct ScanOrder *scan_order); typedef std::tuple QuantizeParam; diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index cc8f623744..7232b81703 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -216,10 +216,10 @@ quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, - const struct macroblock_plane *const mb_plane, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct ScanOrder *const scan_order) { + const struct ScanOrder *scan_order) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index d44ced20dc..7dff8c7a87 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -211,10 +211,10 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #endif void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, - const struct macroblock_plane *const mb_plane, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct ScanOrder *const scan_order) { + const struct ScanOrder *scan_order) { const int n_coeffs = 32 * 32; const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index bde0115298..494d7ba5e2 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -725,14 +725,14 @@ () add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; - add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; + add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *scan_order"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b neon sse2 avx2/; - add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; + add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *scan_order"; specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index d289bf6ebf..6837a5cf28 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -143,10 +143,10 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, - const struct macroblock_plane *const mb_plane, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct ScanOrder *const scan_order) { + const struct ScanOrder *scan_order) { const __m128i zero = _mm_setzero_si128(); const __m256i big_zero = _mm256_setzero_si256(); int index; diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index 5421dcf0ba..3d97b3fdae 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -253,10 +253,10 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16( } void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, - const struct macroblock_plane *const mb_plane, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct ScanOrder *const scan_order) { + const struct ScanOrder *scan_order) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 556f4ca617..641f23298b 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -110,10 +110,10 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, - const struct macroblock_plane *const mb_plane, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct ScanOrder *const scan_order) { + const struct ScanOrder *scan_order) { const __m128i zero = _mm_setzero_si128(); int index; const int16_t *iscan = scan_order->iscan; From 8782fd070df84baaa1a7eafc2848de5ddd513078 Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Wed, 3 May 2023 13:34:41 -0700 Subject: [PATCH 0770/1000] Fix mismatched param names in vpx_dsp/arm/highbd_avg_neon.c Change-Id: Ibf00a6e1029284e637b10ef01ac9b31ffadc74ca --- vpx_dsp/arm/highbd_avg_neon.c | 59 +++++++++++++++++------------------ 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c index fc10197d71..0abdc037be 100644 --- a/vpx_dsp/arm/highbd_avg_neon.c +++ b/vpx_dsp/arm/highbd_avg_neon.c @@ -16,18 +16,18 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *a, int a_stride) { - const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); - const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * a_stride, a_stride); - const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * a_stride, a_stride); +uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *s8, int p) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8); + const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * p, p); + const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * p, p); return (horizontal_add_uint16x8(vaddq_u16(a0, a1)) + (1 << 3)) >> 4; } -uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *a, int a_stride) { - const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); +uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *s8, int p) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8); uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7; - load_u16_8x8(a_ptr, a_stride, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + load_u16_8x8(a_ptr, p, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); sum = vaddq_u16(a0, a1); sum = vaddq_u16(sum, a2); @@ -63,29 +63,28 @@ int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) { return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1])); } -void vpx_highbd_minmax_8x8_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, int *min, - int *max) { - const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); - const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b); - - const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * a_stride); - const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * a_stride); - const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * a_stride); - const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * a_stride); - const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * a_stride); - const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * a_stride); - const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * a_stride); - const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * a_stride); - - const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * b_stride); - const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * b_stride); - const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * b_stride); - const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * b_stride); - const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * b_stride); - const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * b_stride); - const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * b_stride); - const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * b_stride); +void vpx_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8, + int dp, int *min, int *max) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8); + + const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p); + const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p); + const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p); + const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p); + const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p); + const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p); + const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p); + const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p); + + const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp); + const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp); + const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp); + const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp); + const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp); + const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp); + const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp); + const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp); const uint16x8_t abs_diff0 = vabdq_u16(a0, b0); const uint16x8_t abs_diff1 = vabdq_u16(a1, b1); From 701392c1b09cd6db8298520caa360f0f7235d85b Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Wed, 3 May 2023 13:38:46 -0700 Subject: [PATCH 0771/1000] Fix mismatched param names in vpx_dsp/arm/sad4d_neon.c Change-Id: If621944684cf9bb9f353db5961ed8b4b4ae38f24 --- vpx_dsp/arm/sad4d_neon.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 44cd990280..3a548d0f9f 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -282,11 +282,12 @@ static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride, vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); } -#define SAD_WXH_4D_NEON(w, h) \ - void vpx_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ - const uint8_t *const ref[4], int ref_stride, \ - uint32_t res[4]) { \ - sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \ +#define SAD_WXH_4D_NEON(w, h) \ + void vpx_sad##w##x##h##x4d_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon(src_ptr, src_stride, ref_array, ref_stride, sad_array, \ + (h)); \ } SAD_WXH_4D_NEON(4, 4) @@ -309,16 +310,17 @@ SAD_WXH_4D_NEON(64, 64) #undef SAD_WXH_4D_NEON -#define SAD_SKIP_WXH_4D_NEON(w, h) \ - void vpx_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ - const uint8_t *const ref[4], \ - int ref_stride, uint32_t res[4]) { \ - sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res, \ - ((h) >> 1)); \ - res[0] <<= 1; \ - res[1] <<= 1; \ - res[2] <<= 1; \ - res[3] <<= 1; \ +#define SAD_SKIP_WXH_4D_NEON(w, h) \ + void vpx_sad_skip_##w##x##h##x4d_neon( \ + const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], int ref_stride, \ + uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon(src_ptr, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, ((h) >> 1)); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ } SAD_SKIP_WXH_4D_NEON(4, 4) From 174e782fe5c0f4b01c161eeb0500c88d84a26c42 Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Wed, 3 May 2023 14:44:08 -0700 Subject: [PATCH 0772/1000] Fix mismatched param names in vpx_dsp/arm/highbd_sad4d_neon.c Change-Id: Ia4918eb0bac3b28b27e1ef205b9171680b2eb9a4 --- vpx_dsp/arm/highbd_sad4d_neon.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/vpx_dsp/arm/highbd_sad4d_neon.c b/vpx_dsp/arm/highbd_sad4d_neon.c index 62c4685a7a..a6684b0534 100644 --- a/vpx_dsp/arm/highbd_sad4d_neon.c +++ b/vpx_dsp/arm/highbd_sad4d_neon.c @@ -213,10 +213,11 @@ static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr, } #define HBD_SAD_WXH_4D_NEON(w, h) \ - void vpx_highbd_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ - const uint8_t *const ref[4], \ - int ref_stride, uint32_t res[4]) { \ - highbd_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \ + void vpx_highbd_sad##w##x##h##x4d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad##w##xhx4d_neon(src, src_stride, ref_array, ref_stride, \ + sad_array, (h)); \ } HBD_SAD_WXH_4D_NEON(4, 4) @@ -239,16 +240,16 @@ HBD_SAD_WXH_4D_NEON(64, 64) #undef HBD_SAD_WXH_4D_NEON -#define HBD_SAD_SKIP_WXH_4D_NEON(w, h) \ - void vpx_highbd_sad_skip_##w##x##h##x4d_neon( \ - const uint8_t *src, int src_stride, const uint8_t *const ref[4], \ - int ref_stride, uint32_t res[4]) { \ - highbd_sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res, \ - ((h) >> 1)); \ - res[0] <<= 1; \ - res[1] <<= 1; \ - res[2] <<= 1; \ - res[3] <<= 1; \ +#define HBD_SAD_SKIP_WXH_4D_NEON(w, h) \ + void vpx_highbd_sad_skip_##w##x##h##x4d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad##w##xhx4d_neon(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, ((h) >> 1)); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ } HBD_SAD_SKIP_WXH_4D_NEON(4, 4) From 2c03388231cf545e1245746a6ee4edfe0322e71e Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Wed, 3 May 2023 14:45:13 -0700 Subject: [PATCH 0773/1000] Fix mismatched param names in vpx_dsp/x86/sad4d_avx2.c Change-Id: I226215a2ff8798b72abe0c2caf3d18875595caa5 --- vpx_dsp/x86/sad4d_avx2.c | 60 ++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c index c87fd3cd27..cf7111983b 100644 --- a/vpx_dsp/x86/sad4d_avx2.c +++ b/vpx_dsp/x86/sad4d_avx2.c @@ -135,45 +135,45 @@ static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride, calc_final_4(sums, sad_array); } -#define SAD64_H(h) \ - void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ - const uint8_t *const ref[4], int ref_stride, \ - uint32_t res[4]) { \ - sad64xhx4d_avx2(src, src_stride, ref, ref_stride, h, res); \ +#define SAD64_H(h) \ + void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \ } -#define SAD32_H(h) \ - void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ - const uint8_t *const ref[4], int ref_stride, \ - uint32_t res[4]) { \ - sad32xhx4d_avx2(src, src_stride, ref, ref_stride, h, res); \ +#define SAD32_H(h) \ + void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \ } SAD64_H(64) SAD32_H(32) -#define SADS64_H(h) \ - void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ - const uint8_t *const ref[4], \ - int ref_stride, uint32_t res[4]) { \ - sad64xhx4d_avx2(src, 2 * src_stride, ref, 2 * ref_stride, ((h) >> 1), \ - res); \ - res[0] <<= 1; \ - res[1] <<= 1; \ - res[2] <<= 1; \ - res[3] <<= 1; \ +#define SADS64_H(h) \ + void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + ((h) >> 1), sad_array); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ } -#define SADS32_H(h) \ - void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ - const uint8_t *const ref[4], \ - int ref_stride, uint32_t res[4]) { \ - sad32xhx4d_avx2(src, 2 * src_stride, ref, 2 * ref_stride, ((h) >> 1), \ - res); \ - res[0] <<= 1; \ - res[1] <<= 1; \ - res[2] <<= 1; \ - res[3] <<= 1; \ +#define SADS32_H(h) \ + void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + ((h) >> 1), sad_array); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ } SADS64_H(64) From de45e4b612bb576f76d35770afc62ae799e6c0fd Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 28 Apr 2023 14:32:47 -0400 Subject: [PATCH 0774/1000] Add codec control to export TPL stats new codec control: VP9E_GET_TPL_STATS with unit test Bug: b/273736974 Change-Id: I27343bd3f6dffafc86925234537bcdb557bc4079 --- test/encode_api_test.cc | 80 ++++++++++++++++++++++++++++++++++++- test/encode_test_driver.h | 5 +++ vp9/encoder/vp9_encoder.h | 1 + vp9/encoder/vp9_tpl_model.c | 2 + vp9/vp9_cx_iface.c | 18 +++++++++ vpx/vp8cx.h | 14 +++++++ vpx/vpx_encoder.h | 2 +- 7 files changed, 120 insertions(+), 2 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index ecdf928343..eac9626052 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -13,9 +13,12 @@ #include #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/video_source.h" #include "./vpx_config.h" -#include "test/video_source.h" #include "vpx/vp8cx.h" #include "vpx/vpx_encoder.h" @@ -360,4 +363,79 @@ TEST(EncodeAPI, ConfigChangeThreadCount) { } } +#if CONFIG_VP9_ENCODER +class EncodeApiGetTplStatsTest + : public ::libvpx_test::EncoderTest, + public ::testing::TestWithParam { + public: + EncodeApiGetTplStatsTest() : EncoderTest(GetParam()) {} + ~EncodeApiGetTplStatsTest() override {} + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_TPL, 1); + } + } + + vpx_codec_err_t AllocateTplList(TplFrameStats **data) { + // Allocate MAX_ARF_GOP_SIZE * sizeof(TplFrameStats) that will be filled + // by VP9E_GET_TPL_STATS + *data = static_cast(calloc(50, sizeof(TplFrameStats))); + if (*data == nullptr) return VPX_CODEC_MEM_ERROR; + return VPX_CODEC_OK; + } + + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { + ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); + while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + switch (pkt->kind) { + case VPX_CODEC_CX_FRAME_PKT: { + TplFrameStats *tpl_stats = NULL; + EXPECT_EQ(AllocateTplList(&tpl_stats), VPX_CODEC_OK); + encoder->Control(VP9E_GET_TPL_STATS, tpl_stats); + bool stats_not_all_zero = false; + for (unsigned int i = 0; i < cfg_.g_lag_in_frames; i++) { + if (tpl_stats[i].frame_width != 0) { + ASSERT_EQ(tpl_stats[i].frame_width, width_); + ASSERT_EQ(tpl_stats[i].frame_height, height_); + ASSERT_NE(tpl_stats[i].block_stats_list, nullptr); + stats_not_all_zero = true; + } + } + ASSERT_TRUE(stats_not_all_zero); + // Free the memory right away now as this is only a test. + free(tpl_stats); + break; + } + default: break; + } + } + } + + int width_; + int height_; +}; + +TEST_P(EncodeApiGetTplStatsTest, GetTplStats) { + cfg_.g_lag_in_frames = 25; + width_ = 352; + height_ = 288; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_, + height_, 30, 1, 0, 150); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +INSTANTIATE_TEST_SUITE_P( + VP9, EncodeApiGetTplStatsTest, + ::testing::Values( + static_cast(&libvpx_test::kVP9))); +#endif // CONFIG_VP9_ENCODER + } // namespace diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index b57df85291..27a78e68d2 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -153,6 +153,11 @@ class Encoder { const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } + + void Control(int ctrl_id, TplFrameStats *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } #endif // CONFIG_VP9_ENCODER #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 7c22c807b7..742cf0b6dd 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -505,6 +505,7 @@ typedef struct EncFrameBuf { } EncFrameBuf; // Maximum operating frame buffer size needed for a GOP using ARF reference. +// This is used to allocate the memory for TPL stats for a GOP. #define MAX_ARF_GOP_SIZE (2 * MAX_LAG_BUFFERS) #define MAX_KMEANS_GROUPS 8 diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index dbd7482b0d..b62c66b6ce 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -1134,6 +1134,8 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, const int mi_height = num_8x8_blocks_high_lookup[bsize]; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + tpl_frame_stats_before_propagation->frame_width = cm->width; + tpl_frame_stats_before_propagation->frame_height = cm->height; // Setup scaling factor #if CONFIG_VP9_HIGHBITDEPTH vp9_setup_scale_factors_for_frame( diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 4c7eaed725..8bd880c7b5 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1788,6 +1788,23 @@ static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + TplFrameStats *data = va_arg(args, TplFrameStats *); + int i; + if (data == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + for (i = 0; i < MAX_ARF_GOP_SIZE; i++) { + data[i].frame_width = cpi->tpl_frame_stats[i].frame_width; + data[i].frame_height = cpi->tpl_frame_stats[i].frame_height; + data[i].block_stats_list = cpi->tpl_frame_stats[i].block_stats_list; + } + + return VPX_CODEC_OK; +} + static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, va_list args) { VP9_COMP *const cpi = ctx->cpi; @@ -2035,6 +2052,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_GET_ACTIVEMAP, ctrl_get_active_map }, { VP9E_GET_LEVEL, ctrl_get_level }, { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config }, + { VP9E_GET_TPL_STATS, ctrl_get_tpl_stats }, { -1, NULL }, }; diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index e0b679fbb7..123a645d91 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -767,6 +767,18 @@ enum vp8e_enc_control_id { * */ VP9E_SET_QUANTIZER_ONE_PASS, + + /*!\brief Codec control to get TPL stats for the current GOP. + * + * Allocation and free of memory of size MAX_ARF_GOP_SIZE (50) * + * sizeof(TplFrameStats) should be done by applications. + * + * VPX_CODEC_INVALID_PARAM will be returned if the pointer passed in is NULL. + * + * Supported in codecs: VP9 + * + */ + VP9E_GET_TPL_STATS, }; /*!\brief vpx 1-D scaling mode @@ -1097,6 +1109,8 @@ VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int) #define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int) #define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS +VPX_CTRL_USE_TYPE(VP9E_GET_TPL_STATS, void *) +#define VPX_CTRL_VP9E_GET_TPL_STATS /*!\endcond */ /*! @} - end defgroup vp8_encoder */ diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 9247231328..1910a19040 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -58,7 +58,7 @@ extern "C" { * fields to structures */ #define VPX_ENCODER_ABI_VERSION \ - (15 + VPX_CODEC_ABI_VERSION + \ + (16 + VPX_CODEC_ABI_VERSION + \ VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield From f059f9ee2df114ffa475aaef064f93396ebf39cc Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 4 May 2023 10:32:03 -0400 Subject: [PATCH 0775/1000] Add Vpx* prefix to Tpl{Block,Frame}Stats This is to avoid symbols redifinition when integrating with other libraries. Bug: b/273736974 Change-Id: I891af78b1907504d5bb9f735164aea18c2aba944 --- test/encode_api_test.cc | 9 +++++---- test/encode_test_driver.h | 2 +- vp9/encoder/vp9_encoder.h | 2 +- vp9/encoder/vp9_tpl_model.c | 8 ++++---- vp9/vp9_cx_iface.c | 2 +- vpx/vp8cx.h | 2 +- vpx/vpx_encoder.h | 10 +++++----- 7 files changed, 18 insertions(+), 17 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index eac9626052..0514cd828f 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -384,10 +384,11 @@ class EncodeApiGetTplStatsTest } } - vpx_codec_err_t AllocateTplList(TplFrameStats **data) { - // Allocate MAX_ARF_GOP_SIZE * sizeof(TplFrameStats) that will be filled + vpx_codec_err_t AllocateTplList(VpxTplFrameStats **data) { + // Allocate MAX_ARF_GOP_SIZE * sizeof(VpxTplFrameStats) that will be filled // by VP9E_GET_TPL_STATS - *data = static_cast(calloc(50, sizeof(TplFrameStats))); + *data = + static_cast(calloc(50, sizeof(VpxTplFrameStats))); if (*data == nullptr) return VPX_CODEC_MEM_ERROR; return VPX_CODEC_OK; } @@ -397,7 +398,7 @@ class EncodeApiGetTplStatsTest while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { switch (pkt->kind) { case VPX_CODEC_CX_FRAME_PKT: { - TplFrameStats *tpl_stats = NULL; + VpxTplFrameStats *tpl_stats = NULL; EXPECT_EQ(AllocateTplList(&tpl_stats), VPX_CODEC_OK); encoder->Control(VP9E_GET_TPL_STATS, tpl_stats); bool stats_not_all_zero = false; diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index 27a78e68d2..a5cd8306ef 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -154,7 +154,7 @@ class Encoder { ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } - void Control(int ctrl_id, TplFrameStats *arg) { + void Control(int ctrl_id, VpxTplFrameStats *arg) { const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 742cf0b6dd..cca1617830 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -745,7 +745,7 @@ typedef struct VP9_COMP { BLOCK_SIZE tpl_bsize; TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE]; // Used to store TPL stats before propagation - TplFrameStats tpl_frame_stats[MAX_ARF_GOP_SIZE]; + VpxTplFrameStats tpl_frame_stats[MAX_ARF_GOP_SIZE]; YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES]; EncFrameBuf enc_frame_buf[REF_FRAMES]; #if CONFIG_MULTITHREAD diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index b62c66b6ce..ea5d61e326 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -154,7 +154,7 @@ static void init_tpl_stats(VP9_COMP *cpi) { int frame_idx; for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - TplFrameStats *tpl_frame_stats = &cpi->tpl_frame_stats[frame_idx]; + VpxTplFrameStats *tpl_frame_stats = &cpi->tpl_frame_stats[frame_idx]; memset(tpl_frame->tpl_stats_ptr, 0, tpl_frame->height * tpl_frame->width * sizeof(*tpl_frame->tpl_stats_ptr)); @@ -356,7 +356,7 @@ static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, } } -static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats, +static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats, TplDepStats *tpl_stats, int mi_row, int mi_col, BLOCK_SIZE bsize, int stride, int64_t recon_error, @@ -368,7 +368,7 @@ static void tpl_store_before_propagation(TplBlockStats *tpl_block_stats, for (idy = 0; idy < mi_height; ++idy) { for (idx = 0; idx < mi_width; ++idx) { - TplBlockStats *tpl_block_stats_ptr = + VpxTplBlockStats *tpl_block_stats_ptr = &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx]; tpl_block_stats_ptr->inter_cost = src_stats->inter_cost; tpl_block_stats_ptr->intra_cost = src_stats->intra_cost; @@ -1105,7 +1105,7 @@ static void build_motion_field( static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx, BLOCK_SIZE bsize) { TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - TplFrameStats *tpl_frame_stats_before_propagation = + VpxTplFrameStats *tpl_frame_stats_before_propagation = &cpi->tpl_frame_stats[frame_idx]; YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame; YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL }; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 8bd880c7b5..66efba181f 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1791,7 +1791,7 @@ static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx, va_list args) { VP9_COMP *const cpi = ctx->cpi; - TplFrameStats *data = va_arg(args, TplFrameStats *); + VpxTplFrameStats *data = va_arg(args, VpxTplFrameStats *); int i; if (data == NULL) { return VPX_CODEC_INVALID_PARAM; diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 123a645d91..c4e04084c8 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -771,7 +771,7 @@ enum vp8e_enc_control_id { /*!\brief Codec control to get TPL stats for the current GOP. * * Allocation and free of memory of size MAX_ARF_GOP_SIZE (50) * - * sizeof(TplFrameStats) should be done by applications. + * sizeof(VpxTplFrameStats) should be done by applications. * * VPX_CODEC_INVALID_PARAM will be returned if the pointer passed in is NULL. * diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 1910a19040..66c5a68b86 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -253,7 +253,7 @@ enum vpx_kf_mode { }; /*!\brief Temporal dependency model stats for each block before propagation */ -typedef struct TplBlockStats { +typedef struct VpxTplBlockStats { int64_t intra_cost; /**< Intra cost */ int64_t inter_cost; /**< Inter cost */ int16_t mv_r; /**< Motion vector row */ @@ -261,15 +261,15 @@ typedef struct TplBlockStats { int64_t recrf_rate; /**< Rate from reconstructed ref frame */ int64_t recrf_dist; /**< Distortion from reconstructed ref frame */ int ref_frame_index; /**< Ref frame index */ -} TplBlockStats; +} VpxTplBlockStats; /*!\brief Temporal dependency model stats for each frame before propagation */ -typedef struct TplFrameStats { +typedef struct VpxTplFrameStats { int frame_width; /**< Frame width */ int frame_height; /**< Frame height */ // Size of the list can be calculated from frame_width and frame_height. - TplBlockStats *block_stats_list; /**< List of tpl stats for each block */ -} TplFrameStats; + VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */ +} VpxTplFrameStats; /*!\brief Encoded Frame Flags * From 2e5261647f0440e78081f9bfaaaa5ef9d10140d6 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 4 May 2023 10:59:46 -0400 Subject: [PATCH 0776/1000] Add num_blocks to VpxTplFrameStats I realized the calculation of the size of the list of VpxTplBlockStats is non-trivial. So it's better to add the field for the size. Bug: b/273736974 Change-Id: Ic1b50597c1f89a8f866b5669ca676407be6dc9d8 --- test/encode_api_test.cc | 1 + vp9/encoder/vp9_tpl_model.c | 1 + vp9/vp9_cx_iface.c | 1 + vpx/vpx_encoder.h | 2 +- 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 0514cd828f..e435ed872f 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -406,6 +406,7 @@ class EncodeApiGetTplStatsTest if (tpl_stats[i].frame_width != 0) { ASSERT_EQ(tpl_stats[i].frame_width, width_); ASSERT_EQ(tpl_stats[i].frame_height, height_); + ASSERT_GT(tpl_stats[i].num_blocks, 0); ASSERT_NE(tpl_stats[i].block_stats_list, nullptr); stats_not_all_zero = true; } diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index ea5d61e326..ed771dcb4b 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -1354,6 +1354,7 @@ void vp9_init_tpl_buffer(VP9_COMP *cpi) { cm, cpi->tpl_frame_stats[frame].block_stats_list, vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->tpl_frame_stats[frame].block_stats_list))); + cpi->tpl_frame_stats[frame].num_blocks = mi_rows * mi_cols; cpi->tpl_stats[frame].is_valid = 0; cpi->tpl_stats[frame].width = mi_cols; cpi->tpl_stats[frame].height = mi_rows; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 66efba181f..e264ae9bd9 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1799,6 +1799,7 @@ static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx, for (i = 0; i < MAX_ARF_GOP_SIZE; i++) { data[i].frame_width = cpi->tpl_frame_stats[i].frame_width; data[i].frame_height = cpi->tpl_frame_stats[i].frame_height; + data[i].num_blocks = cpi->tpl_frame_stats[i].num_blocks; data[i].block_stats_list = cpi->tpl_frame_stats[i].block_stats_list; } diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 66c5a68b86..a7f1552de0 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -267,7 +267,7 @@ typedef struct VpxTplBlockStats { typedef struct VpxTplFrameStats { int frame_width; /**< Frame width */ int frame_height; /**< Frame height */ - // Size of the list can be calculated from frame_width and frame_height. + int num_blocks; /**< Number of blocks. Size of block_stats_list */ VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */ } VpxTplFrameStats; From 4e23e7abfe02f7030f193327bc9adacfefaef991 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 4 May 2023 17:17:10 -0700 Subject: [PATCH 0777/1000] vpx_subpixel_8t_intrin_avx2,cosmetics: shorten long comment Change-Id: I8badedc2ad07d60896e45de28b707ad9f6c4d499 --- vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 9ff67bd301..2498bba173 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -1078,7 +1078,7 @@ static void vpx_filter_block1d4_h8_avx2( // f4|f7 f6 f5 f4|f7 f6 f5 f4 secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55); - // s6 s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3 + // s6 s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3 // s2 s4 s3 s2 s1 s3 s2 s1 s0 filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2); @@ -1102,7 +1102,7 @@ static void vpx_filter_block1d4_h8_avx2( srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); // multiply 4 adjacent elements with the filter and add the result - // .....|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||......... + // ...|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||... // |f1*r03+f0*r02|f3*r04+f2*r03|f1*r02+f0*r01|f3*r03+f2*r02|f1*r01+f0*r00 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); From 601a98b1542fa1fb439c715a9e2e8559338d33f8 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 23 Mar 2023 14:50:10 -0700 Subject: [PATCH 0778/1000] Add comments about vpx_codec_enc_init_ver failure Address the questions: 1. If vpx_codec_enc_init_ver() fails, should I still call vpx_codec_destroy() on the encoder context? 2. Is it safe to call vpx_codec_error_detail() when vpx_codec_enc_init_ver() failed? Change-Id: I1b0e090d11dd9f853fe203f4cbb6080c3c7b0506 --- vp9/vp9_cx_iface.c | 2 ++ vpx/src/vpx_encoder.c | 4 ++++ vpx/vpx_encoder.h | 5 ++++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index e264ae9bd9..f067efdf79 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -129,6 +129,8 @@ struct vpx_codec_alg_priv { BufferPool *buffer_pool; }; +// Called by encoder_set_config() and encoder_encode() only. Must not be called +// by encoder_init(). static vpx_codec_err_t update_error_state( vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) { const vpx_codec_err_t res = error->error_code; diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c index 846638fe55..0d6e48015a 100644 --- a/vpx/src/vpx_encoder.c +++ b/vpx/src/vpx_encoder.c @@ -54,6 +54,10 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, res = ctx->iface->init(ctx, NULL); if (res) { + // IMPORTANT: ctx->priv->err_detail must be null or point to a string + // that remains valid after ctx->priv is destroyed, such as a C string + // literal. This makes it safe to call vpx_codec_error_detail() after + // vpx_codec_enc_init_ver() failed. ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; vpx_codec_destroy(ctx); } diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index a7f1552de0..2de8089736 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -877,7 +877,7 @@ typedef struct vpx_svc_parameters { /*!\brief Initialize an encoder instance * - * Initializes a encoder context using the given interface. Applications + * Initializes an encoder context using the given interface. Applications * should call the vpx_codec_enc_init convenience macro instead of this * function directly, to ensure that the ABI version number parameter * is properly initialized. @@ -886,6 +886,9 @@ typedef struct vpx_svc_parameters { * is not thread safe and should be guarded with a lock if being used * in a multithreaded context. * + * If vpx_codec_enc_init_ver() fails, it is not necessary to call + * vpx_codec_destroy() on the encoder context. + * * \param[in] ctx Pointer to this instance's context. * \param[in] iface Pointer to the algorithm interface to use. * \param[in] cfg Configuration to use, if known. May be NULL. From 8e47341b0ea4bf2a37f968cf260d6dbfd1f0062a Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Tue, 28 Mar 2023 18:07:41 -0700 Subject: [PATCH 0779/1000] Have vpx_codec_error take const vpx_codec_ctx_t * Also have vpx_codec_error_detail take vpx_codec_ctx_t *. Both functions are getter functions that don't modify the codec context. Change-Id: I4689022425efbf7b1da5034255ac052fce5e5b4f --- vpx/src/vpx_codec.c | 6 +++--- vpx/vpx_codec.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vpx/src/vpx_codec.c b/vpx/src/vpx_codec.c index 114b94e194..24528d860a 100644 --- a/vpx/src/vpx_codec.c +++ b/vpx/src/vpx_codec.c @@ -50,12 +50,12 @@ const char *vpx_codec_err_to_string(vpx_codec_err_t err) { return "Unrecognized error code"; } -const char *vpx_codec_error(vpx_codec_ctx_t *ctx) { +const char *vpx_codec_error(const vpx_codec_ctx_t *ctx) { return (ctx) ? vpx_codec_err_to_string(ctx->err) : vpx_codec_err_to_string(VPX_CODEC_INVALID_PARAM); } -const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx) { +const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx) { if (ctx && ctx->err) return ctx->priv ? ctx->priv->err_detail : ctx->err_detail; @@ -82,7 +82,7 @@ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) { } vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface) { - return (iface) ? iface->caps : 0; + return iface ? iface->caps : 0; } vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...) { diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h index 11bf8aaa22..ca18d90cb7 100644 --- a/vpx/vpx_codec.h +++ b/vpx/vpx_codec.h @@ -318,7 +318,7 @@ const char *vpx_codec_err_to_string(vpx_codec_err_t err); * \param[in] ctx Pointer to this instance's context. * */ -const char *vpx_codec_error(vpx_codec_ctx_t *ctx); +const char *vpx_codec_error(const vpx_codec_ctx_t *ctx); /*!\brief Retrieve detailed error information for codec context * @@ -330,7 +330,7 @@ const char *vpx_codec_error(vpx_codec_ctx_t *ctx); * \retval NULL * No detailed information is available. */ -const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx); +const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx); /* REQUIRED FUNCTIONS * From 3d6b86e7045481c55b35d0daa4f19202bbe99dc1 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Fri, 24 Mar 2023 11:32:36 -0700 Subject: [PATCH 0780/1000] Overwrite cm->error->detail before freeing Help detect use after free of the return value of vpx_codec_error_detail(). If vpx_codec_error_detail() is called after vpx_codec_encode() fails, the return value may be equal to cm->error->detail, which is freed when vpx_codec_destroy() is called. Document the lifetime of the string returned by vpx_codec_error_detail(). Change-Id: I8089e90a4499b4f3cc5b9cfdbb25d72368faa319 --- vp9/encoder/vp9_encoder.c | 5 +++++ vpx/vpx_codec.h | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 662ec24b83..f76eec2b57 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "./vp9_rtcd.h" #include "./vpx_config.h" @@ -2873,6 +2874,10 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vp9_extrc_delete(&cpi->ext_ratectrl); + // Help detect use after free of the error detail string. + memset(cm->error.detail, 'A', sizeof(cm->error.detail) - 1); + cm->error.detail[sizeof(cm->error.detail) - 1] = '\0'; + vp9_remove_common(cm); vp9_free_ref_frame_buffers(cm->buffer_pool); #if CONFIG_VP9_POSTPROC diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h index ca18d90cb7..0d61b07381 100644 --- a/vpx/vpx_codec.h +++ b/vpx/vpx_codec.h @@ -323,7 +323,9 @@ const char *vpx_codec_error(const vpx_codec_ctx_t *ctx); /*!\brief Retrieve detailed error information for codec context * * Returns a human readable string providing detailed information about - * the last error. + * the last error. The returned string is only valid until the next + * vpx_codec_* function call (except vpx_codec_error and + * vpx_codec_error_detail) on the codec context. * * \param[in] ctx Pointer to this instance's context. * From 255ee1888589aa15ae909b992fe123c0358b1730 Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Tue, 18 Apr 2023 14:46:56 +0530 Subject: [PATCH 0781/1000] Add AVX2 intrinsic for idct16x16 and idct32x32 functions Added AVX2 intrinsic optimization for the following functions 1. vpx_idct16x16_256_add 2. vpx_idct32x32_1024_add 3. vpx_idct32x32_135_add The module level scaling w.r.t C function (timer based) for existing (SSE2) and new AVX2 intrinsics: Scaling Function Name SSE2 AVX2 vpx_idct32x32_1024_add 3.62x 7.49x vpx_idct32x32_135_add 4.85x 9.41x vpx_idct16x16_256_add 4.82x 7.70x This is a bit-exact change. Change-Id: Id9dda933aa1f5093bb6b35ac3b8a41846afca9d2 --- test/dct16x16_test.cc | 98 +++++- test/dct32x32_test.cc | 197 +++++++++++ vp9/common/vp9_idct.c | 2 + vp9/decoder/vp9_decoder.c | 2 +- vp9/decoder/vp9_decoder.h | 2 +- vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +- vpx_dsp/x86/inv_txfm_avx2.c | 626 +++++++++++++++++++++++++++++++++++ 8 files changed, 926 insertions(+), 8 deletions(-) create mode 100644 vpx_dsp/x86/inv_txfm_avx2.c diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index 3c104f3a44..4ad2263cfc 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -592,7 +592,7 @@ class Trans16x16TestBase { const int count_test_block = 10000; const int eob = 10; const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan; - DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); #if CONFIG_VP9_HIGHBITDEPTH @@ -643,6 +643,80 @@ class Trans16x16TestBase { } } + void RunInvTrans16x16SpeedTest(IdctFunc ref_txfm, int thresh) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + const int eob = 10; + const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan; + int64_t c_sum_time = 0; + int64_t simd_sum_time = 0; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob) { + // Random values less than the threshold, either positive or negative + coeff[scan[j]] = rnd(thresh); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + + if (bit_depth_ == VPX_BITS_8) { + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + ref_txfm(coeff, ref, pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, dst, pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + printf( + "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n", + c_sum_time, simd_sum_time, + (static_cast(c_sum_time) / static_cast(simd_sum_time))); + } + int pitch_; int tx_type_; vpx_bit_depth_t bit_depth_; @@ -755,7 +829,6 @@ TEST_P(Trans16x16HT, QuantCheck) { RunQuantCheck(429, 729); } -#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE class InvTrans16x16DCT : public Trans16x16TestBase, public ::testing::TestWithParam { public: @@ -786,7 +859,10 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans16x16DCT); TEST_P(InvTrans16x16DCT, CompareReference) { CompareInvReference(ref_txfm_, thresh_); } -#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +TEST_P(InvTrans16x16DCT, DISABLED_Speed) { + RunInvTrans16x16SpeedTest(ref_txfm_, thresh_); +} using std::make_tuple; @@ -828,6 +904,12 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P(C, InvTrans16x16DCT, + ::testing::Values(make_tuple(&vpx_idct16x16_256_add_c, + &vpx_idct16x16_256_add_c, + 6225, VPX_BITS_8))); + #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -862,6 +944,11 @@ INSTANTIATE_TEST_SUITE_P( 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P(SSE2, InvTrans16x16DCT, + ::testing::Values(make_tuple( + &vpx_idct16x16_256_add_c, + &vpx_idct16x16_256_add_sse2, 6225, VPX_BITS_8))); #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -869,6 +956,11 @@ INSTANTIATE_TEST_SUITE_P( AVX2, Trans16x16DCT, ::testing::Values(make_tuple(&vpx_fdct16x16_avx2, &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P(AVX2, InvTrans16x16DCT, + ::testing::Values(make_tuple( + &vpx_idct16x16_256_add_c, + &vpx_idct16x16_256_add_avx2, 6225, VPX_BITS_8))); #endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index 91bb8e01ea..1167038b5f 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -24,10 +24,12 @@ #include "test/register_state_check.h" #include "test/util.h" #include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_scan.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" #include "vpx_ports/msvc.h" // for round() +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; @@ -71,6 +73,9 @@ typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride); typedef std::tuple Trans32x32Param; +typedef std::tuple + InvTrans32x32Param; + #if CONFIG_VP9_HIGHBITDEPTH void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) { vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); @@ -314,6 +319,174 @@ TEST_P(Trans32x32Test, InverseAccuracy) { } } +class InvTrans32x32Test : public ::testing::TestWithParam { + public: + virtual ~InvTrans32x32Test() {} + virtual void SetUp() { + ref_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + version_ = GET_PARAM(2); // 0: high precision forward transform + // 1: low precision version for rd loop + bit_depth_ = GET_PARAM(3); + eob_ = GET_PARAM(4); + thresh_ = GET_PARAM(4); + mask_ = (1 << bit_depth_) - 1; + pitch_ = 32; + } + + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + void RunRefTxfm(tran_low_t *out, uint8_t *dst, int stride) { + ref_txfm_(out, dst, stride); + } + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + inv_txfm_(out, dst, stride); + } + int version_; + vpx_bit_depth_t bit_depth_; + int mask_; + int eob_; + int thresh_; + + InvTxfmFunc ref_txfm_; + InvTxfmFunc inv_txfm_; + int pitch_; + + void RunInvTrans32x32SpeedTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + int64_t c_sum_time = 0; + int64_t simd_sum_time = 0; + const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob_) { + // Random values less than the threshold, either positive or negative + coeff[scan[j]] = rnd(thresh_); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + + if (bit_depth_ == VPX_BITS_8) { + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + RunRefTxfm(coeff, ref, pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, dst, pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + printf( + "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n", + c_sum_time, simd_sum_time, + (static_cast(c_sum_time) / static_cast(simd_sum_time))); + } + + void CompareInvReference32x32() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + const int eob = 31; + const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int i = 0; i < count_test_block; ++i) { + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob) { + coeff[scan[j]] = rnd.Rand8Extremes(); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + if (bit_depth_ == VPX_BITS_8) { + RunRefTxfm(coeff, ref, pitch_); + RunInvTxfm(coeff, dst, pitch_); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + for (int j = 0; j < kNumCoeffs; ++j) { +#if CONFIG_VP9_HIGHBITDEPTH + const uint32_t diff = + bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j]; +#else + const uint32_t diff = dst[j] - ref[j]; +#endif // CONFIG_VP9_HIGHBITDEPTH + const uint32_t error = diff * diff; + EXPECT_EQ(0u, error) << "Error: 32x32 IDCT Comparison has error " + << error << " at index " << j; + } + } + } +}; + +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans32x32Test); + +TEST_P(InvTrans32x32Test, DISABLED_Speed) { RunInvTrans32x32SpeedTest(); } +TEST_P(InvTrans32x32Test, CompareReference) { CompareInvReference32x32(); } + using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH @@ -334,6 +507,14 @@ INSTANTIATE_TEST_SUITE_P( VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + C, InvTrans32x32Test, + ::testing::Values( + (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_c, 0, + VPX_BITS_8, 32, 6225)), + make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_c, 0, + VPX_BITS_8, 16, 6255))); #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE @@ -352,6 +533,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, InvTrans32x32Test, + ::testing::Values( + (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_sse2, 0, + VPX_BITS_8, 32, 6225)), + make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_sse2, 0, + VPX_BITS_8, 16, 6225))); #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -377,6 +566,14 @@ INSTANTIATE_TEST_SUITE_P( &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_avx2, &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + AVX2, InvTrans32x32Test, + ::testing::Values( + (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_avx2, 0, + VPX_BITS_8, 32, 6225)), + make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_avx2, 0, + VPX_BITS_8, 16, 6225))); #endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 69069042cc..71be0f310d 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -150,6 +150,7 @@ void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, int eob) { + assert(((intptr_t)input) % 32 == 0); /* The calculation can be simplified if there are not many non-zero dct * coefficients. Use eobs to separate different cases. */ if (eob == 1) /* DC only DCT coefficient. */ @@ -164,6 +165,7 @@ void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, int eob) { + assert(((intptr_t)input) % 32 == 0); if (eob == 1) vpx_idct32x32_1_add(input, dest, stride); else if (eob <= 34) diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 7db8ed72d5..92cd91f1e3 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -87,7 +87,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, row_mt_worker_data->num_sbs = num_sbs; for (plane = 0; plane < 3; ++plane) { CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane], - vpx_memalign(16, dqcoeff_size)); + vpx_memalign(32, dqcoeff_size)); memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size); CHECK_MEM_ERROR(cm, row_mt_worker_data->eob[plane], vpx_calloc(num_sbs << EOBS_PER_SB_LOG2, diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index b0ef83c73d..2e198d552e 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -54,7 +54,7 @@ typedef struct TileWorkerData { VP9LfSync *lf_sync; DECLARE_ALIGNED(16, MACROBLOCKD, xd); /* dqcoeff are shared by all the planes. So planes must be decoded serially */ - DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]); DECLARE_ALIGNED(16, uint16_t, extend_and_predict_buf[80 * 2 * 80 * 2]); struct vpx_internal_error_info error_info; } TileWorkerData; diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 207cda6310..67d3fb0e29 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -252,6 +252,7 @@ DSP_SRCS-yes += inv_txfm.h DSP_SRCS-yes += inv_txfm.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/inv_txfm_avx2.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.h DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index bde0115298..a872d17973 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -641,12 +641,12 @@ () specialize qw/vpx_idct8x8_64_add neon sse2 vsx/; specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/; specialize qw/vpx_idct8x8_1_add neon sse2/; - specialize qw/vpx_idct16x16_256_add neon sse2 vsx/; + specialize qw/vpx_idct16x16_256_add neon sse2 avx2 vsx/; specialize qw/vpx_idct16x16_38_add neon sse2/; specialize qw/vpx_idct16x16_10_add neon sse2/; specialize qw/vpx_idct16x16_1_add neon sse2/; - specialize qw/vpx_idct32x32_1024_add neon sse2 vsx/; - specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/; + specialize qw/vpx_idct32x32_1024_add neon sse2 avx2 vsx/; + specialize qw/vpx_idct32x32_135_add neon sse2 ssse3 avx2/; specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/; specialize qw/vpx_idct32x32_1_add neon sse2/; specialize qw/vpx_iwht4x4_16_add sse2 vsx/; diff --git a/vpx_dsp/x86/inv_txfm_avx2.c b/vpx_dsp/x86/inv_txfm_avx2.c new file mode 100644 index 0000000000..752435d240 --- /dev/null +++ b/vpx_dsp/x86/inv_txfm_avx2.c @@ -0,0 +1,626 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // AVX2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" + +#define PAIR256_SET_EPI16(a, b) \ + _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +static INLINE void idct_load16x16(const tran_low_t *input, __m256i *in, + int stride) { + int i; + // Load 16x16 values + for (i = 0; i < 16; i++) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i in0 = _mm_loadu_si128((const __m128i *)(input + i * stride)); + const __m128i in1 = + _mm_loadu_si128((const __m128i *)((input + i * stride) + 4)); + const __m128i in2 = + _mm_loadu_si128((const __m128i *)((input + i * stride) + 8)); + const __m128i in3 = + _mm_loadu_si128((const __m128i *)((input + i * stride) + 12)); + const __m128i ls = _mm_packs_epi32(in0, in1); + const __m128i rs = _mm_packs_epi32(in2, in3); + in[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(ls), rs, 1); +#else + in[i] = _mm256_load_si256((const __m256i *)(input + i * stride)); +#endif + } +} + +static INLINE __m256i dct_round_shift_avx2(__m256i in) { + const __m256i t = _mm256_add_epi32(in, _mm256_set1_epi32(DCT_CONST_ROUNDING)); + return _mm256_srai_epi32(t, DCT_CONST_BITS); +} + +static INLINE __m256i idct_madd_round_shift_avx2(__m256i *in, __m256i *cospi) { + const __m256i t = _mm256_madd_epi16(*in, *cospi); + return dct_round_shift_avx2(t); +} + +// Calculate the dot product between in0/1 and x and wrap to short. +static INLINE __m256i idct_calc_wraplow_avx2(__m256i *in0, __m256i *in1, + __m256i *x) { + const __m256i t0 = idct_madd_round_shift_avx2(in0, x); + const __m256i t1 = idct_madd_round_shift_avx2(in1, x); + return _mm256_packs_epi32(t0, t1); +} + +// Multiply elements by constants and add them together. +static INLINE void butterfly16(__m256i in0, __m256i in1, int c0, int c1, + __m256i *out0, __m256i *out1) { + __m256i cst0 = PAIR256_SET_EPI16(c0, -c1); + __m256i cst1 = PAIR256_SET_EPI16(c1, c0); + __m256i lo = _mm256_unpacklo_epi16(in0, in1); + __m256i hi = _mm256_unpackhi_epi16(in0, in1); + *out0 = idct_calc_wraplow_avx2(&lo, &hi, &cst0); + *out1 = idct_calc_wraplow_avx2(&lo, &hi, &cst1); +} + +static INLINE void idct16_16col(__m256i *in, __m256i *out) { + __m256i step1[16], step2[16]; + + // stage 2 + butterfly16(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly16(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly16(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly16(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + butterfly16(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly16(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + step1[8] = _mm256_add_epi16(step2[8], step2[9]); + step1[9] = _mm256_sub_epi16(step2[8], step2[9]); + step1[10] = _mm256_sub_epi16(step2[11], step2[10]); + step1[11] = _mm256_add_epi16(step2[10], step2[11]); + step1[12] = _mm256_add_epi16(step2[12], step2[13]); + step1[13] = _mm256_sub_epi16(step2[12], step2[13]); + step1[14] = _mm256_sub_epi16(step2[15], step2[14]); + step1[15] = _mm256_add_epi16(step2[14], step2[15]); + + // stage 4 + butterfly16(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly16(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly16(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13], + &step2[10]); + step2[5] = _mm256_sub_epi16(step1[4], step1[5]); + step1[4] = _mm256_add_epi16(step1[4], step1[5]); + step2[6] = _mm256_sub_epi16(step1[7], step1[6]); + step1[7] = _mm256_add_epi16(step1[6], step1[7]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = _mm256_add_epi16(step2[0], step2[3]); + step1[1] = _mm256_add_epi16(step2[1], step2[2]); + step1[2] = _mm256_sub_epi16(step2[1], step2[2]); + step1[3] = _mm256_sub_epi16(step2[0], step2[3]); + butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[8] = _mm256_add_epi16(step2[8], step2[11]); + step1[9] = _mm256_add_epi16(step2[9], step2[10]); + step1[10] = _mm256_sub_epi16(step2[9], step2[10]); + step1[11] = _mm256_sub_epi16(step2[8], step2[11]); + step1[12] = _mm256_sub_epi16(step2[15], step2[12]); + step1[13] = _mm256_sub_epi16(step2[14], step2[13]); + step1[14] = _mm256_add_epi16(step2[14], step2[13]); + step1[15] = _mm256_add_epi16(step2[15], step2[12]); + + // stage 6 + step2[0] = _mm256_add_epi16(step1[0], step1[7]); + step2[1] = _mm256_add_epi16(step1[1], step1[6]); + step2[2] = _mm256_add_epi16(step1[2], step1[5]); + step2[3] = _mm256_add_epi16(step1[3], step1[4]); + step2[4] = _mm256_sub_epi16(step1[3], step1[4]); + step2[5] = _mm256_sub_epi16(step1[2], step1[5]); + step2[6] = _mm256_sub_epi16(step1[1], step1[6]); + step2[7] = _mm256_sub_epi16(step1[0], step1[7]); + butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], + &step2[13]); + butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], + &step2[12]); + + // stage 7 + out[0] = _mm256_add_epi16(step2[0], step1[15]); + out[1] = _mm256_add_epi16(step2[1], step1[14]); + out[2] = _mm256_add_epi16(step2[2], step2[13]); + out[3] = _mm256_add_epi16(step2[3], step2[12]); + out[4] = _mm256_add_epi16(step2[4], step2[11]); + out[5] = _mm256_add_epi16(step2[5], step2[10]); + out[6] = _mm256_add_epi16(step2[6], step1[9]); + out[7] = _mm256_add_epi16(step2[7], step1[8]); + out[8] = _mm256_sub_epi16(step2[7], step1[8]); + out[9] = _mm256_sub_epi16(step2[6], step1[9]); + out[10] = _mm256_sub_epi16(step2[5], step2[10]); + out[11] = _mm256_sub_epi16(step2[4], step2[11]); + out[12] = _mm256_sub_epi16(step2[3], step2[12]); + out[13] = _mm256_sub_epi16(step2[2], step2[13]); + out[14] = _mm256_sub_epi16(step2[1], step1[14]); + out[15] = _mm256_sub_epi16(step2[0], step1[15]); +} + +static INLINE void recon_and_store16(uint8_t *dest, __m256i in_x) { + const __m256i zero = _mm256_setzero_si256(); + __m256i d0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dest))); + d0 = _mm256_permute4x64_epi64(d0, 0xd8); + d0 = _mm256_unpacklo_epi8(d0, zero); + d0 = _mm256_add_epi16(in_x, d0); + d0 = _mm256_packus_epi16( + d0, _mm256_castsi128_si256(_mm256_extractf128_si256(d0, 1))); + + _mm_storeu_si128((__m128i *)dest, _mm256_castsi256_si128(d0)); +} + +static INLINE void write_buffer_16x1(uint8_t *dest, __m256i in) { + const __m256i final_rounding = _mm256_set1_epi16(1 << 5); + __m256i out; + out = _mm256_adds_epi16(in, final_rounding); + out = _mm256_srai_epi16(out, 6); + recon_and_store16(dest, out); +} + +static INLINE void store_buffer_16x32(__m256i *in, uint8_t *dst, int stride) { + const __m256i final_rounding = _mm256_set1_epi16(1 << 5); + int j = 0; + while (j < 32) { + in[j] = _mm256_adds_epi16(in[j], final_rounding); + in[j + 1] = _mm256_adds_epi16(in[j + 1], final_rounding); + + in[j] = _mm256_srai_epi16(in[j], 6); + in[j + 1] = _mm256_srai_epi16(in[j + 1], 6); + + recon_and_store16(dst, in[j]); + dst += stride; + recon_and_store16(dst, in[j + 1]); + dst += stride; + j += 2; + } +} + +static INLINE void transpose2_8x8_avx2(__m256i *in, __m256i *out) { + int i; + __m256i t[16], u[16]; + // (1st, 2nd) ==> (lo, hi) + // (0, 1) ==> (0, 1) + // (2, 3) ==> (2, 3) + // (4, 5) ==> (4, 5) + // (6, 7) ==> (6, 7) + for (i = 0; i < 4; i++) { + t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); + t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 2) ==> (0, 2) + // (1, 3) ==> (1, 3) + // (4, 6) ==> (4, 6) + // (5, 7) ==> (5, 7) + for (i = 0; i < 2; i++) { + u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); + u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); + + u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); + u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 4) ==> (0, 1) + // (1, 5) ==> (4, 5) + // (2, 6) ==> (2, 3) + // (3, 7) ==> (6, 7) + for (i = 0; i < 2; i++) { + out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); + out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); + + out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); + out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); + } +} + +static INLINE void transpose_16bit_16x16_avx2(__m256i *in, __m256i *out) { + __m256i t[16]; + +#define LOADL(idx) \ + t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ + t[idx] = _mm256_inserti128_si256( \ + t[idx], _mm_load_si128((__m128i const *)&in[(idx) + 8]), 1); + +#define LOADR(idx) \ + t[8 + (idx)] = \ + _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ + t[8 + (idx)] = _mm256_inserti128_si256( \ + t[8 + (idx)], _mm_load_si128((__m128i const *)&in[(idx) + 8] + 1), 1); + + // load left 8x16 + LOADL(0) + LOADL(1) + LOADL(2) + LOADL(3) + LOADL(4) + LOADL(5) + LOADL(6) + LOADL(7) + + // load right 8x16 + LOADR(0) + LOADR(1) + LOADR(2) + LOADR(3) + LOADR(4) + LOADR(5) + LOADR(6) + LOADR(7) + + // get the top 16x8 result + transpose2_8x8_avx2(t, out); + // get the bottom 16x8 result + transpose2_8x8_avx2(&t[8], &out[8]); +} + +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + int i; + __m256i in[16]; + + // Load 16x16 values + idct_load16x16(input, in, 16); + + transpose_16bit_16x16_avx2(in, in); + idct16_16col(in, in); + + transpose_16bit_16x16_avx2(in, in); + idct16_16col(in, in); + + for (i = 0; i < 16; ++i) { + write_buffer_16x1(dest + i * stride, in[i]); + } +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void add_sub_butterfly_avx2(__m256i *in, __m256i *out, int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm256_add_epi16(in[i], in[bound - i]); + out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]); + i++; + } +} + +// For each 16x32 block __m256i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m256i out[32] +static INLINE void idct32_1024_16x32_quarter_1(__m256i *in, __m256i *out) { + __m256i step1[8], step2[8]; + + // stage 3 + butterfly16(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly16(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + + // stage 4 + butterfly16(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly16(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + step2[4] = _mm256_add_epi16(step1[4], step1[5]); + step2[5] = _mm256_sub_epi16(step1[4], step1[5]); + step2[6] = _mm256_sub_epi16(step1[7], step1[6]); + step2[7] = _mm256_add_epi16(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm256_add_epi16(step2[0], step2[3]); + step1[1] = _mm256_add_epi16(step2[1], step2[2]); + step1[2] = _mm256_sub_epi16(step2[1], step2[2]); + step1[3] = _mm256_sub_epi16(step2[0], step2[3]); + step1[4] = step2[4]; + butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm256_add_epi16(step1[0], step1[7]); + out[1] = _mm256_add_epi16(step1[1], step1[6]); + out[2] = _mm256_add_epi16(step1[2], step1[5]); + out[3] = _mm256_add_epi16(step1[3], step1[4]); + out[4] = _mm256_sub_epi16(step1[3], step1[4]); + out[5] = _mm256_sub_epi16(step1[2], step1[5]); + out[6] = _mm256_sub_epi16(step1[1], step1[6]); + out[7] = _mm256_sub_epi16(step1[0], step1[7]); +} + +static INLINE void idct32_16x32_quarter_2_stage_4_to_6(__m256i *step1, + __m256i *out) { + __m256i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly16(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10], + &step2[13]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm256_add_epi16(step2[8], step2[11]); + step1[9] = _mm256_add_epi16(step2[9], step2[10]); + step1[10] = _mm256_sub_epi16(step2[9], step2[10]); + step1[11] = _mm256_sub_epi16(step2[8], step2[11]); + step1[12] = _mm256_sub_epi16(step2[15], step2[12]); + step1[13] = _mm256_sub_epi16(step2[14], step2[13]); + step1[14] = _mm256_add_epi16(step2[14], step2[13]); + step1[15] = _mm256_add_epi16(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], + &out[13]); + butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], + &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +// For each 16x32 block __m256i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m256i out[32] +static INLINE void idct32_1024_16x32_quarter_2(__m256i *in, __m256i *out) { + __m256i step1[16], step2[16]; + + // stage 2 + butterfly16(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly16(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly16(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly16(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + step1[8] = _mm256_add_epi16(step2[8], step2[9]); + step1[9] = _mm256_sub_epi16(step2[8], step2[9]); + step1[10] = _mm256_sub_epi16(step2[11], step2[10]); + step1[11] = _mm256_add_epi16(step2[11], step2[10]); + step1[12] = _mm256_add_epi16(step2[12], step2[13]); + step1[13] = _mm256_sub_epi16(step2[12], step2[13]); + step1[14] = _mm256_sub_epi16(step2[15], step2[14]); + step1[15] = _mm256_add_epi16(step2[15], step2[14]); + + idct32_16x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_16x32_quarter_3_4_stage_4_to_7(__m256i *step1, + __m256i *out) { + __m256i step2[32]; + + // stage 4 + step2[16] = _mm256_add_epi16(step1[16], step1[19]); + step2[17] = _mm256_add_epi16(step1[17], step1[18]); + step2[18] = _mm256_sub_epi16(step1[17], step1[18]); + step2[19] = _mm256_sub_epi16(step1[16], step1[19]); + step2[20] = _mm256_sub_epi16(step1[23], step1[20]); + step2[21] = _mm256_sub_epi16(step1[22], step1[21]); + step2[22] = _mm256_add_epi16(step1[22], step1[21]); + step2[23] = _mm256_add_epi16(step1[23], step1[20]); + + step2[24] = _mm256_add_epi16(step1[24], step1[27]); + step2[25] = _mm256_add_epi16(step1[25], step1[26]); + step2[26] = _mm256_sub_epi16(step1[25], step1[26]); + step2[27] = _mm256_sub_epi16(step1[24], step1[27]); + step2[28] = _mm256_sub_epi16(step1[31], step1[28]); + step2[29] = _mm256_sub_epi16(step1[30], step1[29]); + step2[30] = _mm256_add_epi16(step1[29], step1[30]); + step2[31] = _mm256_add_epi16(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + butterfly16(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18], + &step1[29]); + butterfly16(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19], + &step1[28]); + butterfly16(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20], + &step1[27]); + butterfly16(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21], + &step1[26]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + out[16] = _mm256_add_epi16(step1[16], step1[23]); + out[17] = _mm256_add_epi16(step1[17], step1[22]); + out[18] = _mm256_add_epi16(step1[18], step1[21]); + out[19] = _mm256_add_epi16(step1[19], step1[20]); + step2[20] = _mm256_sub_epi16(step1[19], step1[20]); + step2[21] = _mm256_sub_epi16(step1[18], step1[21]); + step2[22] = _mm256_sub_epi16(step1[17], step1[22]); + step2[23] = _mm256_sub_epi16(step1[16], step1[23]); + + step2[24] = _mm256_sub_epi16(step1[31], step1[24]); + step2[25] = _mm256_sub_epi16(step1[30], step1[25]); + step2[26] = _mm256_sub_epi16(step1[29], step1[26]); + step2[27] = _mm256_sub_epi16(step1[28], step1[27]); + out[28] = _mm256_add_epi16(step1[27], step1[28]); + out[29] = _mm256_add_epi16(step1[26], step1[29]); + out[30] = _mm256_add_epi16(step1[25], step1[30]); + out[31] = _mm256_add_epi16(step1[24], step1[31]); + + // stage 7 + butterfly16(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], + &out[27]); + butterfly16(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], + &out[26]); + butterfly16(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], + &out[25]); + butterfly16(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], + &out[24]); +} + +static INLINE void idct32_1024_16x32_quarter_1_2(__m256i *in, __m256i *out) { + __m256i temp[16]; + + // For each 16x32 block __m256i in[32], + // Input with index, 0, 4, 8, 12, 16, 20, 24, 28 + // output pixels: 0-7 in __m256i out[32] + idct32_1024_16x32_quarter_1(in, temp); + + // Input with index, 2, 6, 10, 14, 18, 22, 26, 30 + // output pixels: 8-15 in __m256i out[32] + idct32_1024_16x32_quarter_2(in, temp); + + // stage 7 + add_sub_butterfly_avx2(temp, out, 16); +} + +// For each 16x32 block __m256i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m256i out[32] +static INLINE void idct32_1024_16x32_quarter_3_4(__m256i *in, __m256i *out) { + __m256i step1[32], step2[32]; + + // stage 1 + butterfly16(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]); + butterfly16(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]); + butterfly16(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]); + butterfly16(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); + + butterfly16(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]); + butterfly16(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]); + + butterfly16(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]); + butterfly16(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); + + // stage 2 + step2[16] = _mm256_add_epi16(step1[16], step1[17]); + step2[17] = _mm256_sub_epi16(step1[16], step1[17]); + step2[18] = _mm256_sub_epi16(step1[19], step1[18]); + step2[19] = _mm256_add_epi16(step1[19], step1[18]); + step2[20] = _mm256_add_epi16(step1[20], step1[21]); + step2[21] = _mm256_sub_epi16(step1[20], step1[21]); + step2[22] = _mm256_sub_epi16(step1[23], step1[22]); + step2[23] = _mm256_add_epi16(step1[23], step1[22]); + + step2[24] = _mm256_add_epi16(step1[24], step1[25]); + step2[25] = _mm256_sub_epi16(step1[24], step1[25]); + step2[26] = _mm256_sub_epi16(step1[27], step1[26]); + step2[27] = _mm256_add_epi16(step1[27], step1[26]); + step2[28] = _mm256_add_epi16(step1[28], step1[29]); + step2[29] = _mm256_sub_epi16(step1[28], step1[29]); + step2[30] = _mm256_sub_epi16(step1[31], step1[30]); + step2[31] = _mm256_add_epi16(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + butterfly16(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly16(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + butterfly16(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly16(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + idct32_16x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static INLINE void idct32_1024_16x32(__m256i *in, __m256i *out) { + __m256i temp[32]; + + // For each 16x32 block __m256i in[32], + // Input with index, 0, 4, 8, 12, 16, 20, 24, 28 + // output pixels: 0-7 in __m256i out[32] + // AND + // Input with index, 2, 6, 10, 14, 18, 22, 26, 30 + // output pixels: 8-15 in __m256i out[32] + idct32_1024_16x32_quarter_1_2(in, temp); + + // For each 16x32 block __m256i in[32], + // Input with odd index, + // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + // output pixels: 16-23, 24-31 in __m256i out[32] + idct32_1024_16x32_quarter_3_4(in, temp); + + // final stage + add_sub_butterfly_avx2(temp, out, 32); +} + +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i l[32], r[32], out[32], *in; + int i; + + in = l; + + for (i = 0; i < 2; i++) { + idct_load16x16(input, in, 32); + transpose_16bit_16x16_avx2(in, in); + + idct_load16x16(input + 16, in + 16, 32); + transpose_16bit_16x16_avx2(in + 16, in + 16); + idct32_1024_16x32(in, in); + + in = r; + input += 32 << 4; + } + + for (i = 0; i < 32; i += 16) { + transpose_16bit_16x16_avx2(l + i, out); + transpose_16bit_16x16_avx2(r + i, out + 16); + idct32_1024_16x32(out, out); + + store_buffer_16x32(out, dest, stride); + dest += 16; + } +} + +// Case when only upper-left 16x16 has non-zero coeff +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i in[32], io[32], out[32]; + int i; + + for (i = 16; i < 32; i++) { + in[i] = _mm256_setzero_si256(); + } + + // rows + idct_load16x16(input, in, 32); + transpose_16bit_16x16_avx2(in, in); + idct32_1024_16x32(in, io); + + // columns + for (i = 0; i < 32; i += 16) { + transpose_16bit_16x16_avx2(io + i, in); + idct32_1024_16x32(in, out); + + store_buffer_16x32(out, dest, stride); + dest += 16; + } +} From c85b7331a5960bbb4074658641482bb40c902354 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 4 May 2023 19:22:17 -0700 Subject: [PATCH 0782/1000] macros_msa.h: clear -Wshadow warnings Bug: webm:1793 Change-Id: Ib2e3bd3c52632cdd4410cb2c54d69750e64e5201 --- vpx_dsp/mips/macros_msa.h | 56 +++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h index d54ce53684..53462b59f4 100644 --- a/vpx_dsp/mips/macros_msa.h +++ b/vpx_dsp/mips/macros_msa.h @@ -774,16 +774,16 @@ Details : 4 signed word elements of 'in' vector are added together and the resulting integer sum is returned */ -#define HADD_SW_S32(in) \ - ({ \ - v2i64 res0_m, res1_m; \ - int32_t sum_m; \ - \ - res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ - res1_m = __msa_splati_d(res0_m, 1); \ - res0_m = res0_m + res1_m; \ - sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ - sum_m; \ +#define HADD_SW_S32(in) \ + ({ \ + v2i64 hadd_sw_s32_res0_m, hadd_sw_s32_res1_m; \ + int32_t hadd_sw_s32_sum_m; \ + \ + hadd_sw_s32_res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ + hadd_sw_s32_res1_m = __msa_splati_d(hadd_sw_s32_res0_m, 1); \ + hadd_sw_s32_res0_m = hadd_sw_s32_res0_m + hadd_sw_s32_res1_m; \ + hadd_sw_s32_sum_m = __msa_copy_s_w((v4i32)hadd_sw_s32_res0_m, 0); \ + hadd_sw_s32_sum_m; \ }) /* Description : Horizontal addition of 4 unsigned word elements @@ -793,16 +793,16 @@ Details : 4 unsigned word elements of 'in' vector are added together and the resulting integer sum is returned */ -#define HADD_UW_U32(in) \ - ({ \ - v2u64 res0_m, res1_m; \ - uint32_t sum_m; \ - \ - res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \ - res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ - res0_m += res1_m; \ - sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ - sum_m; \ +#define HADD_UW_U32(in) \ + ({ \ + v2u64 hadd_uw_u32_res0_m, hadd_uw_u32_res1_m; \ + uint32_t hadd_uw_u32_sum_m; \ + \ + hadd_uw_u32_res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \ + hadd_uw_u32_res1_m = (v2u64)__msa_splati_d((v2i64)hadd_uw_u32_res0_m, 1); \ + hadd_uw_u32_res0_m += hadd_uw_u32_res1_m; \ + hadd_uw_u32_sum_m = __msa_copy_u_w((v4i32)hadd_uw_u32_res0_m, 0); \ + hadd_uw_u32_sum_m; \ }) /* Description : Horizontal addition of 8 unsigned halfword elements @@ -812,14 +812,14 @@ Details : 8 unsigned halfword elements of 'in' vector are added together and the resulting integer sum is returned */ -#define HADD_UH_U32(in) \ - ({ \ - v4u32 res_m; \ - uint32_t sum_m; \ - \ - res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ - sum_m = HADD_UW_U32(res_m); \ - sum_m; \ +#define HADD_UH_U32(in) \ + ({ \ + v4u32 hadd_uh_u32_res_m; \ + uint32_t hadd_uh_u32_sum_m; \ + \ + hadd_uh_u32_res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ + hadd_uh_u32_sum_m = HADD_UW_U32(hadd_uh_u32_res_m); \ + hadd_uh_u32_sum_m; \ }) /* Description : Horizontal addition of unsigned byte vector elements From 28c5d70650356bf0c60c80d387608047bfdf6e09 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 5 May 2023 09:44:23 -0700 Subject: [PATCH 0783/1000] vp9_encoder: clear -Wshadow warning with --enable-experimental --enable-rate-ctrl Bug: webm:1793 Change-Id: I9ca664538bcf0c2aca8aea73283bbb0232eb86e9 --- vp9/encoder/vp9_encoder.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 662ec24b83..5241f5b489 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4443,10 +4443,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth; #if CONFIG_RATE_CTRL - const FRAME_UPDATE_TYPE update_type = - cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]; - const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type); - RATE_QSTEP_MODEL *rq_model = &cpi->rq_model[frame_type]; + RATE_QSTEP_MODEL *rq_model; + { + const FRAME_UPDATE_TYPE update_type = + cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]; + const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type); + rq_model = &cpi->rq_model[frame_type]; + } init_rq_history(rq_history); #endif // CONFIG_RATE_CTRL From b030d033b81378a8fe8b5f34eaee375acf323c58 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 5 May 2023 11:01:50 -0700 Subject: [PATCH 0784/1000] vp9,encoder_set_config: set setjmp flag after setjmp() Change-Id: I6858e574d24aaff64f725404706f58e04e43717d --- vp9/vp9_cx_iface.c | 1 + 1 file changed, 1 insertion(+) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index e264ae9bd9..298112b084 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -813,6 +813,7 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, assert(codec_err != VPX_CODEC_OK); return codec_err; } + ctx->cpi->common.error.setjmp = 1; ctx->cfg = *cfg; set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); From eb7014c80c3219c15ee3e2abd3543588f96abd63 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 5 May 2023 11:03:19 -0700 Subject: [PATCH 0785/1000] vp9_decodeframe,tile_worker_hook: relocate setjmp=1 after the call to setjmp(); this is more correct and consistent with other code. Change-Id: I6d9bb8daad6a959bfe4f25484f9d6664b99da19e --- vp9/decoder/vp9_decodeframe.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 6eae41fcfb..73420bacfb 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -2192,8 +2192,6 @@ static int tile_worker_hook(void *arg1, void *arg2) { volatile int mi_row = 0; volatile int n = tile_data->buf_start; - tile_data->error_info.setjmp = 1; - if (setjmp(tile_data->error_info.jmp)) { tile_data->error_info.setjmp = 0; tile_data->xd.corrupted = 1; @@ -2206,6 +2204,7 @@ static int tile_worker_hook(void *arg1, void *arg2) { } return 0; } + tile_data->error_info.setjmp = 1; tile_data->xd.corrupted = 0; From 851a76ff65905ea8961d8221a358ffb91c1f5f98 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 5 May 2023 11:04:25 -0700 Subject: [PATCH 0786/1000] vp8_[cd]x_iface: clear setjmp flag on function exit in vp8e_encode, also move setting the setjmp() call closer to setting the flag. Change-Id: Ie165d4100b84776f9c34eddcf64657bd78cce4f5 --- vp8/vp8_cx_iface.c | 16 ++++++++-------- vp8/vp8_dx_iface.c | 2 ++ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index a9d1f8005d..0821eef026 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -911,12 +911,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, } } - if (setjmp(ctx->cpi->common.error.jmp)) { - ctx->cpi->common.error.setjmp = 0; - vpx_clear_system_state(); - return VPX_CODEC_CORRUPT_FRAME; - } - /* Initialize the encoder instance on the first frame*/ if (!res && ctx->cpi) { unsigned int lib_flags; @@ -927,6 +921,13 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, unsigned char *cx_data_end; int comp_data_state = 0; + if (setjmp(ctx->cpi->common.error.jmp)) { + ctx->cpi->common.error.setjmp = 0; + vpx_clear_system_state(); + return VPX_CODEC_CORRUPT_FRAME; + } + ctx->cpi->common.error.setjmp = 1; + /* Set up internal flags */ if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) { ((VP8_COMP *)ctx->cpi)->b_calculate_psnr = 1; @@ -962,8 +963,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, cx_data_end = ctx->cx_data + cx_data_sz; lib_flags = 0; - ctx->cpi->common.error.setjmp = 1; - while (cx_data_sz >= ctx->cx_data_sz / 2) { comp_data_state = vp8_get_compressed_data( ctx->cpi, &lib_flags, &size, cx_data, cx_data_end, &dst_time_stamp, @@ -1059,6 +1058,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, } } } + ctx->cpi->common.error.setjmp = 0; } return res; diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index 55a77ba7e5..fdc0b35dd4 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -310,6 +310,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; VP8_COMMON *const pc = &pbi->common; if (setjmp(pbi->common.error.jmp)) { + pbi->common.error.setjmp = 0; vp8_remove_decoder_instances(fb); vp8_zero(fb->pbi); vpx_clear_system_state(); @@ -494,6 +495,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, /* get ready for the next series of fragments */ ctx->fragments.count = 0; + pbi->common.error.setjmp = 0; } return res; From 497f246d2925a2866644e542df158b4421ebab0d Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 4 May 2023 19:23:20 -0700 Subject: [PATCH 0787/1000] sixtap_filter_msa.c: clear -Wshadow warnings Bug: webm:1793 Change-Id: I5f9c09f31b06fecc123c6a9d01f5fbed39142356 --- vp8/common/mips/msa/sixtap_filter_msa.c | 181 ++++++++++++++---------- 1 file changed, 107 insertions(+), 74 deletions(-) diff --git a/vp8/common/mips/msa/sixtap_filter_msa.c b/vp8/common/mips/msa/sixtap_filter_msa.c index b0affcff01..3a1bb7cd57 100644 --- a/vp8/common/mips/msa/sixtap_filter_msa.c +++ b/vp8/common/mips/msa/sixtap_filter_msa.c @@ -35,101 +35,134 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \ filt_h2) \ ({ \ - v16i8 vec0_m, vec1_m, vec2_m; \ - v8i16 hz_out_m; \ + v16i8 _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m; \ + v8i16 _6tap_out_m; \ \ VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \ - vec0_m, vec1_m, vec2_m); \ - hz_out_m = \ - DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); \ + _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m); \ + _6tap_out_m = DPADD_SH3_SH(_6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m, \ + filt_h0, filt_h1, filt_h2); \ \ - hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT); \ - hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ + _6tap_out_m = __msa_srari_h(_6tap_out_m, VP8_FILTER_SHIFT); \ + _6tap_out_m = __msa_sat_s_h(_6tap_out_m, 7); \ \ - hz_out_m; \ + _6tap_out_m; \ }) #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ mask2, filt0, filt1, filt2, out0, out1) \ { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \ + v16i8 _6tap_4wid_vec0_m, _6tap_4wid_vec1_m, _6tap_4wid_vec2_m, \ + _6tap_4wid_vec3_m, _6tap_4wid_vec4_m, _6tap_4wid_vec5_m; \ \ - VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ - DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ - VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ - DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ - VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ - DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _6tap_4wid_vec0_m, \ + _6tap_4wid_vec1_m); \ + DOTP_SB2_SH(_6tap_4wid_vec0_m, _6tap_4wid_vec1_m, filt0, filt0, out0, \ + out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _6tap_4wid_vec2_m, \ + _6tap_4wid_vec3_m); \ + DPADD_SB2_SH(_6tap_4wid_vec2_m, _6tap_4wid_vec3_m, filt1, filt1, out0, \ + out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, _6tap_4wid_vec4_m, \ + _6tap_4wid_vec5_m); \ + DPADD_SB2_SH(_6tap_4wid_vec4_m, _6tap_4wid_vec5_m, filt2, filt2, out0, \ + out1); \ } -#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ - mask2, filt0, filt1, filt2, out0, out1, \ - out2, out3) \ - { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - \ - VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ - DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ - out0, out1, out2, out3); \ - VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ - VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m); \ - DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ - out0, out1, out2, out3); \ - DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \ - out0, out1, out2, out3); \ +#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, filt0, filt1, filt2, out0, out1, \ + out2, out3) \ + { \ + v16i8 _6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m, _6tap_8wid_vec4_m, _6tap_8wid_vec5_m, \ + _6tap_8wid_vec6_m, _6tap_8wid_vec7_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _6tap_8wid_vec0_m, \ + _6tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m); \ + DOTP_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1, \ + out2, out3); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _6tap_8wid_vec0_m, \ + _6tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, _6tap_8wid_vec4_m, \ + _6tap_8wid_vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, _6tap_8wid_vec6_m, \ + _6tap_8wid_vec7_m); \ + DPADD_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \ + out2, out3); \ + DPADD_SB4_SH(_6tap_8wid_vec4_m, _6tap_8wid_vec5_m, _6tap_8wid_vec6_m, \ + _6tap_8wid_vec7_m, filt2, filt2, filt2, filt2, out0, out1, \ + out2, out3); \ } -#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \ - ({ \ - v8i16 tmp0; \ - \ - tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ - tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \ - \ - tmp0; \ - }) - -#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ +#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \ ({ \ - v16i8 vec0_m, vec1_m; \ - v8i16 hz_out_m; \ - \ - VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \ - hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \ + v8i16 _4tap_dpadd_tmp0; \ \ - hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT); \ - hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ + _4tap_dpadd_tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ + _4tap_dpadd_tmp0 = \ + __msa_dpadd_s_h(_4tap_dpadd_tmp0, (v16i8)vec1, (v16i8)filt1); \ \ - hz_out_m; \ + _4tap_dpadd_tmp0; \ }) -#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ - filt0, filt1, out0, out1) \ - { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ - \ - VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ - DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ - VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ - DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ +#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ + ({ \ + v16i8 _4tap_vec0_m, _4tap_vec1_m; \ + v8i16 _4tap_out_m; \ + \ + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, _4tap_vec0_m, \ + _4tap_vec1_m); \ + _4tap_out_m = \ + FILT_4TAP_DPADD_S_H(_4tap_vec0_m, _4tap_vec1_m, filt_h0, filt_h1); \ + \ + _4tap_out_m = __msa_srari_h(_4tap_out_m, VP8_FILTER_SHIFT); \ + _4tap_out_m = __msa_sat_s_h(_4tap_out_m, 7); \ + \ + _4tap_out_m; \ + }) + +#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + filt0, filt1, out0, out1) \ + { \ + v16i8 _4tap_4wid_vec0_m, _4tap_4wid_vec1_m, _4tap_4wid_vec2_m, \ + _4tap_4wid_vec3_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _4tap_4wid_vec0_m, \ + _4tap_4wid_vec1_m); \ + DOTP_SB2_SH(_4tap_4wid_vec0_m, _4tap_4wid_vec1_m, filt0, filt0, out0, \ + out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _4tap_4wid_vec2_m, \ + _4tap_4wid_vec3_m); \ + DPADD_SB2_SH(_4tap_4wid_vec2_m, _4tap_4wid_vec3_m, filt1, filt1, out0, \ + out1); \ } -#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ - filt0, filt1, out0, out1, out2, out3) \ - { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ - \ - VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ - DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ - out0, out1, out2, out3); \ - VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ - DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ - out0, out1, out2, out3); \ +#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + filt0, filt1, out0, out1, out2, out3) \ + { \ + v16i8 _4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _4tap_8wid_vec0_m, \ + _4tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m); \ + DOTP_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1, \ + out2, out3); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _4tap_8wid_vec0_m, \ + _4tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m); \ + DPADD_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \ + out2, out3); \ } static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, From 5636f098b38e0c013f802c1b411e2cf54d32b183 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 5 May 2023 13:40:50 -0400 Subject: [PATCH 0788/1000] Set setjmp flag in VP9 RTC rate control library Change-Id: Ic5ec8dc7d9637091d4137a47d793cf29e76fdc45 --- vp9/ratectrl_rtc.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index 29033d4ba5..d92b095714 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -140,7 +140,16 @@ bool VP9RateControlRTC::UpdateRateControl( cpi_->framerate = rc_cfg.framerate; cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers; cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers; + vp9_set_mb_mi(cm, cm->width, cm->height); + + if (setjmp(cpi_->common.error.jmp)) { + cpi_->common.error.setjmp = 0; + vpx_clear_system_state(); + return false; + } + cpi_->common.error.setjmp = 1; + for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) { oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl]; } @@ -168,6 +177,8 @@ bool VP9RateControlRTC::UpdateRateControl( (int)cpi_->oxcf.target_bandwidth); } vp9_check_reset_rc_flag(cpi_); + + cpi_->common.error.setjmp = 0; return true; } From 3d57fb69afbd9e299dc18620f23ec9deb0adfb54 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 5 May 2023 18:56:59 -0700 Subject: [PATCH 0789/1000] README: update target list Change-Id: If2d5811a55f6bb60eeba7d28b69c78157a17e87f --- README | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README b/README index e360df05f6..87e46f99d4 100644 --- a/README +++ b/README @@ -64,6 +64,8 @@ COMPILING THE APPLICATIONS/LIBRARIES: arm64-android-gcc arm64-darwin-gcc arm64-darwin20-gcc + arm64-darwin21-gcc + arm64-darwin22-gcc arm64-linux-gcc arm64-win64-gcc arm64-win64-vs15 @@ -77,6 +79,8 @@ COMPILING THE APPLICATIONS/LIBRARIES: armv7-win32-vs15 armv7s-darwin-gcc armv8-linux-gcc + loongarch32-linux-gcc + loongarch64-linux-gcc mips32-linux-gcc mips64-linux-gcc ppc64le-linux-gcc @@ -117,6 +121,8 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86_64-darwin18-gcc x86_64-darwin19-gcc x86_64-darwin20-gcc + x86_64-darwin21-gcc + x86_64-darwin22-gcc x86_64-iphonesimulator-gcc x86_64-linux-gcc x86_64-linux-icc From b14d20b47004e58ee4100fc10ccf0cfa8dfe4fe6 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 6 May 2023 15:48:58 -0700 Subject: [PATCH 0790/1000] examples.mk,vpxdec: rm libwebm muxer dependency vpxdec only requires the parser. Change-Id: I54ead453d4af400ca5c3412a3211d6d0b1383046 --- examples.mk | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples.mk b/examples.mk index 42886f1e15..9f83230eca 100644 --- a/examples.mk +++ b/examples.mk @@ -81,8 +81,6 @@ ifeq ($(CONFIG_LIBYUV),yes) $(BUILD_PFX)third_party/libyuv/%.cc.o: CXXFLAGS += ${LIBYUV_CXXFLAGS} endif ifeq ($(CONFIG_WEBM_IO),yes) - vpxdec.SRCS += $(LIBWEBM_COMMON_SRCS) - vpxdec.SRCS += $(LIBWEBM_MUXER_SRCS) vpxdec.SRCS += $(LIBWEBM_PARSER_SRCS) vpxdec.SRCS += webmdec.cc webmdec.h endif From 75f9551efbef322b85534e91dd0b26c0c3bf18f4 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 8 May 2023 10:37:54 -0400 Subject: [PATCH 0791/1000] CHECK_MEM_ERROR to return in vp9_set_roi_map Also change the return type of vp9_set_roi_map to vpx_codec_err_t Change-Id: I60d9ff45f2d3dfc44cd6e2aab2cb1ba389ff15f3 --- vp9/encoder/vp9_encoder.c | 18 ++++++++++-------- vp9/encoder/vp9_encoder.h | 7 ++++--- vp9/vp9_cx_iface.c | 10 +++------- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index f76eec2b57..b54d0d5f73 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -681,9 +681,10 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level; } -int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, - unsigned int cols, int delta_q[8], int delta_lf[8], - int skip[8], int ref_frame[8]) { +vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, + unsigned int rows, unsigned int cols, + int delta_q[8], int delta_lf[8], int skip[8], + int ref_frame[8]) { VP9_COMMON *cm = &cpi->common; vpx_roi_map_t *roi = &cpi->roi; const int range = 63; @@ -694,13 +695,13 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, // Check number of rows and columns match if (frame_rows != (int)rows || frame_cols != (int)cols) { - return -1; + return VPX_CODEC_INVALID_PARAM; } if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) || !check_seg_range(ref_frame, ref_frame_range) || !check_seg_range(skip, skip_range)) - return -1; + return VPX_CODEC_INVALID_PARAM; // Also disable segmentation if no deltas are specified. if (!map || @@ -714,14 +715,15 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, ref_frame[6] == -1 && ref_frame[7] == -1))) { vp9_disable_segmentation(&cm->seg); cpi->roi.enabled = 0; - return 0; + return VPX_CODEC_OK; } if (roi->roi_map) { vpx_free(roi->roi_map); roi->roi_map = NULL; } - CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols)); + roi->roi_map = vpx_malloc(rows * cols); + if (!roi->roi_map) return VPX_CODEC_MEM_ERROR; // Copy to ROI structure in the compressor. memcpy(roi->roi_map, map, rows * cols); @@ -733,7 +735,7 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, roi->rows = rows; roi->cols = cols; - return 0; + return VPX_CODEC_OK; } int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index cca1617830..8effe8741e 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1457,9 +1457,10 @@ static INLINE int log_tile_cols_from_picsize_level(uint32_t width, VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec); -int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, - unsigned int cols, int delta_q[8], int delta_lf[8], - int skip[8], int ref_frame[8]); +vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, + unsigned int rows, unsigned int cols, + int delta_q[8], int delta_lf[8], int skip[8], + int ref_frame[8]); void vp9_new_framerate(VP9_COMP *cpi, double framerate); diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index f067efdf79..7150f74284 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1633,13 +1633,9 @@ static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx, if (data) { vpx_roi_map_t *roi = (vpx_roi_map_t *)data; - - if (!vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols, - roi->delta_q, roi->delta_lf, roi->skip, - roi->ref_frame)) { - return VPX_CODEC_OK; - } - return VPX_CODEC_INVALID_PARAM; + return vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols, + roi->delta_q, roi->delta_lf, roi->skip, + roi->ref_frame); } return VPX_CODEC_INVALID_PARAM; } From 1710c9282a11aaaccdf42b7507f570f58e39b7fd Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 4 May 2023 22:03:27 -0400 Subject: [PATCH 0792/1000] Unify implementation of CHECK_MEM_ERROR There were multiple implementations of CHECK_MEM_ERROR across the library that take different arguments and used in different places. This CL will unify them and have only one implementation that takes vpx_internal_error_info. Change-Id: I2c568639473815bc00b1fc2b72be56e5ccba1a35 --- vp8/decoder/onyxd_int.h | 21 ---------- vp8/decoder/threading.c | 26 ++++++------ vp8/encoder/encodeframe.c | 2 +- vp8/encoder/ethreading.c | 10 ++--- vp8/encoder/onyx_if.c | 44 +++++++++++--------- vp8/encoder/onyx_int.h | 20 --------- vp9/common/vp9_common.h | 21 ---------- vp9/common/vp9_thread_common.c | 16 +++---- vp9/decoder/vp9_decodeframe.c | 16 +++---- vp9/decoder/vp9_decoder.c | 19 +++++---- vp9/encoder/vp9_bitstream.c | 4 +- vp9/encoder/vp9_context_tree.c | 15 +++---- vp9/encoder/vp9_denoiser.c | 4 +- vp9/encoder/vp9_encodeframe.c | 6 +-- vp9/encoder/vp9_encoder.c | 67 +++++++++++++++--------------- vp9/encoder/vp9_encoder.h | 10 ++--- vp9/encoder/vp9_ethread.c | 14 +++---- vp9/encoder/vp9_firstpass.c | 2 +- vp9/encoder/vp9_mbgraph.c | 2 +- vp9/encoder/vp9_multi_thread.c | 4 +- vp9/encoder/vp9_speed_features.c | 6 +-- vp9/encoder/vp9_svc_layercontext.c | 6 +-- vp9/encoder/vp9_tpl_model.c | 10 ++--- vpx/internal/vpx_codec_internal.h | 23 ++++++++++ 24 files changed, 171 insertions(+), 197 deletions(-) diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h index a6bedc4faf..56500a8506 100644 --- a/vp8/decoder/onyxd_int.h +++ b/vp8/decoder/onyxd_int.h @@ -135,27 +135,6 @@ int vp8_decode_frame(VP8D_COMP *pbi); int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf); int vp8_remove_decoder_instances(struct frame_buffers *fb); -#if CONFIG_DEBUG -#define CHECK_MEM_ERROR(lval, expr) \ - do { \ - assert(pbi->common.error.setjmp); \ - (lval) = (expr); \ - if (!(lval)) \ - vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval " at %s:%d", __FILE__, \ - __LINE__); \ - } while (0) -#else -#define CHECK_MEM_ERROR(lval, expr) \ - do { \ - assert(pbi->common.error.setjmp); \ - (lval) = (expr); \ - if (!(lval)) \ - vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval); \ - } while (0) -#endif - #ifdef __cplusplus } // extern "C" #endif diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index 490f62d1b3..9ea6a4f34a 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -30,11 +30,13 @@ #include "error_concealment.h" #endif -#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n))) -#define CALLOC_ARRAY_ALIGNED(p, n, algn) \ - do { \ - CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n))); \ - memset((p), 0, (n) * sizeof(*(p))); \ +#define CALLOC_ARRAY(p, n) \ + CHECK_MEM_ERROR(&pbi->common.error, (p), vpx_calloc(sizeof(*(p)), (n))) +#define CALLOC_ARRAY_ALIGNED(p, n, algn) \ + do { \ + CHECK_MEM_ERROR(&pbi->common.error, (p), \ + vpx_memalign((algn), sizeof(*(p)) * (n))); \ + memset((p), 0, (n) * sizeof(*(p))); \ } while (0) static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, @@ -754,7 +756,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { uv_width = width >> 1; /* Allocate a vpx_atomic_int for each mb row. */ - CHECK_MEM_ERROR(pbi->mt_current_mb_col, + CHECK_MEM_ERROR(&pc->error, pbi->mt_current_mb_col, vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows)); for (i = 0; i < pc->mb_rows; ++i) vpx_atomic_init(&pbi->mt_current_mb_col[i], 0); @@ -762,7 +764,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { /* Allocate memory for above_row buffers. */ CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) { - CHECK_MEM_ERROR(pbi->mt_yabove_row[i], + CHECK_MEM_ERROR(&pc->error, pbi->mt_yabove_row[i], vpx_memalign(16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1)))); vp8_zero_array(pbi->mt_yabove_row[i], width + (VP8BORDERINPIXELS << 1)); @@ -770,7 +772,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) { - CHECK_MEM_ERROR(pbi->mt_uabove_row[i], + CHECK_MEM_ERROR(&pc->error, pbi->mt_uabove_row[i], vpx_memalign(16, sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); vp8_zero_array(pbi->mt_uabove_row[i], uv_width + VP8BORDERINPIXELS); @@ -778,7 +780,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) { - CHECK_MEM_ERROR(pbi->mt_vabove_row[i], + CHECK_MEM_ERROR(&pc->error, pbi->mt_vabove_row[i], vpx_memalign(16, sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); vp8_zero_array(pbi->mt_vabove_row[i], uv_width + VP8BORDERINPIXELS); @@ -787,17 +789,17 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { /* Allocate memory for left_col buffers. */ CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_yleft_col[i], + CHECK_MEM_ERROR(&pc->error, pbi->mt_yleft_col[i], vpx_calloc(sizeof(unsigned char) * 16, 1)); CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_uleft_col[i], + CHECK_MEM_ERROR(&pc->error, pbi->mt_uleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1)); CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_vleft_col[i], + CHECK_MEM_ERROR(&pc->error, pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1)); } } diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 620107500a..dc29945729 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -123,7 +123,7 @@ static void calc_av_activity(VP8_COMP *cpi, int64_t activity_sum) { unsigned int tmp; /* Create a list to sort to */ - CHECK_MEM_ERROR(sortlist, + CHECK_MEM_ERROR(&cpi->common.error, sortlist, vpx_calloc(sizeof(unsigned int), cpi->common.MBs)); /* Copy map to sort list */ diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index cb35f4f491..2583cb0ac3 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -510,16 +510,16 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { if (th_count == 0) return 0; - CHECK_MEM_ERROR(cpi->h_encoding_thread, + CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * th_count)); - CHECK_MEM_ERROR(cpi->h_event_start_encoding, + CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding, vpx_malloc(sizeof(sem_t) * th_count)); - CHECK_MEM_ERROR(cpi->h_event_end_encoding, + CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding, vpx_malloc(sizeof(sem_t) * th_count)); - CHECK_MEM_ERROR(cpi->mb_row_ei, + CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count)); memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count); - CHECK_MEM_ERROR(cpi->en_thread_data, + CHECK_MEM_ERROR(&cpi->common.error, cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count)); vpx_atomic_store_release(&cpi->b_multi_threaded, 1); diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index a780048073..8941329419 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1169,7 +1169,8 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { #else unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16; #endif - CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok))); + CHECK_MEM_ERROR(&cpi->common.error, cpi->tok, + vpx_calloc(tokens, sizeof(*cpi->tok))); } /* Data used for real time vc mode to see if gf needs refreshing */ @@ -1178,37 +1179,39 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { /* Structures used to monitor GF usage */ vpx_free(cpi->gf_active_flags); CHECK_MEM_ERROR( - cpi->gf_active_flags, + &cpi->common.error, cpi->gf_active_flags, vpx_calloc(sizeof(*cpi->gf_active_flags), cm->mb_rows * cm->mb_cols)); cpi->gf_active_count = cm->mb_rows * cm->mb_cols; vpx_free(cpi->mb_activity_map); CHECK_MEM_ERROR( - cpi->mb_activity_map, + &cpi->common.error, cpi->mb_activity_map, vpx_calloc(sizeof(*cpi->mb_activity_map), cm->mb_rows * cm->mb_cols)); /* allocate memory for storing last frame's MVs for MV prediction. */ vpx_free(cpi->lfmv); - CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), - sizeof(*cpi->lfmv))); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->lfmv, + vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lfmv))); vpx_free(cpi->lf_ref_frame_sign_bias); - CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, + CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame_sign_bias, vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lf_ref_frame_sign_bias))); vpx_free(cpi->lf_ref_frame); - CHECK_MEM_ERROR(cpi->lf_ref_frame, + CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame, vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lf_ref_frame))); /* Create the encoder segmentation map and set all entries to 0 */ vpx_free(cpi->segmentation_map); CHECK_MEM_ERROR( - cpi->segmentation_map, + &cpi->common.error, cpi->segmentation_map, vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->segmentation_map))); cpi->cyclic_refresh_mode_index = 0; vpx_free(cpi->active_map); - CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cm->mb_rows * cm->mb_cols, - sizeof(*cpi->active_map))); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->active_map, + vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->active_map))); memset(cpi->active_map, 1, (cm->mb_rows * cm->mb_cols)); #if CONFIG_MULTITHREAD @@ -1226,7 +1229,7 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { int i; vpx_free(cpi->mt_current_mb_col); - CHECK_MEM_ERROR(cpi->mt_current_mb_col, + CHECK_MEM_ERROR(&cpi->common.error, cpi->mt_current_mb_col, vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows)); for (i = 0; i < cm->mb_rows; ++i) vpx_atomic_init(&cpi->mt_current_mb_col[i], 0); @@ -1235,7 +1238,8 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { #endif vpx_free(cpi->tplist); - CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows)); + CHECK_MEM_ERROR(&cpi->common.error, cpi->tplist, + vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows)); #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0) { @@ -1773,8 +1777,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->common.error.setjmp = 1; - CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), - (MAX_MVSEARCH_STEPS * 8) + 1)); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->mb.ss, + vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1)); vp8_create_common(&cpi->common); @@ -1879,18 +1884,19 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { } if (cpi->cyclic_refresh_mode_enabled) { - CHECK_MEM_ERROR(cpi->cyclic_refresh_map, + CHECK_MEM_ERROR(&cpi->common.error, cpi->cyclic_refresh_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1)); } else { cpi->cyclic_refresh_map = (signed char *)NULL; } - CHECK_MEM_ERROR(cpi->skin_map, vpx_calloc(cm->mb_rows * cm->mb_cols, - sizeof(cpi->skin_map[0]))); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->skin_map, + vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(cpi->skin_map[0]))); - CHECK_MEM_ERROR(cpi->consec_zero_last, + CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last, vpx_calloc(cm->mb_rows * cm->mb_cols, 1)); - CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias, + CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last_mvbias, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1)); /*Initialize the feed-forward activity masking.*/ diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 46a17913ad..bde5c2f69b 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -731,26 +731,6 @@ void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **); void vp8_set_speed_features(VP8_COMP *cpi); -#if CONFIG_DEBUG -#define CHECK_MEM_ERROR(lval, expr) \ - do { \ - assert(cpi->common.error.setjmp); \ - (lval) = (expr); \ - if (!(lval)) \ - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval " at %s:%d", __FILE__, \ - __LINE__); \ - } while (0) -#else -#define CHECK_MEM_ERROR(lval, expr) \ - do { \ - assert(cpi->common.error.setjmp); \ - (lval) = (expr); \ - if (!(lval)) \ - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval); \ - } while (0) -#endif #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h index 8d2bed38e5..d63bad93d1 100644 --- a/vp9/common/vp9_common.h +++ b/vp9/common/vp9_common.h @@ -46,27 +46,6 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { return num_values > 0 ? get_msb(num_values) + 1 : 0; } -#if CONFIG_DEBUG -#define CHECK_MEM_ERROR(cm, lval, expr) \ - do { \ - assert(&(cm)->error.setjmp); \ - (lval) = (expr); \ - if (!(lval)) \ - vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval " at %s:%d", __FILE__, \ - __LINE__); \ - } while (0) -#else -#define CHECK_MEM_ERROR(cm, lval, expr) \ - do { \ - assert(&(cm)->error.setjmp); \ - (lval) = (expr); \ - if (!(lval)) \ - vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval); \ - } while (0) -#endif - #define VP9_SYNC_CODE_0 0x49 #define VP9_SYNC_CODE_1 0x83 #define VP9_SYNC_CODE_2 0x42 diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c index ad4478179e..1c6ecc0fe6 100644 --- a/vp9/common/vp9_thread_common.c +++ b/vp9/common/vp9_thread_common.c @@ -283,7 +283,7 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, { int i; - CHECK_MEM_ERROR(cm, lf_sync->mutex, + CHECK_MEM_ERROR(&cm->error, lf_sync->mutex, vpx_malloc(sizeof(*lf_sync->mutex) * rows)); if (lf_sync->mutex) { for (i = 0; i < rows; ++i) { @@ -291,7 +291,7 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, } } - CHECK_MEM_ERROR(cm, lf_sync->cond, + CHECK_MEM_ERROR(&cm->error, lf_sync->cond, vpx_malloc(sizeof(*lf_sync->cond) * rows)); if (lf_sync->cond) { for (i = 0; i < rows; ++i) { @@ -299,11 +299,11 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, } } - CHECK_MEM_ERROR(cm, lf_sync->lf_mutex, + CHECK_MEM_ERROR(&cm->error, lf_sync->lf_mutex, vpx_malloc(sizeof(*lf_sync->lf_mutex))); pthread_mutex_init(lf_sync->lf_mutex, NULL); - CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex, + CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_mutex, vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows)); if (lf_sync->recon_done_mutex) { for (i = 0; i < rows; ++i) { @@ -311,7 +311,7 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, } } - CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond, + CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_cond, vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows)); if (lf_sync->recon_done_cond) { for (i = 0; i < rows; ++i) { @@ -321,15 +321,15 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, } #endif // CONFIG_MULTITHREAD - CHECK_MEM_ERROR(cm, lf_sync->lfdata, + CHECK_MEM_ERROR(&cm->error, lf_sync->lfdata, vpx_malloc(num_workers * sizeof(*lf_sync->lfdata))); lf_sync->num_workers = num_workers; lf_sync->num_active_workers = lf_sync->num_workers; - CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col, + CHECK_MEM_ERROR(&cm->error, lf_sync->cur_sb_col, vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); - CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done, + CHECK_MEM_ERROR(&cm->error, lf_sync->num_tiles_done, vpx_malloc(sizeof(*lf_sync->num_tiles_done) * mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2)); diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 6eae41fcfb..10a7f9b124 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -1469,7 +1469,7 @@ static void resize_mv_buffer(VP9_COMMON *cm) { vpx_free(cm->cur_frame->mvs); cm->cur_frame->mi_rows = cm->mi_rows; cm->cur_frame->mi_cols = cm->mi_cols; - CHECK_MEM_ERROR(cm, cm->cur_frame->mvs, + CHECK_MEM_ERROR(&cm->error, cm->cur_frame->mvs, (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cm->cur_frame->mvs))); } @@ -1776,7 +1776,8 @@ static void vp9_jobq_alloc(VP9Decoder *pbi) { if (jobq_size > row_mt_worker_data->jobq_size) { vpx_free(row_mt_worker_data->jobq_buf); - CHECK_MEM_ERROR(cm, row_mt_worker_data->jobq_buf, vpx_calloc(1, jobq_size)); + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->jobq_buf, + vpx_calloc(1, jobq_size)); vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf, jobq_size); row_mt_worker_data->jobq_size = jobq_size; @@ -1923,7 +1924,7 @@ static int row_decode_worker_hook(void *arg1, void *arg2) { const int is_last_row = sb_rows - 1 == cur_sb_row; int mi_col_start, mi_col_end; if (!tile_data_recon) - CHECK_MEM_ERROR(cm, tile_data_recon, + CHECK_MEM_ERROR(&cm->error, tile_data_recon, vpx_memalign(32, sizeof(TileWorkerData))); tile_data_recon->xd = pbi->mb; @@ -2025,7 +2026,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, if (cm->lf.filter_level && !cm->skip_loop_filter && pbi->lf_worker.data1 == NULL) { - CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, + CHECK_MEM_ERROR(&cm->error, pbi->lf_worker.data1, vpx_memalign(32, sizeof(LFWorkerData))); pbi->lf_worker.hook = vp9_loop_filter_worker; if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) { @@ -2285,7 +2286,7 @@ static INLINE void init_mt(VP9Decoder *pbi) { if (pbi->num_tile_workers == 0) { const int num_threads = pbi->max_threads; - CHECK_MEM_ERROR(cm, pbi->tile_workers, + CHECK_MEM_ERROR(&cm->error, pbi->tile_workers, vpx_malloc(num_threads * sizeof(*pbi->tile_workers))); for (n = 0; n < num_threads; ++n) { VPxWorker *const worker = &pbi->tile_workers[n]; @@ -2824,7 +2825,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, const int num_jobs = sb_rows << cm->log2_tile_cols; if (pbi->row_mt_worker_data == NULL) { - CHECK_MEM_ERROR(cm, pbi->row_mt_worker_data, + CHECK_MEM_ERROR(&cm->error, pbi->row_mt_worker_data, vpx_calloc(1, sizeof(*pbi->row_mt_worker_data))); #if CONFIG_MULTITHREAD pthread_mutex_init(&pbi->row_mt_worker_data->recon_done_mutex, NULL); @@ -3006,7 +3007,8 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data, // platforms without DECLARE_ALIGNED(). assert((sizeof(*pbi->tile_worker_data) % 16) == 0); vpx_free(pbi->tile_worker_data); - CHECK_MEM_ERROR(cm, pbi->tile_worker_data, vpx_memalign(32, twd_size)); + CHECK_MEM_ERROR(&cm->error, pbi->tile_worker_data, + vpx_memalign(32, twd_size)); pbi->total_tiles = tile_rows * tile_cols; } diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 92cd91f1e3..5a7e9f9ab3 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -66,7 +66,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, { int i; CHECK_MEM_ERROR( - cm, row_mt_worker_data->recon_sync_mutex, + &cm->error, row_mt_worker_data->recon_sync_mutex, vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_mutex) * num_jobs)); if (row_mt_worker_data->recon_sync_mutex) { for (i = 0; i < num_jobs; ++i) { @@ -75,7 +75,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, } CHECK_MEM_ERROR( - cm, row_mt_worker_data->recon_sync_cond, + &cm->error, row_mt_worker_data->recon_sync_cond, vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_cond) * num_jobs)); if (row_mt_worker_data->recon_sync_cond) { for (i = 0; i < num_jobs; ++i) { @@ -86,24 +86,24 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, #endif row_mt_worker_data->num_sbs = num_sbs; for (plane = 0; plane < 3; ++plane) { - CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane], + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->dqcoeff[plane], vpx_memalign(32, dqcoeff_size)); memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size); - CHECK_MEM_ERROR(cm, row_mt_worker_data->eob[plane], + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->eob[plane], vpx_calloc(num_sbs << EOBS_PER_SB_LOG2, sizeof(*row_mt_worker_data->eob[plane]))); } - CHECK_MEM_ERROR(cm, row_mt_worker_data->partition, + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->partition, vpx_calloc(num_sbs * PARTITIONS_PER_SB, sizeof(*row_mt_worker_data->partition))); - CHECK_MEM_ERROR(cm, row_mt_worker_data->recon_map, + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->recon_map, vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map))); // allocate memory for thread_data if (row_mt_worker_data->thread_data == NULL) { const size_t thread_size = max_threads * sizeof(*row_mt_worker_data->thread_data); - CHECK_MEM_ERROR(cm, row_mt_worker_data->thread_data, + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->thread_data, vpx_memalign(32, thread_size)); } } @@ -181,9 +181,10 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) { cm->error.setjmp = 1; - CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); + CHECK_MEM_ERROR(&cm->error, cm->fc, + (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); CHECK_MEM_ERROR( - cm, cm->frame_contexts, + &cm->error, cm->frame_contexts, (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts))); pbi->need_resync = 1; diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 17c123af6f..ca56d14aa1 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -967,13 +967,13 @@ static void encode_tiles_buffer_alloc(VP9_COMP *const cpi) { int i; const size_t worker_data_size = cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data); - CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data, + CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data, vpx_memalign(16, worker_data_size)); memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size); for (i = 1; i < cpi->num_workers; ++i) { cpi->vp9_bitstream_worker_data[i].dest_size = cpi->oxcf.width * cpi->oxcf.height; - CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data[i].dest, + CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data[i].dest, vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size)); } } diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c index b74b9027ca..42073f756c 100644 --- a/vp9/encoder/vp9_context_tree.c +++ b/vp9/encoder/vp9_context_tree.c @@ -25,16 +25,17 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, int i, k; ctx->num_4x4_blk = num_blk; - CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, vpx_calloc(num_blk, sizeof(uint8_t))); + CHECK_MEM_ERROR(&cm->error, ctx->zcoeff_blk, + vpx_calloc(num_blk, sizeof(uint8_t))); for (i = 0; i < MAX_MB_PLANE; ++i) { for (k = 0; k < 3; ++k) { - CHECK_MEM_ERROR(cm, ctx->coeff[i][k], + CHECK_MEM_ERROR(&cm->error, ctx->coeff[i][k], vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k]))); - CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k], + CHECK_MEM_ERROR(&cm->error, ctx->qcoeff[i][k], vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k]))); - CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k], + CHECK_MEM_ERROR(&cm->error, ctx->dqcoeff[i][k], vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k]))); - CHECK_MEM_ERROR(cm, ctx->eobs[i][k], + CHECK_MEM_ERROR(&cm->error, ctx->eobs[i][k], vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k]))); ctx->coeff_pbuf[i][k] = ctx->coeff[i][k]; ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k]; @@ -100,10 +101,10 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) { int nodes; vpx_free(td->leaf_tree); - CHECK_MEM_ERROR(cm, td->leaf_tree, + CHECK_MEM_ERROR(&cm->error, td->leaf_tree, vpx_calloc(leaf_nodes, sizeof(*td->leaf_tree))); vpx_free(td->pc_tree); - CHECK_MEM_ERROR(cm, td->pc_tree, + CHECK_MEM_ERROR(&cm->error, td->pc_tree, vpx_calloc(tree_nodes, sizeof(*td->pc_tree))); this_pc = &td->pc_tree[0]; diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index 77d72396ae..baea8ebb3c 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -634,11 +634,11 @@ int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser, denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES; init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES; denoiser->num_layers = num_layers; - CHECK_MEM_ERROR(cm, denoiser->running_avg_y, + CHECK_MEM_ERROR(&cm->error, denoiser->running_avg_y, vpx_calloc(denoiser->num_ref_frames * num_layers, sizeof(denoiser->running_avg_y[0]))); CHECK_MEM_ERROR( - cm, denoiser->mc_running_avg_y, + &cm->error, denoiser->mc_running_avg_y, vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0]))); for (layer = 0; layer < num_layers; ++layer) { diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 3a042399cb..a979ae1c93 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1545,7 +1545,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } if (low_res && threshold_4x4avg < INT64_MAX) - CHECK_MEM_ERROR(cm, vt2, vpx_calloc(16, sizeof(*vt2))); + CHECK_MEM_ERROR(&cm->error, vt2, vpx_calloc(16, sizeof(*vt2))); // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances // for splits. for (i = 0; i < 4; i++) { @@ -5783,7 +5783,7 @@ static void source_var_based_partition_search_method(VP9_COMP *cpi) { if (cm->last_width != cm->width || cm->last_height != cm->height) { if (cpi->source_diff_var) vpx_free(cpi->source_diff_var); - CHECK_MEM_ERROR(cm, cpi->source_diff_var, + CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var))); } @@ -5823,7 +5823,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) { if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { if (cpi->tile_data != NULL) vpx_free(cpi->tile_data); CHECK_MEM_ERROR( - cm, cpi->tile_data, + &cm->error, cpi->tile_data, vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data))); cpi->allocated_tiles = tile_cols * tile_rows; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index b54d0d5f73..db21509ce9 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1376,7 +1376,7 @@ static void alloc_context_buffers_ext(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; int mi_size = cm->mi_cols * cm->mi_rows; - CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base, + CHECK_MEM_ERROR(&cm->error, cpi->mbmi_ext_base, vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base))); } @@ -1395,14 +1395,14 @@ static void alloc_compressor_data(VP9_COMP *cpi) { { unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols); - CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0], + CHECK_MEM_ERROR(&cm->error, cpi->tile_tok[0][0], vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0]))); } sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; vpx_free(cpi->tplist[0][0]); CHECK_MEM_ERROR( - cm, cpi->tplist[0][0], + &cm->error, cpi->tplist[0][0], vpx_calloc(sb_rows * 4 * (1 << 6), sizeof(*cpi->tplist[0][0]))); vp9_setup_pc_tree(&cpi->common, &cpi->td); @@ -1998,48 +1998,48 @@ static void realloc_segmentation_maps(VP9_COMP *cpi) { // Create the encoder segmentation map and set all entries to 0 vpx_free(cpi->segmentation_map); - CHECK_MEM_ERROR(cm, cpi->segmentation_map, + CHECK_MEM_ERROR(&cm->error, cpi->segmentation_map, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); // Create a map used for cyclic background refresh. if (cpi->cyclic_refresh) vp9_cyclic_refresh_free(cpi->cyclic_refresh); - CHECK_MEM_ERROR(cm, cpi->cyclic_refresh, + CHECK_MEM_ERROR(&cm->error, cpi->cyclic_refresh, vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols)); // Create a map used to mark inactive areas. vpx_free(cpi->active_map.map); - CHECK_MEM_ERROR(cm, cpi->active_map.map, + CHECK_MEM_ERROR(&cm->error, cpi->active_map.map, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); // And a place holder structure is the coding context // for use if we want to save and restore it vpx_free(cpi->coding_context.last_frame_seg_map_copy); - CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy, + CHECK_MEM_ERROR(&cm->error, cpi->coding_context.last_frame_seg_map_copy, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); } static void alloc_copy_partition_data(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; if (cpi->prev_partition == NULL) { - CHECK_MEM_ERROR(cm, cpi->prev_partition, + CHECK_MEM_ERROR(&cm->error, cpi->prev_partition, (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows, sizeof(*cpi->prev_partition))); } if (cpi->prev_segment_id == NULL) { CHECK_MEM_ERROR( - cm, cpi->prev_segment_id, + &cm->error, cpi->prev_segment_id, (int8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(*cpi->prev_segment_id))); } if (cpi->prev_variance_low == NULL) { - CHECK_MEM_ERROR(cm, cpi->prev_variance_low, + CHECK_MEM_ERROR(&cm->error, cpi->prev_variance_low, (uint8_t *)vpx_calloc( (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * 25, sizeof(*cpi->prev_variance_low))); } if (cpi->copied_frame_cnt == NULL) { CHECK_MEM_ERROR( - cm, cpi->copied_frame_cnt, + &cm->error, cpi->copied_frame_cnt, (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(*cpi->copied_frame_cnt))); } @@ -2372,9 +2372,10 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, cm->free_mi = vp9_enc_free_mi; cm->setup_mi = vp9_enc_setup_mi; - CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); + CHECK_MEM_ERROR(&cm->error, cm->fc, + (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); CHECK_MEM_ERROR( - cm, cm->frame_contexts, + &cm->error, cm->frame_contexts, (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts))); cpi->compute_frame_low_motion_onepass = 1; @@ -2401,38 +2402,38 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, realloc_segmentation_maps(cpi); CHECK_MEM_ERROR( - cm, cpi->skin_map, + &cm->error, cpi->skin_map, vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0]))); #if !CONFIG_REALTIME_ONLY - CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create()); + CHECK_MEM_ERROR(&cm->error, cpi->alt_ref_aq, vp9_alt_ref_aq_create()); #endif CHECK_MEM_ERROR( - cm, cpi->consec_zero_mv, + &cm->error, cpi->consec_zero_mv, vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv))); - CHECK_MEM_ERROR(cm, cpi->nmvcosts[0], + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[0], vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0]))); - CHECK_MEM_ERROR(cm, cpi->nmvcosts[1], + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[1], vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1]))); - CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0], + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[0], vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0]))); - CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1], + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[1], vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1]))); - CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0], + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[0], vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0]))); - CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1], + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[1], vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1]))); - CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0], + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[0], vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0]))); - CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1], + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[1], vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1]))); for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0])); i++) { CHECK_MEM_ERROR( - cm, cpi->mbgraph_stats[i].mb_stats, + &cm->error, cpi->mbgraph_stats[i].mb_stats, vpx_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1)); } @@ -2476,7 +2477,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, } if (cpi->b_calculate_consistency) { - CHECK_MEM_ERROR(cm, cpi->ssim_vars, + CHECK_MEM_ERROR(&cm->error, cpi->ssim_vars, vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, sizeof(*cpi->ssim_vars) * 4)); cpi->worst_consistency = 100.0; @@ -2561,7 +2562,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, vpx_free(lc->rc_twopass_stats_in.buf); lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz; - CHECK_MEM_ERROR(cm, lc->rc_twopass_stats_in.buf, + CHECK_MEM_ERROR(&cm->error, lc->rc_twopass_stats_in.buf, vpx_malloc(lc->rc_twopass_stats_in.sz)); lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf; lc->twopass.stats_in = lc->twopass.stats_in_start; @@ -2616,7 +2617,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, const int h = num_8x8_blocks_high_lookup[bsize]; const int num_cols = (cm->mi_cols + w - 1) / w; const int num_rows = (cm->mi_rows + h - 1) / h; - CHECK_MEM_ERROR(cm, cpi->mi_ssim_rdmult_scaling_factors, + CHECK_MEM_ERROR(&cm->error, cpi->mi_ssim_rdmult_scaling_factors, vpx_calloc(num_rows * num_cols, sizeof(*cpi->mi_ssim_rdmult_scaling_factors))); } @@ -2631,7 +2632,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, } // Allocate memory to store variances for a frame. - CHECK_MEM_ERROR(cm, cpi->source_diff_var, + CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var))); cpi->source_var_thresh = 0; cpi->frames_till_next_var_check = 0; @@ -3754,7 +3755,7 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index, case 6: l = 150; break; } if (!cpi->common.postproc_state.limits) { - CHECK_MEM_ERROR(cm, cpi->common.postproc_state.limits, + CHECK_MEM_ERROR(&cm->error, cpi->common.postproc_state.limits, vpx_calloc(cpi->un_scaled_source->y_width, sizeof(*cpi->common.postproc_state.limits))); } @@ -4098,7 +4099,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, svc->spatial_layer_id == svc->number_spatial_layers - 2) { if (svc->prev_partition_svc == NULL) { CHECK_MEM_ERROR( - cm, svc->prev_partition_svc, + &cm->error, svc->prev_partition_svc, (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows, sizeof(*svc->prev_partition_svc))); } @@ -5297,7 +5298,7 @@ static void init_mb_wiener_var_buffer(VP9_COMP *cpi) { cpi->mb_wiener_variance = NULL; CHECK_MEM_ERROR( - cm, cpi->mb_wiener_variance, + &cm->error, cpi->mb_wiener_variance, vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->mb_wiener_variance))); cpi->mb_wiener_var_rows = cm->mb_rows; cpi->mb_wiener_var_cols = cm->mb_cols; @@ -6544,7 +6545,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, pthread_mutex_init(&cpi->kmeans_mutex, NULL); #endif CHECK_MEM_ERROR( - cm, cpi->kmeans_data_arr, + &cm->error, cpi->kmeans_data_arr, vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->kmeans_data_arr))); cpi->kmeans_data_stride = mi_cols; cpi->kmeans_data_arr_alloc = 1; diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 8effe8741e..230a8315bc 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1060,7 +1060,7 @@ static INLINE void partition_info_init(struct VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width); const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height); - CHECK_MEM_ERROR(cm, cpi->partition_info, + CHECK_MEM_ERROR(&cm->error, cpi->partition_info, (PARTITION_INFO *)vpx_calloc(unit_width * unit_height, sizeof(PARTITION_INFO))); memset(cpi->partition_info, 0, @@ -1088,7 +1088,7 @@ static INLINE void motion_vector_info_init(struct VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width); const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height); - CHECK_MEM_ERROR(cm, cpi->motion_vector_info, + CHECK_MEM_ERROR(&cm->error, cpi->motion_vector_info, (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height, sizeof(MOTION_VECTOR_INFO))); memset(cpi->motion_vector_info, 0, @@ -1107,7 +1107,7 @@ static INLINE void free_motion_vector_info(struct VP9_COMP *cpi) { static INLINE void tpl_stats_info_init(struct VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; CHECK_MEM_ERROR( - cm, cpi->tpl_stats_info, + &cm->error, cpi->tpl_stats_info, (TplDepStats *)vpx_calloc(MAX_LAG_BUFFERS, sizeof(TplDepStats))); memset(cpi->tpl_stats_info, 0, MAX_LAG_BUFFERS * sizeof(TplDepStats)); } @@ -1126,7 +1126,7 @@ static INLINE void fp_motion_vector_info_init(struct VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const int unit_width = get_num_unit_16x16(cpi->frame_info.frame_width); const int unit_height = get_num_unit_16x16(cpi->frame_info.frame_height); - CHECK_MEM_ERROR(cm, cpi->fp_motion_vector_info, + CHECK_MEM_ERROR(&cm->error, cpi->fp_motion_vector_info, (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height, sizeof(MOTION_VECTOR_INFO))); } @@ -1475,7 +1475,7 @@ static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) { if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows || new_fb_ptr->mi_cols < cm->mi_cols) { vpx_free(new_fb_ptr->mvs); - CHECK_MEM_ERROR(cm, new_fb_ptr->mvs, + CHECK_MEM_ERROR(&cm->error, new_fb_ptr->mvs, (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*new_fb_ptr->mvs))); new_fb_ptr->mi_rows = cm->mi_rows; diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c index 453fe2e0df..fadd233899 100644 --- a/vp9/encoder/vp9_ethread.c +++ b/vp9/encoder/vp9_ethread.c @@ -94,10 +94,10 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) { vp9_bitstream_encode_tiles_buffer_dealloc(cpi); vp9_encode_free_mt_data(cpi); - CHECK_MEM_ERROR(cm, cpi->workers, + CHECK_MEM_ERROR(&cm->error, cpi->workers, vpx_malloc(num_workers * sizeof(*cpi->workers))); - CHECK_MEM_ERROR(cm, cpi->tile_thr_data, + CHECK_MEM_ERROR(&cm->error, cpi->tile_thr_data, vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data))); for (i = 0; i < num_workers; i++) { @@ -111,7 +111,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) { thread_data->cpi = cpi; // Allocate thread data. - CHECK_MEM_ERROR(cm, thread_data->td, + CHECK_MEM_ERROR(&cm->error, thread_data->td, vpx_memalign(32, sizeof(*thread_data->td))); vp9_zero(*thread_data->td); @@ -121,7 +121,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) { vp9_setup_pc_tree(cm, thread_data->td); // Allocate frame counters in thread data. - CHECK_MEM_ERROR(cm, thread_data->td->counts, + CHECK_MEM_ERROR(&cm->error, thread_data->td->counts, vpx_calloc(1, sizeof(*thread_data->td->counts))); // Create threads @@ -292,7 +292,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm, { int i; - CHECK_MEM_ERROR(cm, row_mt_sync->mutex, + CHECK_MEM_ERROR(&cm->error, row_mt_sync->mutex, vpx_malloc(sizeof(*row_mt_sync->mutex) * rows)); if (row_mt_sync->mutex) { for (i = 0; i < rows; ++i) { @@ -300,7 +300,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm, } } - CHECK_MEM_ERROR(cm, row_mt_sync->cond, + CHECK_MEM_ERROR(&cm->error, row_mt_sync->cond, vpx_malloc(sizeof(*row_mt_sync->cond) * rows)); if (row_mt_sync->cond) { for (i = 0; i < rows; ++i) { @@ -310,7 +310,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm, } #endif // CONFIG_MULTITHREAD - CHECK_MEM_ERROR(cm, row_mt_sync->cur_col, + CHECK_MEM_ERROR(&cm->error, row_mt_sync->cur_col, vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows)); // Set up nsync. diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 71d8775ea5..8fdd976816 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1422,7 +1422,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL) CHECK_MEM_ERROR( - cm, cpi->twopass.fp_mb_float_stats, + &cm->error, cpi->twopass.fp_mb_float_stats, vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1)); { diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index 9487fc5fae..fafc673aca 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -288,7 +288,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) { int *arf_not_zz; CHECK_MEM_ERROR( - cm, arf_not_zz, + &cm->error, arf_not_zz, vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1)); // We are not interested in results beyond the alt ref itself. diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c index 45659f2a9a..0843cd97e4 100644 --- a/vp9/encoder/vp9_multi_thread.c +++ b/vp9/encoder/vp9_multi_thread.c @@ -59,7 +59,7 @@ void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi, int i; CHECK_MEM_ERROR( - cm, this_tile->row_base_thresh_freq_fact, + &cm->error, this_tile->row_base_thresh_freq_fact, (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES, sizeof(*(this_tile->row_base_thresh_freq_fact)))); for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++) @@ -85,7 +85,7 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { multi_thread_ctxt->allocated_tile_rows = tile_rows; multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col; - CHECK_MEM_ERROR(cm, multi_thread_ctxt->job_queue, + CHECK_MEM_ERROR(&cm->error, multi_thread_ctxt->job_queue, (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue))); #if CONFIG_MULTITHREAD diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 60720e3ea6..48c21c581e 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -676,7 +676,7 @@ static void set_rt_speed_feature_framesize_independent( if (cpi->content_state_sb_fd == NULL && (!cpi->use_svc || svc->spatial_layer_id == svc->number_spatial_layers - 1)) { - CHECK_MEM_ERROR(cm, cpi->content_state_sb_fd, + CHECK_MEM_ERROR(&cm->error, cpi->content_state_sb_fd, (uint8_t *)vpx_calloc( (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t))); @@ -832,13 +832,13 @@ static void set_rt_speed_feature_framesize_independent( } if (cpi->count_arf_frame_usage == NULL) { CHECK_MEM_ERROR( - cm, cpi->count_arf_frame_usage, + &cm->error, cpi->count_arf_frame_usage, (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(*cpi->count_arf_frame_usage))); } if (cpi->count_lastgolden_frame_usage == NULL) CHECK_MEM_ERROR( - cm, cpi->count_lastgolden_frame_usage, + &cm->error, cpi->count_lastgolden_frame_usage, (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(*cpi->count_lastgolden_frame_usage))); } diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index f08d668203..e4721271d9 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -163,17 +163,17 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { lc->actual_num_seg1_blocks = 0; lc->actual_num_seg2_blocks = 0; lc->counter_encode_maxq_scene_change = 0; - CHECK_MEM_ERROR(cm, lc->map, + CHECK_MEM_ERROR(&cm->error, lc->map, vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map))); memset(lc->map, 0, mi_rows * mi_cols); last_coded_q_map_size = mi_rows * mi_cols * sizeof(*lc->last_coded_q_map); - CHECK_MEM_ERROR(cm, lc->last_coded_q_map, + CHECK_MEM_ERROR(&cm->error, lc->last_coded_q_map, vpx_malloc(last_coded_q_map_size)); assert(MAXQ <= 255); memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size); consec_zero_mv_size = mi_rows * mi_cols * sizeof(*lc->consec_zero_mv); - CHECK_MEM_ERROR(cm, lc->consec_zero_mv, + CHECK_MEM_ERROR(&cm->error, lc->consec_zero_mv, vpx_malloc(consec_zero_mv_size)); memset(lc->consec_zero_mv, 0, consec_zero_mv_size); } diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index ed771dcb4b..de3783f9a5 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -1320,7 +1320,7 @@ void vp9_init_tpl_buffer(VP9_COMP *cpi) { vpx_free(cpi->select_mv_arr); CHECK_MEM_ERROR( - cm, cpi->select_mv_arr, + &cm->error, cpi->select_mv_arr, vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr))); #endif @@ -1335,23 +1335,23 @@ void vp9_init_tpl_buffer(VP9_COMP *cpi) { for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); CHECK_MEM_ERROR( - cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx], + &cm->error, cpi->tpl_stats[frame].mv_mode_arr[rf_idx], vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx]))); vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); CHECK_MEM_ERROR( - cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx], + &cm->error, cpi->tpl_stats[frame].rd_diff_arr[rf_idx], vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx]))); } #endif vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); - CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr, + CHECK_MEM_ERROR(&cm->error, cpi->tpl_stats[frame].tpl_stats_ptr, vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr))); vpx_free(cpi->tpl_frame_stats[frame].block_stats_list); CHECK_MEM_ERROR( - cm, cpi->tpl_frame_stats[frame].block_stats_list, + &cm->error, cpi->tpl_frame_stats[frame].block_stats_list, vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->tpl_frame_stats[frame].block_stats_list))); cpi->tpl_frame_stats[frame].num_blocks = mi_rows * mi_cols; diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h index 670fe380ed..aae3218738 100644 --- a/vpx/internal/vpx_codec_internal.h +++ b/vpx/internal/vpx_codec_internal.h @@ -48,6 +48,8 @@ #include "../vpx_encoder.h" #include +#include "vpx_config.h" + #ifdef __cplusplus extern "C" { #endif @@ -427,6 +429,27 @@ struct vpx_internal_error_info { jmp_buf jmp; }; +#if CONFIG_DEBUG +#define CHECK_MEM_ERROR(error, lval, expr) \ + do { \ + assert((error)->setjmp); \ + (lval) = (expr); \ + if (!(lval)) \ + vpx_internal_error(error, VPX_CODEC_MEM_ERROR, \ + "Failed to allocate " #lval " at %s:%d", __FILE__, \ + __LINE__); \ + } while (0) +#else +#define CHECK_MEM_ERROR(error, lval, expr) \ + do { \ + assert((error)->setjmp); \ + (lval) = (expr); \ + if (!(lval)) \ + vpx_internal_error(error, VPX_CODEC_MEM_ERROR, \ + "Failed to allocate " #lval); \ + } while (0) +#endif + #define CLANG_ANALYZER_NORETURN #if defined(__has_feature) #if __has_feature(attribute_analyzer_noreturn) From 745c6392f795f43138cfca164b9e54ef895f87b9 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 4 May 2023 14:28:29 -0400 Subject: [PATCH 0793/1000] Add VpxTplGopStats Contains the size of GOP - also the size of the list of TPL stats for each frame in this GOP. VpxTplGopStats will be the unit for VP9E_GET_TPL_STATS control to return TPL stats from the encoder. Bug: b/273736974 Change-Id: I1682242fc6db4aafcd6314af023aa0d704976585 --- test/encode_api_test.cc | 34 +++++++++++++++------------ test/encode_test_driver.h | 2 +- vp9/encoder/vp9_encoder.c | 1 - vp9/encoder/vp9_encoder.h | 2 +- vp9/encoder/vp9_tpl_model.c | 47 +++++++++++++++++++++++++++---------- vp9/vp9_cx_iface.c | 12 +++++----- vpx/vpx_encoder.h | 6 +++++ 7 files changed, 68 insertions(+), 36 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index e435ed872f..2b0aa1fdfe 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -384,12 +384,15 @@ class EncodeApiGetTplStatsTest } } - vpx_codec_err_t AllocateTplList(VpxTplFrameStats **data) { - // Allocate MAX_ARF_GOP_SIZE * sizeof(VpxTplFrameStats) that will be filled - // by VP9E_GET_TPL_STATS - *data = + vpx_codec_err_t AllocateTplList(VpxTplGopStats *data) { + // Allocate MAX_ARF_GOP_SIZE (50) * sizeof(VpxTplFrameStats) that will be + // filled by VP9E_GET_TPL_STATS. + // MAX_ARF_GOP_SIZE is used here because the test doesn't know the size of + // each GOP before getting TPL stats from the encoder. + data->size = 50; + data->frame_stats_list = static_cast(calloc(50, sizeof(VpxTplFrameStats))); - if (*data == nullptr) return VPX_CODEC_MEM_ERROR; + if (data->frame_stats_list == nullptr) return VPX_CODEC_MEM_ERROR; return VPX_CODEC_OK; } @@ -398,22 +401,23 @@ class EncodeApiGetTplStatsTest while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { switch (pkt->kind) { case VPX_CODEC_CX_FRAME_PKT: { - VpxTplFrameStats *tpl_stats = NULL; + VpxTplGopStats tpl_stats; EXPECT_EQ(AllocateTplList(&tpl_stats), VPX_CODEC_OK); - encoder->Control(VP9E_GET_TPL_STATS, tpl_stats); + encoder->Control(VP9E_GET_TPL_STATS, &tpl_stats); bool stats_not_all_zero = false; - for (unsigned int i = 0; i < cfg_.g_lag_in_frames; i++) { - if (tpl_stats[i].frame_width != 0) { - ASSERT_EQ(tpl_stats[i].frame_width, width_); - ASSERT_EQ(tpl_stats[i].frame_height, height_); - ASSERT_GT(tpl_stats[i].num_blocks, 0); - ASSERT_NE(tpl_stats[i].block_stats_list, nullptr); + for (int i = 0; i < tpl_stats.size; i++) { + VpxTplFrameStats *frame_stats_list = tpl_stats.frame_stats_list; + if (frame_stats_list[i].frame_width != 0) { + ASSERT_EQ(frame_stats_list[i].frame_width, width_); + ASSERT_EQ(frame_stats_list[i].frame_height, height_); + ASSERT_GT(frame_stats_list[i].num_blocks, 0); + ASSERT_NE(frame_stats_list[i].block_stats_list, nullptr); stats_not_all_zero = true; } } ASSERT_TRUE(stats_not_all_zero); // Free the memory right away now as this is only a test. - free(tpl_stats); + free(tpl_stats.frame_stats_list); break; } default: break; @@ -430,7 +434,7 @@ TEST_P(EncodeApiGetTplStatsTest, GetTplStats) { width_ = 352; height_ = 288; ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_, - height_, 30, 1, 0, 150); + height_, 30, 1, 0, 50); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index a5cd8306ef..922c49f420 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -154,7 +154,7 @@ class Encoder { ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } - void Control(int ctrl_id, VpxTplFrameStats *arg) { + void Control(int ctrl_id, VpxTplGopStats *arg) { const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index db21509ce9..5126a971a1 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2628,7 +2628,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, #endif // CONFIG_NON_GREEDY_MV for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) { cpi->tpl_stats[i].tpl_stats_ptr = NULL; - cpi->tpl_frame_stats[i].block_stats_list = NULL; } // Allocate memory to store variances for a frame. diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 230a8315bc..2528bc2316 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -745,7 +745,7 @@ typedef struct VP9_COMP { BLOCK_SIZE tpl_bsize; TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE]; // Used to store TPL stats before propagation - VpxTplFrameStats tpl_frame_stats[MAX_ARF_GOP_SIZE]; + VpxTplGopStats tpl_gop_stats; YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES]; EncFrameBuf enc_frame_buf[REF_FRAMES]; #if CONFIG_MULTITHREAD diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index de3783f9a5..9f4bafdf83 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -154,17 +154,43 @@ static void init_tpl_stats(VP9_COMP *cpi) { int frame_idx; for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - VpxTplFrameStats *tpl_frame_stats = &cpi->tpl_frame_stats[frame_idx]; memset(tpl_frame->tpl_stats_ptr, 0, tpl_frame->height * tpl_frame->width * sizeof(*tpl_frame->tpl_stats_ptr)); - memset(tpl_frame_stats->block_stats_list, 0, - tpl_frame->height * tpl_frame->width * - sizeof(*tpl_frame_stats->block_stats_list)); tpl_frame->is_valid = 0; } } +static void free_tpl_frame_stats_list(VpxTplGopStats *tpl_gop_stats) { + int frame_idx; + for (frame_idx = 0; frame_idx < tpl_gop_stats->size; ++frame_idx) { + vpx_free(tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list); + } + vpx_free(tpl_gop_stats->frame_stats_list); +} + +static void init_tpl_stats_before_propagation( + struct vpx_internal_error_info *error_info, VpxTplGopStats *tpl_gop_stats, + TplDepFrame *tpl_stats, int tpl_gop_frames) { + int frame_idx; + free_tpl_frame_stats_list(tpl_gop_stats); + CHECK_MEM_ERROR( + error_info, tpl_gop_stats->frame_stats_list, + vpx_calloc(tpl_gop_frames, sizeof(*tpl_gop_stats->frame_stats_list))); + tpl_gop_stats->size = tpl_gop_frames; + for (frame_idx = 0; frame_idx < tpl_gop_frames; ++frame_idx) { + const int mi_rows = tpl_stats[frame_idx].height; + const int mi_cols = tpl_stats[frame_idx].width; + CHECK_MEM_ERROR( + error_info, tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list, + vpx_calloc( + mi_rows * mi_cols, + sizeof( + *tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list))); + tpl_gop_stats->frame_stats_list[frame_idx].num_blocks = mi_rows * mi_cols; + } +} + #if CONFIG_NON_GREEDY_MV static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, MotionField *motion_field, @@ -1106,7 +1132,7 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx, BLOCK_SIZE bsize) { TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; VpxTplFrameStats *tpl_frame_stats_before_propagation = - &cpi->tpl_frame_stats[frame_idx]; + &cpi->tpl_gop_stats.frame_stats_list[frame_idx]; YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame; YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL }; @@ -1349,12 +1375,6 @@ void vp9_init_tpl_buffer(VP9_COMP *cpi) { CHECK_MEM_ERROR(&cm->error, cpi->tpl_stats[frame].tpl_stats_ptr, vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr))); - vpx_free(cpi->tpl_frame_stats[frame].block_stats_list); - CHECK_MEM_ERROR( - &cm->error, cpi->tpl_frame_stats[frame].block_stats_list, - vpx_calloc(mi_rows * mi_cols, - sizeof(*cpi->tpl_frame_stats[frame].block_stats_list))); - cpi->tpl_frame_stats[frame].num_blocks = mi_rows * mi_cols; cpi->tpl_stats[frame].is_valid = 0; cpi->tpl_stats[frame].width = mi_cols; cpi->tpl_stats[frame].height = mi_rows; @@ -1385,8 +1405,8 @@ void vp9_free_tpl_buffer(VP9_COMP *cpi) { #endif vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); cpi->tpl_stats[frame].is_valid = 0; - vpx_free(cpi->tpl_frame_stats[frame].block_stats_list); } + free_tpl_frame_stats_list(&cpi->tpl_gop_stats); } #if CONFIG_RATE_CTRL @@ -1442,6 +1462,9 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) { init_tpl_stats(cpi); + init_tpl_stats_before_propagation(&cpi->common.error, &cpi->tpl_gop_stats, + cpi->tpl_stats, tpl_group_frames); + // Backward propagation from tpl_group_frames to 1. for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) { if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 7150f74284..62128ff28a 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1789,16 +1789,16 @@ static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx, va_list args) { VP9_COMP *const cpi = ctx->cpi; - VpxTplFrameStats *data = va_arg(args, VpxTplFrameStats *); + VpxTplGopStats *data = va_arg(args, VpxTplGopStats *); + VpxTplFrameStats *frame_stats_list = cpi->tpl_gop_stats.frame_stats_list; int i; if (data == NULL) { return VPX_CODEC_INVALID_PARAM; } - for (i = 0; i < MAX_ARF_GOP_SIZE; i++) { - data[i].frame_width = cpi->tpl_frame_stats[i].frame_width; - data[i].frame_height = cpi->tpl_frame_stats[i].frame_height; - data[i].num_blocks = cpi->tpl_frame_stats[i].num_blocks; - data[i].block_stats_list = cpi->tpl_frame_stats[i].block_stats_list; + data->size = cpi->tpl_gop_stats.size; + + for (i = 0; i < data->size; i++) { + data->frame_stats_list[i] = frame_stats_list[i]; } return VPX_CODEC_OK; diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 2de8089736..fb95723dd3 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -271,6 +271,12 @@ typedef struct VpxTplFrameStats { VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */ } VpxTplFrameStats; +/*!\brief Temporal dependency model stats for each GOP before propagation */ +typedef struct VpxTplGopStats { + int size; /**< GOP size, also the size of frame_stats_list. */ + VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */ +} VpxTplGopStats; + /*!\brief Encoded Frame Flags * * This type indicates a bitfield to be passed to vpx_codec_encode(), defining From 3fe13658846564f37399035146132ee2af2b1ba6 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 5 May 2023 16:56:51 -0700 Subject: [PATCH 0794/1000] configure: add clang-cl vs1[67] arm64 targets x86 and armv7 are skipped for now as the intrinsics will need different flags than cl.exe (/arch:... -> -m...). Bug: webm:1788 Change-Id: I8ca8660a8644cdd84c51cb1f75005e371ba8207d --- README | 4 ++++ build/make/gen_msvs_vcxproj.sh | 38 +++++++++++++++++++++++----------- configure | 2 ++ 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/README b/README index 87e46f99d4..9fa50038fb 100644 --- a/README +++ b/README @@ -69,6 +69,10 @@ COMPILING THE APPLICATIONS/LIBRARIES: arm64-linux-gcc arm64-win64-gcc arm64-win64-vs15 + arm64-win64-vs16 + arm64-win64-vs16-clangcl + arm64-win64-vs17 + arm64-win64-vs17-clangcl armv7-android-gcc armv7-darwin-gcc armv7-linux-rvct diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh index 58bb66b9e3..482d88f49b 100755 --- a/build/make/gen_msvs_vcxproj.sh +++ b/build/make/gen_msvs_vcxproj.sh @@ -141,7 +141,17 @@ for opt in "$@"; do case "$opt" in --help|-h) show_help ;; - --target=*) target="${optval}" + --target=*) + target="${optval}" + platform_toolset=$(echo ${target} | awk 'BEGIN{FS="-"}{print $4}') + case "$platform_toolset" in + clangcl) platform_toolset="ClangCl" + ;; + "") + ;; + *) die Unrecognized Visual Studio Platform Toolset in $opt + ;; + esac ;; --out=*) outfile="$optval" ;; @@ -335,17 +345,21 @@ generate_vcxproj() { else tag_content ConfigurationType StaticLibrary fi - if [ "$vs_ver" = "14" ]; then - tag_content PlatformToolset v140 - fi - if [ "$vs_ver" = "15" ]; then - tag_content PlatformToolset v141 - fi - if [ "$vs_ver" = "16" ]; then - tag_content PlatformToolset v142 - fi - if [ "$vs_ver" = "17" ]; then - tag_content PlatformToolset v143 + if [ -n "$platform_toolset" ]; then + tag_content PlatformToolset "$platform_toolset" + else + if [ "$vs_ver" = "14" ]; then + tag_content PlatformToolset v140 + fi + if [ "$vs_ver" = "15" ]; then + tag_content PlatformToolset v141 + fi + if [ "$vs_ver" = "16" ]; then + tag_content PlatformToolset v142 + fi + if [ "$vs_ver" = "17" ]; then + tag_content PlatformToolset v143 + fi fi tag_content CharacterSet Unicode if [ "$config" = "Release" ]; then diff --git a/configure b/configure index 20707727ef..e4e6acd107 100755 --- a/configure +++ b/configure @@ -106,7 +106,9 @@ all_platforms="${all_platforms} arm64-linux-gcc" all_platforms="${all_platforms} arm64-win64-gcc" all_platforms="${all_platforms} arm64-win64-vs15" all_platforms="${all_platforms} arm64-win64-vs16" +all_platforms="${all_platforms} arm64-win64-vs16-clangcl" all_platforms="${all_platforms} arm64-win64-vs17" +all_platforms="${all_platforms} arm64-win64-vs17-clangcl" all_platforms="${all_platforms} armv7-android-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-darwin-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-linux-rvct" #neon Cortex-A8 From 3916e0e1308acd8511269ca4ed6394601f72cc10 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 5 May 2023 19:00:08 -0700 Subject: [PATCH 0795/1000] gen_msvs_vcxproj: add ARM64EC w/VS >= 2022 rather than define new targets, add a platform to the arm64 list as they share the same configuration. Bug: webm:1788 Change-Id: Iac020280b1103fb12b559f21439aeff26568fba4 --- build/make/gen_msvs_vcxproj.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh index 482d88f49b..1e1db05bb2 100755 --- a/build/make/gen_msvs_vcxproj.sh +++ b/build/make/gen_msvs_vcxproj.sh @@ -269,6 +269,10 @@ case "$target" in ;; arm64*) platforms[0]="ARM64" + # As of Visual Studio 2022 17.5.5, clang-cl does not support ARM64EC. + if [ "$vs_ver" -ge 17 -a "$platform_toolset" != "ClangCl" ]; then + platforms[1]="ARM64EC" + fi asm_Debug_cmdline="armasm64 -nologo -oldit "%(FullPath)"" asm_Release_cmdline="armasm64 -nologo -oldit "%(FullPath)"" ;; From fbbe1d0115efc41a3c7001cc161aa1ec64a9f711 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 8 May 2023 11:48:15 -0700 Subject: [PATCH 0796/1000] vp8_macros_msa.h: clear -Wshadow warnings Bug: webm:1793 Change-Id: Ia940b06bd23a915a050432e03bb630567e891d8d --- vp8/common/mips/msa/vp8_macros_msa.h | 258 +++++++++++++-------------- 1 file changed, 129 insertions(+), 129 deletions(-) diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h index 7cb3c98690..cc85b9a1f7 100644 --- a/vp8/common/mips/msa/vp8_macros_msa.h +++ b/vp8/common/mips/msa/vp8_macros_msa.h @@ -40,160 +40,160 @@ #define ST_SW(...) ST_W(v4i32, __VA_ARGS__) #if (__mips_isa_rev >= 6) -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - asm volatile("lw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LW(psrc) \ + ({ \ + const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \ + uint32_t lw_val_m; \ + \ + asm volatile("lw %[lw_val_m], %[lw_psrc_m] \n\t" \ + \ + : [lw_val_m] "=r"(lw_val_m) \ + : [lw_psrc_m] "m"(*lw_psrc_m)); \ + \ + lw_val_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - asm volatile("ld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \ + uint64_t ld_val_m = 0; \ + \ + asm volatile("ld %[ld_val_m], %[ld_psrc_m] \n\t" \ + \ + : [ld_val_m] "=r"(ld_val_m) \ + : [ld_psrc_m] "m"(*ld_psrc_m)); \ + \ + ld_val_m; \ }) #else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_ld = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_ld); \ - val1_m = LW(psrc_ld + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \ + uint32_t ld_val0_m, ld_val1_m; \ + uint64_t ld_val_m = 0; \ + \ + ld_val0_m = LW(ld_psrc_m); \ + ld_val1_m = LW(ld_psrc_m + 4); \ + \ + ld_val_m = (uint64_t)(ld_val1_m); \ + ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \ + ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m); \ + \ + ld_val_m; \ }) #endif // (__mips == 64) -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - asm volatile("sh %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SH(val, pdst) \ + { \ + uint8_t *sh_pdst_m = (uint8_t *)(pdst); \ + const uint16_t sh_val_m = (val); \ + \ + asm volatile("sh %[sh_val_m], %[sh_pdst_m] \n\t" \ + \ + : [sh_pdst_m] "=m"(*sh_pdst_m) \ + : [sh_val_m] "r"(sh_val_m)); \ } -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - asm volatile("sw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SW(val, pdst) \ + { \ + uint8_t *sw_pdst_m = (uint8_t *)(pdst); \ + const uint32_t sw_val_m = (val); \ + \ + asm volatile("sw %[sw_val_m], %[sw_pdst_m] \n\t" \ + \ + : [sw_pdst_m] "=m"(*sw_pdst_m) \ + : [sw_val_m] "r"(sw_val_m)); \ } -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint64_t val_m = (val); \ - \ - asm volatile("sd %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SD(val, pdst) \ + { \ + uint8_t *sd_pdst_m = (uint8_t *)(pdst); \ + const uint64_t sd_val_m = (val); \ + \ + asm volatile("sd %[sd_val_m], %[sd_pdst_m] \n\t" \ + \ + : [sd_pdst_m] "=m"(*sd_pdst_m) \ + : [sd_val_m] "r"(sd_val_m)); \ } #else // !(__mips_isa_rev >= 6) -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - asm volatile( \ - "lwr %[val_m], 0(%[psrc_m]) \n\t" \ - "lwl %[val_m], 3(%[psrc_m]) \n\t" \ - : [val_m] "=&r"(val_m) \ - : [psrc_m] "r"(psrc_m)); \ - \ - val_m; \ +#define LW(psrc) \ + ({ \ + const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \ + uint32_t lw_val_m; \ + \ + asm volatile( \ + "lwr %[lw_val_m], 0(%[lw_psrc_m]) \n\t" \ + "lwl %[lw_val_m], 3(%[lw_psrc_m]) \n\t" \ + : [lw_val_m] "=&r"(lw_val_m) \ + : [lw_psrc_m] "r"(lw_psrc_m)); \ + \ + lw_val_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - asm volatile( \ - "ldr %[val_m], 0(%[psrc_m]) \n\t" \ - "ldl %[val_m], 7(%[psrc_m]) \n\t" \ - : [val_m] "=&r"(val_m) \ - : [psrc_m] "r"(psrc_m)); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \ + uint64_t ld_val_m = 0; \ + \ + asm volatile( \ + "ldr %[ld_val_m], 0(%[ld_psrc_m]) \n\t" \ + "ldl %[ld_val_m], 7(%[ld_psrc_m]) \n\t" \ + : [ld_val_m] "=&r"(ld_val_m) \ + : [ld_psrc_m] "r"(ld_psrc_m)); \ + \ + ld_val_m; \ }) #else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_m1); \ - val1_m = LW(psrc_m1 + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m1 = (const uint8_t *)(psrc); \ + uint32_t ld_val0_m, ld_val1_m; \ + uint64_t ld_val_m = 0; \ + \ + ld_val0_m = LW(ld_psrc_m1); \ + ld_val1_m = LW(ld_psrc_m1 + 4); \ + \ + ld_val_m = (uint64_t)(ld_val1_m); \ + ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \ + ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m); \ + \ + ld_val_m; \ }) #endif // (__mips == 64) -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - asm volatile("ush %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SH(val, pdst) \ + { \ + uint8_t *sh_pdst_m = (uint8_t *)(pdst); \ + const uint16_t sh_val_m = (val); \ + \ + asm volatile("ush %[sh_val_m], %[sh_pdst_m] \n\t" \ + \ + : [sh_pdst_m] "=m"(*sh_pdst_m) \ + : [sh_val_m] "r"(sh_val_m)); \ } -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - asm volatile("usw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SW(val, pdst) \ + { \ + uint8_t *sw_pdst_m = (uint8_t *)(pdst); \ + const uint32_t sw_val_m = (val); \ + \ + asm volatile("usw %[sw_val_m], %[sw_pdst_m] \n\t" \ + \ + : [sw_pdst_m] "=m"(*sw_pdst_m) \ + : [sw_val_m] "r"(sw_val_m)); \ } -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m1 = (uint8_t *)(pdst); \ - uint32_t val0_m, val1_m; \ - \ - val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ - val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ - \ - SW(val0_m, pdst_m1); \ - SW(val1_m, pdst_m1 + 4); \ +#define SD(val, pdst) \ + { \ + uint8_t *sd_pdst_m1 = (uint8_t *)(pdst); \ + uint32_t sd_val0_m, sd_val1_m; \ + \ + sd_val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + sd_val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(sd_val0_m, sd_pdst_m1); \ + SW(sd_val1_m, sd_pdst_m1 + 4); \ } #endif // (__mips_isa_rev >= 6) From 457b7f59860955415a23c20c535fc13fde51936f Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Mon, 8 May 2023 12:10:09 +0530 Subject: [PATCH 0797/1000] Add AVX2 intrinsic for vpx_comp_avg_pred() function The module level scaling w.r.t C function (timer based) for existing (SSE2) and new AVX2 intrinsics: If ref_padding = 0 Block Scaling size SSE2 AVX2 8x4 3.24x 3.24x 8x8 4.22x 4.90x 8x16 5.91x 5.93x 16x8 1.63x 3.52x 16x16 1.53x 4.19x 16x32 1.38x 4.82x 32x16 1.28x 3.08x 32x32 1.45x 3.13x 32x64 1.38x 3.04x 64x32 1.39x 2.12x 64x64 1.46x 2.24x If ref_padding = 8 Block Scaling size SSE2 AVX2 8x4 3.20x 3.21x 8x8 4.61x 4.83x 8x16 5.50x 6.45x 16x8 1.56x 3.35x 16x16 1.53x 4.19x 16x32 1.37x 4.83x 32x16 1.28x 3.07x 32x32 1.46x 3.29x 32x64 1.38x 3.22x 64x32 1.38x 2.14x 64x64 1.38x 2.12x This is a bit-exact change. Change-Id: I72c5d155f64d0c630bc8c3aef21dc8bbd045d9e6 --- test/comp_avg_pred_test.cc | 21 ++++--- vp9/encoder/vp9_mcomp.c | 8 +-- vp9/encoder/vp9_rdopt.c | 4 +- vpx_dsp/sad.c | 2 +- vpx_dsp/variance.c | 2 +- vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/avg_pred_avx2.c | 111 +++++++++++++++++++++++++++++++++++ 8 files changed, 134 insertions(+), 17 deletions(-) create mode 100644 vpx_dsp/x86/avg_pred_avx2.c diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc index f747c3524e..d8fabd5bef 100644 --- a/test/comp_avg_pred_test.cc +++ b/test/comp_avg_pred_test.cc @@ -81,11 +81,11 @@ void AvgPredTest::TestSizeCombinations() { // Only the reference buffer may have a stride not equal to width. Buffer ref = Buffer(width, height, ref_padding ? 8 : 0); ASSERT_TRUE(ref.Init()); - Buffer pred = Buffer(width, height, 0, 16); + Buffer pred = Buffer(width, height, 0, 32); ASSERT_TRUE(pred.Init()); - Buffer avg_ref = Buffer(width, height, 0, 16); + Buffer avg_ref = Buffer(width, height, 0, 32); ASSERT_TRUE(avg_ref.Init()); - Buffer avg_chk = Buffer(width, height, 0, 16); + Buffer avg_chk = Buffer(width, height, 0, 32); ASSERT_TRUE(avg_chk.Init()); const int bitdepth_mask = (1 << bitdepth) - 1; for (int h = 0; h < height; ++h) { @@ -121,11 +121,11 @@ void AvgPredTest::TestCompareReferenceRandom() { const int height = 32; Buffer ref = Buffer(width, height, 8); ASSERT_TRUE(ref.Init()); - Buffer pred = Buffer(width, height, 0, 16); + Buffer pred = Buffer(width, height, 0, 32); ASSERT_TRUE(pred.Init()); - Buffer avg_ref = Buffer(width, height, 0, 16); + Buffer avg_ref = Buffer(width, height, 0, 32); ASSERT_TRUE(avg_ref.Init()); - Buffer avg_chk = Buffer(width, height, 0, 16); + Buffer avg_chk = Buffer(width, height, 0, 32); ASSERT_TRUE(avg_chk.Init()); for (int i = 0; i < 500; ++i) { @@ -167,9 +167,9 @@ void AvgPredTest::TestSpeed() { const int height = 1 << height_pow; Buffer ref = Buffer(width, height, ref_padding ? 8 : 0); ASSERT_TRUE(ref.Init()); - Buffer pred = Buffer(width, height, 0, 16); + Buffer pred = Buffer(width, height, 0, 32); ASSERT_TRUE(pred.Init()); - Buffer avg = Buffer(width, height, 0, 16); + Buffer avg = Buffer(width, height, 0, 32); ASSERT_TRUE(avg.Init()); const int bitdepth_mask = (1 << bitdepth) - 1; for (int h = 0; h < height; ++h) { @@ -217,6 +217,11 @@ INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTestLBD, ::testing::Values(&vpx_comp_avg_pred_sse2)); #endif // HAVE_SSE2 +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, AvgPredTestLBD, + ::testing::Values(&vpx_comp_avg_pred_avx2)); +#endif // HAVE_AVX2 + #if HAVE_NEON INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD, ::testing::Values(&vpx_comp_avg_pred_neon)); diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 64e9ef0f91..0ea0f85e42 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -297,7 +297,7 @@ static unsigned int setup_center_error( besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1); } else { - DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } @@ -312,7 +312,7 @@ static unsigned int setup_center_error( uint32_t besterr; (void)xd; if (second_pred != NULL) { - DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } else { @@ -635,7 +635,7 @@ static int accurate_sub_pel_search( vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h, 0, kernel, MV_PRECISION_Q3, 0, 0); if (second_pred != NULL) { - DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w); besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse); } else { @@ -654,7 +654,7 @@ static int accurate_sub_pel_search( vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h, 0, kernel, MV_PRECISION_Q3, 0, 0); if (second_pred != NULL) { - DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w); besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse); } else { diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index f051c62791..464705a678 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1937,10 +1937,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // Prediction buffer from second frame. #if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]); + DECLARE_ALIGNED(32, uint16_t, second_pred_alloc_16[64 * 64]); uint8_t *second_pred; #else - DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]); + DECLARE_ALIGNED(32, uint8_t, second_pred[64 * 64]); #endif // CONFIG_VP9_HIGHBITDEPTH // Check number of iterations do not exceed the max diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c index 619d7aa956..2a4c81d588 100644 --- a/vpx_dsp/sad.c +++ b/vpx_dsp/sad.c @@ -40,7 +40,7 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride, unsigned int vpx_sad##m##x##n##_avg_c( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred) { \ - DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \ + DECLARE_ALIGNED(32, uint8_t, comp_pred[m * n]); \ vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \ return sad(src_ptr, src_stride, comp_pred, m, m, n); \ } \ diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c index ce1e8382b9..a6793efb68 100644 --- a/vpx_dsp/variance.c +++ b/vpx_dsp/variance.c @@ -156,7 +156,7 @@ static void var_filter_block2d_bil_second_pass( const uint8_t *second_pred) { \ uint16_t fdata3[(H + 1) * W]; \ uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + DECLARE_ALIGNED(32, uint8_t, temp3[H * W]); \ \ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \ W, bilinear_filters[x_offset]); \ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 67d3fb0e29..04969f37e1 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -424,6 +424,7 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/avg_pred_lsx.c DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/avg_pred_avx2.c DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3 DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c DSP_SRCS-$(HAVE_VSX) += ppc/variance_vsx.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index cae4ca8116..f20f4e0454 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1321,7 +1321,7 @@ () specialize qw/vpx_get4x4sse_cs neon msa vsx/; add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; - specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/; + specialize qw/vpx_comp_avg_pred neon sse2 avx2 vsx lsx/; # # Subpixel Variance diff --git a/vpx_dsp/x86/avg_pred_avx2.c b/vpx_dsp/x86/avg_pred_avx2.c new file mode 100644 index 0000000000..f4357998c9 --- /dev/null +++ b/vpx_dsp/x86/avg_pred_avx2.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" + +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int row = 0; + // comp_pred and pred must be 32 byte aligned. + assert(((intptr_t)comp_pred % 32) == 0); + assert(((intptr_t)pred % 32) == 0); + + if (width == 8) { + assert(height % 4 == 0); + do { + const __m256i p = _mm256_load_si256((const __m256i *)pred); + const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref); + const __m128i r_1 = + _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride)); + + const __m128i r1 = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride))); + const __m128i r2 = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(r_1), (const __m64 *)(ref + 3 * ref_stride))); + + const __m256i ref_0123 = + _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1); + const __m256i avg = _mm256_avg_epu8(p, ref_0123); + + _mm256_store_si256((__m256i *)comp_pred, avg); + + row += 4; + pred += 32; + comp_pred += 32; + ref += 4 * ref_stride; + } while (row < height); + } else if (width == 16) { + assert(height % 4 == 0); + do { + const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred); + const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32)); + const __m256i tmp0 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)ref)); + const __m256i ref_0 = _mm256_inserti128_si256( + tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1); + const __m256i tmp1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride))); + const __m256i ref_1 = _mm256_inserti128_si256( + tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_store_si256((__m256i *)comp_pred, average_0); + _mm256_store_si256((__m256i *)(comp_pred + 32), average_1); + + row += 4; + pred += 64; + comp_pred += 64; + ref += 4 * ref_stride; + } while (row < height); + } else if (width == 32) { + assert(height % 2 == 0); + do { + const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred); + const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32)); + const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i ref_1 = + _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_store_si256((__m256i *)comp_pred, average_0); + _mm256_store_si256((__m256i *)(comp_pred + 32), average_1); + + row += 2; + pred += 64; + comp_pred += 64; + ref += 2 * ref_stride; + } while (row < height); + } else if (width % 64 == 0) { + do { + int x; + for (x = 0; x < width; x += 64) { + const __m256i pred_0 = _mm256_load_si256((const __m256i *)(pred + x)); + const __m256i pred_1 = + _mm256_load_si256((const __m256i *)(pred + x + 32)); + const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x)); + const __m256i ref_1 = + _mm256_loadu_si256((const __m256i *)(ref + x + 32)); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_store_si256((__m256i *)(comp_pred + x), average_0); + _mm256_store_si256((__m256i *)(comp_pred + x + 32), average_1); + } + row++; + pred += width; + comp_pred += width; + ref += ref_stride; + } while (row < height); + } else { + vpx_comp_avg_pred_sse2(comp_pred, pred, width, height, ref, ref_stride); + } +} From 9e0fc37f6f68685066f3e71e1cd0605d6ee2205e Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 17 Apr 2023 13:42:11 -0700 Subject: [PATCH 0798/1000] configure: add -Wshadow libraries under third_party/ are out of scope for this change. Bug: webm:1793 Change-Id: I562065a3c0ea9fdfc9615d1a6b1ae47da79b8ce0 --- build/make/configure.sh | 1 + configure | 6 +++++- examples.mk | 1 + test/test.mk | 1 + 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/build/make/configure.sh b/build/make/configure.sh index ec9af5e63d..6fd67f1623 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -521,6 +521,7 @@ AS_SFX = ${AS_SFX:-.asm} EXE_SFX = ${EXE_SFX} VCPROJ_SFX = ${VCPROJ_SFX} RTCD_OPTIONS = ${RTCD_OPTIONS} +LIBWEBM_CXXFLAGS = ${LIBWEBM_CXXFLAGS} LIBYUV_CXXFLAGS = ${LIBYUV_CXXFLAGS} EOF diff --git a/configure b/configure index e4e6acd107..b73436b47e 100755 --- a/configure +++ b/configure @@ -649,6 +649,7 @@ process_toolchain() { check_add_cflags -Wimplicit-function-declaration check_add_cflags -Wmissing-declarations check_add_cflags -Wmissing-prototypes + check_add_cflags -Wshadow check_add_cflags -Wuninitialized check_add_cflags -Wunreachable-code-loop-increment check_add_cflags -Wunused @@ -679,13 +680,16 @@ process_toolchain() { check_add_cxxflags -Wc++17-extensions check_add_cxxflags -Wc++20-extensions - # disable some warnings specific to libyuv. + # disable some warnings specific to libyuv / libwebm. check_cxxflags -Wno-missing-declarations \ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations" check_cxxflags -Wno-missing-prototypes \ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes" check_cxxflags -Wno-pass-failed \ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-pass-failed" + check_cxxflags -Wno-shadow \ + && LIBWEBM_CXXFLAGS="${LIBWEBM_CXXFLAGS} -Wno-shadow" \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-shadow" check_cxxflags -Wno-unused-parameter \ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter" fi diff --git a/examples.mk b/examples.mk index 42886f1e15..9e506dcd47 100644 --- a/examples.mk +++ b/examples.mk @@ -57,6 +57,7 @@ LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser/mkvparser.cc \ # Add compile flags and include path for libwebm sources. ifeq ($(CONFIG_WEBM_IO),yes) CXXFLAGS += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS + $(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS) INC_PATH-yes += $(SRC_PATH_BARE)/third_party/libwebm endif diff --git a/test/test.mk b/test/test.mk index bbcdd0c6e4..b64e89bb43 100644 --- a/test/test.mk +++ b/test/test.mk @@ -85,6 +85,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../webmdec.cc LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../webmdec.h LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += webm_video_source.h LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_skip_loopfilter_test.cc +$(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS) endif LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += decode_api_test.cc From 8ecf58432118b672fe3f4a54725bc63caac262aa Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 8 May 2023 17:41:26 +0100 Subject: [PATCH 0799/1000] Refactor standard bitdepth Neon convolution functions 1) Use #define constant instead of magic numbers for right shifts. 2) Move saturating narrow into helper functions that return 4-element result vectors. 3) Use mem_neon.h helpers for load/store sequences in Armv8.0 paths. 4) Tidy up: assert conditions and some longer variable names. 5) Prefer != 0 to > 0 where possible for loop termination conditions. Change-Id: Idfcac43ca38faf729dca07b8cc8f7f45ad264d24 --- vpx_dsp/arm/vpx_convolve8_neon.c | 786 +++++++++++++------------------ vpx_dsp/arm/vpx_convolve8_neon.h | 35 +- 2 files changed, 349 insertions(+), 472 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index b312cc747c..f217a3f35d 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -17,6 +17,7 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" // Note: @@ -64,8 +65,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); uint8x16_t s0, s1, s2, s3; - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(x_step_q4 == 16); (void)x_step_q4; @@ -75,22 +76,19 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3; if (w == 4) { - const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); do { - int32x4_t t0, t1, t2, t3; - int16x8_t t01, t23; + int16x4_t t0, t1, t2, t3; uint8x8_t d01, d23; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_usdot(s0, filters, permute_tbl); - t1 = convolve8_4_usdot(s1, filters, permute_tbl); - t2 = convolve8_4_usdot(s2, filters, permute_tbl); - t3 = convolve8_4_usdot(s3, filters, permute_tbl); - t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); - t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); - d01 = vqrshrun_n_s16(t01, 7); - d23 = vqrshrun_n_s16(t23, 7); + t0 = convolve8_4_usdot(s0, filters, perm_tbl); + t1 = convolve8_4_usdot(s1, filters, perm_tbl); + t2 = convolve8_4_usdot(s2, filters, perm_tbl); + t3 = convolve8_4_usdot(s3, filters, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -100,7 +98,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h > 0); } else { - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); const uint8_t *s; uint8_t *d; int width; @@ -113,17 +111,17 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, do { load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_usdot(s0, filters, permute_tbl); - d1 = convolve8_8_usdot(s1, filters, permute_tbl); - d2 = convolve8_8_usdot(s2, filters, permute_tbl); - d3 = convolve8_8_usdot(s3, filters, permute_tbl); + d0 = convolve8_8_usdot(s0, filters, perm_tbl); + d1 = convolve8_8_usdot(s1, filters, perm_tbl); + d2 = convolve8_8_usdot(s2, filters, perm_tbl); + d3 = convolve8_8_usdot(s3, filters, perm_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; - } while (width > 0); + } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; @@ -139,8 +137,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); uint8x16_t s0, s1, s2, s3; - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(x_step_q4 == 16); (void)x_step_q4; @@ -150,24 +148,19 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3; if (w == 4) { - const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); do { - int32x4_t t0, t1, t2, t3; - int16x8_t t01, t23; + int16x4_t t0, t1, t2, t3; uint8x8_t d01, d23, dd01, dd23; - dd01 = vdup_n_u8(0); - dd23 = vdup_n_u8(0); load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_usdot(s0, filters, permute_tbl); - t1 = convolve8_4_usdot(s1, filters, permute_tbl); - t2 = convolve8_4_usdot(s2, filters, permute_tbl); - t3 = convolve8_4_usdot(s3, filters, permute_tbl); - t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); - t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); - d01 = vqrshrun_n_s16(t01, 7); - d23 = vqrshrun_n_s16(t23, 7); + t0 = convolve8_4_usdot(s0, filters, perm_tbl); + t1 = convolve8_4_usdot(s1, filters, perm_tbl); + t2 = convolve8_4_usdot(s2, filters, perm_tbl); + t3 = convolve8_4_usdot(s3, filters, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); dd01 = load_u8(dst + 0 * dst_stride, dst_stride); dd23 = load_u8(dst + 2 * dst_stride, dst_stride); @@ -181,9 +174,9 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; - } while (h > 0); + } while (h != 0); } else { - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); const uint8_t *s; uint8_t *d; int width; @@ -196,10 +189,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, do { load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_usdot(s0, filters, permute_tbl); - d1 = convolve8_8_usdot(s1, filters, permute_tbl); - d2 = convolve8_8_usdot(s2, filters, permute_tbl); - d3 = convolve8_8_usdot(s3, filters, permute_tbl); + d0 = convolve8_8_usdot(s0, filters, perm_tbl); + d1 = convolve8_8_usdot(s1, filters, perm_tbl); + d2 = convolve8_8_usdot(s2, filters, perm_tbl); + d3 = convolve8_8_usdot(s3, filters, perm_tbl); load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); @@ -213,11 +206,11 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s += 8; d += 8; width -= 8; - } while (width > 0); + } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; - } while (h > 0); + } while (h != 0); } } @@ -275,8 +268,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; uint8x16x2_t samples_LUT; - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(y_step_q4 == 16); (void)x0_q4; @@ -288,7 +281,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, if (w == 4) { const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int32x4_t d0, d1, d2, d3; + int16x4_t d0, d1, d2, d3; uint8x8_t d01, d23; load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); @@ -325,8 +318,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, d1 = convolve8_4_usdot_partial(s1234, s5678, filters); d2 = convolve8_4_usdot_partial(s2345, s6789, filters); d3 = convolve8_4_usdot_partial(s3456, s78910, filters); - d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); - d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -341,7 +334,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; - } while (h > 0); + } while (h != 0); } else { const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, @@ -426,11 +419,11 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s += 4 * src_stride; d += 4 * dst_stride; height -= 4; - } while (height > 0); + } while (height != 0); src += 8; dst += 8; w -= 8; - } while (w > 0); + } while (w != 0); } } @@ -444,8 +437,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; uint8x16x2_t samples_LUT; - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(y_step_q4 == 16); (void)x0_q4; @@ -457,7 +450,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, if (w == 4) { const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int32x4_t d0, d1, d2, d3; + int16x4_t d0, d1, d2, d3; uint8x8_t d01, d23, dd01, dd23; load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); @@ -494,8 +487,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, d1 = convolve8_4_usdot_partial(s1234, s5678, filters); d2 = convolve8_4_usdot_partial(s2345, s6789, filters); d3 = convolve8_4_usdot_partial(s3456, s78910, filters); - d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); - d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); dd01 = load_u8(dst + 0 * dst_stride, dst_stride); dd23 = load_u8(dst + 2 * dst_stride, dst_stride); @@ -516,7 +509,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; - } while (h > 0); + } while (h != 0); } else { const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, @@ -608,11 +601,11 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s += 4 * src_stride; d += 4 * dst_stride; height -= 4; - } while (height > 0); + } while (height != 0); src += 8; dst += 8; w -= 8; - } while (w > 0); + } while (w != 0); } } @@ -629,8 +622,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, const uint8x16_t range_limit = vdupq_n_u8(128); uint8x16_t s0, s1, s2, s3; - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(x_step_q4 == 16); (void)x_step_q4; @@ -640,22 +633,19 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3; if (w == 4) { - const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); do { - int32x4_t t0, t1, t2, t3; - int16x8_t t01, t23; + int16x4_t t0, t1, t2, t3; uint8x8_t d01, d23; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl); - t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl); - t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl); - t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl); - t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); - t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); - d01 = vqrshrun_n_s16(t01, 7); - d23 = vqrshrun_n_s16(t23, 7); + t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); + t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); + t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); + t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -665,7 +655,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h > 0); } else { - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); const uint8_t *s; uint8_t *d; int width; @@ -678,21 +668,17 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, do { load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = - convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl); - d1 = - convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl); - d2 = - convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl); - d3 = - convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl); + d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); + d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; - } while (width > 0); + } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; @@ -711,8 +697,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, const uint8x16_t range_limit = vdupq_n_u8(128); uint8x16_t s0, s1, s2, s3; - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(x_step_q4 == 16); (void)x_step_q4; @@ -722,24 +708,19 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3; if (w == 4) { - const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); do { - int32x4_t t0, t1, t2, t3; - int16x8_t t01, t23; + int16x4_t t0, t1, t2, t3; uint8x8_t d01, d23, dd01, dd23; - dd01 = vdup_n_u8(0); - dd23 = vdup_n_u8(0); load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl); - t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl); - t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl); - t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl); - t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); - t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); - d01 = vqrshrun_n_s16(t01, 7); - d23 = vqrshrun_n_s16(t23, 7); + t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); + t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); + t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); + t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); dd01 = load_u8(dst + 0 * dst_stride, dst_stride); dd23 = load_u8(dst + 2 * dst_stride, dst_stride); @@ -753,9 +734,9 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; - } while (h > 0); + } while (h != 0); } else { - const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); const uint8_t *s; uint8_t *d; int width; @@ -768,14 +749,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, do { load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = - convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl); - d1 = - convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl); - d2 = - convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl); - d3 = - convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl); + d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); + d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); @@ -789,11 +766,11 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s += 8; d += 8; width -= 8; - } while (width > 0); + } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; - } while (h > 0); + } while (h != 0); } } @@ -854,8 +831,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; int8x16x2_t samples_LUT; - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(y_step_q4 == 16); (void)x0_q4; @@ -867,7 +844,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, if (w == 4) { const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int32x4_t d0, d1, d2, d3; + int16x4_t d0, d1, d2, d3; uint8x8_t d01, d23; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); @@ -919,8 +896,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); - d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); - d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -935,7 +912,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; - } while (h > 0); + } while (h != 0); } else { const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, @@ -1035,11 +1012,11 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s += 4 * src_stride; d += 4 * dst_stride; height -= 4; - } while (height > 0); + } while (height != 0); src += 8; dst += 8; w -= 8; - } while (w > 0); + } while (w != 0); } } @@ -1057,8 +1034,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; int8x16x2_t samples_LUT; - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(y_step_q4 == 16); (void)x0_q4; @@ -1070,7 +1047,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, if (w == 4) { const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int32x4_t d0, d1, d2, d3; + int16x4_t d0, d1, d2, d3; uint8x8_t d01, d23, dd01, dd23; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); @@ -1122,8 +1099,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); - d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); - d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); dd01 = load_u8(dst + 0 * dst_stride, dst_stride); dd23 = load_u8(dst + 2 * dst_stride, dst_stride); @@ -1144,7 +1121,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; - } while (h > 0); + } while (h != 0); } else { const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, @@ -1251,11 +1228,11 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s += 4 * src_stride; d += 4 * dst_stride; height -= 4; - } while (height > 0); + } while (height != 0); src += 8; dst += 8; w -= 8; - } while (w > 0); + } while (w != 0); } } @@ -1273,8 +1250,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, const int16x8_t filters = vld1q_s16(filter[x0_q4]); uint8x8_t t0, t1, t2, t3; - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(x_step_q4 == 16); (void)x_step_q4; @@ -1286,25 +1263,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, if (h == 4) { uint8x8_t d01, d23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - int16x8_t tt0, tt1, tt2, tt3; __builtin_prefetch(src + 0 * src_stride); __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); - tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s0 = vget_low_s16(tt0); - s1 = vget_low_s16(tt1); - s2 = vget_low_s16(tt2); - s3 = vget_low_s16(tt3); - s4 = vget_high_s16(tt0); - s5 = vget_high_s16(tt1); - s6 = vget_high_s16(tt2); + s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + __builtin_prefetch(dst + 0 * dst_stride); __builtin_prefetch(dst + 1 * dst_stride); __builtin_prefetch(dst + 2 * dst_stride); @@ -1314,32 +1288,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, do { load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); - tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s7 = vget_low_s16(tt0); - s8 = vget_low_s16(tt1); - s9 = vget_low_s16(tt2); - s10 = vget_low_s16(tt3); + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); transpose_u8_4x4(&d01, &d23); - vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), - vreinterpret_u32_u8(d01), 0); - vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), - vreinterpret_u32_u8(d23), 0); - vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), - vreinterpret_u32_u8(d01), 1); - vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), - vreinterpret_u32_u8(d23), 1); + store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01); + store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23); s0 = s4; s1 = s5; @@ -1355,7 +1319,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } else { int width; const uint8_t *s; - uint8x8_t t4, t5, t6, t7; + uint8x8_t t4, t5, t6, t7, d04, d15, d26, d37; int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; if (w == 4) { @@ -1395,32 +1359,24 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - transpose_u8_8x4(&t0, &t1, &t2, &t3); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1); - dst += dst_stride; + d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + transpose_u8_8x4(&d04, &d15, &d26, &d37); + + store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04); + store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15); + store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26); + store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37); + + dst += 8 * dst_stride; h -= 8; } while (h > 0); } else { uint8_t *d; + uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; int16x8_t s11, s12, s13, s14; do { @@ -1466,17 +1422,18 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); - t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); - t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); - t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); + d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); + d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); + d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7); + transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); s0 = s8; s1 = s9; @@ -1505,8 +1462,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, const int16x8_t filters = vld1q_s16(filter[x0_q4]); uint8x8_t t0, t1, t2, t3; - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(x_step_q4 == 16); (void)x_step_q4; @@ -1516,10 +1473,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3; if (h == 4) { - uint8x8_t d01, d23; + uint8x8_t d01, d23, dd01, dd23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - int16x8_t tt0, tt1, tt2, tt3; - uint32x4_t d0123 = vdupq_n_u32(0); __builtin_prefetch(src + 0 * src_stride); __builtin_prefetch(src + 1 * src_stride); @@ -1527,17 +1482,14 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 3 * src_stride); load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); - tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s0 = vget_low_s16(tt0); - s1 = vget_low_s16(tt1); - s2 = vget_low_s16(tt2); - s3 = vget_low_s16(tt3); - s4 = vget_high_s16(tt0); - s5 = vget_high_s16(tt1); - s6 = vget_high_s16(tt2); + s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + __builtin_prefetch(dst + 0 * dst_stride); __builtin_prefetch(dst + 1 * dst_stride); __builtin_prefetch(dst + 2 * dst_stride); @@ -1547,35 +1499,28 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, do { load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); - tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s7 = vget_low_s16(tt0); - s8 = vget_low_s16(tt1); - s9 = vget_low_s16(tt2); - s10 = vget_low_s16(tt3); + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); transpose_u8_4x4(&d01, &d23); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); - d0123 = vreinterpretq_u32_u8( - vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23))); + dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride); + dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); - vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); - vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2); - vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1); - vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); + store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01); + store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23); s0 = s4; s1 = s5; @@ -1595,8 +1540,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; if (w == 4) { - uint32x4_t d0415 = vdupq_n_u32(0); - uint32x4_t d2637 = vdupq_n_u32(0); + uint8x8_t d04, d15, d26, d37, dd04, dd15, dd26, dd37; + do { load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); @@ -1633,48 +1578,35 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - transpose_u8_8x4(&t0, &t1, &t2, &t3); - - d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0); - d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2); - d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0); - d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2); - d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1); - d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3); - d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1); - d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3); - d0415 = vreinterpretq_u32_u8( - vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1))); - d2637 = vreinterpretq_u32_u8( - vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3))); - - vst1q_lane_u32((uint32_t *)dst, d0415, 0); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0415, 2); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d2637, 0); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d2637, 2); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0415, 1); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0415, 3); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d2637, 1); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d2637, 3); - dst += dst_stride; + d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + transpose_u8_8x4(&d04, &d15, &d26, &d37); + + dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride); + dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride); + dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride); + dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride); + + d04 = vrhadd_u8(d04, dd04); + d15 = vrhadd_u8(d15, dd15); + d26 = vrhadd_u8(d26, dd26); + d37 = vrhadd_u8(d37, dd37); + + store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04); + store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15); + store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26); + store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37); + + dst += 8 * dst_stride; h -= 8; - } while (h > 0); + } while (h != 0); } else { uint8_t *d; + uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; int16x8_t s11, s12, s13, s14; - uint8x16_t d01, d23, d45, d67; do { __builtin_prefetch(src + 0 * src_stride); @@ -1719,33 +1651,27 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); - t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); - t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); - t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); + d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); + d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); + d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); - d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride), - vld1_u8(d + 1 * dst_stride)); - d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride), - vld1_u8(d + 3 * dst_stride)); - d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride), - vld1_u8(d + 5 * dst_stride)); - d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride), - vld1_u8(d + 7 * dst_stride)); - d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1)); - d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3)); - d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5)); - d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7)); - - store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01), - vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45), - vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67)); + d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride)); + d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride)); + d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride)); + d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride)); + d4 = vrhadd_u8(d4, vld1_u8(d + 4 * dst_stride)); + d5 = vrhadd_u8(d5, vld1_u8(d + 5 * dst_stride)); + d6 = vrhadd_u8(d6, vld1_u8(d + 6 * dst_stride)); + d7 = vrhadd_u8(d7, vld1_u8(d + 7 * dst_stride)); + + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); s0 = s8; s1 = s9; @@ -1761,7 +1687,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; - } while (h > 0); + } while (h != 0); } } } @@ -1773,8 +1699,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, int h) { const int16x8_t filters = vld1q_s16(filter[y0_q4]); - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(y_step_q4 == 16); (void)x0_q4; @@ -1784,33 +1710,26 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3 * src_stride; if (w == 4) { - uint8x8_t d01, d23; + uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + + src += 7 * src_stride; do { - s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); __builtin_prefetch(dst + 0 * dst_stride); __builtin_prefetch(dst + 1 * dst_stride); @@ -1820,21 +1739,16 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -1843,13 +1757,15 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s4 = s8; s5 = s9; s6 = s10; + src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h != 0); } else { int height; const uint8_t *s; uint8_t *d; - uint8x8_t t0, t1, t2, t3; + uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3; int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; do { @@ -1860,33 +1776,26 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 4 * src_stride); __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); - s = src; - s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s = src + 7 * src_stride; d = dst; height = h; do { - s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); __builtin_prefetch(d + 0 * dst_stride); __builtin_prefetch(d + 1 * dst_stride); @@ -1896,19 +1805,13 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - vst1_u8(d, t0); - d += dst_stride; - vst1_u8(d, t1); - d += dst_stride; - vst1_u8(d, t2); - d += dst_stride; - vst1_u8(d, t3); - d += dst_stride; + + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; @@ -1917,6 +1820,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s4 = s8; s5 = s9; s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; height -= 4; } while (height != 0); src += 8; @@ -1933,8 +1838,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, int h) { const int16x8_t filters = vld1q_s16(filter[y0_q4]); - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(y_step_q4 == 16); (void)x0_q4; @@ -1944,34 +1849,26 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3 * src_stride; if (w == 4) { - uint8x8_t d01, d23; + uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23, dd01, dd23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - uint32x4_t d0123 = vdupq_n_u32(0); - - s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + + src += 7 * src_stride; do { - s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); __builtin_prefetch(dst + 0 * dst_stride); __builtin_prefetch(dst + 1 * dst_stride); @@ -1981,29 +1878,22 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); - - d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); - d0123 = vreinterpretq_u32_u8( - vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23))); - - vst1q_lane_u32((uint32_t *)dst, d0123, 0); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0123, 1); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0123, 2); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0123, 3); - dst += dst_stride; + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -2012,14 +1902,15 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s4 = s8; s5 = s9; s6 = s10; + src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h != 0); } else { int height; const uint8_t *s; uint8_t *d; - uint8x8_t t0, t1, t2, t3; - uint8x16_t d01, d23, dd01, dd23; + uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3; int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; do { @@ -2030,33 +1921,26 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 4 * src_stride); __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); - s = src; - s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s = src + 7 * src_stride; d = dst; height = h; do { - s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); __builtin_prefetch(d + 0 * dst_stride); __builtin_prefetch(d + 1 * dst_stride); @@ -2066,28 +1950,18 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - d01 = vcombine_u8(t0, t1); - d23 = vcombine_u8(t2, t3); - dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride), - vld1_u8(d + 1 * dst_stride)); - dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride), - vld1_u8(d + 3 * dst_stride)); - dd01 = vrhaddq_u8(dd01, d01); - dd23 = vrhaddq_u8(dd23, d23); - - vst1_u8(d, vget_low_u8(dd01)); - d += dst_stride; - vst1_u8(d, vget_high_u8(dd01)); - d += dst_stride; - vst1_u8(d, vget_low_u8(dd23)); - d += dst_stride; - vst1_u8(d, vget_high_u8(dd23)); - d += dst_stride; + + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride)); + d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride)); + d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride)); + d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride)); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; @@ -2097,6 +1971,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5 = s9; s6 = s10; height -= 4; + s += 4 * src_stride; + d += 4 * dst_stride; } while (height != 0); src += 8; dst += 8; diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index 07cf8242d3..c838d40470 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -15,10 +15,11 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_filter.h" #if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) -static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, +static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, const int8x16_t samples_hi, const int32x4_t correction, const int8x8_t filters) { @@ -29,11 +30,11 @@ static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, sum = vdotq_lane_s32(correction, samples_lo, filters, 0); sum = vdotq_lane_s32(sum, samples_hi, filters, 1); - /* Narrowing and packing is performed by the caller. */ - return sum; + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); } -static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples, +static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples, const int8x8_t filters, const int32x4_t correction, const uint8x16_t range_limit, @@ -54,8 +55,8 @@ static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples, sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1); - /* Narrowing and packing is performed by the caller. */ - return sum; + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); } static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo, @@ -78,7 +79,7 @@ static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo, /* Narrow and re-pack. */ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, 7); + return vqrshrun_n_s16(sum, FILTER_BITS); } static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, @@ -111,14 +112,14 @@ static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, /* Narrow and re-pack. */ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, 7); + return vqrshrun_n_s16(sum, FILTER_BITS); } #endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) #if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) -static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, +static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, const uint8x16_t samples_hi, const int8x8_t filters) { /* Sample permutation is performed by the caller. */ @@ -127,11 +128,11 @@ static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); - /* Narrowing and packing is performed by the caller. */ - return sum; + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); } -static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples, +static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples, const int8x8_t filters, const uint8x16x2_t permute_tbl) { uint8x16_t permuted_samples[2]; @@ -147,8 +148,8 @@ static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples, sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); - /* Narrowing and packing is performed by the caller. */ - return sum; + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); } static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo, @@ -169,7 +170,7 @@ static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo, /* Narrow and re-pack. */ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, 7); + return vqrshrun_n_s16(sum, FILTER_BITS); } static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples, @@ -196,7 +197,7 @@ static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples, /* Narrow and re-pack. */ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, 7); + return vqrshrun_n_s16(sum, FILTER_BITS); } #endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) @@ -238,7 +239,7 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, sum = vmlaq_lane_s16(sum, s7, filters_hi, 3); sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3)); sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0)); - return vqrshrun_n_s16(sum, 7); + return vqrshrun_n_s16(sum, FILTER_BITS); } static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, From 2a9b810d3df62ff3c527ce3895f6b80d9d6f6296 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 8 May 2023 16:58:59 -0700 Subject: [PATCH 0800/1000] Don't use -Wl,-z,defs with Clang's sanitizers This avoids link errors related to the sanitizers: https://clang.llvm.org/docs/AddressSanitizer.html#usage "When linking shared libraries, the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link errors ..." See also: https://crbug.com/aomedia/3438 Bug: webm:1801 Fixed: webm:1801 Change-Id: Ie212318005a5f7222e5486775175534025306367 --- build/make/Makefile | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/build/make/Makefile b/build/make/Makefile index 5c38c18e57..65ac2290c7 100644 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -304,6 +304,19 @@ $(1): $(qexec)$$(AR) $$(ARFLAGS) $$@ $$^ endef +# Don't use -Wl,-z,defs with Clang's sanitizers. +# +# Clang's AddressSanitizer documentation says "When linking shared libraries, +# the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link +# errors (don't use it with AddressSanitizer)." See +# https://clang.llvm.org/docs/AddressSanitizer.html#usage. +NO_UNDEFINED := -Wl,-z,defs +ifeq ($(findstring clang,$(CC)),clang) + ifneq ($(filter -fsanitize=%,$(LDFLAGS)),) + NO_UNDEFINED := + endif +endif + define so_template # Not using a pattern rule here because we don't want to generate empty # archives when they are listed as a dependency in files not responsible @@ -313,7 +326,8 @@ define so_template $(1): $(if $(quiet),@echo " [LD] $$@") $(qexec)$$(LD) -shared $$(LDFLAGS) \ - -Wl,--no-undefined -Wl,-soname,$$(SONAME) \ + $(NO_UNDEFINED) \ + -Wl,-soname,$$(SONAME) \ -Wl,--version-script,$$(EXPORTS_FILE) -o $$@ \ $$(filter %.o,$$^) $$(extralibs) endef From 3e1e38d1176c34f71a87f8402c07cdcc2e20083e Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 4 May 2023 16:33:38 +0100 Subject: [PATCH 0801/1000] Add 2D-specific Neon horizontal convolution functions 2D 8-tap convolution filtering is performed in two passes - horizontal and vertical. The horizontal pass must produce enough input data for the subsequent vertical pass - 3 rows above and 4 rows below, in addition to the actual block height. At present, all Neon horizontal convolution algorithms process 4 rows at a time, but this means we end up doing at least 1 row too much work in the 2D first pass case where we need h + 7, not h + 8 rows of output. This patch adds additional dot-product (SDOT and USDOT) Neon paths that process h + 7 rows of data exactly, saving the work of the unnecessary extra row. It is impractical to take a similar approach for the Armv8.0 MLA paths since we have to transpose the data block both before and after calling the convolution helper functions. vpx_convolve_neon performance impact: we observe a speedup of ~9% for smaller (and wider) blocks, and a speedup of 0-3% for larger blocks. This is to be expected since the proportion of redundant work decreases as the block height increases. Change-Id: Ie77ad1848707d2d48bb8851345a469aae9d097e1 --- vpx_dsp/arm/mem_neon.h | 20 +++ vpx_dsp/arm/vpx_convolve8_neon.c | 221 ++++++++++++++++++++++++++++++- vpx_dsp/arm/vpx_convolve8_neon.h | 9 ++ vpx_dsp/arm/vpx_convolve_neon.c | 55 ++++++++ 4 files changed, 301 insertions(+), 4 deletions(-) diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index 1a20da70ef..586bfb85af 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -263,6 +263,16 @@ static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) { vst1_lane_u32((uint32_t *)buf, a_u32, 1); } +static INLINE void store_u8_8x3(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); +} + static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, uint8x8_t *const s0, uint8x8_t *const s1, uint8x8_t *const s2, uint8x8_t *const s3) { @@ -287,6 +297,16 @@ static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p, vst1_u8(s, s3); } +static INLINE void load_u8_16x3(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); +} + static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p, uint8x16_t *const s0, uint8x16_t *const s1, uint8x16_t *const s2, uint8x16_t *const s3) { diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index f217a3f35d..505d0672f0 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -57,6 +57,111 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { #if defined(__ARM_FEATURE_MATMUL_INT8) +void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + assert(h % 4 == 3); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + do { + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_4_usdot(s0, filters, perm_tbl); + d1 = convolve8_4_usdot(s1, filters, perm_tbl); + d2 = convolve8_4_usdot(s2, filters, perm_tbl); + d3 = convolve8_4_usdot(s3, filters, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + d0 = convolve8_4_usdot(s0, filters, perm_tbl); + d1 = convolve8_4_usdot(s1, filters, perm_tbl); + d2 = convolve8_4_usdot(s2, filters, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filters, perm_tbl); + d1 = convolve8_8_usdot(s1, filters, perm_tbl); + d2 = convolve8_8_usdot(s2, filters, perm_tbl); + d3 = convolve8_8_usdot(s3, filters, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + width = w; + s = src; + d = dst; + do { + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + d0 = convolve8_8_usdot(s0, filters, perm_tbl); + d1 = convolve8_8_usdot(s1, filters, perm_tbl); + d2 = convolve8_8_usdot(s2, filters, perm_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + } +} + void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -96,7 +201,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; - } while (h > 0); + } while (h != 0); } else { const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); const uint8_t *s; @@ -125,7 +230,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; - } while (h > 0); + } while (h != 0); } } @@ -611,6 +716,114 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, #else // !defined(__ARM_FEATURE_MATMUL_INT8) +void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x16_t range_limit = vdupq_n_u8(128); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + assert(h % 4 == 3); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + do { + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); + d3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); + d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + width = w; + s = src; + d = dst; + do { + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + } +} + void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -653,7 +866,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; - } while (h > 0); + } while (h != 0); } else { const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); const uint8_t *s; @@ -682,7 +895,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; - } while (h > 0); + } while (h != 0); } } diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index c838d40470..2f78583af3 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -17,6 +17,15 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/vpx_filter.h" +#if VPX_ARCH_AARCH64 && \ + (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)) +void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h); +#endif + #if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c index 830f3176d7..f7db3e6a9c 100644 --- a/vpx_dsp/arm/vpx_convolve_neon.c +++ b/vpx_dsp/arm/vpx_convolve_neon.c @@ -14,6 +14,57 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_ports/mem.h" +#if VPX_ARCH_AARCH64 && \ + (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)) +#include "vpx_dsp/arm/vpx_convolve8_neon.h" + +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the + * maximum buffer size to 64 * (64 + 7). */ + uint8_t temp[64 * 71]; + + /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ + const int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + /* Filter starting 3 lines back. */ + vpx_convolve8_2d_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, + intermediate_height); + + /* Step into the temp buffer 3 lines to get the actual frame data */ + vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); +} + +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + uint8_t temp[64 * 71]; + const int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + vpx_convolve8_2d_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, + intermediate_height); + + vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); +} + +#else // !(VPX_ARCH_AARCH64 && + // (defined(__ARM_FEATURE_DOTPROD) || + // defined(__ARM_FEATURE_MATMUL_INT8))) + void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, @@ -63,3 +114,7 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } + +#endif // #if VPX_ARCH_AARCH64 && + // (defined(__ARM_FEATURE_DOTPROD) || + // defined(__ARM_FEATURE_MATMUL_INT8)) From e6b9a8d667bb43c58437bb1d6204ffc8047252ac Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Fri, 12 May 2023 10:56:45 +0530 Subject: [PATCH 0802/1000] Improve convolve AVX2 intrinsic for speed This CL refactors the code related to convolve function. Furthermore, improved the AVX2 intrinsic to compute convolve vertical for w = 4 case, and convolve horiz for w = 16 case. Please note the module level scaling w.r.t C function (timer based) for existing (AVX2) and new AVX2 intrinsics: Block Scaling Size AVX2 AVX2 (existing) (New) 4x4 5.34x 5.91x 4x8 7.10x 7.79x 16x8 23.52x 25.63x 16x16 29.47x 30.22x 16x32 33.42x 33.44x This is a bit exact change. Change-Id: If130183bc12faab9ca2bcec0ceeaa8d0af05e413 --- vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 226 +++++++--------------- 1 file changed, 71 insertions(+), 155 deletions(-) diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 2498bba173..526c283823 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -46,7 +46,7 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = { }; #define CALC_CONVOLVE8_HORZ_ROW \ - srcReg = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3); \ + srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch); \ s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]); \ s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]); \ s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]); \ @@ -60,16 +60,6 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = { _mm256_extractf128_si256(s1[0], 1)); \ output_ptr += output_pitch; -// 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0 -static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) { - // 0 0 0 0 0 0 0 0 | 0 0 0 0 lo3 lo2 lo1 lo0 - __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo))); - - // 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0 - a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1); - return a; -} - static INLINE void vpx_filter_block1d16_h8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter, @@ -93,12 +83,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2( __m256i srcReg; // load the 2 strides of source - srcReg = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3))); - srcReg = _mm256_inserti128_si256( - srcReg, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)), - 1); + srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr + src_pixels_per_line - 3); // filter the source buffer s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); @@ -109,12 +94,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2( // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) - srcReg = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5))); - srcReg = _mm256_inserti128_si256( - srcReg, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)), - 1); + srcReg = mm256_loadu2_si128(src_ptr + 5, src_ptr + src_pixels_per_line + 5); // filter the source buffer s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); @@ -129,60 +109,37 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2( src_ptr += src_stride; - // average if necessary - outReg1 = _mm256_castsi256_si128(outReg32b1); - outReg2 = _mm256_extractf128_si256(outReg32b1, 1); if (avg) { - outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); - outReg2 = _mm_avg_epu8( - outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch))); + const __m256i outReg = mm256_loadu2_si128( + (__m128i *)output_ptr, (__m128i *)(output_ptr + output_pitch)); + outReg32b1 = _mm256_avg_epu8(outReg32b1, outReg); } - - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, outReg1); - - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2); - + mm256_store2_si128((__m128i *)output_ptr, + (__m128i *)(output_ptr + output_pitch), &outReg32b1); output_ptr += dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { - __m128i srcReg; - - // load the first 16 bytes of the last row - srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + const __m256i srcReg = + _mm256_inserti128_si256(_mm256_castsi128_si256(srcReg1), srcReg2, 1); // filter the source buffer - s[0] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]))); - s[1] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]))); - s[2] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]))); - s[3] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]))); - outReg1 = convolve8_8_avx2(s, f); - - // reading the next 16 bytes - // (part of it was being read by earlier read) - srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); - // filter the source buffer - s[0] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]))); - s[1] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]))); - s[2] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]))); - s[3] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]))); - outReg2 = convolve8_8_avx2(s, f); + // The low and high 128-bits of each lane contain the first and second + // convolve result respectively + outReg32b1 = convolve8_16_avx2(s, f); + outReg1 = _mm256_castsi256_si128(outReg32b1); + outReg2 = _mm256_extractf128_si256(outReg32b1, 1); - // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane - // contain the first and second convolve result respectively + // shrink to 8 bit each 16 bits outReg1 = _mm_packus_epi16(outReg1, outReg2); // average if necessary @@ -266,7 +223,6 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter, const int avg) { - __m128i outReg1, outReg2; __m256i srcRegHead1; unsigned int i; ptrdiff_t src_stride, dst_stride; @@ -345,19 +301,14 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2( src_ptr += src_stride; // average if necessary - outReg1 = _mm256_castsi256_si128(s1[0]); - outReg2 = _mm256_extractf128_si256(s1[0], 1); if (avg) { - outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); - outReg2 = _mm_avg_epu8( - outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch))); + const __m256i outReg = mm256_loadu2_si128( + (__m128i *)output_ptr, (__m128i *)(output_ptr + out_pitch)); + s1[0] = _mm256_avg_epu8(s1[0], outReg); } - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, outReg1); - - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2); + mm256_store2_si128((__m128i *)output_ptr, + (__m128i *)(output_ptr + out_pitch), s1); output_ptr += dst_stride; @@ -1094,7 +1045,7 @@ static void vpx_filter_block1d4_h8_avx2( // load the 2 strides of source // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07 // r06 r05 r04 r03 r02 r01 r00 - srcReg32b1 = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3); + srcReg32b1 = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch); // filter the source buffer // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06 @@ -1188,8 +1139,7 @@ static void vpx_filter_block1d4_v8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m256i f[4], ss[4]; - __m256i r[8]; - __m128i r1[10]; + __m256i r[9], rr[2]; __m128i s[11]; unsigned int y = output_height; @@ -1210,48 +1160,35 @@ static void vpx_filter_block1d4_v8_avx2( s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); - // R1-0 xxxx .. . . x| r13 r12 r11 r10 r03 r02 r01 r00 - r1[0] = _mm_unpacklo_epi32(s[0], s[1]); - - // R2-1 xxxx .. . . x| r23 r22 r21 r20 r13 r12 r11 r10 - r1[1] = _mm_unpacklo_epi32(s[1], s[2]); - - // R3-2 xxxx .. . . x| r33 r32 r31 r30 r23 r22 r21 r20 - r1[2] = _mm_unpacklo_epi32(s[2], s[3]); - - // R4-3 xxxx .. . . x| r43 r42 r41 r40 r33 r32 r31 r30 - r1[3] = _mm_unpacklo_epi32(s[3], s[4]); - - // R5-4 xxxx .. . . x| r53 r52 r51 r50 r43 r42 r41 r40 - r1[4] = _mm_unpacklo_epi32(s[4], s[5]); - - // R6-5 xxxx .. . . x| r63 r62 r61 r60 r53 r52 r51 r50 - r1[5] = _mm_unpacklo_epi32(s[5], s[6]); + r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[2], 1); + r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[3], 1); + r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[4], 1); + r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[5], 1); + r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[6], 1); - // 00000000 r33 r32 r31 r30|r23 r22 r21 r20||00000000|r13 r12 r11 r10|r03 r02 - // r01 r00 - r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[0]), r1[2], 1); + // r37.....r24..r33..r31 r30 r23 r22 r21 r20|r17....r14 r07..r05 r04 r13 r12 + // r11 r10 r03 r02 r01 r00 + rr[0] = _mm256_unpacklo_epi32(r[0], r[1]); - // 00000000 r43 r42 r41 r40|r33 r32 r31 r30||00000000|r23 r22 r21 r20|r13 r12 - // r11 r10 - r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[1]), r1[3], 1); - - // 00000000 r53 r52 r51 r50|r43 r42 r41 r40||00000000|r33 r32 r31 r30|r23 r22 - // r21 r20 - r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[2]), r1[4], 1); - - // 00000000 r63 r62 r61 r60|r53 r52 r51 r50||00000000|r43 r42 r41 r40|r33 r32 - // r31 r30 - r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[3]), r1[5], 1); + // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22 + // r21 r20 r13 r12 r11 r10 + rr[1] = _mm256_unpacklo_epi32(r[1], r[2]); // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10 // r00| - ss[0] = _mm256_unpacklo_epi8(r[0], r[1]); + ss[0] = _mm256_unpacklo_epi8(rr[0], rr[1]); + + // r37.....r24..r33..r31 r30 r23 r22 r21 r20||r17....r14 r07..r05 r04 r13 r12 + // r11 r10 r03 r02 r01 r00 + rr[0] = _mm256_unpacklo_epi32(r[2], r[3]); + + // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22 + // r21 r20 r13 r12 r11 r10 + rr[1] = _mm256_unpacklo_epi32(r[3], r[4]); // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30 // r20| - ss[1] = _mm256_unpacklo_epi8(r[2], r[3]); - + ss[1] = _mm256_unpacklo_epi8(rr[0], rr[1]); // Process 4 rows at a time while (y >= 4) { s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); @@ -1259,41 +1196,17 @@ static void vpx_filter_block1d4_v8_avx2( s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch)); s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch)); - // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60 - r1[6] = _mm_unpacklo_epi32(s[6], s[7]); - - // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70 - r1[7] = _mm_unpacklo_epi32(s[7], s[8]); - - // R9-8 xxxx .. . . x| r93 r92 r91 r90 r83 r82 r81 r80 - r1[8] = _mm_unpacklo_epi32(s[8], s[9]); - - // R10-9 xxxx .. . . x| r10-3 r10-2 r10-1 r10-0 r93 r92 r91 r90 - r1[9] = _mm_unpacklo_epi32(s[9], s[10]); - - // 00000000 r73 r72 r71 r70|r63 r62 r61 r60||00000000|r53 r52 r51 r50|r43 - // r42 r41 r40 - r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[4]), r1[6], 1); - - // 00000000 r83 r82 r81 r80|r73 r72 r71 r70||00000000|r63 r62 r61 r60|r53 - // r52 r51 r50 - r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[5]), r1[7], 1); + r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[7], 1); + r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[8], 1); + rr[0] = _mm256_unpacklo_epi32(r[4], r[5]); + rr[1] = _mm256_unpacklo_epi32(r[5], r[6]); + ss[2] = _mm256_unpacklo_epi8(rr[0], rr[1]); - // 00000000 r93 r92 r91 r90|r83 r82 r81 r80||00000000|r73 r72 r71 r70|r63 - // r62 r61 r60 - r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[6]), r1[8], 1); - - // 00000000 r10-3 r10-2 r10-1 r10-0|r93 r92 r91 r90||00000000|r83 r82 r81 - // r80|r73 r72 r71 r70 - r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[7]), r1[9], 1); - - // r83 r73....r80 r70|r73 r63....r70 r60||r63 r53....r60 r50|r53 r43....r50 - // r40| - ss[2] = _mm256_unpacklo_epi8(r[4], r[5]); - - // r10-3 r10-3....r10-0 r10-0|r93 r83....r90 r80||r83 r73....r80 r70|r73 - // r63....r70 r60| - ss[3] = _mm256_unpacklo_epi8(r[6], r[7]); + r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[9], 1); + r[8] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[8]), s[10], 1); + rr[0] = _mm256_unpacklo_epi32(r[6], r[7]); + rr[1] = _mm256_unpacklo_epi32(r[7], r[8]); + ss[3] = _mm256_unpacklo_epi8(rr[0], rr[1]); ss[0] = convolve8_16_avx2(ss, f); @@ -1315,17 +1228,17 @@ static void vpx_filter_block1d4_v8_avx2( ss[1] = ss[3]; s[6] = s[10]; + s[5] = s[9]; - r1[4] = r1[8]; - r1[5] = r1[9]; - + r[4] = r[8]; y -= 4; } // Process 2 rows if (y == 2) { - __m128i ss1[4], f1[4]; + __m128i ss1[4], f1[4], r1[4]; + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); @@ -1334,11 +1247,14 @@ static void vpx_filter_block1d4_v8_avx2( f1[2] = _mm256_castsi256_si128(f[2]); f1[3] = _mm256_castsi256_si128(f[3]); + r1[0] = _mm_unpacklo_epi32(s[4], s[5]); + r1[1] = _mm_unpacklo_epi32(s[5], s[6]); + // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60 - r1[6] = _mm_unpacklo_epi32(s[6], s[7]); + r1[2] = _mm_unpacklo_epi32(s[6], s[7]); // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70 - r1[7] = _mm_unpacklo_epi32(s[7], s[8]); + r1[3] = _mm_unpacklo_epi32(s[7], s[8]); // r23 r13....r20 r10|r13 r03....r10 r00 ss1[0] = _mm256_castsi256_si128(ss[0]); @@ -1347,10 +1263,10 @@ static void vpx_filter_block1d4_v8_avx2( ss1[1] = _mm256_castsi256_si128(ss[1]); // r63 r53....r60 r50|r53 r43....r50 r40 - ss1[2] = _mm_unpacklo_epi8(r1[4], r1[5]); + ss1[2] = _mm_unpacklo_epi8(r1[0], r1[1]); // r83 r73....r80 r70|r73 r63....r70 r60 - ss1[3] = _mm_unpacklo_epi8(r1[6], r1[7]); + ss1[3] = _mm_unpacklo_epi8(r1[2], r1[3]); ss1[0] = convolve8_8_ssse3(ss1, f1); From 7e7a1706e3dadcfbb3d92d93ea735420990584da Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 16 May 2023 14:57:05 -0400 Subject: [PATCH 0803/1000] Add new vpx_tpl.h API file New file (vpx_tpl.c) in the following CLs will add new APIs dealing with TPL stats from VP9 encoder. Change-Id: I5102ef64214cba1ca6ecea9582a19049666c6ca4 --- libs.mk | 1 + test/encode_api_test.cc | 2 +- test/encode_test_driver.h | 2 +- vp9/encoder/vp9_encoder.h | 1 + vp9/vp9_cx_iface.c | 2 ++ vpx/vpx_codec.mk | 2 ++ vpx/vpx_encoder.h | 32 +++----------------- vpx/vpx_tpl.h | 63 +++++++++++++++++++++++++++++++++++++++ 8 files changed, 75 insertions(+), 30 deletions(-) create mode 100644 vpx/vpx_tpl.h diff --git a/libs.mk b/libs.mk index 1411fee9a1..ea5cc15a17 100644 --- a/libs.mk +++ b/libs.mk @@ -178,6 +178,7 @@ INSTALL-LIBS-yes += include/vpx/vpx_image.h INSTALL-LIBS-yes += include/vpx/vpx_integer.h INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h +INSTALL-LIBS-$(CONFIG_VP9_ENCODER) += include/vpx/vpx_tpl.h ifeq ($(CONFIG_EXTERNAL_BUILD),yes) ifeq ($(CONFIG_MSVS),yes) INSTALL-LIBS-yes += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 2b0aa1fdfe..af98ad5ddd 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -20,7 +20,7 @@ #include "./vpx_config.h" #include "vpx/vp8cx.h" -#include "vpx/vpx_encoder.h" +#include "vpx/vpx_tpl.h" namespace { diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index 922c49f420..165fcfabf6 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -19,7 +19,7 @@ #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER #include "vpx/vp8cx.h" #endif -#include "vpx/vpx_encoder.h" +#include "vpx/vpx_tpl.h" namespace libvpx_test { diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 2528bc2316..2e0c4db9ed 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -18,6 +18,7 @@ #include "vpx/internal/vpx_codec_internal.h" #include "vpx/vpx_ext_ratectrl.h" #include "vpx/vp8cx.h" +#include "vpx/vpx_tpl.h" #if CONFIG_INTERNAL_STATS #include "vpx_dsp/ssim.h" #endif diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 8f157274fb..409069b4ed 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -29,6 +29,8 @@ #include "vp9/vp9_cx_iface.h" #include "vp9/vp9_iface_common.h" +#include "vpx/vpx_tpl.h" + typedef struct vp9_extracfg { int cpu_used; // available cpu percentage in 1/16 unsigned int enable_auto_alt_ref; diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk index de86579d58..4aec88b300 100644 --- a/vpx/vpx_codec.mk +++ b/vpx/vpx_codec.mk @@ -27,6 +27,7 @@ API_DOC_SRCS-yes += vpx_encoder.h API_DOC_SRCS-yes += vpx_ext_ratectrl.h API_DOC_SRCS-yes += vpx_frame_buffer.h API_DOC_SRCS-yes += vpx_image.h +API_DOC_SRCS-yes += vpx_tpl.h API_SRCS-yes += src/vpx_decoder.c API_SRCS-yes += vpx_decoder.h @@ -42,3 +43,4 @@ API_SRCS-yes += vpx_frame_buffer.h API_SRCS-yes += vpx_image.h API_SRCS-yes += vpx_integer.h API_SRCS-yes += vpx_ext_ratectrl.h +API_SRCS-yes += vpx_tpl.h diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index fb95723dd3..c45d1a2ba5 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -31,6 +31,7 @@ extern "C" { #include "./vpx_codec.h" #include "./vpx_ext_ratectrl.h" +#include "./vpx_tpl.h" /*! Temporal Scalability: Maximum length of the sequence defining frame * layer membership @@ -57,9 +58,9 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_ENCODER_ABI_VERSION \ - (16 + VPX_CODEC_ABI_VERSION + \ - VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/ +#define VPX_ENCODER_ABI_VERSION \ + (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \ + VPX_TPL_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield * @@ -252,31 +253,6 @@ enum vpx_kf_mode { VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */ }; -/*!\brief Temporal dependency model stats for each block before propagation */ -typedef struct VpxTplBlockStats { - int64_t intra_cost; /**< Intra cost */ - int64_t inter_cost; /**< Inter cost */ - int16_t mv_r; /**< Motion vector row */ - int16_t mv_c; /**< Motion vector col */ - int64_t recrf_rate; /**< Rate from reconstructed ref frame */ - int64_t recrf_dist; /**< Distortion from reconstructed ref frame */ - int ref_frame_index; /**< Ref frame index */ -} VpxTplBlockStats; - -/*!\brief Temporal dependency model stats for each frame before propagation */ -typedef struct VpxTplFrameStats { - int frame_width; /**< Frame width */ - int frame_height; /**< Frame height */ - int num_blocks; /**< Number of blocks. Size of block_stats_list */ - VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */ -} VpxTplFrameStats; - -/*!\brief Temporal dependency model stats for each GOP before propagation */ -typedef struct VpxTplGopStats { - int size; /**< GOP size, also the size of frame_stats_list. */ - VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */ -} VpxTplGopStats; - /*!\brief Encoded Frame Flags * * This type indicates a bitfield to be passed to vpx_codec_encode(), defining diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h new file mode 100644 index 0000000000..689fa96920 --- /dev/null +++ b/vpx/vpx_tpl.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/*!\file + * \brief Describes the TPL stats descriptor and associated operations + * + */ +#ifndef VPX_VPX_VPX_TPL_H_ +#define VPX_VPX_VPX_TPL_H_ + +#include "./vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_TPL_ABI_VERSION (0) /**<\hideinitializer*/ + +/*!\brief Temporal dependency model stats for each block before propagation */ +typedef struct VpxTplBlockStats { + int64_t intra_cost; /**< Intra cost */ + int64_t inter_cost; /**< Inter cost */ + int16_t mv_r; /**< Motion vector row */ + int16_t mv_c; /**< Motion vector col */ + int64_t recrf_rate; /**< Rate from reconstructed ref frame */ + int64_t recrf_dist; /**< Distortion from reconstructed ref frame */ + int ref_frame_index; /**< Ref frame index */ +} VpxTplBlockStats; + +/*!\brief Temporal dependency model stats for each frame before propagation */ +typedef struct VpxTplFrameStats { + int frame_width; /**< Frame width */ + int frame_height; /**< Frame height */ + int num_blocks; /**< Number of blocks. Size of block_stats_list */ + VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */ +} VpxTplFrameStats; + +/*!\brief Temporal dependency model stats for each GOP before propagation */ +typedef struct VpxTplGopStats { + int size; /**< GOP size, also the size of frame_stats_list. */ + VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */ +} VpxTplGopStats; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_VPX_TPL_H_ From d45cc8edda12306c8449242344c63992f63e7a0b Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 4 May 2023 10:48:25 -0400 Subject: [PATCH 0804/1000] Add IO for TPL stats Overload TempOutFile constructor to allow IO mode. Bug: b/281563704 Change-Id: I1f4f5b29db0e331941b6795e478eeeab51f625ad --- libs.mk | 2 +- test/encode_api_test.cc | 58 +++++++++++++++++++++- test/video_source.h | 11 +++-- vpx/exports_com | 3 ++ vpx/src/vpx_tpl.c | 107 ++++++++++++++++++++++++++++++++++++++++ vpx/vpx_codec.mk | 1 + vpx/vpx_tpl.h | 38 +++++++++++++- 7 files changed, 212 insertions(+), 8 deletions(-) create mode 100644 vpx/src/vpx_tpl.c diff --git a/libs.mk b/libs.mk index ea5cc15a17..f6f6cc94c3 100644 --- a/libs.mk +++ b/libs.mk @@ -178,7 +178,7 @@ INSTALL-LIBS-yes += include/vpx/vpx_image.h INSTALL-LIBS-yes += include/vpx/vpx_integer.h INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h -INSTALL-LIBS-$(CONFIG_VP9_ENCODER) += include/vpx/vpx_tpl.h +INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_tpl.h ifeq ($(CONFIG_EXTERNAL_BUILD),yes) ifeq ($(CONFIG_MSVS),yes) INSTALL-LIBS-yes += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index af98ad5ddd..e8a044ae17 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/codec_factory.h" @@ -368,7 +369,7 @@ class EncodeApiGetTplStatsTest : public ::libvpx_test::EncoderTest, public ::testing::TestWithParam { public: - EncodeApiGetTplStatsTest() : EncoderTest(GetParam()) {} + EncodeApiGetTplStatsTest() : EncoderTest(GetParam()), test_io_(false) {} ~EncodeApiGetTplStatsTest() override {} protected: @@ -396,6 +397,34 @@ class EncodeApiGetTplStatsTest return VPX_CODEC_OK; } + void CompareTplGopStats(const VpxTplGopStats &ref_gop_stats, + const VpxTplGopStats &test_gop_stats) { + ASSERT_EQ(ref_gop_stats.size, test_gop_stats.size); + for (int frame = 0; frame < ref_gop_stats.size; frame++) { + const VpxTplFrameStats &ref_frame_stats = + ref_gop_stats.frame_stats_list[frame]; + const VpxTplFrameStats &test_frame_stats = + test_gop_stats.frame_stats_list[frame]; + ASSERT_EQ(ref_frame_stats.num_blocks, test_frame_stats.num_blocks); + ASSERT_EQ(ref_frame_stats.frame_width, test_frame_stats.frame_width); + ASSERT_EQ(ref_frame_stats.frame_height, test_frame_stats.frame_height); + for (int block = 0; block < ref_frame_stats.num_blocks; block++) { + const VpxTplBlockStats &ref_block_stats = + ref_frame_stats.block_stats_list[block]; + const VpxTplBlockStats &test_block_stats = + test_frame_stats.block_stats_list[block]; + ASSERT_EQ(ref_block_stats.inter_cost, test_block_stats.inter_cost); + ASSERT_EQ(ref_block_stats.intra_cost, test_block_stats.intra_cost); + ASSERT_EQ(ref_block_stats.mv_c, test_block_stats.mv_c); + ASSERT_EQ(ref_block_stats.mv_r, test_block_stats.mv_r); + ASSERT_EQ(ref_block_stats.recrf_dist, test_block_stats.recrf_dist); + ASSERT_EQ(ref_block_stats.recrf_rate, test_block_stats.recrf_rate); + ASSERT_EQ(ref_block_stats.ref_frame_index, + test_block_stats.ref_frame_index); + } + } + } + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { @@ -416,7 +445,21 @@ class EncodeApiGetTplStatsTest } } ASSERT_TRUE(stats_not_all_zero); - // Free the memory right away now as this is only a test. + if (test_io_ && tpl_stats.size > 0) { + libvpx_test::TempOutFile *temp_out_file = + new (std::nothrow) libvpx_test::TempOutFile("w+"); + ASSERT_NE(temp_out_file, nullptr); + ASSERT_NE(temp_out_file->file(), nullptr); + vpx_write_tpl_gop_stats(temp_out_file->file(), &tpl_stats); + rewind(temp_out_file->file()); + VpxTplGopStats gop_stats_io; + ASSERT_EQ( + vpx_read_tpl_gop_stats(temp_out_file->file(), &gop_stats_io), + VPX_CODEC_OK); + CompareTplGopStats(gop_stats_io, tpl_stats); + vpx_free_tpl_gop_stats(&gop_stats_io); + delete temp_out_file; + } free(tpl_stats.frame_stats_list); break; } @@ -427,6 +470,7 @@ class EncodeApiGetTplStatsTest int width_; int height_; + bool test_io_; }; TEST_P(EncodeApiGetTplStatsTest, GetTplStats) { @@ -438,6 +482,16 @@ TEST_P(EncodeApiGetTplStatsTest, GetTplStats) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } +TEST_P(EncodeApiGetTplStatsTest, GetTplStatsIO) { + cfg_.g_lag_in_frames = 25; + width_ = 352; + height_ = 288; + test_io_ = true; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_, + height_, 30, 1, 0, 50); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + INSTANTIATE_TEST_SUITE_P( VP9, EncodeApiGetTplStatsTest, ::testing::Values( diff --git a/test/video_source.h b/test/video_source.h index a10ff6fb09..5ed99d0639 100644 --- a/test/video_source.h +++ b/test/video_source.h @@ -64,7 +64,7 @@ inline FILE *OpenTestDataFile(const std::string &file_name) { return fopen(path_to_source.c_str(), "rb"); } -static FILE *GetTempOutFile(std::string *file_name) { +static FILE *GetTempOutFile(std::string *file_name, const char *io_mode) { file_name->clear(); #if defined(_WIN32) char fname[MAX_PATH]; @@ -73,7 +73,7 @@ static FILE *GetTempOutFile(std::string *file_name) { // Assume for now that the filename generated is unique per process if (GetTempFileNameA(tmppath, "lvx", 0, fname)) { file_name->assign(fname); - return fopen(fname, "wb+"); + return fopen(fname, io_mode); } } return nullptr; @@ -94,13 +94,16 @@ static FILE *GetTempOutFile(std::string *file_name) { const int fd = mkstemp(temp_file_name.get()); if (fd == -1) return nullptr; *file_name = temp_file_name.get(); - return fdopen(fd, "wb+"); + return fdopen(fd, io_mode); #endif } class TempOutFile { public: - TempOutFile() { file_ = GetTempOutFile(&file_name_); } + TempOutFile() { file_ = GetTempOutFile(&file_name_, "wb+"); } + TempOutFile(const char *io_mode) { + file_ = GetTempOutFile(&file_name_, io_mode); + } ~TempOutFile() { CloseFile(); if (!file_name_.empty()) { diff --git a/vpx/exports_com b/vpx/exports_com index 2ab05099f8..f0b46aa175 100644 --- a/vpx/exports_com +++ b/vpx/exports_com @@ -14,3 +14,6 @@ text vpx_img_flip text vpx_img_free text vpx_img_set_rect text vpx_img_wrap +text vpx_free_tpl_gop_stats +text vpx_read_tpl_gop_stats +text vpx_write_tpl_gop_stats diff --git a/vpx/src/vpx_tpl.c b/vpx/src/vpx_tpl.c new file mode 100644 index 0000000000..9cdb4a0a06 --- /dev/null +++ b/vpx/src/vpx_tpl.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx/vpx_codec.h" +#include "vpx/vpx_tpl.h" +#include "vpx_mem/vpx_mem.h" + +#define CHECK_FPRINTF_ERROR(expr) \ + do { \ + if (expr < 0) { \ + return VPX_CODEC_ERROR; \ + } \ + } while (0) + +#define CHECK_FSCANF_ERROR(expr, expected_value) \ + do { \ + if (expr != expected_value) { \ + return VPX_CODEC_ERROR; \ + } \ + } while (0) + +vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file, + const VpxTplGopStats *tpl_gop_stats) { + int i; + if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM; + CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d\n", tpl_gop_stats->size)); + + for (i = 0; i < tpl_gop_stats->size; i++) { + VpxTplFrameStats frame_stats = tpl_gop_stats->frame_stats_list[i]; + const int num_blocks = frame_stats.num_blocks; + int block; + CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d %d %d\n", frame_stats.frame_width, + frame_stats.frame_height, num_blocks)); + for (block = 0; block < num_blocks; block++) { + VpxTplBlockStats block_stats = frame_stats.block_stats_list[block]; + CHECK_FPRINTF_ERROR( + fprintf(tpl_file, + "%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64 + " %" PRId64 " %d\n", + block_stats.inter_cost, block_stats.intra_cost, + block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist, + block_stats.recrf_rate, block_stats.ref_frame_index)); + } + } + + return VPX_CODEC_OK; +} + +vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file, + VpxTplGopStats *tpl_gop_stats) { + int i, frame_list_size; + if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM; + CHECK_FSCANF_ERROR(fscanf(tpl_file, "%d\n", &frame_list_size), 1); + tpl_gop_stats->size = frame_list_size; + tpl_gop_stats->frame_stats_list = (VpxTplFrameStats *)vpx_calloc( + frame_list_size, sizeof(tpl_gop_stats->frame_stats_list[0])); + if (tpl_gop_stats->frame_stats_list == NULL) { + return VPX_CODEC_MEM_ERROR; + } + for (i = 0; i < frame_list_size; i++) { + VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i]; + int num_blocks, width, height, block; + CHECK_FSCANF_ERROR( + fscanf(tpl_file, "%d %d %d\n", &width, &height, &num_blocks), 3); + frame_stats->num_blocks = num_blocks; + frame_stats->frame_width = width; + frame_stats->frame_height = height; + frame_stats->block_stats_list = (VpxTplBlockStats *)vpx_calloc( + num_blocks, sizeof(frame_stats->block_stats_list[0])); + if (frame_stats->block_stats_list == NULL) { + vpx_free_tpl_gop_stats(tpl_gop_stats); + return VPX_CODEC_MEM_ERROR; + } + for (block = 0; block < num_blocks; block++) { + VpxTplBlockStats *block_stats = &frame_stats->block_stats_list[block]; + CHECK_FSCANF_ERROR( + fscanf(tpl_file, + "%" SCNd64 " %" SCNd64 " %" SCNd16 " %" SCNd16 " %" SCNd64 + " %" SCNd64 " %d\n", + &block_stats->inter_cost, &block_stats->intra_cost, + &block_stats->mv_c, &block_stats->mv_r, + &block_stats->recrf_dist, &block_stats->recrf_rate, + &block_stats->ref_frame_index), + 7); + } + } + + return VPX_CODEC_OK; +} + +void vpx_free_tpl_gop_stats(VpxTplGopStats *data) { + int frame; + if (data == NULL) return; + for (frame = 0; frame < data->size; frame++) { + vpx_free(data->frame_stats_list[frame].block_stats_list); + } + vpx_free(data->frame_stats_list); +} diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk index 4aec88b300..25c815ef51 100644 --- a/vpx/vpx_codec.mk +++ b/vpx/vpx_codec.mk @@ -37,6 +37,7 @@ API_SRCS-yes += internal/vpx_codec_internal.h API_SRCS-yes += internal/vpx_ratectrl_rtc.h API_SRCS-yes += src/vpx_codec.c API_SRCS-yes += src/vpx_image.c +API_SRCS-yes += src/vpx_tpl.c API_SRCS-yes += vpx_codec.h API_SRCS-yes += vpx_codec.mk API_SRCS-yes += vpx_frame_buffer.h diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h index 689fa96920..50aec49eb6 100644 --- a/vpx/vpx_tpl.h +++ b/vpx/vpx_tpl.h @@ -15,6 +15,8 @@ #ifndef VPX_VPX_VPX_TPL_H_ #define VPX_VPX_VPX_TPL_H_ +#include + #include "./vpx_integer.h" #ifdef __cplusplus @@ -29,7 +31,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_TPL_ABI_VERSION (0) /**<\hideinitializer*/ +#define VPX_TPL_ABI_VERSION (1) /**<\hideinitializer*/ /*!\brief Temporal dependency model stats for each block before propagation */ typedef struct VpxTplBlockStats { @@ -56,6 +58,40 @@ typedef struct VpxTplGopStats { VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */ } VpxTplGopStats; +/*!\brief Write VpxTplGopStats to file + * + * Accepts an opened file handle and writes \p tpl_gop_stats. + * + * \param[in] tpl_file A FILE pointer that's already been opened. + * \param[in] tpl_gop_stats VpxTplGopStats that contains TPL stats for the + * whole GOP. + * + * \return VPX_CODEC_OK if TPL stats are successfully written. + */ +vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file, + const VpxTplGopStats *tpl_gop_stats); + +/*!\brief Read VpxTplGopStats from file + * + * Accepts an opened file handle and reads TPL stats and stores them into + * \p tpl_gop_stats. Allocates memory for TPL stats. + * + * \param[in] tpl_file A FILE pointer that's already been opened. + * \param[out] tpl_gop_stats VpxTplGopStats that contains TPL stats for the + * whole GOP. + * + * \return VPX_CODEC_OK if TPL stats are successfully read from file. + */ +vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file, + VpxTplGopStats *tpl_gop_stats); + +/*!\brief Free the memory allocated for VpxTplGopStats + * + * \param[in] tpl_gop_stats VpxTplGopStats that contains TPL stats for the + * whole GOP. + */ +void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats); + #ifdef __cplusplus } // extern "C" #endif From 62d09a3e94ef2ce0091b6a8e3a298851657c6891 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 23 May 2023 15:48:10 -0700 Subject: [PATCH 0805/1000] fdct8x8_test.cc: work around VS2022 Arm64 issue cl.exe targeting AArch64 with optimizations enabled produces invalid code in RunExtremalCheck() and RunInvAccuracyCheck(). See: https://developercommunity.visualstudio.com/t/1770-preview-1:-Misoptimization-for-AR/10369786 Bug: b/277255076 Bug: webm:1788 Change-Id: Id2c60f3948d8f788c78602aea1b5232133415dea --- test/fdct8x8_test.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index fcc84690a0..21f8dcffa0 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -132,6 +132,15 @@ void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { #endif // HAVE_SSE2 #endif // CONFIG_VP9_HIGHBITDEPTH +// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled +// produces invalid code in RunExtremalCheck() and RunInvAccuracyCheck(). +// See: +// https://developercommunity.visualstudio.com/t/1770-preview-1:-Misoptimization-for-AR/10369786 +// TODO(jzern): check the compiler version after a fix for the issue is +// released. +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", off) +#endif class FwdTrans8x8TestBase { public: virtual ~FwdTrans8x8TestBase() {} @@ -523,6 +532,9 @@ class FwdTrans8x8TestBase { vpx_bit_depth_t bit_depth_; int mask_; }; +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", on) +#endif class FwdTrans8x8DCT : public FwdTrans8x8TestBase, public ::testing::TestWithParam { From 95b56ab7df669ec5dd29c283fa5bf6d38c2df5d1 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 23 May 2023 15:49:29 -0700 Subject: [PATCH 0806/1000] fdct_partial_neon.c: work around VS2022 Arm64 issue cl.exe targeting AArch64 with optimizations enabled will fail with an internal compiler error. See: https://developercommunity.visualstudio.com/t/Compiler-crash-C1001-when-building-a-for/10346110 Bug: b/277255076 Bug: webm:1788 Change-Id: I55caf34e910dab47a7775f07280677cdfe606f5b --- vpx_dsp/arm/fdct_partial_neon.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c index 718dba0d91..df0da543ce 100644 --- a/vpx_dsp/arm/fdct_partial_neon.c +++ b/vpx_dsp/arm/fdct_partial_neon.c @@ -37,6 +37,15 @@ void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { output[1] = 0; } +// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled +// will fail with an internal compiler error. +// See: +// https://developercommunity.visualstudio.com/t/Compiler-crash-C1001-when-building-a-for/10346110 +// TODO(jzern): check the compiler version after a fix for the issue is +// released. +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", off) +#endif void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { int r; int16x8_t sum = vld1q_s16(&input[0]); @@ -49,6 +58,9 @@ void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { output[0] = (tran_low_t)horizontal_add_int16x8(sum); output[1] = 0; } +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", on) +#endif void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride) { From 25f2e1ef255e89d5e7357aa2427926776327765a Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 23 May 2023 15:50:10 -0700 Subject: [PATCH 0807/1000] vpx_dsp_common.h,clip_pixel: work around VS2022 Arm64 issue cl.exe targeting AArch64 with optimizations enabled produces invalid code for clip_pixel() when the return type is uint8_t. See: https://developercommunity.visualstudio.com/t/Misoptimization-for-ARM64-in-VS-2022-17/10363361 Bug: b/277255076 Bug: webm:1788 Change-Id: Ia3647698effd34f1cf196cd33fa4a8cab9fa53d6 --- vpx_dsp/vpx_dsp_common.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h index 2de4495465..4b946d7560 100644 --- a/vpx_dsp/vpx_dsp_common.h +++ b/vpx_dsp/vpx_dsp_common.h @@ -45,9 +45,21 @@ typedef int16_t tran_low_t; typedef int16_t tran_coef_t; +// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled +// produces invalid code for clip_pixel() when the return type is uint8_t. +// See: +// https://developercommunity.visualstudio.com/t/Misoptimization-for-ARM64-in-VS-2022-17/10363361 +// TODO(jzern): check the compiler version after a fix for the issue is +// released. +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +static INLINE int clip_pixel(int val) { + return (val > 255) ? 255 : (val < 0) ? 0 : val; +} +#else static INLINE uint8_t clip_pixel(int val) { return (val > 255) ? 255 : (val < 0) ? 0 : val; } +#endif static INLINE int clamp(int value, int low, int high) { return value < low ? low : (value > high ? high : value); From c738e87f27ef8e12dd28b9052f446a5f69abf3c9 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Tue, 30 May 2023 15:22:04 +0100 Subject: [PATCH 0808/1000] Optimize Neon implementation of vpx_int_pro_col Use widening pairwise addition instructions to halve the number of additions required. Change-Id: I0307a3b65e50d2b1ae582938bc5df9c2b21df734 --- vpx_dsp/arm/avg_neon.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index 8c61fc26f4..2fe65d1129 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -121,17 +121,17 @@ void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, } int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) { + uint16x8_t sum; int i; - uint16x8_t vec_sum = vdupq_n_u16(0); - for (i = 0; i < width; i += 16) { - const uint8x16_t vec_row = vld1q_u8(ref); - vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); - vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); - ref += 16; + assert(width >= 16 && width % 16 == 0); + + sum = vpaddlq_u8(vld1q_u8(ref)); + for (i = 16; i < width; i += 16) { + sum = vpadalq_u8(sum, vld1q_u8(ref + i)); } - return (int16_t)horizontal_add_uint16x8(vec_sum); + return (int16_t)horizontal_add_uint16x8(sum); } // ref, src = [0, 510] - max diff = 16-bits From c36aa2e9c4a610dd7f5467126c894ac4dcbded02 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Tue, 30 May 2023 17:31:18 +0100 Subject: [PATCH 0809/1000] Optimize Neon implementation of vpx_int_pro_row Double the number of accumulator registers to remove the bottleneck. Also peel the first loop iteration. Change-Id: I6a90680369f9c33cdfe14ea547ac1569ec3f50de --- vpx_dsp/arm/avg_neon.c | 89 +++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index 2fe65d1129..22164242c5 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -70,54 +70,53 @@ int vpx_satd_neon(const tran_low_t *coeff, int length) { void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height) { int i; - uint16x8_t vec_sum_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_hi = vdupq_n_u16(0); - const int shift_factor = ((height >> 5) + 3) * -1; - const int16x8_t vec_shift = vdupq_n_s16(shift_factor); - - for (i = 0; i < height; i += 8) { - const uint8x16_t vec_row1 = vld1q_u8(ref); - const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride); - const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2); - const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3); - const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4); - const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5); - const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6); - const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8)); - - ref += ref_stride * 8; + uint8x16_t r0, r1, r2, r3; + uint16x8_t sum_lo[2], sum_hi[2]; + uint16x8_t tmp_lo[2], tmp_hi[2]; + int16x8_t avg_lo, avg_hi; + + const int norm_factor = (height >> 5) + 3; + const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor); + + assert(height >= 4 && height % 4 == 0); + + r0 = vld1q_u8(ref + 0 * ref_stride); + r1 = vld1q_u8(ref + 1 * ref_stride); + r2 = vld1q_u8(ref + 2 * ref_stride); + r3 = vld1q_u8(ref + 3 * ref_stride); + + sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); + sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); + sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); + sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); + + for (i = 4; i < height; i += 4) { + r0 = vld1q_u8(ref + 0 * ref_stride); + r1 = vld1q_u8(ref + 1 * ref_stride); + r2 = vld1q_u8(ref + 2 * ref_stride); + r3 = vld1q_u8(ref + 3 * ref_stride); + + tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); + tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); + tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); + tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); + + sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]); + sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]); + sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]); + sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]); + + ref += 4 * ref_stride; } - vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift); - vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift); + sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]); + sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]); + + avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor); + avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor); - vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo)); - hbuf += 8; - vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi)); + vst1q_s16(hbuf, avg_lo); + vst1q_s16(hbuf + 8, avg_hi); } int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) { From 1aff4a5655855c20b6766a91b9dacc8d78279c92 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 26 May 2023 12:02:36 -0400 Subject: [PATCH 0810/1000] Trim tpl stats by 2 extra frames Not applicable to the last GOP. Bug: b/284162396 Change-Id: I55b7e04e9fc4b68a08ce3e00b10743823c828954 --- vp9/encoder/vp9_tpl_model.c | 44 ++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index 9f4bafdf83..8d203bbf4f 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -20,8 +20,8 @@ #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_tpl_model.h" -static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, - const GF_GROUP *gf_group, int *tpl_group_frames) { +static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, + const GF_GROUP *gf_group, int *tpl_group_frames) { VP9_COMMON *cm = &cpi->common; int frame_idx = 0; int i; @@ -148,6 +148,8 @@ static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, ++extend_frame_count; ++frame_gop_offset; } + + return extend_frame_count; } static void init_tpl_stats(VP9_COMP *cpi) { @@ -1245,6 +1247,35 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, } } +static void trim_tpl_stats(struct vpx_internal_error_info *error_info, + VpxTplGopStats *tpl_gop_stats, int extra_frames) { + int i; + VpxTplFrameStats *new_frame_stats; + const int new_size = tpl_gop_stats->size - extra_frames; + if (tpl_gop_stats->size <= extra_frames) + vpx_internal_error( + error_info, VPX_CODEC_ERROR, + "The number of frames in VpxTplGopStats is fewer than expected."); + CHECK_MEM_ERROR(error_info, new_frame_stats, + vpx_calloc(new_size, sizeof(*new_frame_stats))); + for (i = 0; i < new_size; i++) { + VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i]; + const int num_blocks = frame_stats->num_blocks; + new_frame_stats[i].num_blocks = frame_stats->num_blocks; + new_frame_stats[i].frame_width = frame_stats->frame_width; + new_frame_stats[i].frame_height = frame_stats->frame_height; + new_frame_stats[i].num_blocks = num_blocks; + CHECK_MEM_ERROR( + error_info, new_frame_stats[i].block_stats_list, + vpx_calloc(num_blocks, sizeof(*new_frame_stats[i].block_stats_list))); + memcpy(new_frame_stats[i].block_stats_list, frame_stats->block_stats_list, + num_blocks * sizeof(*new_frame_stats[i].block_stats_list)); + } + free_tpl_frame_stats_list(tpl_gop_stats); + tpl_gop_stats->size = new_size; + tpl_gop_stats->frame_stats_list = new_frame_stats; +} + #if CONFIG_NON_GREEDY_MV #define DUMP_TPL_STATS 0 #if DUMP_TPL_STATS @@ -1456,9 +1487,11 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) { const GF_GROUP *gf_group = &cpi->twopass.gf_group; int tpl_group_frames = 0; int frame_idx; + int extended_frame_count; cpi->tpl_bsize = BLOCK_32X32; - init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames); + extended_frame_count = + init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames); init_tpl_stats(cpi); @@ -1470,6 +1503,11 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) { if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue; mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize); } + + // TPL stats has extra frames from next GOP. Trim those extra frames for + // Qmode. + trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count); + #if CONFIG_NON_GREEDY_MV cpi->tpl_ready = 1; #if DUMP_TPL_STATS From 7b66c730a2edd3e232dce5e8ef2522ff83928a90 Mon Sep 17 00:00:00 2001 From: Deepa K G Date: Mon, 24 Apr 2023 15:56:18 +0530 Subject: [PATCH 0811/1000] Fix c vs avx mismatch of diamond_search_sad() In the function vp9_diamond_search_sad_avx(), arranged the cost vector in a specific order. This ensures that the motion vector with the least index is selected, when there exists more than one candidate motion vector with the minimum cost, thus resolving the c vs avx mismatch. STATS_CHANGED Change-Id: I4f8864f464f9ea2aae6250db3d8ad91cb08b26e2 --- vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c index 80442e3594..c00579edc0 100644 --- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c @@ -233,8 +233,9 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, if (UNLIKELY(local_best_sad == 0xffff)) { __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; - v_loval_d = v_sad_d; - v_loidx_d = _mm_set_epi32(3, 2, 1, 0); + v_loval_d = _mm_shuffle_epi32(v_sad_d, 0xd8); + v_loidx_d = _mm_set_epi32(3, 1, 2, 0); + v_hival_d = _mm_srli_si128(v_loval_d, 8); v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); From e510716d7e9a0a34592eb8ff1f8a65b951fe2eeb Mon Sep 17 00:00:00 2001 From: Deepa K G Date: Tue, 6 Jun 2023 11:38:09 +0530 Subject: [PATCH 0812/1000] Add comments in vp9_diamond_search_sad_avx() Added comments related to re-arranging the elements of the SAD vector to find the minimum. Change-Id: I58b702d304a6cdd32f04775fba603e39c19a8947 --- vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c index c00579edc0..63c35df09e 100644 --- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c @@ -233,12 +233,19 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, if (UNLIKELY(local_best_sad == 0xffff)) { __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; + // Re-arrange the values in v_sad_d as follows: + // v_loval_d[0] = v_sad_d[0], v_loval_d[1] = v_sad_d[2] + // v_loval_d[2] = v_sad_d[1], v_loval_d[3] = v_sad_d[3] + // v_loidx_d stores the corresponding indices 0, 2, 1, 3 + // This re-arrangement is required to ensure that when there exists + // more than one minimum, the one with the least index is selected v_loval_d = _mm_shuffle_epi32(v_sad_d, 0xd8); v_loidx_d = _mm_set_epi32(3, 1, 2, 0); v_hival_d = _mm_srli_si128(v_loval_d, 8); v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); + // Compare if v_sad_d[1] < v_sad_d[0], v_sad_d[3] < v_sad_d[2] v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); @@ -246,6 +253,7 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, v_hival_d = _mm_srli_si128(v_loval_d, 4); v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); + // min(v_sad_d[2], v_sad_d[3]) < min(v_sad_d[0], v_sad_d[1]) v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); From bcd491a6be2b163c7293674dd91d8ca1f4cb56f0 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 7 Jun 2023 12:21:38 -0400 Subject: [PATCH 0813/1000] Fix a few typos segement -> segment dont -> don't useage -> usage devide -> divide Bug: webm:1803 Change-Id: I0153380b0003825c4b62cf323d4f2bc837c8a264 --- test/set_roi.cc | 2 +- vp8/common/blockd.h | 2 +- vp8/common/vp8_loopfilter.c | 2 +- vp8/decoder/decodeframe.c | 6 +++--- vp8/decoder/threading.c | 2 +- vp8/encoder/bitstream.c | 2 +- vp8/encoder/ethreading.c | 2 +- vp8/encoder/firstpass.c | 6 +++--- vp8/encoder/onyx_if.c | 14 ++++++------ vp8/encoder/ratectrl.c | 38 ++++++++++++++++----------------- vp8/encoder/rdopt.c | 2 +- vp8/encoder/segmentation.c | 4 ++-- vp8/encoder/segmentation.h | 4 ++-- vp8/encoder/vp8_quantize.c | 2 +- vp9/decoder/vp9_decodemv.c | 2 +- vp9/encoder/vp9_aq_complexity.c | 2 +- vp9/encoder/vp9_firstpass.c | 16 +++++++------- vp9/encoder/vp9_ratectrl.c | 4 ++-- vp9/encoder/vp9_rdopt.c | 2 +- 19 files changed, 57 insertions(+), 57 deletions(-) diff --git a/test/set_roi.cc b/test/set_roi.cc index 167cf908fd..693410e391 100644 --- a/test/set_roi.cc +++ b/test/set_roi.cc @@ -40,7 +40,7 @@ TEST(VP8RoiMapTest, ParameterCheck) { // Initialize elements of cpi with valid defaults. VP8_COMP cpi; - cpi.mb.e_mbd.mb_segement_abs_delta = SEGMENT_DELTADATA; + cpi.mb.e_mbd.mb_segment_abs_delta = SEGMENT_DELTADATA; cpi.cyclic_refresh_mode_enabled = 0; cpi.mb.e_mbd.segmentation_enabled = 0; cpi.mb.e_mbd.update_mb_segmentation_map = 0; diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h index 405443449d..8300aad941 100644 --- a/vp8/common/blockd.h +++ b/vp8/common/blockd.h @@ -251,7 +251,7 @@ typedef struct macroblockd { unsigned char update_mb_segmentation_data; /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ - unsigned char mb_segement_abs_delta; + unsigned char mb_segment_abs_delta; /* Per frame flags that define which MB level features (such as quantizer or * loop filter level) */ diff --git a/vp8/common/vp8_loopfilter.c b/vp8/common/vp8_loopfilter.c index 9c9e5f351b..4576c18537 100644 --- a/vp8/common/vp8_loopfilter.c +++ b/vp8/common/vp8_loopfilter.c @@ -111,7 +111,7 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd, /* Note the baseline filter values for each segment */ if (mbd->segmentation_enabled) { - if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) { + if (mbd->mb_segment_abs_delta == SEGMENT_ABSDATA) { lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; } else { /* Delta Value */ lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c index 1c1566766b..d014cf9667 100644 --- a/vp8/decoder/decodeframe.c +++ b/vp8/decoder/decodeframe.c @@ -63,7 +63,7 @@ void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) { /* Decide whether to use the default or alternate baseline Q value. */ if (xd->segmentation_enabled) { /* Abs Value */ - if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) { + if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) { QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id]; /* Delta Value */ @@ -829,7 +829,7 @@ static void init_frame(VP8D_COMP *pbi) { /* reset the segment feature data to 0 with delta coding (Default state). */ memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data)); - xd->mb_segement_abs_delta = SEGMENT_DELTADATA; + xd->mb_segment_abs_delta = SEGMENT_DELTADATA; /* reset the mode ref deltasa for loop filter */ memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas)); @@ -995,7 +995,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) { xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc); if (xd->update_mb_segmentation_data) { - xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc); + xd->mb_segment_abs_delta = (unsigned char)vp8_read_bit(bc); memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data)); diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index 9ea6a4f34a..6ccb080cf9 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -56,7 +56,7 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, mbd->dst = xd->dst; mbd->segmentation_enabled = xd->segmentation_enabled; - mbd->mb_segement_abs_delta = xd->mb_segement_abs_delta; + mbd->mb_segment_abs_delta = xd->mb_segment_abs_delta; memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data)); diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c index 190b013afd..03691fc9d1 100644 --- a/vp8/encoder/bitstream.c +++ b/vp8/encoder/bitstream.c @@ -1080,7 +1080,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, if (xd->update_mb_segmentation_data) { signed char Data; - vp8_write_bit(bc, xd->mb_segement_abs_delta); + vp8_write_bit(bc, xd->mb_segment_abs_delta); /* For each segmentation feature (Quant and loop filter level) */ for (i = 0; i < MB_LVL_MAX; ++i) { diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index 2583cb0ac3..b7f1932c58 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -402,7 +402,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) { zd->subpixel_predict8x8 = xd->subpixel_predict8x8; zd->subpixel_predict16x16 = xd->subpixel_predict16x16; zd->segmentation_enabled = xd->segmentation_enabled; - zd->mb_segement_abs_delta = xd->mb_segement_abs_delta; + zd->mb_segment_abs_delta = xd->mb_segment_abs_delta; memcpy(zd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data)); diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 4149fb4bf8..0abd178a61 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -1788,7 +1788,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { /* Should we use the alternate refernce frame */ if (allow_alt_ref && (i >= MIN_GF_INTERVAL) && - /* dont use ARF very near next kf */ + /* don't use ARF very near next kf */ (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) && #if NEW_BOOST ((next_frame.pcnt_inter > 0.75) || (next_frame.pcnt_second_ref > 0.5)) && @@ -2123,7 +2123,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (cpi->twopass.gf_group_bits < 0) cpi->twopass.gf_group_bits = 0; /* This condition could fail if there are two kfs very close together - * despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the + * despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the * calculation of cpi->twopass.alt_extra_bits. */ if (cpi->baseline_gf_interval >= 3) { @@ -2393,7 +2393,7 @@ void vp8_second_pass(VP8_COMP *cpi) { } /* The last few frames of a clip almost always have to few or too many - * bits and for the sake of over exact rate control we dont want to make + * bits and for the sake of over exact rate control we don't want to make * radical adjustments to the allowed quantizer range just to use up a * few surplus bits or get beneath the target rate. */ diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 8941329419..e78743e496 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -488,7 +488,7 @@ static void set_segmentation_map(VP8_COMP *cpi, */ static void set_segment_data(VP8_COMP *cpi, signed char *feature_data, unsigned char abs_delta) { - cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta; + cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta; memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data)); } @@ -2751,7 +2751,7 @@ static int decide_key_frame(VP8_COMP *cpi) { } /* in addition if the following are true and this is not a golden frame * then code a key frame Note that on golden frames there often seems - * to be a pop in intra useage anyway hence this restriction is + * to be a pop in intra usage anyway hence this restriction is * designed to prevent spurious key frames. The Intra pop needs to be * investigated. */ @@ -3637,7 +3637,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, Q = cpi->avg_frame_qindex; } - /* For constrained quality dont allow Q less than the cq level */ + /* For constrained quality don't allow Q less than the cq level */ if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) && (Q < cpi->cq_target_quality)) { Q = cpi->cq_target_quality; @@ -3664,7 +3664,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } else { cpi->active_best_quality = inter_minq[Q]; - /* For the constant/constrained quality mode we dont want + /* For the constant/constrained quality mode we don't want * q to fall below the cq level. */ if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) && @@ -3685,7 +3685,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, * higher quality on the frames to prevent bits just going to waste. */ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { - /* Note that the use of >= here elliminates the risk of a devide + /* Note that the use of >= here elliminates the risk of a divide * by 0 error in the else if clause */ if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size) { @@ -4322,12 +4322,12 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, vp8_cal_dissimilarity(cpi); #endif - /* Update the GF useage maps. + /* Update the GF usage maps. * This is done after completing the compression of a frame when all * modes etc. are finalized but before loop filter */ if (cpi->oxcf.number_of_layers == 1) { - vp8_update_gf_useage_maps(cpi, cm, &cpi->mb); + vp8_update_gf_usage_maps(cpi, cm, &cpi->mb); } if (cm->frame_type == KEY_FRAME) cm->refresh_last_frame = 1; diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index 9cd3963e22..49ab4aa238 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -388,7 +388,7 @@ static void calc_gf_params(VP8_COMP *cpi) { (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; int Boost = 0; - int gf_frame_useage = 0; /* Golden frame useage since last GF */ + int gf_frame_usage = 0; /* Golden frame usage since last GF */ int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] + cpi->recent_ref_frame_usage[LAST_FRAME] + cpi->recent_ref_frame_usage[GOLDEN_FRAME] + @@ -398,12 +398,12 @@ static void calc_gf_params(VP8_COMP *cpi) { (cpi->common.mb_rows * cpi->common.mb_cols); if (tot_mbs) { - gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + - cpi->recent_ref_frame_usage[ALTREF_FRAME]) * - 100 / tot_mbs; + gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * + 100 / tot_mbs; } - if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active; + if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active; /* Not two pass */ if (cpi->pass != 2) { @@ -467,7 +467,7 @@ static void calc_gf_params(VP8_COMP *cpi) { /* Adjust boost based upon ambient Q */ Boost = GFQ_ADJUSTMENT; - /* Adjust based upon most recently measure intra useage */ + /* Adjust based upon most recently measure intra usage */ Boost = Boost * gf_intra_usage_adjustment[(cpi->this_frame_percent_intra < 15) ? cpi->this_frame_percent_intra @@ -475,7 +475,7 @@ static void calc_gf_params(VP8_COMP *cpi) { 100; /* Adjust gf boost based upon GF usage since last GF */ - Boost = Boost * gf_adjust_table[gf_frame_useage] / 100; + Boost = Boost * gf_adjust_table[gf_frame_usage] / 100; #endif } @@ -516,8 +516,8 @@ static void calc_gf_params(VP8_COMP *cpi) { if (cpi->last_boost >= 1500) cpi->frames_till_gf_update_due++; - if (gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due) { - cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_useage]; + if (gf_interval_table[gf_frame_usage] > cpi->frames_till_gf_update_due) { + cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_usage]; } if (cpi->frames_till_gf_update_due > cpi->max_gf_interval) { @@ -895,7 +895,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; - int gf_frame_useage = 0; /* Golden frame useage since last GF */ + int gf_frame_usage = 0; /* Golden frame usage since last GF */ int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] + cpi->recent_ref_frame_usage[LAST_FRAME] + cpi->recent_ref_frame_usage[GOLDEN_FRAME] + @@ -905,20 +905,20 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { (cpi->common.mb_rows * cpi->common.mb_cols); if (tot_mbs) { - gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + - cpi->recent_ref_frame_usage[ALTREF_FRAME]) * - 100 / tot_mbs; + gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * + 100 / tot_mbs; } - if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active; + if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active; /* Is a fixed manual GF frequency being used */ if (cpi->auto_gold) { - /* For one pass throw a GF if recent frame intra useage is - * low or the GF useage is high + /* For one pass throw a GF if recent frame intra usage is + * low or the GF usage is high */ if ((cpi->pass == 0) && - (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) { + (cpi->this_frame_percent_intra < 15 || gf_frame_usage >= 5)) { cpi->common.refresh_golden_frame = 1; /* Two pass GF descision */ @@ -933,10 +933,10 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { if (0) { FILE *f; - f = fopen("gf_useaget.stt", "a"); + f = fopen("gf_usaget.stt", "a"); fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->gfu_boost, - GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage); + GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_usage); fclose(f); } diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index bbddacf8f0..7cd42d107c 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1979,7 +1979,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, rd.distortion2 += distortion; /* If even the 'Y' rd value of split is higher than best so far - * then dont bother looking at UV + * then don't bother looking at UV */ if (tmp_rd < best_mode.yrd) { /* Now work out UV cost and add it in */ diff --git a/vp8/encoder/segmentation.c b/vp8/encoder/segmentation.c index dcb68119e1..2127258111 100644 --- a/vp8/encoder/segmentation.c +++ b/vp8/encoder/segmentation.c @@ -11,7 +11,7 @@ #include "segmentation.h" #include "vpx_mem/vpx_mem.h" -void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) { +void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) { int mb_row, mb_col; MODE_INFO *this_mb_mode_info = cm->mi; @@ -19,7 +19,7 @@ void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) { x->gf_active_ptr = (signed char *)cpi->gf_active_flags; if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) { - /* Reset Gf useage monitors */ + /* Reset Gf usage monitors */ memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols)); cpi->gf_active_count = cm->mb_rows * cm->mb_cols; } else { diff --git a/vp8/encoder/segmentation.h b/vp8/encoder/segmentation.h index 4ddbdbbd26..0fecfc2212 100644 --- a/vp8/encoder/segmentation.h +++ b/vp8/encoder/segmentation.h @@ -19,8 +19,8 @@ extern "C" { #endif -extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, - MACROBLOCK *x); +extern void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm, + MACROBLOCK *x); #ifdef __cplusplus } // extern "C" diff --git a/vp8/encoder/vp8_quantize.c b/vp8/encoder/vp8_quantize.c index 5b89555108..8e5e318241 100644 --- a/vp8/encoder/vp8_quantize.c +++ b/vp8/encoder/vp8_quantize.c @@ -294,7 +294,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip) { /* Select the baseline MB Q index. */ if (xd->segmentation_enabled) { /* Abs Value */ - if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) { + if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) { QIndex = xd->segment_feature_data[MB_LVL_ALT_Q] [xd->mode_info_context->mbmi.segment_id]; /* Delta Value */ diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index db3e746639..22b62e6a2d 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -708,7 +708,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, mi->mode = ZEROMV; if (bsize < BLOCK_8X8) { vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM, - "Invalid usage of segement feature on small blocks"); + "Invalid usage of segment feature on small blocks"); return; } } else { diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c index bd3812036c..ef3423f8eb 100644 --- a/vp9/encoder/vp9_aq_complexity.c +++ b/vp9/encoder/vp9_aq_complexity.c @@ -87,7 +87,7 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { &cpi->rc, cm->frame_type, cm->base_qindex, aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth); - // For AQ complexity mode, we dont allow Q0 in a segment if the base + // For AQ complexity mode, we don't allow Q0 in a segment if the base // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment // Q delta is sometimes applied without going back around the rd loop. // This could lead to an illegal combination of partition size and q. diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 8fdd976816..d97bf2a1c9 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1700,7 +1700,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) { } // Second scan using clamps based on the previous cycle average. - // This may modify the total and average somewhat but we dont bother with + // This may modify the total and average somewhat but we don't bother with // further itterations. modified_score_total = 0.0; s = twopass->stats_in; @@ -1858,7 +1858,7 @@ static int detect_flash_from_frame_stats(const FIRSTPASS_STATS *frame_stats) { // brief break in prediction (such as a flash) but subsequent frames // are reasonably well predicted by an earlier (pre flash) frame. // The recovery after a flash is indicated by a high pcnt_second_ref - // useage or a second ref coded error notabley lower than the last + // usage or a second ref coded error notabley lower than the last // frame coded error. if (frame_stats == NULL) { return 0; @@ -2169,7 +2169,7 @@ static double calculate_group_score(VP9_COMP *cpi, double av_score, double score_total = 0.0; int i = 0; - // We dont ever want to return a 0 score here. + // We don't ever want to return a 0 score here. if (frame_count == 0) return 1.0; while ((i < frame_count) && (s < twopass->stats_in_end)) { @@ -2597,7 +2597,7 @@ static int get_gop_coding_frame_num( if ( // Don't break out with a very short interval. (gop_coding_frames >= active_gf_interval->min) && - // If possible dont break very close to a kf + // If possible don't break very close to a kf ((rc->frames_to_key - gop_coding_frames) >= rc->min_gf_interval) && (gop_coding_frames & 0x01) && (!flash_detected) && ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) || @@ -3031,7 +3031,7 @@ static int intra_step_transition(const FIRSTPASS_STATS *this_frame, next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error); // Return true the intra/inter ratio for the current frame is - // low but better in the next and previous frame and the relative useage of + // low but better in the next and previous frame and the relative usage of // intra in the current frame is markedly higher than the last and next frame. if ((this_ii_ratio < 2.0) && (last_ii_ratio > 2.25) && (next_ii_ratio > 2.25) && (this_pcnt_intra > (3 * last_pcnt_intra)) && @@ -3052,7 +3052,7 @@ static int intra_step_transition(const FIRSTPASS_STATS *this_frame, // Minimum % intra coding observed in first pass (1.0 = 100%) #define MIN_INTRA_LEVEL 0.25 // Threshold for use of the lagging second reference frame. Scene cuts do not -// usually have a high second ref useage. +// usually have a high second ref usage. #define SECOND_REF_USEAGE_THRESH 0.2 // Hard threshold where the first pass chooses intra for almost all blocks. // In such a case even if the frame is not a scene cut coding a key frame @@ -3391,7 +3391,7 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) { twopass->key_frame_section_intra_rating = calculate_section_intra_ratio( start_position, twopass->stats_in_end, rc->frames_to_key); - // Special case for static / slide show content but dont apply + // Special case for static / slide show content but don't apply // if the kf group is very short. if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) { rc->kf_boost = (int)(twopass->kf_max_total_boost); @@ -3523,7 +3523,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { vp9_init_vizier_params(twopass, screen_area); } - // If this is an arf frame then we dont want to read the stats file or + // If this is an arf frame then we don't want to read the stats file or // advance the input pointer as we already have what we need. if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { int target_rate; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 13b43aa63a..4e5fdc6d90 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1196,7 +1196,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, } else { q = rc->avg_frame_qindex[KEY_FRAME]; } - // For constrained quality dont allow Q less than the cq level + // For constrained quality don't allow Q less than the cq level if (oxcf->rc_mode == VPX_CQ) { if (q < cq_level) q = cq_level; @@ -1457,7 +1457,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, } else { q = active_worst_quality; } - // For constrained quality dont allow Q less than the cq level + // For constrained quality don't allow Q less than the cq level if (oxcf->rc_mode == VPX_CQ) { if (q < cq_level) q = cq_level; } diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 464705a678..c0d8b505fe 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -4671,7 +4671,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, if (tmp_best_rdu > 0) { // If even the 'Y' rd value of split is higher than best so far - // then dont bother looking at UV + // then don't bother looking at UV vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8); memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm)); if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable, From ad14a32b33ac94e56fdd84ba06c1c9c9c032d004 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 7 Jun 2023 12:26:55 -0400 Subject: [PATCH 0814/1000] Fix more typos (1/n) Dont -> Don't setings -> settings thresold -> thresh thresold -> threshold becasue -> because itterations -> iterations its a -> it's a an constant -> a constant Bug: webm:1803 Change-Id: I1e019393939ed25c59c898c88d4941ec360b026d --- vp8/encoder/firstpass.c | 12 ++++++------ vp8/encoder/onyx_if.c | 6 +++--- vp8/encoder/onyx_int.h | 2 +- vp8/encoder/pickinter.c | 2 +- vp8/vp8_cx_iface.c | 4 ++-- vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c | 2 +- vp9/encoder/vp9_block.h | 2 +- vp9/encoder/vp9_firstpass.c | 4 ++-- vp9/encoder/vp9_ratectrl.c | 4 ++-- vp9/encoder/vp9_speed_features.h | 2 +- vp9/vp9_cx_iface.c | 4 ++-- 11 files changed, 22 insertions(+), 22 deletions(-) diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 0abd178a61..f141e7d057 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -412,7 +412,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, int_mv ref_mv_full; int tmp_err; - int step_param = 3; /* Dont search over full range for first pass */ + int step_param = 3; /* Don't search over full range for first pass */ int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; int n; vp8_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; @@ -1717,9 +1717,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { /* Break at cpi->max_gf_interval unless almost totally static */ (i >= cpi->max_gf_interval && (decay_accumulator < 0.995)) || ( - /* Dont break out with a very short interval */ + /* Don't break out with a very short interval */ (i > MIN_GF_INTERVAL) && - /* Dont break out very close to a key frame */ + /* Don't break out very close to a key frame */ ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) && ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) && (!flash_detected) && @@ -1765,7 +1765,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (boost_score > max_boost) boost_score = max_boost; } - /* Dont allow conventional gf too near the next kf */ + /* Don't allow conventional gf too near the next kf */ if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) { while (i < cpi->twopass.frames_to_key) { i++; @@ -2082,7 +2082,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { } } - /* Dont allow a negative value for gf_bits */ + /* Don't allow a negative value for gf_bits */ if (gf_bits < 0) gf_bits = 0; /* Add in minimum for a frame */ @@ -3011,7 +3011,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { bits_per_frame = (double)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key); - /* Dont turn to resampling in easy sections just because they + /* Don't turn to resampling in easy sections just because they * have been assigned a small number of bits */ if (bits_per_frame < av_bits_per_frame) { diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index e78743e496..b189632757 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -4484,7 +4484,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, * size within range) then use the last frame value - 1. The -1 * is designed to stop Q and hence the data rate, from * progressively falling away during difficult sections, but at - * the same time reduce the number of itterations around the + * the same time reduce the number of iterations around the * recode loop. */ if (Q > cpi->ni_av_qi) cpi->ni_av_qi = Q - 1; @@ -4731,7 +4731,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, cpi->mb.e_mbd.update_mb_segmentation_data = 0; cpi->mb.e_mbd.mode_ref_lf_delta_update = 0; - /* Dont increment frame counters if this was an altref buffer update + /* Don't increment frame counters if this was an altref buffer update * not a real frame */ if (cm->show_frame) { @@ -5109,7 +5109,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, if (cm->refresh_last_frame) memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc)); - /* if its a dropped frame honor the requests on subsequent frames */ + /* if it's a dropped frame honor the requests on subsequent frames */ if (*size > 0) { cpi->droppable = !frame_is_reference(cpi); diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index bde5c2f69b..4304f054ca 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -360,7 +360,7 @@ typedef struct VP8_COMP { /* GF interval chosen when we coded the last GF */ int current_gf_interval; - /* Total bits overspent becasue of GF boost (cumulative) */ + /* Total bits overspent because of GF boost (cumulative) */ int gf_overspend_bits; /* Used in the few frames following a GF to recover the extra bits diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 04f68c3245..1af8a2f9b2 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -1103,7 +1103,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity) { /* Store for later use by denoiser. */ - // Dont' denoise with GOLDEN OR ALTREF is they are old reference + // Don't denoise with GOLDEN OR ALTREF is they are old reference // frames (greater than MAX_GF_ARF_DENOISE_RANGE frames in past). int skip_old_reference = ((this_ref_frame != LAST_FRAME) && (cpi->common.current_video_frame - diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 0821eef026..8950de0d8a 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -1292,8 +1292,8 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { 0, /* rc_resize_allowed */ 1, /* rc_scaled_width */ 1, /* rc_scaled_height */ - 60, /* rc_resize_down_thresold */ - 30, /* rc_resize_up_thresold */ + 60, /* rc_resize_down_thresh */ + 30, /* rc_resize_up_thresh */ VPX_VBR, /* rc_end_usage */ { NULL, 0 }, /* rc_twopass_stats_in */ diff --git a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c index aeb7e49c10..b43d7fa4f9 100644 --- a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c +++ b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c @@ -18,7 +18,7 @@ #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/inv_txfm.h" -// Use macros to make sure argument lane is passed in as an constant integer. +// Use macros to make sure argument lane is passed in as a constant integer. #define vmull_lane_s32_dual(in, c, lane, out) \ do { \ diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 4d336f2a42..f4653a82fb 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -34,7 +34,7 @@ struct macroblock_plane { uint16_t *eobs; struct buf_2d src; - // Quantizer setings + // Quantizer settings int16_t *round_fp; int16_t *quant_fp; int16_t *quant; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index d97bf2a1c9..e4c8a0e4a1 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1448,7 +1448,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data)); } - // Dont allow a value of 0 for duration. + // Don't allow a value of 0 for duration. // (Section duration is also defaulted to minimum of 1.0). fps.duration = VPXMAX(1.0, (double)(source->ts_end - source->ts_start)); @@ -1701,7 +1701,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) { // Second scan using clamps based on the previous cycle average. // This may modify the total and average somewhat but we don't bother with - // further itterations. + // further iterations. modified_score_total = 0.0; s = twopass->stats_in; while (s < twopass->stats_in_end) { diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 4e5fdc6d90..c32745b4f8 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1359,7 +1359,7 @@ static void pick_kf_q_bound_two_pass(const VP9_COMP *cpi, int *bottom_index, active_best_quality /= 4; } - // Dont allow the active min to be lossless (q0) unlesss the max q + // Don't allow the active min to be lossless (q0) unlesss the max q // already indicates lossless. active_best_quality = VPXMIN(active_worst_quality, VPXMAX(1, active_best_quality)); @@ -2693,7 +2693,7 @@ static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) { } // Fast redistribution of bits arising from massive local undershoot. - // Dont do it for kf,arf,gf or overlay frames. + // Don't do it for kf,arf,gf or overlay frames. if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref && rc->vbr_bits_off_target_fast) { int one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, *this_frame_target); diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 70c61fe00d..941de639ac 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -643,7 +643,7 @@ typedef struct SPEED_FEATURES { // Use machine learning based partition search. int nonrd_use_ml_partition; - // Multiplier for base thresold for variance partitioning. + // Multiplier for base threshold for variance partitioning. int variance_part_thresh_mult; // Force subpel motion filter to always use SMOOTH_FILTER. diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 409069b4ed..5873f30e89 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -2084,8 +2084,8 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { 0, // rc_resize_allowed 0, // rc_scaled_width 0, // rc_scaled_height - 60, // rc_resize_down_thresold - 30, // rc_resize_up_thresold + 60, // rc_resize_down_thresh + 30, // rc_resize_up_thresh VPX_VBR, // rc_end_usage { NULL, 0 }, // rc_twopass_stats_in From ffb93451095809b710576194c374c84c1d36d4cd Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 7 Jun 2023 12:31:38 -0400 Subject: [PATCH 0815/1000] Fix more typos (2/n) kernal -> kernel e.g -> e.g. paritioning -> partitioning partioning -> partitioning coefficents -> coefficients i.e, -> i.e., equivalend -> equivalent recive -> receive resoultions -> resolutions Bug: webm:1803 Change-Id: I1d6176202ee5daee7a64bf59114e8b304aeb4db7 --- vp8/encoder/onyx_if.c | 2 +- vp9/encoder/vp9_block.h | 6 ++-- vp9/encoder/vp9_denoiser.c | 2 +- vp9/encoder/vp9_encodeframe.c | 26 ++++++++--------- vp9/encoder/vp9_encoder.c | 48 ++++++++++++++++---------------- vp9/encoder/vp9_firstpass.c | 28 +++++++++---------- vp9/encoder/vp9_noise_estimate.c | 2 +- vp9/simple_encode.h | 2 +- vpx/vp8cx.h | 4 +-- 9 files changed, 60 insertions(+), 60 deletions(-) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index b189632757..c65afc643b 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -2251,7 +2251,7 @@ void vp8_remove_compressor(VP8_COMP **comp) { #if 0 { printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000); - printf("\n_frames recive_data encod_mb_row compress_frame Total\n"); + printf("\n_frames receive_data encod_mb_row compress_frame Total\n"); printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000); } #endif diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index f4653a82fb..7fa00cd194 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -85,10 +85,10 @@ struct macroblock { // The equivalent error at the current rdmult of one whole bit (not one // bitcost unit). int errorperbit; - // The equivalend SAD error of one (whole) bit at the current quantizer + // The equivalent SAD error of one (whole) bit at the current quantizer // for large blocks. int sadperbit16; - // The equivalend SAD error of one (whole) bit at the current quantizer + // The equivalent SAD error of one (whole) bit at the current quantizer // for sub-8x8 blocks. int sadperbit4; int rddiv; @@ -128,7 +128,7 @@ struct macroblock { // from extending outside the UMV borders MvLimits mv_limits; - // Notes transform blocks where no coefficents are coded. + // Notes transform blocks where no coefficients are coded. // Set during mode selection. Read during block encoding. uint8_t zcoeff_blk[TX_SIZES][256]; diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index baea8ebb3c..b40d5c6154 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -387,7 +387,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], consec_zeromv); // No need to keep checking 8x8 blocks if any of the sub-blocks // has small consec_zeromv (since threshold for no_skin based on - // zero/small motion in skin detection is high, i.e, > 4). + // zero/small motion in skin detection is high, i.e., > 4). if (consec_zeromv < 4) { i = ymis; break; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index a979ae1c93..2381c73687 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1706,7 +1706,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, const int y16_idx = ((j >> 1) << 1); // For inter frames: if variance4x4downsample[] == 1 for this 16x16 // block, then the variance is based on 4x4 down-sampling, so use vt2 - // in set_vt_partioning(), otherwise use vt. + // in set_vt_partitioning(), otherwise use vt. v16x16 *vtemp = (!is_key_frame && variance4x4downsample[i2 + j] == 1) ? &vt2[i2 + j] : &vt.split[i].split[j]; @@ -3470,11 +3470,11 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, // Features used: QP; spatial block size contexts; variance of prediction // residue after simple_motion_search. #define FEATURES 12 -static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi, - MACROBLOCK *const x, - PC_TREE *const pc_tree, - BLOCK_SIZE bsize, int mi_row, - int mi_col, int *none, int *split) { +static void ml_predict_var_rd_partitioning(const VP9_COMP *const cpi, + MACROBLOCK *const x, + PC_TREE *const pc_tree, + BLOCK_SIZE bsize, int mi_row, + int mi_col, int *none, int *split) { const VP9_COMMON *const cm = &cpi->common; const NN_CONFIG *nn_config = NULL; const MACROBLOCKD *const xd = &x->e_mbd; @@ -4092,8 +4092,8 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows && mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols; if (do_rd_ml_partition_var_pruning) { - ml_predict_var_rd_paritioning(cpi, x, pc_tree, bsize, mi_row, mi_col, - &partition_none_allowed, &do_split); + ml_predict_var_rd_partitioning(cpi, x, pc_tree, bsize, mi_row, mi_col, + &partition_none_allowed, &do_split); } else { vp9_zero(pc_tree->mv); } @@ -4820,9 +4820,9 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) { #define FEATURES 6 #define LABELS 2 -static int ml_predict_var_paritioning(VP9_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, int mi_row, - int mi_col) { +static int ml_predict_var_partitioning(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col) { VP9_COMMON *const cm = &cpi->common; const NN_CONFIG *nn_config = NULL; @@ -4954,7 +4954,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (partition_none_allowed || do_split) do_rect = 0; if (partition_none_allowed && do_split) { const int ml_predicted_partition = - ml_predict_var_paritioning(cpi, x, bsize, mi_row, mi_col); + ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col); if (ml_predicted_partition == PARTITION_NONE) do_split = 0; if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0; } @@ -5633,7 +5633,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad && cpi->oxcf.speed < 6 && !frame_is_intra_only(cm) && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { - // Use lower max_partition_size for low resoultions. + // Use lower max_partition_size for low resolutions. if (cm->width <= 352 && cm->height <= 288) x->max_partition_size = BLOCK_32X32; else diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 9d5c0030a2..d03d87a8a1 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -139,7 +139,7 @@ static int compute_context_model_thresh(const VP9_COMP *const cpi) { // frame context probability model is less than a certain threshold. // The first component is the most critical part to guarantee adaptivity. // Other parameters are estimated based on normal setting of hd resolution - // parameters. e.g frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50 + // parameters. e.g. frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50 const int thresh = ((FRAME_SIZE_FACTOR * frame_size - FRAME_RATE_FACTOR * bitrate) * qindex_factor) >> @@ -2836,7 +2836,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) { #if 0 { printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000); - printf("\n_frames recive_data encod_mb_row compress_frame Total\n"); + printf("\n_frames receive_data encod_mb_row compress_frame Total\n"); printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000, cpi->time_compress_data / 1000, @@ -5020,8 +5020,8 @@ static int setup_interp_filter_search_mask(VP9_COMP *cpi) { #ifdef ENABLE_KF_DENOISE // Baseline kernel weights for denoise -static uint8_t dn_kernal_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 }; -static uint8_t dn_kernal_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4, +static uint8_t dn_kernel_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 }; +static uint8_t dn_kernel_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1 }; static INLINE void add_denoise_point(int centre_val, int data_val, int thresh, @@ -5038,17 +5038,17 @@ static void spatial_denoise_point(uint8_t *src_ptr, const int stride, int sum_weight = 0; int sum_val = 0; int thresh = strength; - int kernal_size = 5; + int kernel_size = 5; int half_k_size = 2; int i, j; int max_diff = 0; uint8_t *tmp_ptr; - uint8_t *kernal_ptr; + uint8_t *kernel_ptr; // Find the maximum deviation from the source point in the locale. tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1); - for (i = 0; i < kernal_size + 2; ++i) { - for (j = 0; j < kernal_size + 2; ++j) { + for (i = 0; i < kernel_size + 2; ++i) { + for (j = 0; j < kernel_size + 2; ++j) { max_diff = VPXMAX(max_diff, abs((int)*src_ptr - (int)tmp_ptr[j])); } tmp_ptr += stride; @@ -5056,19 +5056,19 @@ static void spatial_denoise_point(uint8_t *src_ptr, const int stride, // Select the kernel size. if (max_diff > (strength + (strength >> 1))) { - kernal_size = 3; + kernel_size = 3; half_k_size = 1; thresh = thresh >> 1; } - kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5; + kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5; // Apply the kernel tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size; - for (i = 0; i < kernal_size; ++i) { - for (j = 0; j < kernal_size; ++j) { - add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr, + for (i = 0; i < kernel_size; ++i) { + for (j = 0; j < kernel_size; ++j) { + add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr, &sum_val, &sum_weight); - ++kernal_ptr; + ++kernel_ptr; } tmp_ptr += stride; } @@ -5083,17 +5083,17 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride, int sum_weight = 0; int sum_val = 0; int thresh = strength; - int kernal_size = 5; + int kernel_size = 5; int half_k_size = 2; int i, j; int max_diff = 0; uint16_t *tmp_ptr; - uint8_t *kernal_ptr; + uint8_t *kernel_ptr; // Find the maximum deviation from the source point in the locale. tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1); - for (i = 0; i < kernal_size + 2; ++i) { - for (j = 0; j < kernal_size + 2; ++j) { + for (i = 0; i < kernel_size + 2; ++i) { + for (j = 0; j < kernel_size + 2; ++j) { max_diff = VPXMAX(max_diff, abs((int)src_ptr - (int)tmp_ptr[j])); } tmp_ptr += stride; @@ -5101,19 +5101,19 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride, // Select the kernel size. if (max_diff > (strength + (strength >> 1))) { - kernal_size = 3; + kernel_size = 3; half_k_size = 1; thresh = thresh >> 1; } - kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5; + kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5; // Apply the kernel tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size; - for (i = 0; i < kernal_size; ++i) { - for (j = 0; j < kernal_size; ++j) { - add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr, + for (i = 0; i < kernel_size; ++i) { + for (j = 0; j < kernel_size; ++j) { + add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr, &sum_val, &sum_weight); - ++kernal_ptr; + ++kernel_ptr; } tmp_ptr += stride; } diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index e4c8a0e4a1..567080a2dd 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -607,10 +607,10 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) { #define KERNEL_SIZE 3 // Baseline Kernal weights for first pass noise metric -static uint8_t fp_dn_kernal_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4, +static uint8_t fp_dn_kernel_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 }; -// Estimate noise at a single point based on the impace of a spatial kernal +// Estimate noise at a single point based on the impace of a spatial kernel // on the point value static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) { int sum_weight = 0; @@ -620,23 +620,23 @@ static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) { int diff; int dn_diff; uint8_t *tmp_ptr; - uint8_t *kernal_ptr; + uint8_t *kernel_ptr; uint8_t dn_val; uint8_t centre_val = *src_ptr; - kernal_ptr = fp_dn_kernal_3; + kernel_ptr = fp_dn_kernel_3; - // Apply the kernal + // Apply the kernel tmp_ptr = src_ptr - stride - 1; for (i = 0; i < KERNEL_SIZE; ++i) { for (j = 0; j < KERNEL_SIZE; ++j) { diff = abs((int)centre_val - (int)tmp_ptr[j]); max_diff = VPXMAX(max_diff, diff); if (diff <= FP_DN_THRESH) { - sum_weight += *kernal_ptr; - sum_val += (int)tmp_ptr[j] * (int)*kernal_ptr; + sum_weight += *kernel_ptr; + sum_val += (int)tmp_ptr[j] * (int)*kernel_ptr; } - ++kernal_ptr; + ++kernel_ptr; } tmp_ptr += stride; } @@ -662,13 +662,13 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) { int dn_diff; uint8_t *tmp_ptr; uint16_t *tmp_ptr16; - uint8_t *kernal_ptr; + uint8_t *kernel_ptr; uint16_t dn_val; uint16_t centre_val = *CONVERT_TO_SHORTPTR(src_ptr); - kernal_ptr = fp_dn_kernal_3; + kernel_ptr = fp_dn_kernel_3; - // Apply the kernal + // Apply the kernel tmp_ptr = src_ptr - stride - 1; for (i = 0; i < KERNEL_SIZE; ++i) { tmp_ptr16 = CONVERT_TO_SHORTPTR(tmp_ptr); @@ -676,10 +676,10 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) { diff = abs((int)centre_val - (int)tmp_ptr16[j]); max_diff = VPXMAX(max_diff, diff); if (diff <= FP_DN_THRESH) { - sum_weight += *kernal_ptr; - sum_val += (int)tmp_ptr16[j] * (int)*kernal_ptr; + sum_weight += *kernel_ptr; + sum_val += (int)tmp_ptr16[j] * (int)*kernel_ptr; } - ++kernal_ptr; + ++kernel_ptr; } tmp_ptr += stride; } diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c index 9696529c50..4ee6e51ba8 100644 --- a/vp9/encoder/vp9_noise_estimate.c +++ b/vp9/encoder/vp9_noise_estimate.c @@ -202,7 +202,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { VPXMIN(cpi->consec_zero_mv[bl_index1], VPXMIN(cpi->consec_zero_mv[bl_index2], cpi->consec_zero_mv[bl_index3]))); - // Only consider blocks that are likely steady background. i.e, have + // Only consider blocks that are likely steady background. i.e., have // been encoded as zero/low motion x (= thresh_consec_zeromv) frames // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all // 4 sub-blocks for 16x16 block. And exclude this frame if diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h index 7920e95ee9..d610a5e159 100644 --- a/vp9/simple_encode.h +++ b/vp9/simple_encode.h @@ -309,7 +309,7 @@ struct EncodeFrameResult { // The tpl stats stored in the vector is according to the encoding order. // For example, suppose there are N show frames for the current GOP. // Then tpl_stats_info[0] stores the information of the first frame to be - // encoded for this GOP, i.e, the AltRef frame. + // encoded for this GOP, i.e., the AltRef frame. std::vector tpl_stats_info; ImageBuffer coded_frame; diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index c4e04084c8..3c0278c848 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -302,7 +302,7 @@ enum vp8e_enc_control_id { * the feature is off, i.e., no golden frame boost in CBR mode and * average bitrate target is used. * - * For example, to allow 100% more bits, i.e, 2X, in a golden frame + * For example, to allow 100% more bits, i.e., 2X, in a golden frame * than average frame, set this to 100. * * Supported in codecs: VP9 @@ -598,7 +598,7 @@ enum vp8e_enc_control_id { * the feature is off, i.e., no golden frame boost in CBR mode and * average bitrate target is used. * - * For example, to allow 100% more bits, i.e, 2X, in a golden frame + * For example, to allow 100% more bits, i.e., 2X, in a golden frame * than average frame, set this to 100. * * Supported in codecs: VP8 From 6a8eb04feccc05dba9b42c0f95405055183a798c Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 7 Jun 2023 12:36:31 -0400 Subject: [PATCH 0816/1000] Fix more typos (3/n) Propogation -> Propagation propogate -> propagate cant -> can't upto -> up to canddiates -> candidates refernce -> reference USEAGE -> USAGE Change-Id: Iadaf2dffd86b54e04411910f667e8c2dfc6c4c77 --- vp8/encoder/denoising.c | 2 +- vp8/encoder/firstpass.c | 2 +- vp8/encoder/rdopt.c | 2 +- vp9/decoder/vp9_decodemv.c | 2 +- vp9/encoder/vp9_encoder.h | 2 +- vp9/encoder/vp9_firstpass.c | 12 ++++++------ vp9/encoder/vp9_mcomp.c | 6 +++--- vp9/encoder/vp9_rdopt.c | 2 +- vp9/encoder/vp9_speed_features.c | 2 +- 9 files changed, 16 insertions(+), 16 deletions(-) diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c index e54d1e9f4b..a666bca4d2 100644 --- a/vp8/encoder/denoising.c +++ b/vp8/encoder/denoising.c @@ -135,7 +135,7 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, // When adopting aggressive denoiser, the adj_val for each pixel // could be at most 8 (this is current max adjustment of the map). // In SSE code, we calculate the sum of adj_val for - // the columns, so the sum could be upto 128(16 rows). However, + // the columns, so the sum could be up to 128(16 rows). However, // the range of the value is -128 ~ 127 in SSE code, that's why // we do this change in C code. // We don't do this for UV denoiser, since there are only 8 rows, diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index f141e7d057..2b88a88c80 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -1786,7 +1786,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { alt_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost); #endif - /* Should we use the alternate refernce frame */ + /* Should we use the alternate reference frame */ if (allow_alt_ref && (i >= MIN_GF_INTERVAL) && /* don't use ARF very near next kf */ (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) && diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 7cd42d107c..5d539ef30c 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1021,7 +1021,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, BLOCK *c; BLOCKD *e; - /* Is the best so far sufficiently good that we cant justify + /* Is the best so far sufficiently good that we can't justify * doing a new motion search. */ if (best_label_rd < label_mv_thresh) break; diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index 22b62e6a2d..7b524fa2a8 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -299,7 +299,7 @@ static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm, } } -// Read the referncence frame +// Read the reference frame static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, vpx_reader *r, int segment_id, MV_REFERENCE_FRAME ref_frame[2]) { diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 2e0c4db9ed..f8a27872df 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -846,7 +846,7 @@ typedef struct VP9_COMP { uint8_t *skin_map; - // segment threashold for encode breakout + // segment threshold for encode breakout int segment_encode_breakout[MAX_SEGMENTS]; CYCLIC_REFRESH *cyclic_refresh; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 567080a2dd..d726bc15f4 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -606,7 +606,7 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) { #define FP_MAX_DN_THRESH 24 #define KERNEL_SIZE 3 -// Baseline Kernal weights for first pass noise metric +// Baseline Kernel weights for first pass noise metric static uint8_t fp_dn_kernel_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 }; @@ -1458,7 +1458,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { accumulate_stats(&twopass->total_stats, &fps); } - // Copy the previous Last Frame back into gf and and arf buffers if + // Copy the previous Last Frame back into gf and arf buffers if // the prediction is good enough... but also don't allow it to lag too far. if ((twopass->sr_update_lag > 3) || ((cm->current_video_frame > 0) && @@ -1675,7 +1675,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) { // Scan the first pass file and calculate a modified score for each // frame that is used to distribute bits. The modified score is assumed - // to provide a linear basis for bit allocation. I.e a frame A with a score + // to provide a linear basis for bit allocation. I.e., a frame A with a score // that is double that of frame B will be allocated 2x as many bits. { double modified_score_total = 0.0; @@ -3053,7 +3053,7 @@ static int intra_step_transition(const FIRSTPASS_STATS *this_frame, #define MIN_INTRA_LEVEL 0.25 // Threshold for use of the lagging second reference frame. Scene cuts do not // usually have a high second ref usage. -#define SECOND_REF_USEAGE_THRESH 0.2 +#define SECOND_REF_USAGE_THRESH 0.2 // Hard threshold where the first pass chooses intra for almost all blocks. // In such a case even if the frame is not a scene cut coding a key frame // may be a good option. @@ -3083,7 +3083,7 @@ static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info, detect_flash_from_frame_stats(next_frame); if (!detect_flash_from_frame_stats(this_frame) && !detect_flash_from_frame_stats(next_frame) && - (this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && + (this_frame->pcnt_second_ref < SECOND_REF_USAGE_THRESH) && ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) || (slide_transition(this_frame, last_frame, next_frame)) || (intra_step_transition(this_frame, last_frame, next_frame)) || @@ -3361,7 +3361,7 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) { // The second (lagging) ref error is not valid immediately after // a key frame because either the lag has not built up (in the case of - // the first key frame or it points to a refernce before the new key + // the first key frame or it points to a reference before the new key // frame. if (i < 2) sr_accumulator = 0.0; frame_boost = diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 0ea0f85e42..cbe1c40290 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -953,7 +953,7 @@ static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) { } #define MAX_PATTERN_SCALES 11 -#define MAX_PATTERN_CANDIDATES 8 // max number of canddiates per scale +#define MAX_PATTERN_CANDIDATES 8 // max number of candidates per scale #define PATTERN_CANDIDATES_REF 3 // number of refinement candidates // Calculate and return a sad+mvcost list around an integer best pel. @@ -1034,7 +1034,7 @@ static int vp9_pattern_search( in_what->stride) + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - // Search all possible scales upto the search param around the center point + // Search all possible scales up to the search param around the center point // pick the scale of the point that is best as the starting scale of // further steps around it. if (do_init_search) { @@ -1208,7 +1208,7 @@ static int vp9_pattern_search_sad( in_what->stride) + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - // Search all possible scales upto the search param around the center point + // Search all possible scales up to the search param around the center point // pick the scale of the point that is best as the starting scale of // further steps around it. if (do_init_search) { diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index c0d8b505fe..9454802a5e 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -2183,7 +2183,7 @@ static int64_t rd_pick_best_sub8x8_mode( int cost_list[5]; const MvLimits tmp_mv_limits = x->mv_limits; - /* Is the best so far sufficiently good that we cant justify doing + /* Is the best so far sufficiently good that we can't justify doing * and new motion search. */ if (best_rd < label_mv_thresh) break; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 48c21c581e..4a7172118c 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -42,7 +42,7 @@ static int frame_is_boosted(const VP9_COMP *cpi) { // Sets a partition size down to which the auto partition code will always // search (can go lower), based on the image dimensions. The logic here // is that the extent to which ringing artefacts are offensive, depends -// partly on the screen area that over which they propogate. Propogation is +// partly on the screen area that over which they propagate. Propagation is // limited by transform block size but the screen area take up by a given block // size will be larger for a small image format stretched to full screen. static BLOCK_SIZE set_partition_min_limit(VP9_COMMON *const cm) { From d42b7fd66162be7a94ded287c09461acd7875c8d Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 7 Jun 2023 16:35:19 -0400 Subject: [PATCH 0817/1000] Fix more typos (n/n) impace -> impact taget -> target prediciton -> prediction addtion -> addition the the -> the Bug: webm:1803 Change-Id: I759c9d930a037ca69662164fcd6be160ed707d77 --- vp8/encoder/firstpass.c | 4 ++-- vp9/encoder/vp9_firstpass.c | 4 ++-- vp9/encoder/vp9_svc_layercontext.c | 2 +- vp9/vp9_cx_iface.c | 2 +- vpx_dsp/ppc/variance_vsx.c | 2 +- vpx_dsp/variance.c | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 2b88a88c80..ff088aa969 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -1537,7 +1537,7 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames, /* Calculate the baseline boost number for this frame */ r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out); - /* We want to discount the the flash frame itself and the recovery + /* We want to discount the flash frame itself and the recovery * frame that follows as both will have poor scores. */ flash_detected = @@ -1581,7 +1581,7 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames, /* Calculate the baseline boost number for this frame */ r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out); - /* We want to discount the the flash frame itself and the recovery + /* We want to discount the flash frame itself and the recovery * frame that follows as both will have poor scores. */ flash_detected = diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index d726bc15f4..27cfc805d7 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -610,7 +610,7 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) { static uint8_t fp_dn_kernel_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 }; -// Estimate noise at a single point based on the impace of a spatial kernel +// Estimate noise at a single point based on the impact of a spatial kernel // on the point value static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) { int sum_weight = 0; @@ -2038,7 +2038,7 @@ static int compute_arf_boost(const FRAME_INFO *frame_info, this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, &mv_ratio_accumulator); - // We want to discount the the flash frame itself and the recovery + // We want to discount the flash frame itself and the recovery // frame that follows as both will have poor scores. flash_detected = detect_flash_from_frame_stats(this_frame) || detect_flash_from_frame_stats(next_frame); diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index e4721271d9..24fd818133 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -1145,7 +1145,7 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) { void vp9_svc_assert_constraints_pattern(VP9_COMP *const cpi) { SVC *const svc = &cpi->svc; // For fixed/non-flexible mode, the following constraint are expected, - // when inter-layer prediciton is on (default). + // when inter-layer prediction is on (default). if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && svc->disable_inter_layer_pred == INTER_LAYER_PRED_ON && svc->framedrop_mode != LAYER_DROP) { diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 5873f30e89..cc2ae20d27 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -2118,7 +2118,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { { 0 }, // ts_rate_decimator 0, // ts_periodicity { 0 }, // ts_layer_id - { 0 }, // layer_taget_bitrate + { 0 }, // layer_target_bitrate 0, // temporal_layering_mode 0, // use_vizier_rc_params { 1, 1 }, // active_wq_factor diff --git a/vpx_dsp/ppc/variance_vsx.c b/vpx_dsp/ppc/variance_vsx.c index be9614a358..6c6bc9a301 100644 --- a/vpx_dsp/ppc/variance_vsx.c +++ b/vpx_dsp/ppc/variance_vsx.c @@ -225,7 +225,7 @@ static INLINE void variance(const uint8_t *src_ptr, int src_stride, } /* Identical to the variance call except it does not calculate the - * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in + * sse - sum^2 / w*h and returns sse in addition to modifying the passed in * variable. */ #define MSE(W, H) \ diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c index a6793efb68..1c476542fa 100644 --- a/vpx_dsp/variance.c +++ b/vpx_dsp/variance.c @@ -180,7 +180,7 @@ static void var_filter_block2d_bil_second_pass( } /* Identical to the variance call except it does not calculate the - * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in + * sse - sum^2 / w*h and returns sse in addition to modifying the passed in * variable. */ #define MSE(W, H) \ From 2245df50a6d360d33fccd51479c48f2210ed607a Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 8 Jun 2023 10:52:45 -0400 Subject: [PATCH 0818/1000] Replace NONE with NO_REF_FRAME NONE is a common name and it has conflicts with symbols defined in Chromium. Bug: b/286163500 Change-Id: I3d935a786f771a4d90b258fabc6fd6c2ecbf1c59 --- vp9/common/vp9_blockd.h | 2 +- vp9/decoder/vp9_decodemv.c | 8 ++--- vp9/encoder/vp9_denoiser.c | 2 +- vp9/encoder/vp9_encodeframe.c | 10 +++--- vp9/encoder/vp9_encoder.c | 5 +-- vp9/encoder/vp9_encoder.h | 4 +-- vp9/encoder/vp9_firstpass.c | 2 +- vp9/encoder/vp9_mbgraph.c | 2 +- vp9/encoder/vp9_pickmode.c | 22 ++++++------ vp9/encoder/vp9_rdopt.c | 68 +++++++++++++++++------------------ vp9/simple_encode.cc | 9 ++--- 11 files changed, 68 insertions(+), 66 deletions(-) diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index d7de46cf4f..aa13d8a0d5 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -54,7 +54,7 @@ typedef struct { // decoder implementation modules critically rely on the defined entry values // specified herein. They should be refactored concurrently. -#define NONE (-1) +#define NO_REF_FRAME (-1) #define INTRA_FRAME 0 #define LAST_FRAME 1 #define GOLDEN_FRAME 2 diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index 7b524fa2a8..0989cde58d 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -204,7 +204,7 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm, mi->skip = read_skip(cm, xd, mi->segment_id, r); mi->tx_size = read_tx_size(cm, xd, 1, r); mi->ref_frame[0] = INTRA_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; switch (bsize) { case BLOCK_4X4: @@ -309,7 +309,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); - ref_frame[1] = NONE; + ref_frame[1] = NO_REF_FRAME; } else { const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r); // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding @@ -333,7 +333,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, ref_frame[0] = LAST_FRAME; } - ref_frame[1] = NONE; + ref_frame[1] = NO_REF_FRAME; } else { assert(0 && "Invalid prediction mode."); } @@ -383,7 +383,7 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, mi->interp_filter = SWITCHABLE_FILTERS; mi->ref_frame[0] = INTRA_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; } static INLINE int is_mv_valid(const MV *mv) { diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index b40d5c6154..e5dffa90a8 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -319,7 +319,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation( filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride; - set_ref_ptrs(cm, filter_mbd, saved_frame, NONE); + set_ref_ptrs(cm, filter_mbd, saved_frame, NO_REF_FRAME); vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs); // Restore everything to its original state diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 2381c73687..7280e0da8b 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1437,7 +1437,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, &cm->frame_refs[LAST_FRAME - 1].sf); mi->ref_frame[0] = LAST_FRAME; } - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->sb_type = BLOCK_64X64; mi->mv[0].as_int = 0; mi->interp_filter = BILINEAR; @@ -1924,7 +1924,7 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, mi->skip = 1; mi->uv_mode = DC_PRED; mi->ref_frame[0] = LAST_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->mv[0].as_int = 0; mi->interp_filter = filter_ref; @@ -3449,7 +3449,7 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, &cm->frame_refs[ref - 1].sf); mi->ref_frame[0] = ref; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->sb_type = bsize; vp9_set_mv_search_range(&x->mv_limits, &ref_mv); vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, search_method, @@ -3789,7 +3789,7 @@ static void assign_motion_vector_info(const int block_width_4x4, const int col_4x4 = col_start_4x4 + j; const int unit_index = row_4x4 * num_unit_cols + col_4x4; if (row_4x4 >= num_unit_rows || col_4x4 >= num_unit_cols) continue; - if (source_ref_frame[1] == NONE) { + if (source_ref_frame[1] == NO_REF_FRAME) { assert(source_mv[1]->row == 0 && source_mv[1]->col == 0); } motion_vector_info[unit_index].ref_frame[0] = source_ref_frame[0]; @@ -5443,7 +5443,7 @@ static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile, &cm->frame_refs[LAST_FRAME - 1].sf); mi->ref_frame[0] = LAST_FRAME; } - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->sb_type = BLOCK_64X64; mi->mv[0].as_int = 0; mi->interp_filter = BILINEAR; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index d03d87a8a1..4d7f475a01 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2949,7 +2949,7 @@ void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags) { static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer( VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag) { - MV_REFERENCE_FRAME ref_frame = NONE; + MV_REFERENCE_FRAME ref_frame = NO_REF_FRAME; if (ref_frame_flag == VP9_LAST_FLAG) ref_frame = LAST_FRAME; else if (ref_frame_flag == VP9_GOLD_FLAG) @@ -2957,7 +2957,8 @@ static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer( else if (ref_frame_flag == VP9_ALT_FLAG) ref_frame = ALTREF_FRAME; - return ref_frame == NONE ? NULL : get_ref_frame_buffer(cpi, ref_frame); + return ref_frame == NO_REF_FRAME ? NULL + : get_ref_frame_buffer(cpi, ref_frame); } int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index f8a27872df..333ff0b36a 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1076,8 +1076,8 @@ static INLINE void free_partition_info(struct VP9_COMP *cpi) { } static INLINE void reset_mv_info(MOTION_VECTOR_INFO *mv_info) { - mv_info->ref_frame[0] = NONE; - mv_info->ref_frame[1] = NONE; + mv_info->ref_frame[0] = NO_REF_FRAME; + mv_info->ref_frame[1] = NO_REF_FRAME; mv_info->mv[0].as_int = INVALID_MV; mv_info->mv[1].as_int = INVALID_MV; } diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 27cfc805d7..22669ab847 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1264,7 +1264,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, xd->mi[0]->mv[0].as_mv = mv; xd->mi[0]->tx_size = TX_4X4; xd->mi[0]->ref_frame[0] = LAST_FRAME; - xd->mi[0]->ref_frame[1] = NONE; + xd->mi[0]->ref_frame[1] = NO_REF_FRAME; vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize); vp9_encode_sby_pass1(x, bsize); fp_acc_data->sum_mvr += mv.row; diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index fafc673aca..8b6521d915 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -237,7 +237,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, xd->mi[0] = &mi_local; mi_local.sb_type = BLOCK_16X16; mi_local.ref_frame[0] = LAST_FRAME; - mi_local.ref_frame[1] = NONE; + mi_local.ref_frame[1] = NO_REF_FRAME; for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { MV gld_left_mv = gld_top_mv; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index fa88cd79da..4a92802dcc 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1398,8 +1398,8 @@ static void recheck_zeromv_after_denoising( RD_COST this_rdc; mi->mode = ZEROMV; mi->ref_frame[0] = LAST_FRAME; - mi->ref_frame[1] = NONE; - set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE); + mi->ref_frame[1] = NO_REF_FRAME; + set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME); mi->mv[0].as_int = 0; mi->interp_filter = EIGHTTAP; if (cpi->sf.default_interp_filter == BILINEAR) mi->interp_filter = BILINEAR; @@ -1417,7 +1417,7 @@ static void recheck_zeromv_after_denoising( this_rdc = *best_rdc; mi->mode = ctx_den->best_mode; mi->ref_frame[0] = ctx_den->best_ref_frame; - set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE); + set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME); mi->interp_filter = ctx_den->best_pred_filter; if (ctx_den->best_ref_frame == INTRA_FRAME) { mi->mv[0].as_int = INVALID_MV; @@ -1681,7 +1681,7 @@ static INLINE void init_best_pickmode(BEST_PICKMODE *bp) { bp->best_intra_tx_size = TX_SIZES; bp->best_pred_filter = EIGHTTAP; bp->best_mode_skip_txfm = SKIP_TXFM_NONE; - bp->best_second_ref_frame = NONE; + bp->best_second_ref_frame = NO_REF_FRAME; bp->best_pred = NULL; } @@ -1875,8 +1875,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, vp9_rd_cost_reset(&best_rdc); vp9_rd_cost_reset(rd_cost); mi->sb_type = bsize; - mi->ref_frame[0] = NONE; - mi->ref_frame[1] = NONE; + mi->ref_frame[0] = NO_REF_FRAME; + mi->ref_frame[1] = NO_REF_FRAME; mi->tx_size = VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cm->tx_mode]); @@ -2054,7 +2054,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int comp_pred = 0; int force_mv_inter_layer = 0; PREDICTION_MODE this_mode; - second_ref_frame = NONE; + second_ref_frame = NO_REF_FRAME; if (idx < num_inter_modes) { this_mode = ref_mode_set[idx].pred_mode; @@ -2631,7 +2631,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, best_pickmode.best_mode = this_mode; best_pickmode.best_intra_tx_size = mi->tx_size; best_pickmode.best_ref_frame = INTRA_FRAME; - best_pickmode.best_second_ref_frame = NONE; + best_pickmode.best_second_ref_frame = NO_REF_FRAME; mi->uv_mode = this_mode; mi->mv[0].as_int = INVALID_MV; mi->mv[1].as_int = INVALID_MV; @@ -2753,8 +2753,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, MODE_INFO *const mi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; const struct segmentation *const seg = &cm->seg; - MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE; - MV_REFERENCE_FRAME best_ref_frame = NONE; + MV_REFERENCE_FRAME ref_frame, second_ref_frame = NO_REF_FRAME; + MV_REFERENCE_FRAME best_ref_frame = NO_REF_FRAME; unsigned char segment_id = mi->segment_id; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; int64_t best_rd = INT64_MAX; @@ -2793,7 +2793,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, mi->tx_size = TX_4X4; mi->uv_mode = DC_PRED; mi->ref_frame[0] = LAST_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 9454802a5e..b7fb26de27 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -86,28 +86,28 @@ struct rdcost_block_args { #if !CONFIG_REALTIME_ONLY static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { - { NEARESTMV, { LAST_FRAME, NONE } }, - { NEARESTMV, { ALTREF_FRAME, NONE } }, - { NEARESTMV, { GOLDEN_FRAME, NONE } }, + { NEARESTMV, { LAST_FRAME, NO_REF_FRAME } }, + { NEARESTMV, { ALTREF_FRAME, NO_REF_FRAME } }, + { NEARESTMV, { GOLDEN_FRAME, NO_REF_FRAME } }, - { DC_PRED, { INTRA_FRAME, NONE } }, + { DC_PRED, { INTRA_FRAME, NO_REF_FRAME } }, - { NEWMV, { LAST_FRAME, NONE } }, - { NEWMV, { ALTREF_FRAME, NONE } }, - { NEWMV, { GOLDEN_FRAME, NONE } }, + { NEWMV, { LAST_FRAME, NO_REF_FRAME } }, + { NEWMV, { ALTREF_FRAME, NO_REF_FRAME } }, + { NEWMV, { GOLDEN_FRAME, NO_REF_FRAME } }, - { NEARMV, { LAST_FRAME, NONE } }, - { NEARMV, { ALTREF_FRAME, NONE } }, - { NEARMV, { GOLDEN_FRAME, NONE } }, + { NEARMV, { LAST_FRAME, NO_REF_FRAME } }, + { NEARMV, { ALTREF_FRAME, NO_REF_FRAME } }, + { NEARMV, { GOLDEN_FRAME, NO_REF_FRAME } }, - { ZEROMV, { LAST_FRAME, NONE } }, - { ZEROMV, { GOLDEN_FRAME, NONE } }, - { ZEROMV, { ALTREF_FRAME, NONE } }, + { ZEROMV, { LAST_FRAME, NO_REF_FRAME } }, + { ZEROMV, { GOLDEN_FRAME, NO_REF_FRAME } }, + { ZEROMV, { ALTREF_FRAME, NO_REF_FRAME } }, { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, - { TM_PRED, { INTRA_FRAME, NONE } }, + { TM_PRED, { INTRA_FRAME, NO_REF_FRAME } }, { NEARMV, { LAST_FRAME, ALTREF_FRAME } }, { NEWMV, { LAST_FRAME, ALTREF_FRAME } }, @@ -117,20 +117,20 @@ static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { { ZEROMV, { LAST_FRAME, ALTREF_FRAME } }, { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } }, - { H_PRED, { INTRA_FRAME, NONE } }, - { V_PRED, { INTRA_FRAME, NONE } }, - { D135_PRED, { INTRA_FRAME, NONE } }, - { D207_PRED, { INTRA_FRAME, NONE } }, - { D153_PRED, { INTRA_FRAME, NONE } }, - { D63_PRED, { INTRA_FRAME, NONE } }, - { D117_PRED, { INTRA_FRAME, NONE } }, - { D45_PRED, { INTRA_FRAME, NONE } }, + { H_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { V_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D135_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D207_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D153_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D63_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D117_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D45_PRED, { INTRA_FRAME, NO_REF_FRAME } }, }; static const REF_DEFINITION vp9_ref_order[MAX_REFS] = { - { { LAST_FRAME, NONE } }, { { GOLDEN_FRAME, NONE } }, - { { ALTREF_FRAME, NONE } }, { { LAST_FRAME, ALTREF_FRAME } }, - { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NONE } }, + { { LAST_FRAME, NO_REF_FRAME } }, { { GOLDEN_FRAME, NO_REF_FRAME } }, + { { ALTREF_FRAME, NO_REF_FRAME } }, { { LAST_FRAME, ALTREF_FRAME } }, + { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NO_REF_FRAME } }, }; #endif // !CONFIG_REALTIME_ONLY @@ -1811,7 +1811,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi, const MV_REFERENCE_FRAME ref_frames[2]) { if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) && frame_mv[this_mode][ref_frames[0]].as_int == 0 && - (ref_frames[1] == NONE || + (ref_frames[1] == NO_REF_FRAME || frame_mv[this_mode][ref_frames[1]].as_int == 0)) { int rfc = mode_context[ref_frames[0]]; int c1 = cost_mv_ref(cpi, NEARMV, rfc); @@ -1824,7 +1824,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi, if (c2 > c3) return 0; } else { assert(this_mode == ZEROMV); - if (ref_frames[1] == NONE) { + if (ref_frames[1] == NO_REF_FRAME) { if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) || (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0)) return 0; @@ -3241,7 +3241,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost, x->skip_encode = 0; ctx->skip = 0; xd->mi[0]->ref_frame[0] = INTRA_FRAME; - xd->mi[0]->ref_frame[1] = NONE; + xd->mi[0]->ref_frame[1] = NO_REF_FRAME; // Initialize interp_filter here so we do not have to check for inter block // modes in get_pred_context_switchable_interp() xd->mi[0]->interp_filter = SWITCHABLE_FILTERS; @@ -3686,7 +3686,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK; break; case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK; break; - case NONE: + case NO_REF_FRAME: case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break; } } @@ -3719,7 +3719,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, MODE_INFO *ref_mi; int const_motion = 1; int skip_ref_frame = !cb_partition_search_ctrl; - MV_REFERENCE_FRAME rf = NONE; + MV_REFERENCE_FRAME rf = NO_REF_FRAME; int_mv ref_mv; ref_mv.as_int = INVALID_MV; @@ -3736,7 +3736,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if ((mi_col - 1) >= tile_info->mi_col_start) { if (ref_mv.as_int == INVALID_MV) ref_mv = xd->mi[-1]->mv[0]; - if (rf == NONE) rf = xd->mi[-1]->ref_frame[0]; + if (rf == NO_REF_FRAME) rf = xd->mi[-1]->ref_frame[0]; for (i = 0; i < mi_height; ++i) { ref_mi = xd->mi[i * xd->mi_stride - 1]; const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) && @@ -4230,7 +4230,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data, mi->mode = ZEROMV; mi->uv_mode = DC_PRED; mi->ref_frame[0] = LAST_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->mv[0].as_int = 0; x->skip = 1; @@ -4412,7 +4412,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, case ALTREF_FRAME: ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME); break; - case NONE: + case NO_REF_FRAME: case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break; } } @@ -4899,7 +4899,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, mi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int; } // If the second reference does not exist, set the corresponding mv to zero. - if (mi->ref_frame[1] == NONE) { + if (mi->ref_frame[1] == NO_REF_FRAME) { mi->mv[1].as_int = 0; for (i = 0; i < 4; ++i) { mi->bmi[i].as_mv[1].as_int = 0; diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index f42912d35b..2e2a3746e7 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -183,10 +183,11 @@ static void update_motion_vector_info( const MV_REFERENCE_FRAME *in_ref_frame = input_motion_vector_info[i].ref_frame; output_motion_vector_info[i].mv_count = - (in_ref_frame[0] == INTRA_FRAME) ? 0 - : ((in_ref_frame[1] == NONE) ? 1 : 2); - if (in_ref_frame[0] == NONE) { - fprintf(stderr, "in_ref_frame[0] shouldn't be NONE\n"); + (in_ref_frame[0] == INTRA_FRAME) + ? 0 + : ((in_ref_frame[1] == NO_REF_FRAME) ? 1 : 2); + if (in_ref_frame[0] == NO_REF_FRAME) { + fprintf(stderr, "in_ref_frame[0] shouldn't be NO_REF_FRAME\n"); abort(); } output_motion_vector_info[i].ref_frame[0] = From 8c308aefea7c58a1a979b81f4aa6d68908e379ee Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Tue, 6 Jun 2023 12:27:34 +0530 Subject: [PATCH 0819/1000] Fix c vs intrinsic mismatch of vpx_hadamard_32x32() function This CL resolves the mismatch between C and intrinsic implementation of vpx_hadamard_32x32 function. The mismatch was due to integer overflow during the addition operation in the intrinsic functions. Specifically, the addition in the intrinsic function was performed at the 16-bit level, while the calculation of a0 + a1 resulted in a 17-bit value. This code change addresses the problem by performing the addition at the 32-bit level (with sign extension) in both SSE2 and AVX2, and then converting the results back to the 16-bit level after a right shift. STATS_CHANGED Change-Id: I576ca64e3b9ebb31d143fcd2da64322790bc5853 --- test/hadamard_test.cc | 27 ++++++++++++++++++ vpx_dsp/avg.c | 8 +++--- vpx_dsp/x86/avg_intrin_avx2.c | 53 +++++++++++++++++++++++++++++------ vpx_dsp/x86/avg_intrin_sse2.c | 53 +++++++++++++++++++++++++++++------ 4 files changed, 121 insertions(+), 20 deletions(-) diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index 9f6c99f3c4..0de6622e20 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -170,6 +170,31 @@ class HadamardTestBase : public ::testing::TestWithParam { EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); } + void ExtremeValuesTest() { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxBlockSize]); + DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]); + memset(b, 0, sizeof(b)); + + tran_low_t b_ref[kMaxBlockSize]; + memset(b_ref, 0, sizeof(b_ref)); + + for (int i = 0; i < 2; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + const int sign = (i == 0) ? 1 : -1; + for (int j = 0; j < kMaxBlockSize; ++j) + input_extreme_block[j] = sign * 255; + + ReferenceHadamard(input_extreme_block, bwh_, b_ref, bwh_); + ASM_REGISTER_STATE_CHECK(h_func_(input_extreme_block, bwh_, b)); + + // The order of the output is not important. Sort before checking. + std::sort(b, b + block_size_); + std::sort(b_ref, b_ref + block_size_); + EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); + } + } + void VaryStride() { const int kMaxBlockSize = 32 * 32; DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]); @@ -225,6 +250,8 @@ class HadamardLowbdTest : public HadamardTestBase { TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); } +TEST_P(HadamardLowbdTest, ExtremeValuesTest) { ExtremeValuesTest(); } + TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); } TEST_P(HadamardLowbdTest, DISABLED_Speed) { diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c index 391e9eb144..a8dcab7dae 100644 --- a/vpx_dsp/avg.c +++ b/vpx_dsp/avg.c @@ -295,19 +295,19 @@ void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, vpx_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256); } - // coeff: 15 bit, dynamic range [-16320, 16320] + // coeff: 16 bit, dynamic range [-32768, 32767] for (idx = 0; idx < 256; ++idx) { tran_low_t a0 = coeff[0]; tran_low_t a1 = coeff[256]; tran_low_t a2 = coeff[512]; tran_low_t a3 = coeff[768]; - tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 16 bit, [-32640, 32640] + tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 17 bit, [-65536, 65535] tran_low_t b1 = (a0 - a1) >> 2; // b0-b3: 15 bit, dynamic range - tran_low_t b2 = (a2 + a3) >> 2; // [-16320, 16320] + tran_low_t b2 = (a2 + a3) >> 2; // [-16384, 16383] tran_low_t b3 = (a2 - a3) >> 2; - coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[0] = b0 + b2; // 16 bit, [-32768, 32767] coeff[256] = b1 + b3; coeff[512] = b0 - b2; coeff[768] = b1 - b3; diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c index b2e01319d3..61e4e73c5b 100644 --- a/vpx_dsp/x86/avg_intrin_avx2.c +++ b/vpx_dsp/x86/avg_intrin_avx2.c @@ -218,6 +218,14 @@ void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff, } #endif // CONFIG_VP9_HIGHBITDEPTH +static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero, + __m256i *out_lo, + __m256i *out_hi) { + const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in); + *out_lo = _mm256_unpacklo_epi16(in, sign_bits); + *out_hi = _mm256_unpackhi_epi16(in, sign_bits); +} + static void hadamard_col8x2_avx2(__m256i *in, int iter) { __m256i a0 = in[0]; __m256i a1 = in[1]; @@ -400,6 +408,12 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *t_coeff = coeff; #endif int idx; + __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, + b3_lo; + __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, + b3_hi; + __m256i b0, b1, b2, b3; + const __m256i zero = _mm256_setzero_si256(); for (idx = 0; idx < 4; ++idx) { // src_diff: 9 bit, dynamic range [-255, 255] const int16_t *src_ptr = @@ -414,15 +428,38 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); - __m256i b0 = _mm256_add_epi16(coeff0, coeff1); - __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); - __m256i b2 = _mm256_add_epi16(coeff2, coeff3); - __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + // Sign extend 16 bit to 32 bit. + sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi); + sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi); + sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi); + sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi); + + b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo); + b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi); + + b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo); + b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi); + + b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo); + b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi); + + b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo); + b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi); + + b0_lo = _mm256_srai_epi32(b0_lo, 2); + b1_lo = _mm256_srai_epi32(b1_lo, 2); + b2_lo = _mm256_srai_epi32(b2_lo, 2); + b3_lo = _mm256_srai_epi32(b3_lo, 2); + + b0_hi = _mm256_srai_epi32(b0_hi, 2); + b1_hi = _mm256_srai_epi32(b1_hi, 2); + b2_hi = _mm256_srai_epi32(b2_hi, 2); + b3_hi = _mm256_srai_epi32(b3_hi, 2); - b0 = _mm256_srai_epi16(b0, 2); - b1 = _mm256_srai_epi16(b1, 2); - b2 = _mm256_srai_epi16(b2, 2); - b3 = _mm256_srai_epi16(b3, 2); + b0 = _mm256_packs_epi32(b0_lo, b0_hi); + b1 = _mm256_packs_epi32(b1_lo, b1_hi); + b2 = _mm256_packs_epi32(b2_lo, b2_hi); + b3 = _mm256_packs_epi32(b3_lo, b3_hi); store_tran_low(_mm256_add_epi16(b0, b2), coeff); store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256); diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c index 015c11a1f3..4447dfab7c 100644 --- a/vpx_dsp/x86/avg_intrin_sse2.c +++ b/vpx_dsp/x86/avg_intrin_sse2.c @@ -15,6 +15,14 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_ports/mem.h" +static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero, + __m128i *out_lo, + __m128i *out_hi) { + const __m128i sign_bits = _mm_cmplt_epi16(in, zero); + *out_lo = _mm_unpacklo_epi16(in, sign_bits); + *out_hi = _mm_unpackhi_epi16(in, sign_bits); +} + void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max) { __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; @@ -400,6 +408,12 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *t_coeff = coeff; #endif int idx; + __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, + b3_lo; + __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, + b3_hi; + __m128i b0, b1, b2, b3; + const __m128i zero = _mm_setzero_si128(); for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; @@ -413,15 +427,38 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512)); __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768)); - __m128i b0 = _mm_add_epi16(coeff0, coeff1); - __m128i b1 = _mm_sub_epi16(coeff0, coeff1); - __m128i b2 = _mm_add_epi16(coeff2, coeff3); - __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + // Sign extend 16 bit to 32 bit. + sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi); + sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi); + sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi); + sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi); + + b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo); + b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi); + + b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo); + b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi); + + b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo); + b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi); + + b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo); + b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi); + + b0_lo = _mm_srai_epi32(b0_lo, 2); + b1_lo = _mm_srai_epi32(b1_lo, 2); + b2_lo = _mm_srai_epi32(b2_lo, 2); + b3_lo = _mm_srai_epi32(b3_lo, 2); + + b0_hi = _mm_srai_epi32(b0_hi, 2); + b1_hi = _mm_srai_epi32(b1_hi, 2); + b2_hi = _mm_srai_epi32(b2_hi, 2); + b3_hi = _mm_srai_epi32(b3_hi, 2); - b0 = _mm_srai_epi16(b0, 2); - b1 = _mm_srai_epi16(b1, 2); - b2 = _mm_srai_epi16(b2, 2); - b3 = _mm_srai_epi16(b3, 2); + b0 = _mm_packs_epi32(b0_lo, b0_hi); + b1 = _mm_packs_epi32(b1_lo, b1_hi); + b2 = _mm_packs_epi32(b2_lo, b2_hi); + b3 = _mm_packs_epi32(b3_lo, b3_hi); coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); From bdb8ccc0af49a87c9f4ee08f1d363ceec347ab6e Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 9 Jun 2023 15:33:39 -0400 Subject: [PATCH 0820/1000] RTC RC: clean up unnecessary headers Change-Id: I77c407be59f4eb0c70a89a5fffd88c648e634123 --- vp9/ratectrl_rtc.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index 7f3c900459..d3876de875 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -14,12 +14,9 @@ #include #include -#include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_enums.h" -#include "vp9/common/vp9_onyxc_int.h" #include "vp9/vp9_iface_common.h" #include "vp9/encoder/vp9_aq_cyclicrefresh.h" -#include "vp9/encoder/vp9_firstpass.h" #include "vp9/vp9_cx_iface.h" #include "vpx/internal/vpx_ratectrl_rtc.h" #include "vpx_mem/vpx_mem.h" From 8cee267d3d056ea006e0b5bb380742e3da0a5480 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 14 Jun 2023 16:20:30 -0400 Subject: [PATCH 0821/1000] Add new_mv_count to firstpass stats Mostly follows the logic of how it's calculated in libaom. Bug: b/287283080 Change-Id: I9ee67d844ef9db7cca63339b5304459eaa28d324 --- test/vp9_ethread_test.cc | 8 ++++---- vp9/encoder/vp9_encodeframe.c | 2 ++ vp9/encoder/vp9_encoder.h | 1 + vp9/encoder/vp9_ethread.c | 1 + vp9/encoder/vp9_firstpass.c | 22 +++++++++++++++++++++- vp9/encoder/vp9_firstpass.h | 2 ++ 6 files changed, 31 insertions(+), 5 deletions(-) diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc index 238366cb60..54fa6c48e2 100644 --- a/test/vp9_ethread_test.cc +++ b/test/vp9_ethread_test.cc @@ -21,12 +21,12 @@ namespace { // FIRSTPASS_STATS struct: // { -// 25 double members; +// 26 double members; // 1 int64_t member; // } // Whenever FIRSTPASS_STATS struct is modified, the following constants need to // be revisited. -const int kDbl = 25; +const int kDbl = 26; const int kInt = 1; const size_t kFirstPassStatsSz = kDbl * sizeof(double) + kInt * sizeof(int64_t); @@ -185,7 +185,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); // Compare to check if using or not using row-mt generates close stats. - ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0)); + ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0)); // Test single thread vs multiple threads row_mt_mode_ = 1; @@ -199,7 +199,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); // Compare to check if single-thread and multi-thread stats are close enough. - ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0)); + ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0)); // Bit exact test in row_mt mode. // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 7280e0da8b..0d03d01c80 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -5832,6 +5832,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) { TileDataEnc *tile_data = &cpi->tile_data[tile_row * tile_cols + tile_col]; int i, j; + const MV zero_mv = { 0, 0 }; for (i = 0; i < BLOCK_SIZES; ++i) { for (j = 0; j < MAX_MODES; ++j) { tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT; @@ -5839,6 +5840,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) { tile_data->mode_map[i][j] = j; } } + tile_data->firstpass_top_mv = zero_mv; #if CONFIG_MULTITHREAD tile_data->row_base_thresh_freq_fact = NULL; #endif diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 333ff0b36a..7b02fe7f6b 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -338,6 +338,7 @@ typedef struct TileDataEnc { // Used for adaptive_rd_thresh with row multithreading int *row_base_thresh_freq_fact; + MV firstpass_top_mv; } TileDataEnc; typedef struct RowMTInfo { diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c index fadd233899..681996d334 100644 --- a/vp9/encoder/vp9_ethread.c +++ b/vp9/encoder/vp9_ethread.c @@ -265,6 +265,7 @@ static void accumulate_fp_tile_stat(TileDataEnc *tile_data, tile_data->fp_data.intra_count_high += tile_data_t->fp_data.intra_count_high; tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count; tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount; + tile_data->fp_data.new_mv_count += tile_data_t->fp_data.new_mv_count; tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr; tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs; tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 22669ab847..42e935740a 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -152,6 +152,7 @@ static void zero_stats(FIRSTPASS_STATS *section) { section->pcnt_intra_high = 0.0; section->inactive_zone_rows = 0.0; section->inactive_zone_cols = 0.0; + section->new_mv_count = 0.0; section->MVr = 0.0; section->mvr_abs = 0.0; section->MVc = 0.0; @@ -183,6 +184,7 @@ static void accumulate_stats(FIRSTPASS_STATS *section, section->pcnt_intra_high += frame->pcnt_intra_high; section->inactive_zone_rows += frame->inactive_zone_rows; section->inactive_zone_cols += frame->inactive_zone_cols; + section->new_mv_count += frame->new_mv_count; section->MVr += frame->MVr; section->mvr_abs += frame->mvr_abs; section->MVc += frame->MVc; @@ -212,6 +214,7 @@ static void subtract_stats(FIRSTPASS_STATS *section, section->pcnt_intra_high -= frame->pcnt_intra_high; section->inactive_zone_rows -= frame->inactive_zone_rows; section->inactive_zone_cols -= frame->inactive_zone_cols; + section->new_mv_count -= frame->new_mv_count; section->MVr -= frame->MVr; section->mvr_abs -= frame->mvr_abs; section->MVc -= frame->MVc; @@ -804,6 +807,7 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps, fps->inactive_zone_cols = (double)0; if (fp_acc_data->mvcount > 0) { + fps->new_mv_count = (double)(fp_acc_data->new_mv_count) / num_mbs; fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount; fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount; fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount; @@ -820,6 +824,7 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps, (double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2); fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs; } else { + fps->new_mv_count = 0.0; fps->MVr = 0.0; fps->mvr_abs = 0.0; fps->MVc = 0.0; @@ -845,6 +850,7 @@ static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile, this_tile->fp_data.intra_count_low += fp_acc_data->intra_count_low; this_tile->fp_data.intra_count_high += fp_acc_data->intra_count_high; this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count; + this_tile->fp_data.new_mv_count += fp_acc_data->new_mv_count; this_tile->fp_data.mvcount += fp_acc_data->mvcount; this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr; this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs; @@ -915,6 +921,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, double mb_neutral_count; int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH); + MV *first_top_mv = &tile_data->firstpass_top_mv; + MV last_nonzero_mv = { 0, 0 }; + // First pass code requires valid last and new frame buffers. assert(new_yv12 != NULL); assert(frame_is_intra_only(cm) || (lst_yv12 != NULL)); @@ -955,6 +964,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c); + if (mb_col == mb_col_start) { + last_nonzero_mv = *first_top_mv; + } + // Adjust to the next column of MBs. x->plane[0].src.buf = cpi->Source->y_buffer + mb_row * 16 * x->plane[0].src.stride + mb_col * 16; @@ -1279,6 +1292,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, if (!is_zero_mv(&mv)) { ++(fp_acc_data->mvcount); + if (!is_equal_mv(&mv, &last_nonzero_mv)) { + ++(fp_acc_data->new_mv_count); + } + last_nonzero_mv = mv; // Does the row vector point inwards or outwards? if (mb_row < cm->mb_rows / 2) { @@ -1334,6 +1351,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, } fp_acc_data->coded_error += (int64_t)this_error; + if (mb_col == mb_col_start) { + *first_top_mv = last_nonzero_mv; + } recon_yoffset += 16; recon_uvoffset += uv_mb_height; @@ -1356,7 +1376,7 @@ static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) { MV best_ref_mv; // Tiling is ignored in the first pass. vp9_tile_init(tile, cm, 0, 0); - + tile_data.firstpass_top_mv = zero_mv; #if CONFIG_RATE_CTRL if (cpi->oxcf.use_simple_encode_api) { fp_motion_vector_info_reset(cpi->frame_info.frame_width, diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index cdcf568723..3ba336b34f 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -55,6 +55,7 @@ typedef struct { int64_t sum_mvcs; int sum_in_vectors; int intra_smooth_count; + int new_mv_count; } FIRSTPASS_DATA; typedef struct { @@ -83,6 +84,7 @@ typedef struct { double mv_in_out_count; double duration; double count; + double new_mv_count; int64_t spatial_layer_id; } FIRSTPASS_STATS; From af40910197bb4cd24fe2f5870c386843cced70c2 Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Fri, 16 Jun 2023 16:19:02 +0800 Subject: [PATCH 0822/1000] configure.sh: Improve a comment. The corresponding case block is not only for ARM. Original comment text makes reader confused. Test: N/A, just comment text changes. Change-Id: I3154d18d3b3d237c1eecfe07dc7ec237c98194cf Signed-off-by: Chen Wang --- build/make/configure.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/make/configure.sh b/build/make/configure.sh index 6fd67f1623..7b2da3c1a1 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -970,7 +970,7 @@ process_common_toolchain() { ;; esac - # Process ARM architecture variants + # Process architecture variants case ${toolchain} in arm*) # on arm, isa versions are supersets From 80d4172f0705334db3c51113355ea4704d9a4240 Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Wed, 14 Jun 2023 10:27:49 +0530 Subject: [PATCH 0823/1000] Remove vp9_diamond_search_sad_avx function This CL removes the avx of vp9_diamond_search_sad function as there is no speed up seen wrt C. Change-Id: Ife6005d8e444ea2c8d07ac0f686c840344b9e0ea --- vp9/common/vp9_rtcd_defs.pl | 2 +- vp9/encoder/vp9_encoder.c | 4 +- vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 300 ------------------- vp9/vp9cx.mk | 1 - 4 files changed, 3 insertions(+), 304 deletions(-) delete mode 100644 vp9/encoder/x86/vp9_diamond_search_sad_avx.c diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 4b94c31f15..1a4140b38b 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -172,7 +172,7 @@ () # Motion search # add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv"; -specialize qw/vp9_diamond_search_sad avx neon/; +specialize qw/vp9_diamond_search_sad neon/; # # Apply temporal filter diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 4d7f475a01..aaf42a2a3f 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2187,7 +2187,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { * The following 2 functions ('cal_nmvjointsadcost' and * * 'cal_nmvsadcosts') are used to calculate cost lookup tables * * used by 'vp9_diamond_search_sad'. The C implementation of the * - * function is generic, but the AVX intrinsics optimised version * + * function is generic, but the NEON intrinsics optimised version * * relies on the following properties of the computed tables: * * For cal_nmvjointsadcost: * * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * @@ -2196,7 +2196,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { * (Equal costs for both components) * * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * * (Cost function is even) * - * If these do not hold, then the AVX optimised version of the * + * If these do not hold, then the NEON optimised version of the * * 'vp9_diamond_search_sad' function cannot be used as it is, in which * * case you can revert to using the C function instead. * ***********************************************************************/ diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c deleted file mode 100644 index 63c35df09e..0000000000 --- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#if defined(_MSC_VER) -#include -#endif -#include -#include - -#include "vpx_dsp/vpx_dsp_common.h" -#include "vp9/encoder/vp9_encoder.h" -#include "vpx_ports/mem.h" - -#ifdef __GNUC__ -#define LIKELY(v) __builtin_expect(v, 1) -#define UNLIKELY(v) __builtin_expect(v, 0) -#else -#define LIKELY(v) (v) -#define UNLIKELY(v) (v) -#endif - -static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { - int_mv result; - result.as_mv.row = row; - result.as_mv.col = col; - return result; -} -/***************************************************************************** - * This function utilizes 3 properties of the cost function lookup tables, * - * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * - * vp9_encoder.c. * - * For the joint cost: * - * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * - * For the component costs: * - * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * - * (Equal costs for both components) * - * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * - * (Cost function is even) * - * If these do not hold, then this function cannot be used without * - * modification, in which case you can revert to using the C implementation, * - * which does not rely on these properties. * - *****************************************************************************/ -int vp9_diamond_search_sad_avx(const MACROBLOCK *x, - const search_site_config *cfg, MV *ref_mv, - uint32_t start_mv_sad, MV *best_mv, - int search_param, int sad_per_bit, int *num00, - const vp9_sad_fn_ptr_t *sad_fn_ptr, - const MV *center_mv) { - const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); - const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int); - const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min); - const __m128i v_min_mv_w = _mm_set1_epi32((int)minmv.as_int); - - const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); - - const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]); - const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]); - - // search_param determines the length of the initial step and hence the number - // of iterations. - // 0 = initial step (MAX_FIRST_STEP) pel - // 1 = (MAX_FIRST_STEP/2) pel, - // 2 = (MAX_FIRST_STEP/4) pel... - const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; - const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; - const int tot_steps = cfg->total_steps - search_param; - - const int_mv fcenter_mv = - pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); - const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int); - - const int ref_row = ref_mv->row; - const int ref_col = ref_mv->col; - - int_mv bmv = pack_int_mv(ref_row, ref_col); - int_mv new_bmv = bmv; - __m128i v_bmv_w = _mm_set1_epi32((int)bmv.as_int); - - const int what_stride = x->plane[0].src.stride; - const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; - const uint8_t *const what = x->plane[0].src.buf; - const uint8_t *const in_what = - x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; - - // Work out the start point for the search - const uint8_t *best_address = in_what; - const uint8_t *new_best_address = best_address; -#if VPX_ARCH_X86_64 - __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address); -#else - __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); -#endif - // Starting position - unsigned int best_sad = start_mv_sad; - int i, j, step; - - // Check the prerequisite cost function properties that are easy to check - // in an assert. See the function-level documentation for details on all - // prerequisites. - assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); - assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); - - *num00 = 0; - - for (i = 0, step = 0; step < tot_steps; step++) { - for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { - __m128i v_sad_d, v_cost_d, v_outside_d, v_inside_d, v_diff_mv_w; -#if VPX_ARCH_X86_64 - __m128i v_blocka[2]; -#else - __m128i v_blocka[1]; -#endif - - // Compute the candidate motion vectors - const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i *)&ss_mv[i]); - const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); - // Clamp them to the search bounds - __m128i v_these_mv_clamp_w = v_these_mv_w; - v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w); - v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w); - // The ones that did not change are inside the search area - v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w); - - // If none of them are inside, then move on - if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) { - continue; - } - - // The inverse mask indicates which of the MVs are outside - v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8((int8_t)0xff)); - // Shift right to keep the sign bit clear, we will use this later - // to set the cost to the maximum value. - v_outside_d = _mm_srli_epi32(v_outside_d, 1); - - // Compute the difference MV - v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv); - // We utilise the fact that the cost function is even, and use the - // absolute difference. This allows us to use unsigned indexes later - // and reduces cache pressure somewhat as only a half of the table - // is ever referenced. - v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w); - - // Compute the SIMD pointer offsets. - { -#if VPX_ARCH_X86_64 // sizeof(intptr_t) == 8 - // Load the offsets - __m128i v_bo10_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 0]); - __m128i v_bo32_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 2]); - // Set the ones falling outside to zero - v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d)); - v_bo32_q = - _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d)); - // Compute the candidate addresses - v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); - v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); -#else // VPX_ARCH_X86 // sizeof(intptr_t) == 4 - __m128i v_bo_d = _mm_loadu_si128((const __m128i *)&ss_os[i]); - v_bo_d = _mm_and_si128(v_bo_d, v_inside_d); - v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d); -#endif - } - - sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], - in_what_stride, (uint32_t *)&v_sad_d); - - // Look up the component cost of the residual motion vector - { - const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0); - const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1); - const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2); - const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3); - const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4); - const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5); - const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6); - const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7); - - // Note: This is a use case for vpgather in AVX2 - const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0]; - const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1]; - const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2]; - const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3]; - - __m128i v_cost_10_d, v_cost_32_d; - v_cost_10_d = _mm_cvtsi32_si128(cost0); - v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1); - v_cost_32_d = _mm_cvtsi32_si128(cost2); - v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1); - v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d); - } - - // Now add in the joint cost - { - const __m128i v_sel_d = - _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128()); - const __m128i v_joint_cost_d = - _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d); - v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); - } - - // Multiply by sad_per_bit - v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d); - // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT) - v_cost_d = _mm_add_epi32(v_cost_d, - _mm_set1_epi32(1 << (VP9_PROB_COST_SHIFT - 1))); - v_cost_d = _mm_srai_epi32(v_cost_d, VP9_PROB_COST_SHIFT); - // Add the cost to the sad - v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d); - - // Make the motion vectors outside the search area have max cost - // by or'ing in the comparison mask, this way the minimum search won't - // pick them. - v_sad_d = _mm_or_si128(v_sad_d, v_outside_d); - - // Find the minimum value and index horizontally in v_sad_d - { - // Try speculatively on 16 bits, so we can use the minpos intrinsic - const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d); - const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w); - - uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0); - uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1); - - // If the local best value is not saturated, just use it, otherwise - // find the horizontal minimum again the hard way on 32 bits. - // This is executed rarely. - if (UNLIKELY(local_best_sad == 0xffff)) { - __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; - - // Re-arrange the values in v_sad_d as follows: - // v_loval_d[0] = v_sad_d[0], v_loval_d[1] = v_sad_d[2] - // v_loval_d[2] = v_sad_d[1], v_loval_d[3] = v_sad_d[3] - // v_loidx_d stores the corresponding indices 0, 2, 1, 3 - // This re-arrangement is required to ensure that when there exists - // more than one minimum, the one with the least index is selected - v_loval_d = _mm_shuffle_epi32(v_sad_d, 0xd8); - v_loidx_d = _mm_set_epi32(3, 1, 2, 0); - - v_hival_d = _mm_srli_si128(v_loval_d, 8); - v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); - - // Compare if v_sad_d[1] < v_sad_d[0], v_sad_d[3] < v_sad_d[2] - v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); - - v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); - v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); - v_hival_d = _mm_srli_si128(v_loval_d, 4); - v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); - - // min(v_sad_d[2], v_sad_d[3]) < min(v_sad_d[0], v_sad_d[1]) - v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); - - v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); - v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); - - local_best_sad = _mm_extract_epi32(v_loval_d, 0); - local_best_idx = _mm_extract_epi32(v_loidx_d, 0); - } - - // Update the global minimum if the local minimum is smaller - if (LIKELY(local_best_sad < best_sad)) { -#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#endif - new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; -#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) -#pragma GCC diagnostic pop -#endif - new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; - - best_sad = local_best_sad; - } - } - } - - bmv = new_bmv; - best_address = new_best_address; - - v_bmv_w = _mm_set1_epi32((int)bmv.as_int); -#if VPX_ARCH_X86_64 - v_ba_q = _mm_set1_epi64x((intptr_t)best_address); -#else - v_ba_d = _mm_set1_epi32((intptr_t)best_address); -#endif - - if (UNLIKELY(best_address == in_what)) { - (*num00)++; - } - } - - *best_mv = bmv.as_mv; - return best_sad; -} diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 3e5c0e1a7f..dd9475bcb0 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -117,7 +117,6 @@ VP9_CX_SRCS-$(HAVE_NEON) += encoder/vp9_temporal_filter_constants.h VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c -VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c From 19f3a754d62dcd21e400a3c715f2ed4235d1c4ec Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Mon, 26 Jun 2023 14:57:53 -0700 Subject: [PATCH 0824/1000] Fix a bug in vpx_hadamard_32x32_neon() A right shift by 2 is equivalent to two halving operations if there is no no addition or subtraction between the two halving operations. Note: Since vhaddq_s16() and vhsubq_s16() have 17-bit intermediate precision, the Neon code doesn't need to go to int32_t as was done in https://chromium-review.googlesource.com/c/webm/libvpx/+/4604169. Change-Id: Ibe0691cde0fd3b94ee7c497845ba459d30d503b0 --- vpx_dsp/arm/hadamard_neon.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c index f6b6d7e3ce..f5a044be4d 100644 --- a/vpx_dsp/arm/hadamard_neon.c +++ b/vpx_dsp/arm/hadamard_neon.c @@ -138,15 +138,15 @@ void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512); const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768); - const int16x8_t b0 = vhaddq_s16(a0, a1); - const int16x8_t b1 = vhsubq_s16(a0, a1); - const int16x8_t b2 = vhaddq_s16(a2, a3); - const int16x8_t b3 = vhsubq_s16(a2, a3); + const int16x8_t b0 = vshrq_n_s16(vhaddq_s16(a0, a1), 1); + const int16x8_t b1 = vshrq_n_s16(vhsubq_s16(a0, a1), 1); + const int16x8_t b2 = vshrq_n_s16(vhaddq_s16(a2, a3), 1); + const int16x8_t b3 = vshrq_n_s16(vhsubq_s16(a2, a3), 1); - const int16x8_t c0 = vhaddq_s16(b0, b2); - const int16x8_t c1 = vhaddq_s16(b1, b3); - const int16x8_t c2 = vhsubq_s16(b0, b2); - const int16x8_t c3 = vhsubq_s16(b1, b3); + const int16x8_t c0 = vaddq_s16(b0, b2); + const int16x8_t c1 = vaddq_s16(b1, b3); + const int16x8_t c2 = vsubq_s16(b0, b2); + const int16x8_t c3 = vsubq_s16(b1, b3); store_s16q_to_tran_low(coeff + 0, c0); store_s16q_to_tran_low(coeff + 256, c1); From 885ecc7c667eac3521d4558b2be554d96c95da41 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 23 Jun 2023 19:27:26 -0700 Subject: [PATCH 0825/1000] vp9_dx_iface: fix leaks on init_decoder() failure If any allocations fail in init_decoder() and the application continues to call vpx_codec_decode() some of the allocations would be orphaned or the decoder would be left in a partially initialized state. Found with vpx_dec_fuzzer_vp9 & Nallocfuzz (https://github.com/catenacyber/nallocfuzz). Bug: webm:1807 Change-Id: I44f662526d715ecaeac6180070af40672cd42611 --- vp9/vp9_dx_iface.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 20e71cc227..a242c776cd 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -256,6 +256,7 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) { } while (0) static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { + vpx_codec_err_t res; ctx->last_show_frame = -1; ctx->need_resync = 1; ctx->flushed = 0; @@ -265,6 +266,8 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { ctx->pbi = vp9_decoder_create(ctx->buffer_pool); if (ctx->pbi == NULL) { + vpx_free(ctx->buffer_pool); + ctx->buffer_pool = NULL; set_error_detail(ctx, "Failed to allocate decoder"); return VPX_CODEC_MEM_ERROR; } @@ -282,7 +285,14 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) set_default_ppflags(&ctx->postproc_cfg); - return init_buffer_callbacks(ctx); + res = init_buffer_callbacks(ctx); + if (res != VPX_CODEC_OK) { + vpx_free(ctx->buffer_pool); + ctx->buffer_pool = NULL; + vp9_decoder_remove(ctx->pbi); + ctx->pbi = NULL; + } + return res; } static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx, From 02ab555e992c191e5c509ed87b3cc48ed915b447 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 26 Jun 2023 19:06:51 -0700 Subject: [PATCH 0826/1000] vp9_alloccommon: clear allocation sizes on free This fixes reallocations (and avoids potential crashes) if any allocations fails and the application continues to call vpx_codec_decode(). Found with vpx_dec_fuzzer_vp9 & Nallocfuzz (https://github.com/catenacyber/nallocfuzz). Bug: webm:1807 Change-Id: If5dc96b73c02efc94ec84c25eb50d10ad6b645a6 --- vp9/common/vp9_alloccommon.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index faad657a08..e53883f621 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -65,6 +65,7 @@ static void free_seg_map(VP9_COMMON *cm) { vpx_free(cm->seg_map_array[i]); cm->seg_map_array[i] = NULL; } + cm->seg_map_alloc_size = 0; cm->current_frame_seg_map = NULL; cm->last_frame_seg_map = NULL; @@ -108,6 +109,7 @@ void vp9_free_context_buffers(VP9_COMMON *cm) { cm->above_context = NULL; vpx_free(cm->above_seg_context); cm->above_seg_context = NULL; + cm->above_context_alloc_cols = 0; vpx_free(cm->lf.lfm); cm->lf.lfm = NULL; } From a31e818ef8ae3b1b791187bca4fbca1a5c191736 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 26 Jun 2023 19:09:24 -0700 Subject: [PATCH 0827/1000] vp9_decodeframe,init_mt: free tile_workers on alloc failure This avoids a crash if any of the thread allocations fail and the application continues to call vpx_codec_decode(). Previously num_tile_workers would be non-zero, but not equal to num_threads, which would cause a crash during later thread management. Found with vpx_dec_fuzzer_vp9 & Nallocfuzz (https://github.com/catenacyber/nallocfuzz). Bug: webm:1807 Change-Id: Ie3faf7b36764aebedac0924acb6e4cb7545aec7d --- vp9/decoder/vp9_decodeframe.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 6ec1d9f668..c5892156f4 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -2293,6 +2293,11 @@ static INLINE void init_mt(VP9Decoder *pbi) { winterface->init(worker); if (n < num_threads - 1 && !winterface->reset(worker)) { + do { + winterface->end(&pbi->tile_workers[pbi->num_tile_workers - 1]); + } while (--pbi->num_tile_workers != 0); + vpx_free(pbi->tile_workers); + pbi->tile_workers = NULL; vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Tile decoder thread creation failed"); } From 263ddc9e384fc747714210df1866b1200227dee1 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 26 Jun 2023 19:18:55 -0700 Subject: [PATCH 0828/1000] vp8_decode: clear stream info on decoder create failure This fixes a crash if the application continues to call vpx_codec_decode(). Found with vpx_dec_fuzzer_vp8 & Nallocfuzz (https://github.com/catenacyber/nallocfuzz). Bug: webm:1807 Change-Id: I9867f5fc3d1163026f521a9609d3cbbc00568d1d --- vp8/vp8_dx_iface.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index fdc0b35dd4..8f73d9f83f 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -350,7 +350,14 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, } res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf); - if (res == VPX_CODEC_OK) ctx->decoder_init = 1; + if (res == VPX_CODEC_OK) { + ctx->decoder_init = 1; + } else { + /* on failure clear the cached resolution to ensure a full + * reallocation is attempted on resync. */ + ctx->si.w = 0; + ctx->si.h = 0; + } } /* Set these even if already initialized. The caller may have changed the From a166c52d3a2e72d0fe4dbc8909523c6ad8fbdfb2 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 26 Jun 2023 19:22:00 -0700 Subject: [PATCH 0829/1000] vp8_decode: only remove threads on thread create failure This fixes a crash if the application continues to call vpx_codec_decode(). Previously the decoder instance would be freed, causing a crash when attempting to access it with restart_threads=1. Found with vpx_dec_fuzzer_vp8 & Nallocfuzz (https://github.com/catenacyber/nallocfuzz). Bug: webm:1807 Change-Id: Ic084894b776729bb1572f747082cef002f0832a8 --- vp8/vp8_dx_iface.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index 8f73d9f83f..ed53a8625b 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -306,13 +306,11 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, #if CONFIG_MULTITHREAD if (!res && ctx->restart_threads) { - struct frame_buffers *fb = &ctx->yv12_frame_buffers; VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; VP8_COMMON *const pc = &pbi->common; if (setjmp(pbi->common.error.jmp)) { pbi->common.error.setjmp = 0; - vp8_remove_decoder_instances(fb); - vp8_zero(fb->pbi); + vp8_decoder_remove_threads(pbi); vpx_clear_system_state(); return VPX_CODEC_ERROR; } From 44a5eaa3ba890849fa7db14b8b8cfb1bac876c80 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 26 Jun 2023 19:25:56 -0700 Subject: [PATCH 0830/1000] vp8_decode: fix keyframe resync after decode error This fixes a crash if the application continues to call vpx_codec_decode(). Previously a non-keyframe could cause a crash if the decoder failed before fully initializing due to an allocation failure. The stream info and frame resolution would be 0, skipping an allocation. Found with vpx_dec_fuzzer_vp8 & Nallocfuzz (https://github.com/catenacyber/nallocfuzz). Bug: webm:1807 Change-Id: I1c17302f4d3a488ba3b4eefe0bf53853dc558bc1 --- vp8/vp8_dx_iface.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index ed53a8625b..9e622e3b97 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -162,7 +162,10 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data, si->h = (clear[8] | (clear[9] << 8)) & 0x3fff; /*printf("w=%d, h=%d\n", si->w, si->h);*/ - if (!(si->h && si->w)) res = VPX_CODEC_CORRUPT_FRAME; + if (!(si->h && si->w)) { + si->w = si->h = 0; + res = VPX_CODEC_CORRUPT_FRAME; + } } else { res = VPX_CODEC_UNSUP_BITSTREAM; } @@ -301,6 +304,16 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, } if (!ctx->decoder_init && !ctx->si.is_kf) res = VPX_CODEC_UNSUP_BITSTREAM; + if (!res && ctx->decoder_init && w == 0 && h == 0 && ctx->si.h == 0 && + ctx->si.w == 0) { + VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; + assert(pbi != NULL); + assert(!pbi->common.error.setjmp); + res = VPX_CODEC_CORRUPT_FRAME; + vpx_internal_error(&pbi->common.error, res, + "Keyframe / intra-only frame required to reset decoder" + " state"); + } if ((ctx->si.h != h) || (ctx->si.w != w)) resolution_change = 1; From 3bd65ac7769f68e9319ad1ab5fe7c664121d373b Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 28 Jun 2023 10:04:21 -0400 Subject: [PATCH 0831/1000] vp9 firstpass stats in a separate header Change-Id: If91c5c74c71affc48eb858beb314a6c194b14023 --- vp9/encoder/vp9_firstpass.h | 31 +----------------- vp9/encoder/vp9_firstpass_stats.h | 54 +++++++++++++++++++++++++++++++ vp9/vp9cx.mk | 1 + 3 files changed, 56 insertions(+), 30 deletions(-) create mode 100644 vp9/encoder/vp9_firstpass_stats.h diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 3ba336b34f..a19b04db74 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -14,6 +14,7 @@ #include #include "vp9/common/vp9_onyxc_int.h" +#include "vp9/encoder/vp9_firstpass_stats.h" #include "vp9/encoder/vp9_lookahead.h" #include "vp9/encoder/vp9_ratectrl.h" @@ -58,36 +59,6 @@ typedef struct { int new_mv_count; } FIRSTPASS_DATA; -typedef struct { - double frame; - double weight; - double intra_error; - double coded_error; - double sr_coded_error; - double frame_noise_energy; - double pcnt_inter; - double pcnt_motion; - double pcnt_second_ref; - double pcnt_neutral; - double pcnt_intra_low; // Coded intra but low variance - double pcnt_intra_high; // Coded intra high variance - double intra_skip_pct; - double intra_smooth_pct; // % of blocks that are smooth - double inactive_zone_rows; // Image mask rows top and bottom. - double inactive_zone_cols; // Image mask columns at left and right edges. - double MVr; - double mvr_abs; - double MVc; - double mvc_abs; - double MVrv; - double MVcv; - double mv_in_out_count; - double duration; - double count; - double new_mv_count; - int64_t spatial_layer_id; -} FIRSTPASS_STATS; - typedef enum { KF_UPDATE = 0, LF_UPDATE = 1, diff --git a/vp9/encoder/vp9_firstpass_stats.h b/vp9/encoder/vp9_firstpass_stats.h new file mode 100644 index 0000000000..01928e7816 --- /dev/null +++ b/vp9/encoder/vp9_firstpass_stats.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_ +#define VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + double frame; + double weight; + double intra_error; + double coded_error; + double sr_coded_error; + double frame_noise_energy; + double pcnt_inter; + double pcnt_motion; + double pcnt_second_ref; + double pcnt_neutral; + double pcnt_intra_low; // Coded intra but low variance + double pcnt_intra_high; // Coded intra high variance + double intra_skip_pct; + double intra_smooth_pct; // % of blocks that are smooth + double inactive_zone_rows; // Image mask rows top and bottom. + double inactive_zone_cols; // Image mask columns at left and right edges. + double MVr; + double mvr_abs; + double MVc; + double mvc_abs; + double MVrv; + double MVcv; + double mv_in_out_count; + double duration; + double count; + double new_mv_count; + int64_t spatial_layer_id; +} FIRSTPASS_STATS; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_ diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index dd9475bcb0..44790ef6a4 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -40,6 +40,7 @@ VP9_CX_SRCS-yes += encoder/vp9_encodemb.h VP9_CX_SRCS-yes += encoder/vp9_encodemv.h VP9_CX_SRCS-yes += encoder/vp9_extend.h VP9_CX_SRCS-yes += encoder/vp9_firstpass.h +VP9_CX_SRCS-yes += encoder/vp9_firstpass_stats.h VP9_CX_SRCS-yes += encoder/vp9_frame_scale.c VP9_CX_SRCS-yes += encoder/vp9_job_queue.h VP9_CX_SRCS-yes += encoder/vp9_lookahead.c From 3ecba398023dea731520cc1489159bfd0ad0a200 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 2 Jun 2023 18:49:00 -0700 Subject: [PATCH 0832/1000] Fix Clang -Wunreachable-code-aggressive warnings Based on the change in libaom: fe36011455 Fix Clang -Wunreachable-code-aggressive warnings Clang's -Wunreachable-code-aggressive flag enables several warning flags such as -Wunreachable-code-break and -Wunreachable-code-return. Chrome's build system enables -Wunreachable-code-aggressive (in build/config/compiler/BUILD.gn), so it would be good if libvpx could be compiled without -Wunreachable-code-aggressive warnings. This requires the VPX_NO_RETURN macro be defined correctly for all the compilers we support, otherwise some compilers may warn about missing return statements after a die() or fatal() call (which does not return). Change-Id: I0c069133af45a7a61759538b6d74c681ea087dcd --- args.c | 3 --- configure | 2 +- examples/vp9_spatial_svc_encoder.c | 1 - test/partial_idct_test.cc | 2 +- test/vp9_ratectrl_rtc_test.cc | 2 +- tools_common.c | 10 +++++----- vp8/decoder/decodeframe.c | 2 +- vp8/encoder/firstpass.c | 10 +++++----- vp9/encoder/vp9_ext_ratectrl.c | 1 - vp9/encoder/vp9_firstpass.c | 5 +---- vp9/encoder/vp9_mbgraph.c | 25 +++++++++---------------- vp9/simple_encode.cc | 1 - vpxdec.c | 2 +- 13 files changed, 25 insertions(+), 41 deletions(-) diff --git a/args.c b/args.c index 4afb9c021a..0a9631e1f4 100644 --- a/args.c +++ b/args.c @@ -135,7 +135,6 @@ unsigned int arg_parse_uint(const struct arg *arg) { } die("Option %s: Invalid character '%c'\n", arg->name, *endptr); - return 0; } int arg_parse_int(const struct arg *arg) { @@ -152,7 +151,6 @@ int arg_parse_int(const struct arg *arg) { } die("Option %s: Invalid character '%c'\n", arg->name, *endptr); - return 0; } struct vpx_rational { @@ -209,7 +207,6 @@ int arg_parse_enum(const struct arg *arg) { if (!strcmp(arg->val, listptr->name)) return listptr->val; die("Option %s: Invalid value '%s'\n", arg->name, arg->val); - return 0; } int arg_parse_enum_or_int(const struct arg *arg) { diff --git a/configure b/configure index b73436b47e..67bba946f6 100755 --- a/configure +++ b/configure @@ -651,7 +651,7 @@ process_toolchain() { check_add_cflags -Wmissing-prototypes check_add_cflags -Wshadow check_add_cflags -Wuninitialized - check_add_cflags -Wunreachable-code-loop-increment + check_add_cflags -Wunreachable-code-aggressive check_add_cflags -Wunused check_add_cflags -Wextra # check_add_cflags also adds to cxxflags. gtest does not do well with diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c index 9d37ed0244..998e4fb20d 100644 --- a/examples/vp9_spatial_svc_encoder.c +++ b/examples/vp9_spatial_svc_encoder.c @@ -316,7 +316,6 @@ static void parse_command_line(int argc, const char **argv_, break; default: die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth); - break; } #endif // CONFIG_VP9_HIGHBITDEPTH } else if (arg_match(&arg, &dropframe_thresh_arg, argi)) { diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 7eb888a586..b7c0c050af 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -76,7 +76,7 @@ class PartialIDctTest : public ::testing::TestWithParam { case TX_8X8: size_ = 8; break; case TX_16X16: size_ = 16; break; case TX_32X32: size_ = 32; break; - default: FAIL() << "Wrong Size!"; break; + default: FAIL() << "Wrong Size!"; } // Randomize stride_ to a value less than or equal to 1024 diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index 5abda1290a..8422df074b 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -227,7 +227,7 @@ class RcInterfaceSvcTest rc_cfg_.layer_target_bitrate[4] = 0; rc_cfg_.layer_target_bitrate[5] = 0; ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); - } else if (0 && video->frame() == 280) { + } else if (/*DISABLES CODE*/ (0) && video->frame() == 280) { // TODO(marpan): Re-enable this going back up when issue is fixed. // Go back up to 3 spatial layers. // Update the encoder config: use the original bitrates. diff --git a/tools_common.c b/tools_common.c index cbecfbb419..0de15558dd 100644 --- a/tools_common.c +++ b/tools_common.c @@ -375,7 +375,7 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src, case VPX_IMG_FMT_I42216: case VPX_IMG_FMT_I44416: case VPX_IMG_FMT_I44016: break; - default: fatal("Unsupported image conversion"); break; + default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -411,7 +411,7 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src, case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I444: case VPX_IMG_FMT_I440: break; - default: fatal("Unsupported image conversion"); break; + default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -452,7 +452,7 @@ void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) { case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I444: case VPX_IMG_FMT_I440: break; - default: fatal("Unsupported image conversion"); break; + default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -487,7 +487,7 @@ static void highbd_img_downshift(vpx_image_t *dst, vpx_image_t *src, case VPX_IMG_FMT_I42216: case VPX_IMG_FMT_I44416: case VPX_IMG_FMT_I44016: break; - default: fatal("Unsupported image conversion"); break; + default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -521,7 +521,7 @@ static void lowbd_img_downshift(vpx_image_t *dst, vpx_image_t *src, case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I444: case VPX_IMG_FMT_I440: break; - default: fatal("Unsupported image conversion"); break; + default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c index d014cf9667..729cd9980f 100644 --- a/vp8/decoder/decodeframe.c +++ b/vp8/decoder/decodeframe.c @@ -1167,7 +1167,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) { if (pbi->ec_active && xd->corrupted) pc->refresh_last_frame = 1; #endif - if (0) { + if (/*DISABLES CODE*/ (0)) { FILE *z = fopen("decodestats.stt", "a"); fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n", pc->current_video_frame, pc->frame_type, pc->refresh_golden_frame, pc->refresh_alt_ref_frame, diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index ff088aa969..5f372912fe 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -822,7 +822,7 @@ void vp8_first_pass(VP8_COMP *cpi) { } /* use this to see what the first pass reconstruction looks like */ - if (0) { + if (/*DISABLES CODE*/ (0)) { char filename[512]; FILE *recon_file; sprintf(filename, "enc%04d.yuv", (int)cm->current_video_frame); @@ -1038,7 +1038,7 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats, double clip_iifactor; int overhead_bits_per_mb; - if (0) { + if (/*DISABLES CODE*/ (0)) { FILE *f = fopen("epmp.stt", "a"); fprintf(f, "%10.2f\n", err_per_mb); fclose(f); @@ -1230,7 +1230,7 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, Q++; } - if (0) { + if (/*DISABLES CODE*/ (0)) { FILE *f = fopen("estkf_q.stt", "a"); fprintf(f, "%8d %8d %8d %8.2f %8.3f %8.2f %8.3f %8.3f %8.3f %8d\n", cpi->common.current_video_frame, bits_per_mb_at_this_q, @@ -3047,7 +3047,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { (int)((projected_bits_perframe - av_bits_per_frame) * cpi->twopass.frames_to_key)); - if (0) { + if (/*DISABLES CODE*/ (0)) { FILE *f = fopen("Subsamle.stt", "a"); fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n", cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale, @@ -3121,7 +3121,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, (int)bits_per_frame, group_iiratio); - if (0) { + if (/*DISABLES CODE*/ (0)) { FILE *f = fopen("Subsamle.stt", "a"); fprintf( f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n", kf_q, diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 1d440442b5..b08fd63c3c 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -131,7 +131,6 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { default: fprintf(stderr, "Unsupported update_type %d\n", update_type); abort(); - return 1; } } diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 42e935740a..de7a3829ab 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -364,7 +364,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, case BLOCK_8X16: return vpx_highbd_8_mse8x16; default: return vpx_highbd_8_mse16x16; } - break; case 10: switch (bsize) { case BLOCK_8X8: return vpx_highbd_10_mse8x8; @@ -372,7 +371,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, case BLOCK_8X16: return vpx_highbd_10_mse8x16; default: return vpx_highbd_10_mse16x16; } - break; case 12: switch (bsize) { case BLOCK_8X8: return vpx_highbd_12_mse8x8; @@ -380,7 +378,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, case BLOCK_8X16: return vpx_highbd_12_mse8x16; default: return vpx_highbd_12_mse16x16; } - break; } } @@ -1508,7 +1505,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { } // Use this to see what the first pass reconstruction looks like. - if (0) { + if (/*DISABLES CODE*/ (0)) { char filename[512]; FILE *recon_file; snprintf(filename, sizeof(filename), "enc%04d.yuv", diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index 8b6521d915..2f20a8fe6d 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -333,23 +333,16 @@ static void separate_arf_mbs(VP9_COMP *cpi) { } } - // Only bother with segmentation if over 10% of the MBs in static segment - // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) ) - if (1) { - // Note % of blocks that are marked as static - if (cm->MBs) - cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols); - - // This error case should not be reachable as this function should - // never be called with the common data structure uninitialized. - else - cpi->static_mb_pct = 0; - - vp9_enable_segmentation(&cm->seg); - } else { + // Note % of blocks that are marked as static + if (cm->MBs) + cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols); + + // This error case should not be reachable as this function should + // never be called with the common data structure uninitialized. + else cpi->static_mb_pct = 0; - vp9_disable_segmentation(&cm->seg); - } + + vp9_enable_segmentation(&cm->seg); // Free localy allocated storage vpx_free(arf_not_zz); diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc index 2e2a3746e7..2e6f9a4513 100644 --- a/vp9/simple_encode.cc +++ b/vp9/simple_encode.cc @@ -143,7 +143,6 @@ get_frame_type_from_update_type(FRAME_UPDATE_TYPE update_type) { default: fprintf(stderr, "Unsupported update_type %d\n", update_type); abort(); - return kFrameTypeInter; } } diff --git a/vpxdec.c b/vpxdec.c index 54a41f0799..bfe6c1d6ba 100644 --- a/vpxdec.c +++ b/vpxdec.c @@ -446,7 +446,7 @@ static void generate_filename(const char *pattern, char *out, size_t q_len, case '7': snprintf(q, q_len - 1, "%07d", frame_in); break; case '8': snprintf(q, q_len - 1, "%08d", frame_in); break; case '9': snprintf(q, q_len - 1, "%09d", frame_in); break; - default: die("Unrecognized pattern %%%c\n", p[1]); break; + default: die("Unrecognized pattern %%%c\n", p[1]); } pat_len = strlen(q); From dc26707f80686031905e5f013daf37062e61f6d2 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 28 Jun 2023 12:26:32 -0700 Subject: [PATCH 0833/1000] delete some dead code follow-up to: 3ecba3980 Fix Clang -Wunreachable-code-aggressive warnings Change-Id: I364312987bc838c69c010cce024bd3d62a918417 --- vp8/decoder/decodeframe.c | 8 ------ vp8/encoder/firstpass.c | 54 ------------------------------------- vp9/encoder/vp9_firstpass.c | 16 ----------- 3 files changed, 78 deletions(-) diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c index 729cd9980f..af9a98c1de 100644 --- a/vp8/decoder/decodeframe.c +++ b/vp8/decoder/decodeframe.c @@ -1167,14 +1167,6 @@ int vp8_decode_frame(VP8D_COMP *pbi) { if (pbi->ec_active && xd->corrupted) pc->refresh_last_frame = 1; #endif - if (/*DISABLES CODE*/ (0)) { - FILE *z = fopen("decodestats.stt", "a"); - fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n", pc->current_video_frame, - pc->frame_type, pc->refresh_golden_frame, pc->refresh_alt_ref_frame, - pc->refresh_last_frame, pc->base_qindex); - fclose(z); - } - { pbi->independent_partitions = 1; diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 5f372912fe..4443f5e7cd 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -821,22 +821,6 @@ void vp8_first_pass(VP8_COMP *cpi) { vp8_yv12_copy_frame(lst_yv12, gld_yv12); } - /* use this to see what the first pass reconstruction looks like */ - if (/*DISABLES CODE*/ (0)) { - char filename[512]; - FILE *recon_file; - sprintf(filename, "enc%04d.yuv", (int)cm->current_video_frame); - - if (cm->current_video_frame == 0) { - recon_file = fopen(filename, "wb"); - } else { - recon_file = fopen(filename, "ab"); - } - - (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file); - fclose(recon_file); - } - cm->current_video_frame++; } extern const int vp8_bits_per_mb[2][QINDEX_RANGE]; @@ -1038,12 +1022,6 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats, double clip_iifactor; int overhead_bits_per_mb; - if (/*DISABLES CODE*/ (0)) { - FILE *f = fopen("epmp.stt", "a"); - fprintf(f, "%10.2f\n", err_per_mb); - fclose(f); - } - target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs); @@ -1230,17 +1208,6 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, Q++; } - if (/*DISABLES CODE*/ (0)) { - FILE *f = fopen("estkf_q.stt", "a"); - fprintf(f, "%8d %8d %8d %8.2f %8.3f %8.2f %8.3f %8.3f %8.3f %8d\n", - cpi->common.current_video_frame, bits_per_mb_at_this_q, - target_norm_bits_per_mb, err_per_mb, err_correction_factor, - current_spend_ratio, group_iiratio, iiratio_correction_factor, - (double)cpi->buffer_level / (double)cpi->oxcf.optimal_buffer_level, - Q); - fclose(f); - } - return Q; } @@ -3047,16 +3014,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { (int)((projected_bits_perframe - av_bits_per_frame) * cpi->twopass.frames_to_key)); - if (/*DISABLES CODE*/ (0)) { - FILE *f = fopen("Subsamle.stt", "a"); - fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n", - cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale, - cpi->common.vert_scale, kf_group_err / cpi->twopass.frames_to_key, - (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key), - new_height, new_width); - fclose(f); - } - /* The trigger for spatial resampling depends on the various * parameters such as whether we are streaming (CBR) or VBR. */ @@ -3120,17 +3077,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { */ kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, (int)bits_per_frame, group_iiratio); - - if (/*DISABLES CODE*/ (0)) { - FILE *f = fopen("Subsamle.stt", "a"); - fprintf( - f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n", kf_q, - cpi->common.horiz_scale, cpi->common.vert_scale, - kf_group_err / cpi->twopass.frames_to_key, - (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key), - new_height, new_width); - fclose(f); - } } } diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index de7a3829ab..bd203f1e21 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1504,22 +1504,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { cm->ref_frame_map[cpi->lst_fb_idx]); } - // Use this to see what the first pass reconstruction looks like. - if (/*DISABLES CODE*/ (0)) { - char filename[512]; - FILE *recon_file; - snprintf(filename, sizeof(filename), "enc%04d.yuv", - (int)cm->current_video_frame); - - if (cm->current_video_frame == 0) - recon_file = fopen(filename, "wb"); - else - recon_file = fopen(filename, "ab"); - - (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file); - fclose(recon_file); - } - // In the first pass, every frame is considered as a show frame. update_frame_indexes(cm, /*show_frame=*/1); if (cpi->use_svc) vp9_inc_frame_in_layer(cpi); From 3ef9934789c0b1de47c72abd7362072ba89c0d8b Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Wed, 28 Jun 2023 16:09:36 -0700 Subject: [PATCH 0834/1000] Fix a bug in vpx_highbd_hadamard_32x32_neon(). This CL is the highbd version of https://chromium-review.googlesource.com/c/webm/libvpx/+/4646573. The bug is caused by the incorrect assumption that (a / 2) + (b / 2) == (a + b) / 2 and (a / 2) - (b / 2) == (a - b) / 2. Also fix the Rand() inputs to Hadamard functions in unit tests. This CL ports the following libaom CLs to libvpx: https://aomedia-review.googlesource.com/c/aom/+/177101 https://aomedia-review.googlesource.com/c/aom/+/177241 Change-Id: Ic20e7684eab5d6507417fa2b75e572064d37ad2c --- test/acm_random.h | 15 +++++---------- test/hadamard_test.cc | 20 ++++++++++++++++++-- vpx_dsp/arm/highbd_hadamard_neon.c | 16 ++++++++-------- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/test/acm_random.h b/test/acm_random.h index c7122b9338..e3520c47de 100644 --- a/test/acm_random.h +++ b/test/acm_random.h @@ -45,16 +45,11 @@ class ACMRandom { return static_cast(random_.Generate(65536)); } - int16_t Rand13Signed() { - // Use 13 bits: values between 4095 and -4096. - const uint32_t value = random_.Generate(8192); - return static_cast(value) - 4096; - } - - int16_t Rand9Signed() { - // Use 9 bits: values between 255 (0x0FF) and -256 (0x100). - const uint32_t value = random_.Generate(512); - return static_cast(value) - 256; + uint16_t Rand12() { + const uint32_t value = + random_.Generate(testing::internal::Random::kMaxRange); + // There's a bit more entropy in the upper bits of this implementation. + return (value >> 19) & 0xfff; } uint8_t Rand8() { diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index 0de6622e20..9ba898b519 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -137,6 +137,12 @@ class HadamardTestBase : public ::testing::TestWithParam { rnd_.Reset(ACMRandom::DeterministicSeed()); } + // The Rand() function generates values in the range [-((1 << BitDepth) - 1), + // (1 << BitDepth) - 1]. This is because the input to the Hadamard transform + // is the residual pixel, which is defined as 'source pixel - predicted + // pixel'. Source pixel and predicted pixel take values in the range + // [0, (1 << BitDepth) - 1] and thus the residual pixel ranges from + // -((1 << BitDepth) - 1) to ((1 << BitDepth) - 1). virtual int16_t Rand() = 0; void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b, @@ -245,7 +251,12 @@ class HadamardTestBase : public ::testing::TestWithParam { class HadamardLowbdTest : public HadamardTestBase { protected: - virtual int16_t Rand() { return rnd_.Rand9Signed(); } + // Use values between -255 (0xFF01) and 255 (0x00FF) + virtual int16_t Rand() { + int16_t src = rnd_.Rand8(); + int16_t pred = rnd_.Rand8(); + return src - pred; + } }; TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); } @@ -323,7 +334,12 @@ INSTANTIATE_TEST_SUITE_P( #if CONFIG_VP9_HIGHBITDEPTH class HadamardHighbdTest : public HadamardTestBase { protected: - virtual int16_t Rand() { return rnd_.Rand13Signed(); } + // Use values between -4095 (0xF001) and 4095 (0x0FFF) + virtual int16_t Rand() { + int16_t src = rnd_.Rand12(); + int16_t pred = rnd_.Rand12(); + return src - pred; + } }; TEST_P(HadamardHighbdTest, CompareReferenceRandom) { CompareReferenceRandom(); } diff --git a/vpx_dsp/arm/highbd_hadamard_neon.c b/vpx_dsp/arm/highbd_hadamard_neon.c index 499eb65462..7be88f6bcb 100644 --- a/vpx_dsp/arm/highbd_hadamard_neon.c +++ b/vpx_dsp/arm/highbd_hadamard_neon.c @@ -197,15 +197,15 @@ void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff, int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 512); int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 768); - int32x4_t b0 = vhaddq_s32(a0, a1); - int32x4_t b1 = vhsubq_s32(a0, a1); - int32x4_t b2 = vhaddq_s32(a2, a3); - int32x4_t b3 = vhsubq_s32(a2, a3); + int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2); + int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2); + int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2); + int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2); - int32x4_t c0 = vhaddq_s32(b0, b2); - int32x4_t c1 = vhaddq_s32(b1, b3); - int32x4_t c2 = vhsubq_s32(b0, b2); - int32x4_t c3 = vhsubq_s32(b1, b3); + int32x4_t c0 = vaddq_s32(b0, b2); + int32x4_t c1 = vaddq_s32(b1, b3); + int32x4_t c2 = vsubq_s32(b0, b2); + int32x4_t c3 = vsubq_s32(b1, b3); store_s32q_to_tran_low(coeff + 4 * i, c0); store_s32q_to_tran_low(coeff + 4 * i + 256, c1); From dcb91aa3ddc21586a9bc9d3ee9c270ad1b1a5fe1 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 29 Jun 2023 09:52:26 -0700 Subject: [PATCH 0835/1000] mfqe_partition: fix -Wunreachable-code vp9/common/vp9_mfqe.c|240 col 16| warning: code will never be executed [-Wunreachable-code] BLOCK_SIZE mfqe_bs, bs_tmp; ^~~~~~~ Change-Id: I566b20d8c294e19bc4b90b57b730f933048e71a5 --- vp9/common/vp9_mfqe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c index e76d771b8d..cf60fa40fd 100644 --- a/vp9/common/vp9_mfqe.c +++ b/vp9/common/vp9_mfqe.c @@ -217,6 +217,7 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, const int bsl = b_width_log2_lookup[bs]; PARTITION_TYPE partition = partition_lookup[bsl][cur_bs]; const BLOCK_SIZE subsize = get_subsize(bs, partition); + BLOCK_SIZE mfqe_bs, bs_tmp; if (cur_bs < BLOCK_8X8) { // If there are blocks smaller than 8x8, it must be on the boundary. @@ -236,7 +237,6 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, uv_offset = 8; } switch (partition) { - BLOCK_SIZE mfqe_bs, bs_tmp; case PARTITION_HORZ: if (bs == BLOCK_64X64) { mfqe_bs = BLOCK_64X32; From f30532a6d9d7f86b1a3431aa0d4dc658d1e3269a Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 10 Jul 2023 10:06:13 -0700 Subject: [PATCH 0836/1000] vpx_free_tpl_gop_stats: normalize param name this fixes a clang-tidy warning Change-Id: I13f4750c15b7d6a395494c8dbcb896bde125b3c4 --- vpx/src/vpx_tpl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vpx/src/vpx_tpl.c b/vpx/src/vpx_tpl.c index 9cdb4a0a06..62c2a9c857 100644 --- a/vpx/src/vpx_tpl.c +++ b/vpx/src/vpx_tpl.c @@ -97,11 +97,11 @@ vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file, return VPX_CODEC_OK; } -void vpx_free_tpl_gop_stats(VpxTplGopStats *data) { +void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats) { int frame; - if (data == NULL) return; - for (frame = 0; frame < data->size; frame++) { - vpx_free(data->frame_stats_list[frame].block_stats_list); + if (tpl_gop_stats == NULL) return; + for (frame = 0; frame < tpl_gop_stats->size; frame++) { + vpx_free(tpl_gop_stats->frame_stats_list[frame].block_stats_list); } - vpx_free(data->frame_stats_list); + vpx_free(tpl_gop_stats->frame_stats_list); } From e9b9972ca41115b74d26b8a012606c53c837ee95 Mon Sep 17 00:00:00 2001 From: "L. E. Segovia" Date: Sat, 8 Jul 2023 20:30:49 -0300 Subject: [PATCH 0837/1000] vp8: remove missing prototypes from the rtcd header These were removed in If7a49e920e12f7fca0541190b87e6dae510df05c but the leftovers can cause a build to fail if the code isn't optimized out. I just found this out in the Meson port of libvpx for GStreamer. BUG=webm:1584 Change-Id: I1c953720a2cbec3796200d4ec4020dca0b672bfb --- vp8/common/rtcd_defs.pl | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 739a612847..12b474d939 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -127,12 +127,6 @@ () # if (vpx_config("CONFIG_POSTPROC") eq "yes") { - add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride"; - - add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride"; - - add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride"; - add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"; specialize qw/vp8_filter_by_weight16x16 sse2 msa/; From 1d1ee888d39d288017c360abb523c2686ff56da8 Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 13 Jul 2023 09:49:30 -0700 Subject: [PATCH 0838/1000] vp9_rdopt,handle_inter_mode: fix -Wmaybe-uninitialized warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With gcc 13.1.1 In function ‘handle_inter_mode’, inlined from ‘vp9_rd_pick_inter_mode_sb’ at ../vp9/encoder/vp9_rdopt.c:3872:17: ../vp9/encoder/vp9_rdopt.c:3142:8: warning: ‘tmp_rd’ may be used uninitialized [-Wmaybe-uninitialized] 3142 | rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0); ../vp9/encoder/vp9_rdopt.c: In function ‘vp9_rd_pick_inter_mode_sb’: ../vp9/encoder/vp9_rdopt.c:2846:15: note: ‘tmp_rd’ was declared here 2846 | int64_t rd, tmp_rd, best_rd = INT64_MAX; Change-Id: I8608957cc8bbeb1ae525f3c3dad6fe9785b2a9b4 --- vp9/encoder/vp9_rdopt.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index b7fb26de27..7b607b643a 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -2841,9 +2841,8 @@ static int64_t handle_inter_mode( #else DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]); #endif // CONFIG_VP9_HIGHBITDEPTH - int pred_exists = 0; int intpel_mv; - int64_t rd, tmp_rd, best_rd = INT64_MAX; + int64_t rd, tmp_rd = INT64_MAX, best_rd = INT64_MAX; int best_needs_copy = 0; uint8_t *orig_dst[MAX_MB_PLANE]; int orig_dst_stride[MAX_MB_PLANE]; @@ -3003,7 +3002,6 @@ static int64_t handle_inter_mode( mi->mode != NEARESTMV) return INT64_MAX; - pred_exists = 0; // Are all MVs integer pel for Y and UV intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv); if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv); @@ -3111,7 +3109,6 @@ static int64_t handle_inter_mode( if ((cm->interp_filter == SWITCHABLE && newbest) || (cm->interp_filter != SWITCHABLE && cm->interp_filter == mi->interp_filter)) { - pred_exists = 1; tmp_rd = best_rd; skip_txfm_sb = tmp_skip_sb; @@ -3131,7 +3128,7 @@ static int64_t handle_inter_mode( cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter; rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0; - if (pred_exists) { + if (tmp_rd != INT64_MAX) { if (best_needs_copy) { // again temporarily set the buffers to local memory to prevent a memcpy for (i = 0; i < MAX_MB_PLANE; i++) { From 37200b6abb9cb4989a3c955a2a8a9b60df7e6245 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 24 Jul 2023 13:08:05 -0400 Subject: [PATCH 0839/1000] cleanup: _pt -> _ptr in vp9 external RC interface Change-Id: Ic483488f8f6273e8977cfc324466bda41f1e47a7 --- vpx/vpx_ext_ratectrl.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 3c5fc8cfc3..43816459ef 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -385,13 +385,13 @@ typedef struct vpx_rc_gop_decision { * This callback is invoked by the encoder to create an external rate control * model. * - * \param[in] priv Callback's private data - * \param[in] ratectrl_config Pointer to vpx_rc_config_t - * \param[out] rate_ctrl_model_pt Pointer to vpx_rc_model_t + * \param[in] priv Callback's private data + * \param[in] ratectrl_config Pointer to vpx_rc_config_t + * \param[out] rate_ctrl_model_ptr Pointer to vpx_rc_model_t */ typedef vpx_rc_status_t (*vpx_rc_create_model_cb_fn_t)( void *priv, const vpx_rc_config_t *ratectrl_config, - vpx_rc_model_t *rate_ctrl_model_pt); + vpx_rc_model_t *rate_ctrl_model_ptr); /*!\brief Send first pass stats to the external rate control model callback * prototype From e1c124f8965f166d3e9ca26c9215ebc3ec3a1d72 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 24 Jul 2023 18:04:58 -0400 Subject: [PATCH 0840/1000] Add new_mv_count to ext rate control interface Bug: b/290385227 Change-Id: Ia87c4bf1e9315bf1134c998f88e9d5548c497777 --- vp9/encoder/vp9_ext_ratectrl.c | 1 + vpx/vpx_ext_ratectrl.h | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index b08fd63c3c..09253403b8 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -92,6 +92,7 @@ static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats, rc_frame_stats->mv_in_out_count = stats->mv_in_out_count; rc_frame_stats->duration = stats->duration; rc_frame_stats->count = stats->count; + rc_frame_stats->new_mv_count = stats->new_mv_count; } vpx_codec_err_t vp9_extrc_send_firstpass_stats( diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 43816459ef..2c312858b8 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -271,6 +271,10 @@ typedef struct vpx_rc_frame_stats { * number of frames whose stats are accumulated. */ double count; + /*! + * Number of new mv in a frame. + */ + double new_mv_count; } vpx_rc_frame_stats_t; /*!\brief Collection of first pass frame stats From 5fb280ebb9b023575829876bede174f2098e6f9e Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 7 Jul 2023 19:14:59 -0700 Subject: [PATCH 0841/1000] test,AbstractBench: fix -Wnon-virtual-dtor In file included from ../test/bench.cc:14: ../test/bench.h:17:7: warning: 'AbstractBench' has virtual functions but non-virtual destructor [-Wnon-virtual-dtor] class AbstractBench { Change-Id: Ibbfb949b63c8dff936c7ed4f2d056dea0343377b --- configure | 1 + test/bench.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/configure b/configure index 67bba946f6..aef65a8505 100755 --- a/configure +++ b/configure @@ -679,6 +679,7 @@ process_toolchain() { check_add_cxxflags -Wc++14-extensions check_add_cxxflags -Wc++17-extensions check_add_cxxflags -Wc++20-extensions + check_add_cxxflags -Wnon-virtual-dtor # disable some warnings specific to libyuv / libwebm. check_cxxflags -Wno-missing-declarations \ diff --git a/test/bench.h b/test/bench.h index 57ca9118ba..203e4d247e 100644 --- a/test/bench.h +++ b/test/bench.h @@ -16,6 +16,8 @@ class AbstractBench { public: + virtual ~AbstractBench() = default; + void RunNTimes(int n); void PrintMedian(const char *title); From 84e6b7ab02771fbf27a623944b511367bf5aacbd Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 24 Jul 2023 17:23:23 -0700 Subject: [PATCH 0842/1000] test/*.cc: prefer 'override' to 'virtual' created with clang-tidy --fix --checks=-*,modernize-use-override Change-Id: I71e1b4423c143b3e47fe90929ee110b307cdb565 --- test/active_map_refresh_test.cc | 8 ++-- test/active_map_test.cc | 8 ++-- test/add_noise_test.cc | 4 +- test/alt_ref_aq_segment_test.cc | 8 ++-- test/altref_test.cc | 22 ++++----- test/aq_segment_test.cc | 8 ++-- test/avg_test.cc | 20 ++++---- test/blockiness_test.cc | 4 +- test/borders_test.cc | 10 ++-- test/byte_alignment_test.cc | 4 +- test/comp_avg_pred_test.cc | 2 +- test/config_test.cc | 10 ++-- test/consistency_test.cc | 4 +- test/convolve_test.cc | 4 +- test/cpu_speed_test.cc | 12 ++--- test/cq_test.cc | 14 +++--- test/dct16x16_test.cc | 31 +++++++------ test/dct32x32_test.cc | 14 +++--- test/dct_partial_test.cc | 2 +- test/dct_test.cc | 4 +- test/decode_corrupted.cc | 22 ++++----- test/decode_perf_test.cc | 16 +++---- test/decode_svc_test.cc | 11 ++--- test/encode_perf_test.cc | 14 +++--- test/error_resilience_test.cc | 26 +++++------ test/external_frame_buffer_test.cc | 17 ++++--- test/fdct8x8_test.cc | 31 +++++++------ test/frame_size_tests.cc | 16 +++---- test/hadamard_test.cc | 6 +-- test/idct_test.cc | 4 +- test/invalid_file_test.cc | 15 +++--- test/keyframe_test.cc | 10 ++-- test/level_test.cc | 8 ++-- test/lpf_test.cc | 12 ++--- test/minmax_test.cc | 2 +- test/partial_idct_test.cc | 6 +-- test/pp_filter_test.cc | 12 ++--- test/predict_test.cc | 6 +-- test/quantize_test.cc | 4 +- test/resize_test.cc | 58 ++++++++++++------------ test/sad_test.cc | 4 +- test/sum_squares_test.cc | 6 +-- test/superframe_test.cc | 14 +++--- test/svc_datarate_test.cc | 44 +++++++++--------- test/svc_end_to_end_test.cc | 54 +++++++++++----------- test/test_vector_test.cc | 11 ++--- test/tile_independence_test.cc | 10 ++-- test/timestamp_test.cc | 10 ++-- test/variance_test.cc | 10 ++-- test/vp8_datarate_test.cc | 14 +++--- test/vp8_denoiser_sse2_test.cc | 6 +-- test/vp8_fdct4x4_test.cc | 2 +- test/vp8_fragments_test.cc | 4 +- test/vp8_ratectrl_rtc_test.cc | 12 ++--- test/vp9_arf_freq_test.cc | 12 ++--- test/vp9_block_error_test.cc | 6 +-- test/vp9_datarate_test.cc | 40 ++++++++-------- test/vp9_denoiser_test.cc | 6 +-- test/vp9_encoder_parms_get_to_decoder.cc | 14 +++--- test/vp9_end_to_end_test.cc | 32 ++++++------- test/vp9_ethread_test.cc | 36 +++++++-------- test/vp9_intrapred_test.cc | 2 +- test/vp9_lossless_test.cc | 12 ++--- test/vp9_motion_vector_test.cc | 8 ++-- test/vp9_quantize_test.cc | 4 +- test/vp9_ratectrl_rtc_test.cc | 26 +++++------ test/vp9_roi_test.cc | 8 ++-- test/vp9_scale_test.cc | 4 +- test/vp9_subtract_test.cc | 8 ++-- test/vp9_thread_test.cc | 6 +-- test/vpx_scale_test.cc | 8 ++-- test/y4m_test.cc | 6 +-- test/yuv_temporal_filter_test.cc | 2 +- 73 files changed, 464 insertions(+), 466 deletions(-) diff --git a/test/active_map_refresh_test.cc b/test/active_map_refresh_test.cc index 68d8856eaa..8b35ca81ba 100644 --- a/test/active_map_refresh_test.cc +++ b/test/active_map_refresh_test.cc @@ -62,16 +62,16 @@ class ActiveMapRefreshTest public ::libvpx_test::CodecTestWith2Params { protected: ActiveMapRefreshTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~ActiveMapRefreshTest() {} + ~ActiveMapRefreshTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); cpu_used_ = GET_PARAM(2); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { ::libvpx_test::Y4mVideoSource *y4m_video = static_cast(video); if (video->frame() == 0) { diff --git a/test/active_map_test.cc b/test/active_map_test.cc index 543ec0d358..1f661b559c 100644 --- a/test/active_map_test.cc +++ b/test/active_map_test.cc @@ -26,16 +26,16 @@ class ActiveMapTest static const int kHeight = 144; ActiveMapTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~ActiveMapTest() {} + ~ActiveMapTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); cpu_used_ = GET_PARAM(2); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); encoder->Control(VP9E_SET_AQ_MODE, GET_PARAM(3)); diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc index 7dc86e3eb6..6e787dd6ba 100644 --- a/test/add_noise_test.cc +++ b/test/add_noise_test.cc @@ -32,8 +32,8 @@ typedef std::tuple AddNoiseTestFPParam; class AddNoiseTest : public ::testing::Test, public ::testing::WithParamInterface { public: - virtual void TearDown() { libvpx_test::ClearSystemState(); } - virtual ~AddNoiseTest() {} + void TearDown() override { libvpx_test::ClearSystemState(); } + ~AddNoiseTest() override {} }; double stddev6(char a, char b, char c, char d, char e, char f) { diff --git a/test/alt_ref_aq_segment_test.cc b/test/alt_ref_aq_segment_test.cc index 00a00e27c5..b64fc3cd0b 100644 --- a/test/alt_ref_aq_segment_test.cc +++ b/test/alt_ref_aq_segment_test.cc @@ -20,9 +20,9 @@ class AltRefAqSegmentTest public ::libvpx_test::CodecTestWith2Params { protected: AltRefAqSegmentTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~AltRefAqSegmentTest() {} + ~AltRefAqSegmentTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); set_cpu_used_ = GET_PARAM(2); @@ -30,8 +30,8 @@ class AltRefAqSegmentTest alt_ref_aq_mode_ = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); encoder->Control(VP9E_SET_ALT_REF_AQ, alt_ref_aq_mode_); diff --git a/test/altref_test.cc b/test/altref_test.cc index 69bcef774e..69b2b87a2a 100644 --- a/test/altref_test.cc +++ b/test/altref_test.cc @@ -24,24 +24,24 @@ class AltRefTest : public ::libvpx_test::EncoderTest, public ::libvpx_test::CodecTestWithParam { protected: AltRefTest() : EncoderTest(GET_PARAM(0)), altref_count_(0) {} - virtual ~AltRefTest() {} + ~AltRefTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(libvpx_test::kTwoPassGood); } - virtual void BeginPassHook(unsigned int /*pass*/) { altref_count_ = 0; } + void BeginPassHook(unsigned int /*pass*/) override { altref_count_ = 0; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); encoder->Control(VP8E_SET_CPUUSED, 3); } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) ++altref_count_; } @@ -75,17 +75,17 @@ class AltRefForcedKeyTestLarge AltRefForcedKeyTestLarge() : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {} - virtual ~AltRefForcedKeyTestLarge() {} + ~AltRefForcedKeyTestLarge() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); cfg_.rc_end_usage = VPX_VBR; cfg_.g_threads = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); @@ -100,7 +100,7 @@ class AltRefForcedKeyTestLarge (video->frame() == forced_kf_frame_num_) ? VPX_EFLAG_FORCE_KF : 0; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { if (frame_num_ == forced_kf_frame_num_) { ASSERT_TRUE(!!(pkt->data.frame.flags & VPX_FRAME_IS_KEY)) << "Frame #" << frame_num_ << " isn't a keyframe!"; diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc index 2cbc991d0c..c98b8de094 100644 --- a/test/aq_segment_test.cc +++ b/test/aq_segment_test.cc @@ -20,17 +20,17 @@ class AqSegmentTest public ::libvpx_test::CodecTestWith2Params { protected: AqSegmentTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~AqSegmentTest() {} + ~AqSegmentTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); set_cpu_used_ = GET_PARAM(2); aq_mode_ = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); diff --git a/test/avg_test.cc b/test/avg_test.cc index a0428304a2..dbd3309ee4 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -38,7 +38,7 @@ class AverageTestBase : public ::testing::Test { : width_(width), height_(height), source_data_(nullptr), source_stride_(0), bit_depth_(8) {} - virtual void TearDown() { + void TearDown() override { vpx_free(source_data_); source_data_ = nullptr; libvpx_test::ClearSystemState(); @@ -49,7 +49,7 @@ class AverageTestBase : public ::testing::Test { static const int kDataAlignment = 16; static const int kDataBlockSize = 64 * 128; - virtual void SetUp() { + void SetUp() override { source_data_ = reinterpret_cast( vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); ASSERT_NE(source_data_, nullptr); @@ -169,7 +169,7 @@ class IntProRowTest : public AverageTestBase, } protected: - virtual void SetUp() { + void SetUp() override { source_data_ = reinterpret_cast( vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); ASSERT_NE(source_data_, nullptr); @@ -180,7 +180,7 @@ class IntProRowTest : public AverageTestBase, vpx_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16)); } - virtual void TearDown() { + void TearDown() override { vpx_free(source_data_); source_data_ = nullptr; vpx_free(hbuf_c_); @@ -238,7 +238,7 @@ typedef std::tuple SatdTestParam; class SatdTest : public ::testing::Test, public ::testing::WithParamInterface { protected: - virtual void SetUp() { + void SetUp() override { satd_size_ = GET_PARAM(0); satd_func_ = GET_PARAM(1); rnd_.Reset(ACMRandom::DeterministicSeed()); @@ -247,7 +247,7 @@ class SatdTest : public ::testing::Test, ASSERT_NE(src_, nullptr); } - virtual void TearDown() { + void TearDown() override { libvpx_test::ClearSystemState(); vpx_free(src_); } @@ -276,7 +276,7 @@ class SatdTest : public ::testing::Test, class SatdLowbdTest : public SatdTest { protected: - virtual void FillRandom() { + void FillRandom() override { for (int i = 0; i < satd_size_; ++i) { const int16_t tmp = rnd_.Rand16Signed(); src_[i] = (tran_low_t)tmp; @@ -292,7 +292,7 @@ class BlockErrorTestFP : public ::testing::Test, public ::testing::WithParamInterface { protected: - virtual void SetUp() { + void SetUp() override { txfm_size_ = GET_PARAM(0); block_error_func_ = GET_PARAM(1); rnd_.Reset(ACMRandom::DeterministicSeed()); @@ -304,7 +304,7 @@ class BlockErrorTestFP ASSERT_NE(dqcoeff_, nullptr); } - virtual void TearDown() { + void TearDown() override { libvpx_test::ClearSystemState(); vpx_free(coeff_); vpx_free(dqcoeff_); @@ -463,7 +463,7 @@ TEST_P(SatdLowbdTest, DISABLED_Speed) { #if CONFIG_VP9_HIGHBITDEPTH class SatdHighbdTest : public SatdTest { protected: - virtual void FillRandom() { + void FillRandom() override { for (int i = 0; i < satd_size_; ++i) { src_[i] = rnd_.Rand20Signed(); } diff --git a/test/blockiness_test.cc b/test/blockiness_test.cc index 11b2a3f61f..5a45bc0b7f 100644 --- a/test/blockiness_test.cc +++ b/test/blockiness_test.cc @@ -49,14 +49,14 @@ class BlockinessTestBase : public ::testing::Test { reference_data_ = nullptr; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: // Handle frames up to 640x480 static const int kDataAlignment = 16; static const int kDataBufferSize = 640 * 480; - virtual void SetUp() { + void SetUp() override { source_stride_ = (width_ + 31) & ~31; reference_stride_ = width_ * 2; rnd_.Reset(ACMRandom::DeterministicSeed()); diff --git a/test/borders_test.cc b/test/borders_test.cc index 3c1f69a923..009121bf22 100644 --- a/test/borders_test.cc +++ b/test/borders_test.cc @@ -22,15 +22,15 @@ class BordersTest public ::libvpx_test::CodecTestWithParam { protected: BordersTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~BordersTest() {} + ~BordersTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 1); encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); @@ -40,7 +40,7 @@ class BordersTest } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) { } } diff --git a/test/byte_alignment_test.cc b/test/byte_alignment_test.cc index 1e0ffceb8d..ba6fffc524 100644 --- a/test/byte_alignment_test.cc +++ b/test/byte_alignment_test.cc @@ -58,7 +58,7 @@ class ByteAlignmentTest ByteAlignmentTest() : video_(nullptr), decoder_(nullptr), md5_file_(nullptr) {} - virtual void SetUp() { + void SetUp() override { video_ = new libvpx_test::WebMVideoSource(kVP9TestFile); ASSERT_NE(video_, nullptr); video_->Init(); @@ -71,7 +71,7 @@ class ByteAlignmentTest OpenMd5File(kVP9Md5File); } - virtual void TearDown() { + void TearDown() override { if (md5_file_ != nullptr) fclose(md5_file_); delete decoder_; diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc index d8fabd5bef..3234cc9a25 100644 --- a/test/comp_avg_pred_test.cc +++ b/test/comp_avg_pred_test.cc @@ -49,7 +49,7 @@ using AvgPredFunc = void (*)(uint8_t *a, const uint8_t *b, int w, int h, template class AvgPredTest : public ::testing::TestWithParam { public: - virtual void SetUp() { + void SetUp() override { avg_pred_func_ = GetParam(); rnd_.Reset(ACMRandom::DeterministicSeed()); } diff --git a/test/config_test.cc b/test/config_test.cc index 8f4c60e113..a476d580a5 100644 --- a/test/config_test.cc +++ b/test/config_test.cc @@ -22,24 +22,24 @@ class ConfigTest ConfigTest() : EncoderTest(GET_PARAM(0)), frame_count_in_(0), frame_count_out_(0), frame_count_max_(0) {} - virtual ~ConfigTest() {} + ~ConfigTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { frame_count_in_ = 0; frame_count_out_ = 0; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource * /*video*/) { + void PreEncodeFrameHook(libvpx_test::VideoSource * /*video*/) override { ++frame_count_in_; abort_ |= (frame_count_in_ >= frame_count_max_); } - virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) { + void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) override { ++frame_count_out_; } diff --git a/test/consistency_test.cc b/test/consistency_test.cc index f0e2cb297e..5e872e70a8 100644 --- a/test/consistency_test.cc +++ b/test/consistency_test.cc @@ -65,14 +65,14 @@ class ConsistencyTestBase : public ::testing::Test { delete[] ssim_array_; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: // Handle frames up to 640x480 static const int kDataAlignment = 16; static const int kDataBufferSize = 640 * 480; - virtual void SetUp() { + void SetUp() override { source_stride_ = (width_ + 31) & ~31; reference_stride_ = width_ * 2; rnd_.Reset(ACMRandom::DeterministicSeed()); diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 5a17d80894..4d27c5ffcf 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -361,7 +361,7 @@ class ConvolveTest : public ::testing::TestWithParam { #endif } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } static void TearDownTestSuite() { vpx_free(input_ - 1); @@ -403,7 +403,7 @@ class ConvolveTest : public ::testing::TestWithParam { i % kOuterBlockSize >= (BorderLeft() + Width())); } - virtual void SetUp() { + void SetUp() override { UUT_ = GET_PARAM(2); #if CONFIG_VP9_HIGHBITDEPTH if (UUT_->use_highbd_ != 0) { diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc index a7623f09ac..78999ce658 100644 --- a/test/cpu_speed_test.cc +++ b/test/cpu_speed_test.cc @@ -26,9 +26,9 @@ class CpuSpeedTest : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR), tune_content_(VP9E_CONTENT_DEFAULT) {} - virtual ~CpuSpeedTest() {} + ~CpuSpeedTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); if (encoding_mode_ != ::libvpx_test::kRealTime) { @@ -40,10 +40,10 @@ class CpuSpeedTest } } - virtual void BeginPassHook(unsigned int /*pass*/) { min_psnr_ = kMaxPSNR; } + void BeginPassHook(unsigned int /*pass*/) override { min_psnr_ = kMaxPSNR; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_); @@ -56,7 +56,7 @@ class CpuSpeedTest } } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.psnr.psnr[0] < min_psnr_) min_psnr_ = pkt->data.psnr.psnr[0]; } diff --git a/test/cq_test.cc b/test/cq_test.cc index 292adb0d04..a9a16aae13 100644 --- a/test/cq_test.cc +++ b/test/cq_test.cc @@ -50,21 +50,21 @@ class CQTest : public ::libvpx_test::EncoderTest, init_flags_ = VPX_CODEC_USE_PSNR; } - virtual ~CQTest() {} + ~CQTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(libvpx_test::kTwoPassGood); } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { file_size_ = 0; psnr_ = 0.0; n_frames_ = 0; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { if (cfg_.rc_end_usage == VPX_CQ) { encoder->Control(VP8E_SET_CQ_LEVEL, cq_level_); @@ -73,12 +73,12 @@ class CQTest : public ::libvpx_test::EncoderTest, } } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { psnr_ += pow(10.0, pkt->data.psnr.psnr[0] / 10.0); n_frames_++; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { file_size_ += pkt->data.frame.sz; } diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index 4ad2263cfc..de98d99731 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -728,9 +728,9 @@ class Trans16x16TestBase { class Trans16x16DCT : public Trans16x16TestBase, public ::testing::TestWithParam { public: - virtual ~Trans16x16DCT() {} + ~Trans16x16DCT() override {} - virtual void SetUp() { + void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); tx_type_ = GET_PARAM(2); @@ -749,13 +749,13 @@ class Trans16x16DCT : public Trans16x16TestBase, inv_txfm_ref = idct16x16_ref; #endif } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) { + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { fwd_txfm_(in, out, stride); } - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride); } @@ -782,9 +782,9 @@ TEST_P(Trans16x16DCT, DISABLED_Speed) { RunSpeedTest(); } class Trans16x16HT : public Trans16x16TestBase, public ::testing::TestWithParam { public: - virtual ~Trans16x16HT() {} + ~Trans16x16HT() override {} - virtual void SetUp() { + void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); tx_type_ = GET_PARAM(2); @@ -803,13 +803,13 @@ class Trans16x16HT : public Trans16x16TestBase, inv_txfm_ref = iht16x16_ref; #endif } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) { + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { fwd_txfm_(in, out, stride, tx_type_); } - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride, tx_type_); } @@ -832,9 +832,9 @@ TEST_P(Trans16x16HT, QuantCheck) { class InvTrans16x16DCT : public Trans16x16TestBase, public ::testing::TestWithParam { public: - virtual ~InvTrans16x16DCT() {} + ~InvTrans16x16DCT() override {} - virtual void SetUp() { + void SetUp() override { ref_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); thresh_ = GET_PARAM(2); @@ -842,11 +842,12 @@ class InvTrans16x16DCT : public Trans16x16TestBase, pitch_ = 16; mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, int /*stride*/) {} - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, + int /*stride*/) override {} + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride); } diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index 1167038b5f..62547ac537 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -89,8 +89,8 @@ void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) { class Trans32x32Test : public AbstractBench, public ::testing::TestWithParam { public: - virtual ~Trans32x32Test() {} - virtual void SetUp() { + ~Trans32x32Test() override {} + void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); version_ = GET_PARAM(2); // 0: high precision forward transform @@ -99,7 +99,7 @@ class Trans32x32Test : public AbstractBench, mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: int version_; @@ -110,7 +110,7 @@ class Trans32x32Test : public AbstractBench, int16_t *bench_in_; tran_low_t *bench_out_; - virtual void Run(); + void Run() override; }; void Trans32x32Test::Run() { fwd_txfm_(bench_in_, bench_out_, 32); } @@ -321,8 +321,8 @@ TEST_P(Trans32x32Test, InverseAccuracy) { class InvTrans32x32Test : public ::testing::TestWithParam { public: - virtual ~InvTrans32x32Test() {} - virtual void SetUp() { + ~InvTrans32x32Test() override {} + void SetUp() override { ref_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); version_ = GET_PARAM(2); // 0: high precision forward transform @@ -334,7 +334,7 @@ class InvTrans32x32Test : public ::testing::TestWithParam { pitch_ = 32; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: void RunRefTxfm(tran_low_t *out, uint8_t *dst, int stride) { diff --git a/test/dct_partial_test.cc b/test/dct_partial_test.cc index e57fa0f48b..ec6f543f71 100644 --- a/test/dct_partial_test.cc +++ b/test/dct_partial_test.cc @@ -67,7 +67,7 @@ class PartialFdctTest : public ::testing::TestWithParam { bit_depth_ = GET_PARAM(2); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: void RunTest() { diff --git a/test/dct_test.cc b/test/dct_test.cc index 235c407237..c3d3081c42 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -134,7 +134,7 @@ void fwht_ref(const Buffer &in, Buffer *out, int size, class TransTestBase : public ::testing::TestWithParam { public: - virtual void SetUp() { + void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); const int idx = GET_PARAM(0); const FuncInfo *func_info = &(GET_PARAM(1)[idx]); @@ -166,7 +166,7 @@ class TransTestBase : public ::testing::TestWithParam { ASSERT_NE(dst_, nullptr); } - virtual void TearDown() { + void TearDown() override { vpx_free(src_); src_ = nullptr; vpx_free(dst_); diff --git a/test/decode_corrupted.cc b/test/decode_corrupted.cc index 31e1da69cc..a9a2cc6e70 100644 --- a/test/decode_corrupted.cc +++ b/test/decode_corrupted.cc @@ -28,9 +28,9 @@ class DecodeCorruptedFrameTest DecodeCorruptedFrameTest() : EncoderTest(GET_PARAM(0)) {} protected: - virtual ~DecodeCorruptedFrameTest() {} + ~DecodeCorruptedFrameTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); cfg_.g_lag_in_frames = 0; @@ -44,16 +44,16 @@ class DecodeCorruptedFrameTest dec_cfg_.threads = 1; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, 7); } - virtual void MismatchHook(const vpx_image_t * /*img1*/, - const vpx_image_t * /*img2*/) {} + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override {} - virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( - const vpx_codec_cx_pkt_t *pkt) { + const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( + const vpx_codec_cx_pkt_t *pkt) override { // Don't edit frame packet on key frame. if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) return pkt; if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt; @@ -66,9 +66,9 @@ class DecodeCorruptedFrameTest return &modified_pkt_; } - virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, - const libvpx_test::VideoSource & /*video*/, - libvpx_test::Decoder *decoder) { + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder *decoder) override { EXPECT_NE(res_dec, VPX_CODEC_MEM_ERROR) << decoder->DecodeError(); return VPX_CODEC_MEM_ERROR != res_dec; } diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc index e07a667440..7533778e82 100644 --- a/test/decode_perf_test.cc +++ b/test/decode_perf_test.cc @@ -118,9 +118,9 @@ class VP9NewEncodeDecodePerfTest : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0), outfile_(0), out_frames_(0) {} - virtual ~VP9NewEncodeDecodePerfTest() {} + ~VP9NewEncodeDecodePerfTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); @@ -137,8 +137,8 @@ class VP9NewEncodeDecodePerfTest cfg_.rc_end_usage = VPX_VBR; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, speed_); encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); @@ -146,14 +146,14 @@ class VP9NewEncodeDecodePerfTest } } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { const std::string data_path = getenv("LIBVPX_TEST_DATA_PATH"); const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile; outfile_ = fopen(path_to_source.c_str(), "wb"); ASSERT_NE(outfile_, nullptr); } - virtual void EndPassHook() { + void EndPassHook() override { if (outfile_ != nullptr) { if (!fseek(outfile_, 0, SEEK_SET)) { ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_); @@ -163,7 +163,7 @@ class VP9NewEncodeDecodePerfTest } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { ++out_frames_; // Write initial file header if first frame. @@ -177,7 +177,7 @@ class VP9NewEncodeDecodePerfTest pkt->data.frame.sz); } - virtual bool DoDecode() const { return false; } + bool DoDecode() const override { return false; } void set_speed(unsigned int speed) { speed_ = speed; } diff --git a/test/decode_svc_test.cc b/test/decode_svc_test.cc index ec9935da79..29e9bd06f5 100644 --- a/test/decode_svc_test.cc +++ b/test/decode_svc_test.cc @@ -25,17 +25,16 @@ class DecodeSvcTest : public ::libvpx_test::DecoderTest, public ::libvpx_test::CodecTestWithParam { protected: DecodeSvcTest() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)) {} - virtual ~DecodeSvcTest() {} + ~DecodeSvcTest() override {} - virtual void PreDecodeFrameHook( - const libvpx_test::CompressedVideoSource &video, - libvpx_test::Decoder *decoder) { + void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { if (video.frame_number() == 0) decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, spatial_layer_); } - virtual void DecompressedFrameHook(const vpx_image_t &img, - const unsigned int frame_number) { + void DecompressedFrameHook(const vpx_image_t &img, + const unsigned int frame_number) override { ASSERT_EQ(img.d_w, width_); ASSERT_EQ(img.d_h, height_); total_frames_ = frame_number; diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc index 142a55952b..5f9c58dc94 100644 --- a/test/encode_perf_test.cc +++ b/test/encode_perf_test.cc @@ -61,9 +61,9 @@ class VP9EncodePerfTest : EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0), encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {} - virtual ~VP9EncodePerfTest() {} + ~VP9EncodePerfTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); @@ -82,8 +82,8 @@ class VP9EncodePerfTest cfg_.g_threads = threads_; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { const int log2_tile_columns = 3; encoder->Control(VP8E_SET_CPUUSED, speed_); @@ -93,19 +93,19 @@ class VP9EncodePerfTest } } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { min_psnr_ = kMaxPsnr; nframes_ = 0; } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.psnr.psnr[0] < min_psnr_) { min_psnr_ = pkt->data.psnr.psnr[0]; } } // for performance reasons don't decode - virtual bool DoDecode() const { return false; } + bool DoDecode() const override { return false; } double min_psnr() const { return min_psnr_; } diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc index 45138f14b9..622c3c4461 100644 --- a/test/error_resilience_test.cc +++ b/test/error_resilience_test.cc @@ -30,7 +30,7 @@ class ErrorResilienceTestLarge Reset(); } - virtual ~ErrorResilienceTestLarge() {} + ~ErrorResilienceTestLarge() override {} void Reset() { error_nframes_ = 0; @@ -38,19 +38,19 @@ class ErrorResilienceTestLarge pattern_switch_ = 0; } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { psnr_ = 0.0; nframes_ = 0; mismatch_psnr_ = 0.0; mismatch_nframes_ = 0; } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { psnr_ += pkt->data.psnr.psnr[0]; nframes_++; } @@ -90,7 +90,7 @@ class ErrorResilienceTestLarge return frame_flags; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video) override { frame_flags_ &= ~(VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF); // For temporal layer case. @@ -129,7 +129,7 @@ class ErrorResilienceTestLarge return 0.0; } - virtual bool DoDecode() const { + bool DoDecode() const override { if (error_nframes_ > 0 && (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) { for (unsigned int i = 0; i < error_nframes_; ++i) { @@ -143,7 +143,7 @@ class ErrorResilienceTestLarge return 1; } - virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) { + void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override { double mismatch_psnr = compute_psnr(img1, img2); mismatch_psnr_ += mismatch_psnr; ++mismatch_nframes_; @@ -381,7 +381,7 @@ class ErrorResilienceTestLargeCodecControls Reset(); } - virtual ~ErrorResilienceTestLargeCodecControls() {} + ~ErrorResilienceTestLargeCodecControls() override {} void Reset() { last_pts_ = 0; @@ -393,7 +393,7 @@ class ErrorResilienceTestLargeCodecControls duration_ = 0.0; } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); } @@ -460,8 +460,8 @@ class ErrorResilienceTestLargeCodecControls return layer_id; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (cfg_.ts_number_layers > 1) { int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers); int frame_flags = SetFrameFlags(video->frame(), cfg_.ts_number_layers); @@ -476,7 +476,7 @@ class ErrorResilienceTestLargeCodecControls } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { // Time since last timestamp = duration. vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; if (duration > 1) { @@ -496,7 +496,7 @@ class ErrorResilienceTestLargeCodecControls ++tot_frame_number_; } - virtual void EndPassHook() { + void EndPassHook() override { duration_ = (last_pts_ + 1) * timebase_; if (cfg_.ts_number_layers > 1) { for (int layer = 0; layer < static_cast(cfg_.ts_number_layers); diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc index 3bd4a1c473..7b9a836fbc 100644 --- a/test/external_frame_buffer_test.cc +++ b/test/external_frame_buffer_test.cc @@ -210,13 +210,12 @@ class ExternalFrameBufferMD5Test : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)), md5_file_(nullptr), num_buffers_(0) {} - virtual ~ExternalFrameBufferMD5Test() { + ~ExternalFrameBufferMD5Test() override { if (md5_file_ != nullptr) fclose(md5_file_); } - virtual void PreDecodeFrameHook( - const libvpx_test::CompressedVideoSource &video, - libvpx_test::Decoder *decoder) { + void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { if (num_buffers_ > 0 && video.frame_number() == 0) { // Have libvpx use frame buffers we create. ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_)); @@ -232,8 +231,8 @@ class ExternalFrameBufferMD5Test << "Md5 file open failed. Filename: " << md5_file_name_; } - virtual void DecompressedFrameHook(const vpx_image_t &img, - const unsigned int frame_number) { + void DecompressedFrameHook(const vpx_image_t &img, + const unsigned int frame_number) override { ASSERT_NE(md5_file_, nullptr); char expected_md5[33]; char junk[128]; @@ -289,7 +288,7 @@ class ExternalFrameBufferTest : public ::testing::Test { ExternalFrameBufferTest() : video_(nullptr), decoder_(nullptr), num_buffers_(0) {} - virtual void SetUp() { + void SetUp() override { video_ = new libvpx_test::WebMVideoSource(kVP9TestFile); ASSERT_NE(video_, nullptr); video_->Init(); @@ -300,7 +299,7 @@ class ExternalFrameBufferTest : public ::testing::Test { ASSERT_NE(decoder_, nullptr); } - virtual void TearDown() { + void TearDown() override { delete decoder_; decoder_ = nullptr; delete video_; @@ -355,7 +354,7 @@ class ExternalFrameBufferTest : public ::testing::Test { class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest { protected: - virtual void SetUp() { + void SetUp() override { video_ = new libvpx_test::WebMVideoSource(kVP9NonRefTestFile); ASSERT_NE(video_, nullptr); video_->Init(); diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 21f8dcffa0..ba9db00120 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -539,9 +539,9 @@ class FwdTrans8x8TestBase { class FwdTrans8x8DCT : public FwdTrans8x8TestBase, public ::testing::TestWithParam { public: - virtual ~FwdTrans8x8DCT() {} + ~FwdTrans8x8DCT() override {} - virtual void SetUp() { + void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); tx_type_ = GET_PARAM(2); @@ -551,13 +551,13 @@ class FwdTrans8x8DCT : public FwdTrans8x8TestBase, mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) { + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { fwd_txfm_(in, out, stride); } - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride); } @@ -578,9 +578,9 @@ TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); } class FwdTrans8x8HT : public FwdTrans8x8TestBase, public ::testing::TestWithParam { public: - virtual ~FwdTrans8x8HT() {} + ~FwdTrans8x8HT() override {} - virtual void SetUp() { + void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); tx_type_ = GET_PARAM(2); @@ -590,13 +590,13 @@ class FwdTrans8x8HT : public FwdTrans8x8TestBase, mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) { + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { fwd_txfm_(in, out, stride, tx_type_); } - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride, tx_type_); } @@ -614,9 +614,9 @@ TEST_P(FwdTrans8x8HT, ExtremalCheck) { RunExtremalCheck(); } class InvTrans8x8DCT : public FwdTrans8x8TestBase, public ::testing::TestWithParam { public: - virtual ~InvTrans8x8DCT() {} + ~InvTrans8x8DCT() override {} - virtual void SetUp() { + void SetUp() override { ref_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); thresh_ = GET_PARAM(2); @@ -625,13 +625,14 @@ class InvTrans8x8DCT : public FwdTrans8x8TestBase, mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride); } - void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/, int /*stride*/) {} + void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/, + int /*stride*/) override {} IdctFunc ref_txfm_; IdctFunc inv_txfm_; diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc index 8a0eb71ba0..266858eebb 100644 --- a/test/frame_size_tests.cc +++ b/test/frame_size_tests.cc @@ -65,7 +65,7 @@ class EncoderWithExpectedError : public ::libvpx_test::Encoder { ASSERT_EQ(expected_err, res) << EncoderError(); } - virtual vpx_codec_iface_t *CodecInterface() const { + vpx_codec_iface_t *CodecInterface() const override { #if CONFIG_VP9_ENCODER return &vpx_codec_vp9_cx_algo; #else @@ -79,22 +79,22 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest, protected: VP9FrameSizeTestsLarge() : EncoderTest(&::libvpx_test::kVP9), expected_res_(VPX_CODEC_OK) {} - virtual ~VP9FrameSizeTestsLarge() {} + ~VP9FrameSizeTestsLarge() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); } - virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, - const libvpx_test::VideoSource & /*video*/, - libvpx_test::Decoder *decoder) { + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder *decoder) override { EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError(); return !::testing::Test::HasFailure(); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 7); encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index 9ba898b519..b22bae87cc 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -130,7 +130,7 @@ std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) { class HadamardTestBase : public ::testing::TestWithParam { public: - virtual void SetUp() { + void SetUp() override { h_func_ = GetParam().func; bwh_ = GetParam().block_size; block_size_ = bwh_ * bwh_; @@ -252,7 +252,7 @@ class HadamardTestBase : public ::testing::TestWithParam { class HadamardLowbdTest : public HadamardTestBase { protected: // Use values between -255 (0xFF01) and 255 (0x00FF) - virtual int16_t Rand() { + int16_t Rand() override { int16_t src = rnd_.Rand8(); int16_t pred = rnd_.Rand8(); return src - pred; @@ -335,7 +335,7 @@ INSTANTIATE_TEST_SUITE_P( class HadamardHighbdTest : public HadamardTestBase { protected: // Use values between -4095 (0xF001) and 4095 (0x0FFF) - virtual int16_t Rand() { + int16_t Rand() override { int16_t src = rnd_.Rand12(); int16_t pred = rnd_.Rand12(); return src - pred; diff --git a/test/idct_test.cc b/test/idct_test.cc index 1b9532e1c1..279e58e2aa 100644 --- a/test/idct_test.cc +++ b/test/idct_test.cc @@ -27,7 +27,7 @@ using libvpx_test::Buffer; class IDCTTest : public ::testing::TestWithParam { protected: - virtual void SetUp() { + void SetUp() override { UUT = GetParam(); input = new Buffer(4, 4, 0); @@ -41,7 +41,7 @@ class IDCTTest : public ::testing::TestWithParam { ASSERT_TRUE(output->Init()); } - virtual void TearDown() { + void TearDown() override { delete input; delete predict; delete output; diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc index 762d585f59..c37dc0d486 100644 --- a/test/invalid_file_test.cc +++ b/test/invalid_file_test.cc @@ -40,7 +40,7 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest, protected: InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(nullptr) {} - virtual ~InvalidFileTest() { + ~InvalidFileTest() override { if (res_file_ != nullptr) fclose(res_file_); } @@ -50,10 +50,9 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest, << "Result file open failed. Filename: " << res_file_name_; } - virtual bool HandleDecodeResult( - const vpx_codec_err_t res_dec, - const libvpx_test::CompressedVideoSource &video, - libvpx_test::Decoder *decoder) { + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { EXPECT_NE(res_file_, nullptr); int expected_res_dec; @@ -172,9 +171,9 @@ VP9_INSTANTIATE_TEST_SUITE(InvalidFileTest, class InvalidFileInvalidPeekTest : public InvalidFileTest { protected: InvalidFileInvalidPeekTest() : InvalidFileTest() {} - virtual void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/, - libvpx_test::CompressedVideoSource * /*video*/, - const vpx_codec_err_t /*res_peek*/) {} + void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/, + libvpx_test::CompressedVideoSource * /*video*/, + const vpx_codec_err_t /*res_peek*/) override {} }; TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { RunTest(); } diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc index a13dec9ce2..d624cb19d6 100644 --- a/test/keyframe_test.cc +++ b/test/keyframe_test.cc @@ -22,9 +22,9 @@ class KeyframeTest public ::libvpx_test::CodecTestWithParam { protected: KeyframeTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~KeyframeTest() {} + ~KeyframeTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); kf_count_ = 0; @@ -33,8 +33,8 @@ class KeyframeTest set_cpu_used_ = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (kf_do_force_kf_) { frame_flags_ = (video->frame() % 3) ? 0 : VPX_EFLAG_FORCE_KF; } @@ -43,7 +43,7 @@ class KeyframeTest } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) { kf_pts_list_.push_back(pkt->data.frame.pts); kf_count_++; diff --git a/test/level_test.cc b/test/level_test.cc index 038d75f44f..8e653d93e1 100644 --- a/test/level_test.cc +++ b/test/level_test.cc @@ -22,9 +22,9 @@ class LevelTest : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), cpu_used_(GET_PARAM(2)), min_gf_internal_(24), target_level_(0), level_(0) {} - virtual ~LevelTest() {} + ~LevelTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); if (encoding_mode_ != ::libvpx_test::kRealTime) { @@ -41,8 +41,8 @@ class LevelTest cfg_.rc_min_quantizer = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); encoder->Control(VP9E_SET_TARGET_LEVEL, target_level_); diff --git a/test/lpf_test.cc b/test/lpf_test.cc index 4cc99a6db4..2f04194dce 100644 --- a/test/lpf_test.cc +++ b/test/lpf_test.cc @@ -129,15 +129,15 @@ uint8_t GetHevThresh(ACMRandom *rnd) { class Loop8Test6Param : public ::testing::TestWithParam { public: - virtual ~Loop8Test6Param() {} - virtual void SetUp() { + ~Loop8Test6Param() override {} + void SetUp() override { loopfilter_op_ = GET_PARAM(0); ref_loopfilter_op_ = GET_PARAM(1); bit_depth_ = GET_PARAM(2); mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: int bit_depth_; @@ -151,15 +151,15 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param); (HAVE_DSPR2 || HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH) class Loop8Test9Param : public ::testing::TestWithParam { public: - virtual ~Loop8Test9Param() {} - virtual void SetUp() { + ~Loop8Test9Param() override {} + void SetUp() override { loopfilter_op_ = GET_PARAM(0); ref_loopfilter_op_ = GET_PARAM(1); bit_depth_ = GET_PARAM(2); mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: int bit_depth_; diff --git a/test/minmax_test.cc b/test/minmax_test.cc index e710af6991..b495709063 100644 --- a/test/minmax_test.cc +++ b/test/minmax_test.cc @@ -29,7 +29,7 @@ typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride, const uint8_t *b, class MinMaxTest : public ::testing::TestWithParam { public: - virtual void SetUp() { + void SetUp() override { mm_func_ = GetParam(); rnd_.Reset(ACMRandom::DeterministicSeed()); } diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index b7c0c050af..6593ac68e9 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -59,8 +59,8 @@ const int kCountTestBlock = 1000; class PartialIDctTest : public ::testing::TestWithParam { public: - virtual ~PartialIDctTest() {} - virtual void SetUp() { + ~PartialIDctTest() override {} + void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); fwd_txfm_ = GET_PARAM(0); full_inv_txfm_ = GET_PARAM(1); @@ -100,7 +100,7 @@ class PartialIDctTest : public ::testing::TestWithParam { vpx_memalign(16, pixel_size_ * output_block_size_)); } - virtual void TearDown() { + void TearDown() override { vpx_free(input_block_); input_block_ = nullptr; vpx_free(output_block_); diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc index 27d5ffa907..d2db8a7c7d 100644 --- a/test/pp_filter_test.cc +++ b/test/pp_filter_test.cc @@ -51,10 +51,10 @@ class VpxPostProcDownAndAcrossMbRowTest public: VpxPostProcDownAndAcrossMbRowTest() : mb_post_proc_down_and_across_(GetParam()) {} - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - virtual void Run(); + void Run() override; const VpxPostProcDownAndAcrossMbRowFunc mb_post_proc_down_and_across_; // Size of the underlying data block that will be filtered. @@ -227,10 +227,10 @@ class VpxMbPostProcAcrossIpTest VpxMbPostProcAcrossIpTest() : rows_(16), cols_(16), mb_post_proc_across_ip_(GetParam()), src_(Buffer(rows_, cols_, 8, 8, 17, 8)) {} - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - virtual void Run(); + void Run() override; void SetCols(unsigned char *s, int rows, int cols, int src_width) { for (int r = 0; r < rows; r++) { @@ -356,10 +356,10 @@ class VpxMbPostProcDownTest : rows_(16), cols_(16), mb_post_proc_down_(GetParam()), src_c_(Buffer(rows_, cols_, 8, 8, 8, 17)) {} - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - virtual void Run(); + void Run() override; void SetRows(unsigned char *src_c, int rows, int cols, int src_width) { for (int r = 0; r < rows; r++) { diff --git a/test/predict_test.cc b/test/predict_test.cc index 7472970576..fbf42077b3 100644 --- a/test/predict_test.cc +++ b/test/predict_test.cc @@ -43,7 +43,7 @@ class PredictTestBase : public AbstractBench, : width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)), src_(nullptr), padded_dst_(nullptr), dst_(nullptr), dst_c_(nullptr) {} - virtual void SetUp() { + void SetUp() override { src_ = new uint8_t[kSrcSize]; ASSERT_NE(src_, nullptr); @@ -64,7 +64,7 @@ class PredictTestBase : public AbstractBench, memset(dst_c_, 0, 16 * 16); } - virtual void TearDown() { + void TearDown() override { delete[] src_; src_ = nullptr; vpx_free(padded_dst_); @@ -209,7 +209,7 @@ class PredictTestBase : public AbstractBench, } } - void Run() { + void Run() override { for (int xoffset = 0; xoffset < 8; ++xoffset) { for (int yoffset = 0; yoffset < 8; ++yoffset) { if (xoffset == 0 && yoffset == 0) { diff --git a/test/quantize_test.cc b/test/quantize_test.cc index 57309e8102..ab38f5c1b0 100644 --- a/test/quantize_test.cc +++ b/test/quantize_test.cc @@ -121,13 +121,13 @@ class QuantizeTest : public QuantizeTestBase, public ::testing::TestWithParam, public AbstractBench { protected: - virtual void SetUp() { + void SetUp() override { SetupCompressor(); asm_quant_ = GET_PARAM(0); c_quant_ = GET_PARAM(1); } - virtual void Run() { + void Run() override { asm_quant_(&vp8_comp_->mb.block[0], ¯oblockd_dst_->block[0]); } diff --git a/test/resize_test.cc b/test/resize_test.cc index 715bb9d70f..3eb842f549 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -247,10 +247,10 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource { } bool flag_codec_; bool smaller_width_larger_size_; - virtual ~ResizingVideoSource() {} + ~ResizingVideoSource() override {} protected: - virtual void Next() { + void Next() override { ++frame_; unsigned int width = 0; unsigned int height = 0; @@ -267,14 +267,14 @@ class ResizeTest protected: ResizeTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~ResizeTest() {} + ~ResizeTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { ASSERT_NE(static_cast(pkt->data.frame.width[0]), 0); ASSERT_NE(static_cast(pkt->data.frame.height[0]), 0); encode_frame_width_.push_back(pkt->data.frame.width[0]); @@ -289,8 +289,8 @@ class ResizeTest return encode_frame_height_[idx]; } - virtual void DecompressedFrameHook(const vpx_image_t &img, - vpx_codec_pts_t pts) { + void DecompressedFrameHook(const vpx_image_t &img, + vpx_codec_pts_t pts) override { frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h)); } @@ -336,15 +336,15 @@ class ResizeInternalTest : public ResizeTest { ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {} #endif - virtual ~ResizeInternalTest() {} + ~ResizeInternalTest() override {} - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { #if WRITE_COMPRESSED_STREAM outfile_ = fopen("vp90-2-05-resize.ivf", "wb"); #endif } - virtual void EndPassHook() { + void EndPassHook() override { #if WRITE_COMPRESSED_STREAM if (outfile_) { if (!fseek(outfile_, 0, SEEK_SET)) @@ -355,8 +355,8 @@ class ResizeInternalTest : public ResizeTest { #endif } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (change_config_) { int new_q = 60; if (video->frame() == 0) { @@ -381,7 +381,7 @@ class ResizeInternalTest : public ResizeTest { } } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0]; EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0); } @@ -450,10 +450,10 @@ class ResizeRealtimeTest public ::libvpx_test::CodecTestWith2Params { protected: ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~ResizeRealtimeTest() {} + ~ResizeRealtimeTest() override {} - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP9E_SET_AQ_MODE, 3); encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); @@ -466,24 +466,24 @@ class ResizeRealtimeTest } } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); set_cpu_used_ = GET_PARAM(2); } - virtual void DecompressedFrameHook(const vpx_image_t &img, - vpx_codec_pts_t pts) { + void DecompressedFrameHook(const vpx_image_t &img, + vpx_codec_pts_t pts) override { frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h)); } - virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) { + void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override { double mismatch_psnr = compute_psnr(img1, img2); mismatch_psnr_ += mismatch_psnr; ++mismatch_nframes_; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { ASSERT_NE(static_cast(pkt->data.frame.width[0]), 0); ASSERT_NE(static_cast(pkt->data.frame.height[0]), 0); encode_frame_width_.push_back(pkt->data.frame.width[0]); @@ -693,15 +693,15 @@ class ResizeCspTest : public ResizeTest { ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {} #endif - virtual ~ResizeCspTest() {} + ~ResizeCspTest() override {} - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { #if WRITE_COMPRESSED_STREAM outfile_ = fopen("vp91-2-05-cspchape.ivf", "wb"); #endif } - virtual void EndPassHook() { + void EndPassHook() override { #if WRITE_COMPRESSED_STREAM if (outfile_) { if (!fseek(outfile_, 0, SEEK_SET)) @@ -712,8 +712,8 @@ class ResizeCspTest : public ResizeTest { #endif } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (CspForFrameNumber(video->frame()) != VPX_IMG_FMT_I420 && cfg_.g_profile != 1) { cfg_.g_profile = 1; @@ -726,7 +726,7 @@ class ResizeCspTest : public ResizeTest { } } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0]; EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0); } @@ -758,10 +758,10 @@ class ResizingCspVideoSource : public ::libvpx_test::DummyVideoSource { limit_ = 30; } - virtual ~ResizingCspVideoSource() {} + ~ResizingCspVideoSource() override {} protected: - virtual void Next() { + void Next() override { ++frame_; SetImageFormat(CspForFrameNumber(frame_)); FillFrame(); diff --git a/test/sad_test.cc b/test/sad_test.cc index 92b3a14d68..83c4fe0c36 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -73,7 +73,7 @@ class SADTestBase : public ::testing::TestWithParam { public: explicit SADTestBase(const ParamType ¶ms) : params_(params) {} - virtual void SetUp() { + void SetUp() override { source_data8_ = reinterpret_cast( vpx_memalign(kDataAlignment, kDataBlockSize)); reference_data8_ = reinterpret_cast( @@ -108,7 +108,7 @@ class SADTestBase : public ::testing::TestWithParam { rnd_.Reset(ACMRandom::DeterministicSeed()); } - virtual void TearDown() { + void TearDown() override { vpx_free(source_data8_); source_data8_ = nullptr; vpx_free(reference_data8_); diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc index df6da84037..1943acf8b3 100644 --- a/test/sum_squares_test.cc +++ b/test/sum_squares_test.cc @@ -33,13 +33,13 @@ typedef std::tuple SumSquaresParam; class SumSquaresTest : public ::testing::TestWithParam { public: - virtual ~SumSquaresTest() {} - virtual void SetUp() { + ~SumSquaresTest() override {} + void SetUp() override { ref_func_ = GET_PARAM(0); tst_func_ = GET_PARAM(1); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: SSI16Func ref_func_; diff --git a/test/superframe_test.cc b/test/superframe_test.cc index a5c92e9147..4a522dd496 100644 --- a/test/superframe_test.cc +++ b/test/superframe_test.cc @@ -28,9 +28,9 @@ class SuperframeTest protected: SuperframeTest() : EncoderTest(GET_PARAM(0)), modified_buf_(nullptr), last_sf_pts_(0) {} - virtual ~SuperframeTest() {} + ~SuperframeTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); const SuperframeTestParam input = GET_PARAM(1); const libvpx_test::TestMode mode = std::get(input); @@ -39,17 +39,17 @@ class SuperframeTest sf_count_max_ = INT_MAX; } - virtual void TearDown() { delete[] modified_buf_; } + void TearDown() override { delete[] modified_buf_; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); } } - virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( - const vpx_codec_cx_pkt_t *pkt) { + const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( + const vpx_codec_cx_pkt_t *pkt) override { if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt; const uint8_t *buffer = reinterpret_cast(pkt->data.frame.buf); diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc index d571f50860..723538368f 100644 --- a/test/svc_datarate_test.cc +++ b/test/svc_datarate_test.cc @@ -43,7 +43,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } protected: - virtual ~DatarateOnePassCbrSvc() {} + ~DatarateOnePassCbrSvc() override {} virtual void ResetModel() { last_pts_ = 0; @@ -86,7 +86,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } ksvc_flex_noupd_tlenh_ = false; } - virtual void BeginPassHook(unsigned int /*pass*/) {} + void BeginPassHook(unsigned int /*pass*/) override {} // Example pattern for spatial layers and 2 temporal layers used in the // bypass/flexible mode. The pattern corresponds to the pattern @@ -179,8 +179,8 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { PreEncodeFrameHookSetup(video, encoder); if (video->frame() == 0) { @@ -468,7 +468,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { return VPX_CODEC_OK; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { uint32_t sizes[8] = { 0 }; uint32_t sizes_parsed[8] = { 0 }; int count = 0; @@ -571,7 +571,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } } - virtual void EndPassHook() { + void EndPassHook() override { if (change_bitrate_) last_pts_ = last_pts_ - last_pts_ref_; duration_ = (last_pts_ + 1) * timebase_; for (int sl = 0; sl < number_spatial_layers_; ++sl) { @@ -583,7 +583,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } } - virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) { + void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override { // TODO(marpan): Look into why an assert is triggered in compute_psnr // for mismatch frames for the special test case: ksvc_flex_noupd_tlenh. // Has to do with dropped frames in bypass/flexible svc mode. @@ -639,7 +639,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { bool ksvc_flex_noupd_tlenh_; private: - virtual void SetConfig(const int num_temporal_layer) { + void SetConfig(const int num_temporal_layer) override { cfg_.rc_end_usage = VPX_CBR; cfg_.g_lag_in_frames = 0; cfg_.g_error_resilient = 1; @@ -670,10 +670,10 @@ class DatarateOnePassCbrSvcSingleBR DatarateOnePassCbrSvcSingleBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - virtual ~DatarateOnePassCbrSvcSingleBR() {} + ~DatarateOnePassCbrSvcSingleBR() override {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); speed_setting_ = GET_PARAM(1); @@ -1160,10 +1160,10 @@ class DatarateOnePassCbrSvcMultiBR DatarateOnePassCbrSvcMultiBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - virtual ~DatarateOnePassCbrSvcMultiBR() {} + ~DatarateOnePassCbrSvcMultiBR() override {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); speed_setting_ = GET_PARAM(1); @@ -1243,10 +1243,10 @@ class DatarateOnePassCbrSvcFrameDropMultiBR : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - virtual ~DatarateOnePassCbrSvcFrameDropMultiBR() {} + ~DatarateOnePassCbrSvcFrameDropMultiBR() override {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); speed_setting_ = GET_PARAM(1); @@ -1355,10 +1355,10 @@ class DatarateOnePassCbrSvcInterLayerPredSingleBR : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - virtual ~DatarateOnePassCbrSvcInterLayerPredSingleBR() {} + ~DatarateOnePassCbrSvcInterLayerPredSingleBR() override {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); speed_setting_ = GET_PARAM(1); @@ -1441,10 +1441,10 @@ class DatarateOnePassCbrSvcDenoiser DatarateOnePassCbrSvcDenoiser() : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - virtual ~DatarateOnePassCbrSvcDenoiser() {} + ~DatarateOnePassCbrSvcDenoiser() override {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); speed_setting_ = GET_PARAM(1); @@ -1499,10 +1499,10 @@ class DatarateOnePassCbrSvcSmallKF DatarateOnePassCbrSvcSmallKF() : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - virtual ~DatarateOnePassCbrSvcSmallKF() {} + ~DatarateOnePassCbrSvcSmallKF() override {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); speed_setting_ = GET_PARAM(1); @@ -1702,10 +1702,10 @@ class DatarateOnePassCbrSvcPostencodeDrop DatarateOnePassCbrSvcPostencodeDrop() : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - virtual ~DatarateOnePassCbrSvcPostencodeDrop() {} + ~DatarateOnePassCbrSvcPostencodeDrop() override {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); speed_setting_ = GET_PARAM(1); diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc index 7300ce6679..c9ef35bbe6 100644 --- a/test/svc_end_to_end_test.cc +++ b/test/svc_end_to_end_test.cc @@ -45,19 +45,19 @@ class ScalePartitionOnePassCbrSvc } protected: - virtual ~ScalePartitionOnePassCbrSvc() {} + ~ScalePartitionOnePassCbrSvc() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); speed_setting_ = 7; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { PreEncodeFrameHookSetup(video, encoder); } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { // Keep track of number of non-reference frames, needed for mismatch check. // Non-reference frames are top spatial and temporal layer frames, // for TL > 0. @@ -67,12 +67,12 @@ class ScalePartitionOnePassCbrSvc num_nonref_frames_++; } - virtual void MismatchHook(const vpx_image_t * /*img1*/, - const vpx_image_t * /*img2*/) { + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { ++mismatch_nframes_; } - virtual void SetConfig(const int /*num_temporal_layer*/) {} + void SetConfig(const int /*num_temporal_layer*/) override {} unsigned int GetMismatchFrames() const { return mismatch_nframes_; } unsigned int GetNonRefFrames() const { return num_nonref_frames_; } @@ -129,14 +129,14 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, } protected: - virtual ~SyncFrameOnePassCbrSvc() {} + ~SyncFrameOnePassCbrSvc() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); speed_setting_ = 7; } - virtual bool DoDecode() const { + bool DoDecode() const override { return current_video_frame_ >= frame_to_start_decode_; } @@ -225,8 +225,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, } } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { current_video_frame_ = video->frame(); PreEncodeFrameHookSetup(video, encoder); if (video->frame() == 0) { @@ -265,8 +265,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, } #if CONFIG_VP9_DECODER - virtual void PreDecodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Decoder *decoder) { + void PreDecodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Decoder *decoder) override { if (video->frame() < frame_to_sync_) { if (decode_to_layer_before_sync_ >= 0) decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, @@ -284,7 +284,7 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, } #endif - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { // Keep track of number of non-reference frames, needed for mismatch check. // Non-reference frames are top spatial and temporal layer frames, // for TL > 0. @@ -307,8 +307,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, } } - virtual void MismatchHook(const vpx_image_t * /*img1*/, - const vpx_image_t * /*img2*/) { + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { if (current_video_frame_ >= frame_to_sync_) ++mismatch_nframes_; } @@ -331,7 +331,7 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, vpx_svc_ref_frame_config_t ref_frame_config_; private: - virtual void SetConfig(const int num_temporal_layer) { + void SetConfig(const int num_temporal_layer) override { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_sz = 1000; @@ -657,15 +657,15 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc, } protected: - virtual ~LoopfilterOnePassCbrSvc() {} + ~LoopfilterOnePassCbrSvc() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); speed_setting_ = 7; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { PreEncodeFrameHookSetup(video, encoder); if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) { // Consider 3 cases: @@ -694,7 +694,7 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc, } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { // Keep track of number of non-reference frames, needed for mismatch check. // Non-reference frames are top spatial and temporal layer frames, // for TL > 0. @@ -704,12 +704,12 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc, num_nonref_frames_++; } - virtual void MismatchHook(const vpx_image_t * /*img1*/, - const vpx_image_t * /*img2*/) { + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { ++mismatch_nframes_; } - virtual void SetConfig(const int /*num_temporal_layer*/) {} + void SetConfig(const int /*num_temporal_layer*/) override {} int GetMismatchFrames() const { return mismatch_nframes_; } int GetNonRefFrames() const { return num_nonref_frames_; } diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc index ca990f4dd4..ee552113ce 100644 --- a/test/test_vector_test.cc +++ b/test/test_vector_test.cc @@ -48,7 +48,7 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, #endif } - virtual ~TestVectorTest() { + ~TestVectorTest() override { if (md5_file_) fclose(md5_file_); } @@ -59,9 +59,8 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, } #if CONFIG_VP9_DECODER - virtual void PreDecodeFrameHook( - const libvpx_test::CompressedVideoSource &video, - libvpx_test::Decoder *decoder) { + void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { if (video.frame_number() == 0 && mt_mode_ >= 0) { if (mt_mode_ == 1) { decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 1); @@ -77,8 +76,8 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, } #endif - virtual void DecompressedFrameHook(const vpx_image_t &img, - const unsigned int frame_number) { + void DecompressedFrameHook(const vpx_image_t &img, + const unsigned int frame_number) override { ASSERT_NE(md5_file_, nullptr); char expected_md5[33]; char junk[128]; diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc index d92c13f88e..dab6e531b7 100644 --- a/test/tile_independence_test.cc +++ b/test/tile_independence_test.cc @@ -36,18 +36,18 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest, inv_dec_->Control(VP9_INVERT_TILE_DECODE_ORDER, 1); } - virtual ~TileIndependenceTest() { + ~TileIndependenceTest() override { delete fw_dec_; delete inv_dec_; } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(libvpx_test::kTwoPassGood); } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP9E_SET_TILE_COLUMNS, n_tiles_); } @@ -65,7 +65,7 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest, md5->Add(img); } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { UpdateMD5(fw_dec_, pkt, &md5_fw_order_); UpdateMD5(inv_dec_, pkt, &md5_inv_order_); } diff --git a/test/timestamp_test.cc b/test/timestamp_test.cc index 645a9f2ff8..d6f0b3bda2 100644 --- a/test/timestamp_test.cc +++ b/test/timestamp_test.cc @@ -42,16 +42,16 @@ class DummyTimebaseVideoSource : public ::libvpx_test::DummyVideoSource { (static_cast(framerate_numerator_) / framerate_denominator_); } - virtual vpx_codec_pts_t pts() const { + vpx_codec_pts_t pts() const override { return static_cast(frame_ * FrameDuration() + starting_pts_ + 0.5); } - virtual unsigned long duration() const { + unsigned long duration() const override { return static_cast(FrameDuration() + 0.5); } - virtual vpx_rational_t timebase() const { return timebase_; } + vpx_rational_t timebase() const override { return timebase_; } void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; } @@ -67,9 +67,9 @@ class TimestampTest public ::libvpx_test::CodecTestWithParam { protected: TimestampTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~TimestampTest() {} + ~TimestampTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); } diff --git a/test/variance_test.cc b/test/variance_test.cc index df9a1c56f6..6885252b82 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -210,7 +210,7 @@ class SumOfSquaresTest : public ::testing::TestWithParam { public: SumOfSquaresTest() : func_(GetParam()) {} - virtual ~SumOfSquaresTest() { libvpx_test::ClearSystemState(); } + ~SumOfSquaresTest() override { libvpx_test::ClearSystemState(); } protected: void ConstTest(); @@ -289,7 +289,7 @@ template class MainTestClass : public ::testing::TestWithParam > { public: - virtual void SetUp() { + void SetUp() override { params_ = this->GetParam(); rnd_.Reset(ACMRandom::DeterministicSeed()); @@ -308,7 +308,7 @@ class MainTestClass #endif } - virtual void TearDown() { + void TearDown() override { #if CONFIG_VP9_HIGHBITDEPTH if (use_high_bit_depth()) { // TODO(skal): remove! @@ -568,7 +568,7 @@ template class SubpelVarianceTest : public ::testing::TestWithParam > { public: - virtual void SetUp() { + void SetUp() override { params_ = this->GetParam(); rnd_.Reset(ACMRandom::DeterministicSeed()); @@ -592,7 +592,7 @@ class SubpelVarianceTest ASSERT_NE(ref_, nullptr); } - virtual void TearDown() { + void TearDown() override { if (!use_high_bit_depth()) { vpx_free(src_); vpx_free(sec_); diff --git a/test/vp8_datarate_test.cc b/test/vp8_datarate_test.cc index 64a861d15e..c91c2a0d26 100644 --- a/test/vp8_datarate_test.cc +++ b/test/vp8_datarate_test.cc @@ -24,10 +24,10 @@ class DatarateTestLarge public: DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {} - virtual ~DatarateTestLarge() {} + ~DatarateTestLarge() override {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); set_cpu_used_ = GET_PARAM(2); @@ -47,8 +47,8 @@ class DatarateTestLarge use_roi_ = false; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); @@ -74,7 +74,7 @@ class DatarateTestLarge duration_ = 0; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { // Time since last timestamp = duration. vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; @@ -121,7 +121,7 @@ class DatarateTestLarge ++frame_number_; } - virtual void EndPassHook() { + void EndPassHook() override { if (bits_total_) { const double file_size_in_kb = bits_total_ / 1000.; // bits per kilobit @@ -301,7 +301,7 @@ TEST_P(DatarateTestLarge, DropFramesMultiThreads) { class DatarateTestRealTime : public DatarateTestLarge { public: - virtual ~DatarateTestRealTime() {} + ~DatarateTestRealTime() override {} }; #if CONFIG_TEMPORAL_DENOISING diff --git a/test/vp8_denoiser_sse2_test.cc b/test/vp8_denoiser_sse2_test.cc index 8cb84ddd8e..a6d414e508 100644 --- a/test/vp8_denoiser_sse2_test.cc +++ b/test/vp8_denoiser_sse2_test.cc @@ -30,11 +30,11 @@ namespace { const int kNumPixels = 16 * 16; class VP8DenoiserTest : public ::testing::TestWithParam { public: - virtual ~VP8DenoiserTest() {} + ~VP8DenoiserTest() override {} - virtual void SetUp() { increase_denoising_ = GetParam(); } + void SetUp() override { increase_denoising_ = GetParam(); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: int increase_denoising_; diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc index 1b73a72a01..66d5c151c5 100644 --- a/test/vp8_fdct4x4_test.cc +++ b/test/vp8_fdct4x4_test.cc @@ -74,7 +74,7 @@ using libvpx_test::ACMRandom; class FdctTest : public ::testing::TestWithParam { public: - virtual void SetUp() { + void SetUp() override { fdct_func_ = GetParam(); rnd_.Reset(ACMRandom::DeterministicSeed()); } diff --git a/test/vp8_fragments_test.cc b/test/vp8_fragments_test.cc index 6e5baf229d..bb527966c5 100644 --- a/test/vp8_fragments_test.cc +++ b/test/vp8_fragments_test.cc @@ -17,9 +17,9 @@ class VP8FragmentsTest : public ::libvpx_test::EncoderTest, public ::testing::Test { protected: VP8FragmentsTest() : EncoderTest(&::libvpx_test::kVP8) {} - virtual ~VP8FragmentsTest() {} + ~VP8FragmentsTest() override {} - virtual void SetUp() { + void SetUp() override { const unsigned long init_flags = // NOLINT(runtime/int) VPX_CODEC_USE_OUTPUT_PARTITION; InitializeConfig(); diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc index b76bcae11d..70d73c52d6 100644 --- a/test/vp8_ratectrl_rtc_test.cc +++ b/test/vp8_ratectrl_rtc_test.cc @@ -53,10 +53,10 @@ class Vp8RcInterfaceTest public: Vp8RcInterfaceTest() : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false) {} - virtual ~Vp8RcInterfaceTest() {} + ~Vp8RcInterfaceTest() override {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); } @@ -111,8 +111,8 @@ class Vp8RcInterfaceTest return layer_id; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (rc_cfg_.ts_number_layers > 1) { const int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers); const int frame_flags = @@ -139,7 +139,7 @@ class Vp8RcInterfaceTest encoder_exit_ = video->frame() == test_video_.frames; } - virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { if (encoder_exit_) { return; } @@ -149,7 +149,7 @@ class Vp8RcInterfaceTest ASSERT_EQ(rc_api_->GetQP(), qp); } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { rc_api_->PostEncodeUpdate(pkt->data.frame.sz); } diff --git a/test/vp9_arf_freq_test.cc b/test/vp9_arf_freq_test.cc index c7e6f1af02..7cc9a28396 100644 --- a/test/vp9_arf_freq_test.cc +++ b/test/vp9_arf_freq_test.cc @@ -86,9 +86,9 @@ class ArfFreqTest : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)), test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {} - virtual ~ArfFreqTest() {} + ~ArfFreqTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(test_encode_param_.mode); if (test_encode_param_.mode != ::libvpx_test::kRealTime) { @@ -104,7 +104,7 @@ class ArfFreqTest dec_cfg_.threads = 4; } - virtual void BeginPassHook(unsigned int) { + void BeginPassHook(unsigned int) override { min_run_ = ARF_NOT_SEEN; run_of_visible_frames_ = 0; } @@ -126,7 +126,7 @@ class ArfFreqTest return frames; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return; const int frames = GetNumFramesInPkt(pkt); if (frames == 1) { @@ -145,8 +145,8 @@ class ArfFreqTest } } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); encoder->Control(VP9E_SET_TILE_COLUMNS, 4); diff --git a/test/vp9_block_error_test.cc b/test/vp9_block_error_test.cc index 9e9595ebe9..4ff1838c34 100644 --- a/test/vp9_block_error_test.cc +++ b/test/vp9_block_error_test.cc @@ -53,14 +53,14 @@ int64_t BlockError8BitWrapper(const tran_low_t *coeff, class BlockErrorTest : public ::testing::TestWithParam { public: - virtual ~BlockErrorTest() {} - virtual void SetUp() { + ~BlockErrorTest() override {} + void SetUp() override { error_block_op_ = GET_PARAM(0); ref_error_block_op_ = GET_PARAM(1); bit_depth_ = GET_PARAM(2); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: vpx_bit_depth_t bit_depth_; diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc index 7e91807492..48480d6fa1 100644 --- a/test/vp9_datarate_test.cc +++ b/test/vp9_datarate_test.cc @@ -28,7 +28,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest { } protected: - virtual ~DatarateTestVP9() {} + ~DatarateTestVP9() override {} virtual void ResetModel() { last_pts_ = 0; @@ -113,8 +113,8 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest { return layer_id; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); @@ -164,7 +164,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest { duration_ = 0; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { // Time since last timestamp = duration. vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; @@ -202,7 +202,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest { ++tot_frame_number_; } - virtual void EndPassHook() { + void EndPassHook() override { for (int layer = 0; layer < static_cast(cfg_.ts_number_layers); ++layer) { duration_ = (last_pts_ + 1) * timebase_; @@ -243,7 +243,7 @@ class DatarateTestVP9RealTimeMultiBR DatarateTestVP9RealTimeMultiBR() : DatarateTestVP9(GET_PARAM(0)) {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); set_cpu_used_ = GET_PARAM(1); @@ -259,7 +259,7 @@ class DatarateTestVP9LargeVBR DatarateTestVP9LargeVBR() : DatarateTestVP9(GET_PARAM(0)) {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); set_cpu_used_ = GET_PARAM(1); @@ -579,10 +579,10 @@ class DatarateTestVP9RealTime : public DatarateTestVP9, public ::libvpx_test::CodecTestWithParam { public: DatarateTestVP9RealTime() : DatarateTestVP9(GET_PARAM(0)) {} - virtual ~DatarateTestVP9RealTime() {} + ~DatarateTestVP9RealTime() override {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); set_cpu_used_ = GET_PARAM(1); @@ -731,10 +731,10 @@ class DatarateTestVP9RealTimeDeltaQUV public ::libvpx_test::CodecTestWith2Params { public: DatarateTestVP9RealTimeDeltaQUV() : DatarateTestVP9(GET_PARAM(0)) {} - virtual ~DatarateTestVP9RealTimeDeltaQUV() {} + ~DatarateTestVP9RealTimeDeltaQUV() override {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); set_cpu_used_ = GET_PARAM(1); @@ -779,7 +779,7 @@ class DatarateTestVP9PostEncodeDrop DatarateTestVP9PostEncodeDrop() : DatarateTestVP9(GET_PARAM(0)) {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); set_cpu_used_ = GET_PARAM(1); @@ -819,17 +819,17 @@ class DatarateTestVP9FrameQp public ::testing::TestWithParam { public: DatarateTestVP9FrameQp() : DatarateTestVP9(GetParam()), frame_(0) {} - virtual ~DatarateTestVP9FrameQp() {} + ~DatarateTestVP9FrameQp() override {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); ResetModel(); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { set_cpu_used_ = 7; DatarateTestVP9::PreEncodeFrameHook(video, encoder); frame_qp_ = static_cast(rnd_.RandRange(64)); @@ -837,7 +837,7 @@ class DatarateTestVP9FrameQp frame_++; } - virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { int qp = 0; vpx_svc_layer_id_t layer_id; if (frame_ >= total_frame_) return; @@ -847,8 +847,8 @@ class DatarateTestVP9FrameQp temporal_layer_id_ = layer_id.temporal_layer_id; } - virtual void MismatchHook(const vpx_image_t * /*img1*/, - const vpx_image_t * /*img2*/) { + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { if (frame_ >= total_frame_) return; ASSERT_TRUE(cfg_.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212 && @@ -945,7 +945,7 @@ TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) { // Params: speed setting. class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime { public: - virtual ~DatarateTestVP9RealTimeDenoiser() {} + ~DatarateTestVP9RealTimeDenoiser() override {} }; // Check basic datarate targeting, for a single bitrate, when denoiser is on. diff --git a/test/vp9_denoiser_test.cc b/test/vp9_denoiser_test.cc index d884b7eb92..1f679d5bd8 100644 --- a/test/vp9_denoiser_test.cc +++ b/test/vp9_denoiser_test.cc @@ -42,11 +42,11 @@ class VP9DenoiserTest : public ::testing::Test, public ::testing::WithParamInterface { public: - virtual ~VP9DenoiserTest() {} + ~VP9DenoiserTest() override {} - virtual void SetUp() { bs_ = GET_PARAM(1); } + void SetUp() override { bs_ = GET_PARAM(1); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: BLOCK_SIZE bs_; diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc index ce2198c592..1f9929f880 100644 --- a/test/vp9_encoder_parms_get_to_decoder.cc +++ b/test/vp9_encoder_parms_get_to_decoder.cc @@ -62,9 +62,9 @@ class VpxEncoderParmsGetToDecoder VpxEncoderParmsGetToDecoder() : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {} - virtual ~VpxEncoderParmsGetToDecoder() {} + ~VpxEncoderParmsGetToDecoder() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kTwoPassGood); cfg_.g_lag_in_frames = 25; @@ -74,8 +74,8 @@ class VpxEncoderParmsGetToDecoder cfg_.rc_target_bitrate = test_video_.bitrate; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs); encoder->Control(VP9E_SET_COLOR_RANGE, encode_parms.color_range); @@ -95,9 +95,9 @@ class VpxEncoderParmsGetToDecoder } } - virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, - const libvpx_test::VideoSource & /*video*/, - libvpx_test::Decoder *decoder) { + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder *decoder) override { vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder(); vpx_codec_alg_priv_t *const priv = reinterpret_cast(vp9_decoder->priv); diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc index 7a85db26a4..d4c0b0dd11 100644 --- a/test/vp9_end_to_end_test.cc +++ b/test/vp9_end_to_end_test.cc @@ -89,9 +89,9 @@ class EndToEndTestAdaptiveRDThresh : EncoderTest(GET_PARAM(0)), cpu_used_start_(GET_PARAM(1)), cpu_used_end_(GET_PARAM(2)) {} - virtual ~EndToEndTestAdaptiveRDThresh() {} + ~EndToEndTestAdaptiveRDThresh() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); cfg_.g_lag_in_frames = 0; @@ -102,8 +102,8 @@ class EndToEndTestAdaptiveRDThresh dec_cfg_.threads = 4; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_start_); encoder->Control(VP9E_SET_ROW_MT, 1); @@ -131,9 +131,9 @@ class EndToEndTestLarge denoiser_on_ = 0; } - virtual ~EndToEndTestLarge() {} + ~EndToEndTestLarge() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); if (encoding_mode_ != ::libvpx_test::kRealTime) { @@ -149,18 +149,18 @@ class EndToEndTestLarge dec_cfg_.threads = 4; } - virtual void BeginPassHook(unsigned int) { + void BeginPassHook(unsigned int) override { psnr_ = 0.0; nframes_ = 0; } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { psnr_ += pkt->data.psnr.psnr[0]; nframes_++; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); encoder->Control(VP9E_SET_TILE_COLUMNS, 4); @@ -207,9 +207,9 @@ class EndToEndTestLoopFilterThreading EndToEndTestLoopFilterThreading() : EncoderTest(GET_PARAM(0)), use_loop_filter_opt_(GET_PARAM(1)) {} - virtual ~EndToEndTestLoopFilterThreading() {} + ~EndToEndTestLoopFilterThreading() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); cfg_.g_threads = 2; @@ -221,16 +221,16 @@ class EndToEndTestLoopFilterThreading dec_cfg_.threads = GET_PARAM(2); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 8); } encoder->Control(VP9E_SET_TILE_COLUMNS, 4 - video->frame() % 5); } - virtual void PreDecodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Decoder *decoder) { + void PreDecodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Decoder *decoder) override { if (video->frame() == 0) { decoder->Control(VP9D_SET_LOOP_FILTER_OPT, use_loop_filter_opt_ ? 1 : 0); } diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc index 54fa6c48e2..27a73b2aa5 100644 --- a/test/vp9_ethread_test.cc +++ b/test/vp9_ethread_test.cc @@ -44,9 +44,9 @@ class VPxFirstPassEncoderThreadTest firstpass_stats_.buf = nullptr; firstpass_stats_.sz = 0; } - virtual ~VPxFirstPassEncoderThreadTest() { free(firstpass_stats_.buf); } + ~VPxFirstPassEncoderThreadTest() override { free(firstpass_stats_.buf); } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); @@ -57,19 +57,19 @@ class VPxFirstPassEncoderThreadTest cfg_.rc_min_quantizer = 0; } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { encoder_initialized_ = false; abort_ = false; } - virtual void EndPassHook() { + void EndPassHook() override { // For first pass stats test, only run first pass encoder. if (first_pass_only_ && cfg_.g_pass == VPX_RC_FIRST_PASS) abort_ |= first_pass_only_; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/, + ::libvpx_test::Encoder *encoder) override { if (!encoder_initialized_) { // Encode in 2-pass mode. encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_); @@ -87,7 +87,7 @@ class VPxFirstPassEncoderThreadTest } } - virtual void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) { + void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) override { const uint8_t *const pkt_buf = reinterpret_cast(pkt->data.twopass_stats.buf); const size_t pkt_size = pkt->data.twopass_stats.sz; @@ -233,9 +233,9 @@ class VPxEncoderThreadTest psnr_ = 0.0; nframes_ = 0; } - virtual ~VPxEncoderThreadTest() {} + ~VPxEncoderThreadTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); @@ -252,14 +252,14 @@ class VPxEncoderThreadTest cfg_.rc_min_quantizer = 0; } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { encoder_initialized_ = false; psnr_ = 0.0; nframes_ = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/, + ::libvpx_test::Encoder *encoder) override { if (!encoder_initialized_) { // Encode 4 column tiles. encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_); @@ -280,21 +280,21 @@ class VPxEncoderThreadTest } } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { psnr_ += pkt->data.psnr.psnr[0]; nframes_++; } - virtual void DecompressedFrameHook(const vpx_image_t &img, - vpx_codec_pts_t /*pts*/) { + void DecompressedFrameHook(const vpx_image_t &img, + vpx_codec_pts_t /*pts*/) override { ::libvpx_test::MD5 md5_res; md5_res.Add(&img); md5_.push_back(md5_res.Get()); } - virtual bool HandleDecodeResult(const vpx_codec_err_t res, - const libvpx_test::VideoSource & /*video*/, - libvpx_test::Decoder * /*decoder*/) { + bool HandleDecodeResult(const vpx_codec_err_t res, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder * /*decoder*/) override { if (res != VPX_CODEC_OK) { EXPECT_EQ(VPX_CODEC_OK, res); return false; diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 6de7cf8d0f..daaf768699 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -96,7 +96,7 @@ class IntraPredTest : public ::testing::TestWithParam { } protected: - virtual void SetUp() { + void SetUp() override { params_ = this->GetParam(); stride_ = params_.block_size * 3; mask_ = (1 << params_.bit_depth) - 1; diff --git a/test/vp9_lossless_test.cc b/test/vp9_lossless_test.cc index 931ac30a36..01dae08f54 100644 --- a/test/vp9_lossless_test.cc +++ b/test/vp9_lossless_test.cc @@ -29,15 +29,15 @@ class LosslessTest : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0), encoding_mode_(GET_PARAM(1)) {} - virtual ~LosslessTest() {} + ~LosslessTest() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { // Only call Control if quantizer > 0 to verify that using quantizer // alone will activate lossless @@ -47,12 +47,12 @@ class LosslessTest } } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { psnr_ = kMaxPsnr; nframes_ = 0; } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.psnr.psnr[0] < psnr_) psnr_ = pkt->data.psnr.psnr[0]; } diff --git a/test/vp9_motion_vector_test.cc b/test/vp9_motion_vector_test.cc index 6b1082a106..033c5fcd83 100644 --- a/test/vp9_motion_vector_test.cc +++ b/test/vp9_motion_vector_test.cc @@ -42,9 +42,9 @@ class MotionVectorTestLarge : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {} - virtual ~MotionVectorTestLarge() {} + ~MotionVectorTestLarge() override {} - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); if (encoding_mode_ != ::libvpx_test::kRealTime) { @@ -59,8 +59,8 @@ class MotionVectorTestLarge } } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); encoder->Control(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_); diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 5e3a7c2701..5ba90a21bc 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -174,7 +174,7 @@ class VP9QuantizeBase : public AbstractBench { q_ptr_ = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; } - ~VP9QuantizeBase() { + ~VP9QuantizeBase() override { vpx_free(mb_plane_); vpx_free(zbin_ptr_); vpx_free(round_fp_ptr_); @@ -225,7 +225,7 @@ class VP9QuantizeTest : public VP9QuantizeBase, quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {} protected: - virtual void Run(); + void Run() override; void Speed(bool is_median); const QuantizeFunc quantize_op_; const QuantizeFunc ref_quantize_op_; diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index 8422df074b..0680ac7df3 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -40,16 +40,16 @@ class RcInterfaceTest : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000), encoder_exit_(false) {} - virtual ~RcInterfaceTest() {} + ~RcInterfaceTest() override {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 7); encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); @@ -69,7 +69,7 @@ class RcInterfaceTest encoder_exit_ = video->frame() == kNumFrames; } - virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { if (encoder_exit_) { return; } @@ -81,7 +81,7 @@ class RcInterfaceTest ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level); } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { rc_api_->PostEncodeUpdate(pkt->data.frame.sz, frame_params_); } @@ -170,16 +170,16 @@ class RcInterfaceSvcTest : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000), dynamic_spatial_layers_(0), inter_layer_pred_off_(GET_PARAM(2)), parallel_spatial_layers_(false) {} - virtual ~RcInterfaceSvcTest() {} + ~RcInterfaceSvcTest() override {} protected: - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 7); encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); @@ -256,7 +256,7 @@ class RcInterfaceSvcTest : libvpx::RcFrameType::kInterFrame; } - virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0; std::vector rc_qp; @@ -301,8 +301,8 @@ class RcInterfaceSvcTest // This method needs to be overridden because non-reference frames are // expected to be mismatched frames as the encoder will avoid loopfilter on // these frames. - virtual void MismatchHook(const vpx_image_t * /*img1*/, - const vpx_image_t * /*img2*/) {} + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override {} void RunSvc() { SetRCConfigSvc(3, 3); diff --git a/test/vp9_roi_test.cc b/test/vp9_roi_test.cc index e8373c4c0b..a9347fb365 100644 --- a/test/vp9_roi_test.cc +++ b/test/vp9_roi_test.cc @@ -84,9 +84,9 @@ class RoiMaskBackgroundSkip : public ::libvpx_test::EncoderTest, public ::testing::Test { protected: RoiMaskBackgroundSkip() : EncoderTest(&::libvpx_test::kVP9) {} - virtual ~RoiMaskBackgroundSkip() { free(roi_.roi_map); } + ~RoiMaskBackgroundSkip() override { free(roi_.roi_map); } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); SetRoi(); @@ -114,8 +114,8 @@ class RoiMaskBackgroundSkip : public ::libvpx_test::EncoderTest, } } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 7); encoder->Control(VP9E_SET_AQ_MODE, 3); diff --git a/test/vp9_scale_test.cc b/test/vp9_scale_test.cc index 2d1203fb89..bd45c557ee 100644 --- a/test/vp9_scale_test.cc +++ b/test/vp9_scale_test.cc @@ -33,10 +33,10 @@ typedef void (*ScaleFrameFunc)(const YV12_BUFFER_CONFIG *src, class ScaleTest : public VpxScaleBase, public ::testing::TestWithParam { public: - virtual ~ScaleTest() {} + ~ScaleTest() override {} protected: - virtual void SetUp() { scale_fn_ = GetParam(); } + void SetUp() override { scale_fn_ = GetParam(); } void ReferenceScaleFrame(INTERP_FILTER filter_type, int phase_scaler) { vp9_scale_and_extend_frame_c(&img_, &ref_img_, filter_type, phase_scaler); diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc index a57082f1eb..78deb51909 100644 --- a/test/vp9_subtract_test.cc +++ b/test/vp9_subtract_test.cc @@ -34,10 +34,10 @@ namespace vp9 { class VP9SubtractBlockTest : public AbstractBench, public ::testing::TestWithParam { public: - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - virtual void Run() { + void Run() override { GetParam()(block_height_, block_width_, diff_, block_width_, src_, block_width_, pred_, block_width_); } @@ -176,7 +176,7 @@ using Params = std::tuple; class VPXHBDSubtractBlockTest : public ::testing::TestWithParam { public: - virtual void SetUp() { + void SetUp() override { block_width_ = 4 * num_4x4_blocks_wide_lookup[GET_PARAM(0)]; block_height_ = 4 * num_4x4_blocks_high_lookup[GET_PARAM(0)]; bit_depth_ = static_cast(GET_PARAM(1)); @@ -198,7 +198,7 @@ class VPXHBDSubtractBlockTest : public ::testing::TestWithParam { ASSERT_NE(diff_, nullptr); } - virtual void TearDown() { + void TearDown() override { vpx_free(CONVERT_TO_SHORTPTR(src_)); vpx_free(CONVERT_TO_SHORTPTR(pred_)); vpx_free(diff_); diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc index 1ceef8185c..3409e72dfd 100644 --- a/test/vp9_thread_test.cc +++ b/test/vp9_thread_test.cc @@ -26,10 +26,10 @@ using std::string; class VPxWorkerThreadTest : public ::testing::TestWithParam { protected: - virtual ~VPxWorkerThreadTest() {} - virtual void SetUp() { vpx_get_worker_interface()->init(&worker_); } + ~VPxWorkerThreadTest() override {} + void SetUp() override { vpx_get_worker_interface()->init(&worker_); } - virtual void TearDown() { vpx_get_worker_interface()->end(&worker_); } + void TearDown() override { vpx_get_worker_interface()->end(&worker_); } void Run(VPxWorker *worker) { const bool synchronous = GetParam(); diff --git a/test/vpx_scale_test.cc b/test/vpx_scale_test.cc index 7eea437fc8..3a238b7a8d 100644 --- a/test/vpx_scale_test.cc +++ b/test/vpx_scale_test.cc @@ -38,10 +38,10 @@ class ExtendBorderTest : public VpxScaleBase, public ::testing::TestWithParam { public: - virtual ~ExtendBorderTest() {} + ~ExtendBorderTest() override {} protected: - virtual void SetUp() { extend_fn_ = GetParam(); } + void SetUp() override { extend_fn_ = GetParam(); } void ExtendBorder() { ASM_REGISTER_STATE_CHECK(extend_fn_(&img_)); } @@ -68,10 +68,10 @@ INSTANTIATE_TEST_SUITE_P(C, ExtendBorderTest, class CopyFrameTest : public VpxScaleBase, public ::testing::TestWithParam { public: - virtual ~CopyFrameTest() {} + ~CopyFrameTest() override {} protected: - virtual void SetUp() { copy_frame_fn_ = GetParam(); } + void SetUp() override { copy_frame_fn_ = GetParam(); } void CopyFrame() { ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &dst_img_)); diff --git a/test/y4m_test.cc b/test/y4m_test.cc index 32f2cd51d3..78a944fd08 100644 --- a/test/y4m_test.cc +++ b/test/y4m_test.cc @@ -78,7 +78,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam, protected: Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {} - virtual ~Y4mVideoSourceTest() { CloseSource(); } + ~Y4mVideoSourceTest() override { CloseSource(); } virtual void Init(const std::string &file_name, int limit) { file_name_ = file_name; @@ -140,7 +140,7 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest { protected: Y4mVideoWriteTest() : tmpfile_(nullptr) {} - virtual ~Y4mVideoWriteTest() { + ~Y4mVideoWriteTest() override { delete tmpfile_; input_file_ = nullptr; } @@ -172,7 +172,7 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest { ReplaceInputFile(tmpfile_->file()); } - virtual void Init(const std::string &file_name, int limit) { + void Init(const std::string &file_name, int limit) override { Y4mVideoSourceTest::Init(file_name, limit); WriteY4mAndReadBack(); } diff --git a/test/yuv_temporal_filter_test.cc b/test/yuv_temporal_filter_test.cc index 91b4e804b3..0677d55688 100644 --- a/test/yuv_temporal_filter_test.cc +++ b/test/yuv_temporal_filter_test.cc @@ -290,7 +290,7 @@ void ApplyReferenceFilter( class YUVTemporalFilterTest : public ::testing::TestWithParam { public: - virtual void SetUp() { + void SetUp() override { filter_func_ = GetParam().temporal_filter; bd_ = GetParam().bd; use_highbd_ = (bd_ != 8); From 1c2297b2bc693a8d3d6a11abaff801c045f11b54 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 25 Jul 2023 12:04:57 -0700 Subject: [PATCH 0843/1000] test/*.cc: use '= default' created with clang-tidy --fix --checks=-*,modernize-use-equals-default Change-Id: Ie373fb5501491fce53479d20f3a6d908c4b7c535 --- test/active_map_refresh_test.cc | 2 +- test/active_map_test.cc | 2 +- test/add_noise_test.cc | 2 +- test/alt_ref_aq_segment_test.cc | 2 +- test/altref_test.cc | 4 ++-- test/aq_segment_test.cc | 2 +- test/borders_test.cc | 2 +- test/config_test.cc | 2 +- test/cpu_speed_test.cc | 2 +- test/cq_test.cc | 2 +- test/dct16x16_test.cc | 8 ++++---- test/dct32x32_test.cc | 4 ++-- test/decode_corrupted.cc | 2 +- test/decode_perf_test.cc | 2 +- test/decode_svc_test.cc | 2 +- test/encode_api_test.cc | 2 +- test/encode_perf_test.cc | 2 +- test/error_resilience_test.cc | 4 ++-- test/fdct8x8_test.cc | 8 ++++---- test/frame_size_tests.cc | 2 +- test/keyframe_test.cc | 2 +- test/level_test.cc | 2 +- test/lpf_test.cc | 4 ++-- test/partial_idct_test.cc | 2 +- test/realtime_test.cc | 2 +- test/resize_test.cc | 12 ++++++------ test/sum_squares_test.cc | 2 +- test/superframe_test.cc | 2 +- test/svc_datarate_test.cc | 16 ++++++++-------- test/svc_end_to_end_test.cc | 6 +++--- test/timestamp_test.cc | 2 +- test/vp8_datarate_test.cc | 4 ++-- test/vp8_denoiser_sse2_test.cc | 2 +- test/vp8_fragments_test.cc | 2 +- test/vp8_ratectrl_rtc_test.cc | 4 ++-- test/vp9_arf_freq_test.cc | 2 +- test/vp9_block_error_test.cc | 2 +- test/vp9_datarate_test.cc | 10 +++++----- test/vp9_denoiser_test.cc | 2 +- test/vp9_encoder_parms_get_to_decoder.cc | 2 +- test/vp9_end_to_end_test.cc | 6 +++--- test/vp9_ethread_test.cc | 2 +- test/vp9_lossless_test.cc | 2 +- test/vp9_motion_vector_test.cc | 2 +- test/vp9_ratectrl_rtc_test.cc | 4 ++-- test/vp9_scale_test.cc | 2 +- test/vp9_thread_test.cc | 2 +- test/vpx_scale_test.cc | 4 ++-- 48 files changed, 82 insertions(+), 82 deletions(-) diff --git a/test/active_map_refresh_test.cc b/test/active_map_refresh_test.cc index 8b35ca81ba..ad067346a7 100644 --- a/test/active_map_refresh_test.cc +++ b/test/active_map_refresh_test.cc @@ -62,7 +62,7 @@ class ActiveMapRefreshTest public ::libvpx_test::CodecTestWith2Params { protected: ActiveMapRefreshTest() : EncoderTest(GET_PARAM(0)) {} - ~ActiveMapRefreshTest() override {} + ~ActiveMapRefreshTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/active_map_test.cc b/test/active_map_test.cc index 1f661b559c..d222c00b74 100644 --- a/test/active_map_test.cc +++ b/test/active_map_test.cc @@ -26,7 +26,7 @@ class ActiveMapTest static const int kHeight = 144; ActiveMapTest() : EncoderTest(GET_PARAM(0)) {} - ~ActiveMapTest() override {} + ~ActiveMapTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc index 6e787dd6ba..4fc4e81e63 100644 --- a/test/add_noise_test.cc +++ b/test/add_noise_test.cc @@ -33,7 +33,7 @@ class AddNoiseTest : public ::testing::Test, public ::testing::WithParamInterface { public: void TearDown() override { libvpx_test::ClearSystemState(); } - ~AddNoiseTest() override {} + ~AddNoiseTest() override = default; }; double stddev6(char a, char b, char c, char d, char e, char f) { diff --git a/test/alt_ref_aq_segment_test.cc b/test/alt_ref_aq_segment_test.cc index b64fc3cd0b..3b1a26ed16 100644 --- a/test/alt_ref_aq_segment_test.cc +++ b/test/alt_ref_aq_segment_test.cc @@ -20,7 +20,7 @@ class AltRefAqSegmentTest public ::libvpx_test::CodecTestWith2Params { protected: AltRefAqSegmentTest() : EncoderTest(GET_PARAM(0)) {} - ~AltRefAqSegmentTest() override {} + ~AltRefAqSegmentTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/altref_test.cc b/test/altref_test.cc index 69b2b87a2a..903230fde9 100644 --- a/test/altref_test.cc +++ b/test/altref_test.cc @@ -24,7 +24,7 @@ class AltRefTest : public ::libvpx_test::EncoderTest, public ::libvpx_test::CodecTestWithParam { protected: AltRefTest() : EncoderTest(GET_PARAM(0)), altref_count_(0) {} - ~AltRefTest() override {} + ~AltRefTest() override = default; void SetUp() override { InitializeConfig(); @@ -75,7 +75,7 @@ class AltRefForcedKeyTestLarge AltRefForcedKeyTestLarge() : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {} - ~AltRefForcedKeyTestLarge() override {} + ~AltRefForcedKeyTestLarge() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc index c98b8de094..955e1dafc0 100644 --- a/test/aq_segment_test.cc +++ b/test/aq_segment_test.cc @@ -20,7 +20,7 @@ class AqSegmentTest public ::libvpx_test::CodecTestWith2Params { protected: AqSegmentTest() : EncoderTest(GET_PARAM(0)) {} - ~AqSegmentTest() override {} + ~AqSegmentTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/borders_test.cc b/test/borders_test.cc index 009121bf22..2726bd557d 100644 --- a/test/borders_test.cc +++ b/test/borders_test.cc @@ -22,7 +22,7 @@ class BordersTest public ::libvpx_test::CodecTestWithParam { protected: BordersTest() : EncoderTest(GET_PARAM(0)) {} - ~BordersTest() override {} + ~BordersTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/config_test.cc b/test/config_test.cc index a476d580a5..729b01151b 100644 --- a/test/config_test.cc +++ b/test/config_test.cc @@ -22,7 +22,7 @@ class ConfigTest ConfigTest() : EncoderTest(GET_PARAM(0)), frame_count_in_(0), frame_count_out_(0), frame_count_max_(0) {} - ~ConfigTest() override {} + ~ConfigTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc index 78999ce658..22f4552963 100644 --- a/test/cpu_speed_test.cc +++ b/test/cpu_speed_test.cc @@ -26,7 +26,7 @@ class CpuSpeedTest : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR), tune_content_(VP9E_CONTENT_DEFAULT) {} - ~CpuSpeedTest() override {} + ~CpuSpeedTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/cq_test.cc b/test/cq_test.cc index a9a16aae13..b74915a336 100644 --- a/test/cq_test.cc +++ b/test/cq_test.cc @@ -50,7 +50,7 @@ class CQTest : public ::libvpx_test::EncoderTest, init_flags_ = VPX_CODEC_USE_PSNR; } - ~CQTest() override {} + ~CQTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index de98d99731..8c4213ee16 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -310,7 +310,7 @@ void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { class Trans16x16TestBase { public: - virtual ~Trans16x16TestBase() {} + virtual ~Trans16x16TestBase() = default; protected: virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0; @@ -728,7 +728,7 @@ class Trans16x16TestBase { class Trans16x16DCT : public Trans16x16TestBase, public ::testing::TestWithParam { public: - ~Trans16x16DCT() override {} + ~Trans16x16DCT() override = default; void SetUp() override { fwd_txfm_ = GET_PARAM(0); @@ -782,7 +782,7 @@ TEST_P(Trans16x16DCT, DISABLED_Speed) { RunSpeedTest(); } class Trans16x16HT : public Trans16x16TestBase, public ::testing::TestWithParam { public: - ~Trans16x16HT() override {} + ~Trans16x16HT() override = default; void SetUp() override { fwd_txfm_ = GET_PARAM(0); @@ -832,7 +832,7 @@ TEST_P(Trans16x16HT, QuantCheck) { class InvTrans16x16DCT : public Trans16x16TestBase, public ::testing::TestWithParam { public: - ~InvTrans16x16DCT() override {} + ~InvTrans16x16DCT() override = default; void SetUp() override { ref_txfm_ = GET_PARAM(0); diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index 62547ac537..6233b17a43 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -89,7 +89,7 @@ void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) { class Trans32x32Test : public AbstractBench, public ::testing::TestWithParam { public: - ~Trans32x32Test() override {} + ~Trans32x32Test() override = default; void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); @@ -321,7 +321,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) { class InvTrans32x32Test : public ::testing::TestWithParam { public: - ~InvTrans32x32Test() override {} + ~InvTrans32x32Test() override = default; void SetUp() override { ref_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); diff --git a/test/decode_corrupted.cc b/test/decode_corrupted.cc index a9a2cc6e70..58773d7b86 100644 --- a/test/decode_corrupted.cc +++ b/test/decode_corrupted.cc @@ -28,7 +28,7 @@ class DecodeCorruptedFrameTest DecodeCorruptedFrameTest() : EncoderTest(GET_PARAM(0)) {} protected: - ~DecodeCorruptedFrameTest() override {} + ~DecodeCorruptedFrameTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc index 7533778e82..ed23cc2e7e 100644 --- a/test/decode_perf_test.cc +++ b/test/decode_perf_test.cc @@ -118,7 +118,7 @@ class VP9NewEncodeDecodePerfTest : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0), outfile_(0), out_frames_(0) {} - ~VP9NewEncodeDecodePerfTest() override {} + ~VP9NewEncodeDecodePerfTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/decode_svc_test.cc b/test/decode_svc_test.cc index 29e9bd06f5..7098e7b270 100644 --- a/test/decode_svc_test.cc +++ b/test/decode_svc_test.cc @@ -25,7 +25,7 @@ class DecodeSvcTest : public ::libvpx_test::DecoderTest, public ::libvpx_test::CodecTestWithParam { protected: DecodeSvcTest() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)) {} - ~DecodeSvcTest() override {} + ~DecodeSvcTest() override = default; void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video, libvpx_test::Decoder *decoder) override { diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index e8a044ae17..c8bd7daa4a 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -370,7 +370,7 @@ class EncodeApiGetTplStatsTest public ::testing::TestWithParam { public: EncodeApiGetTplStatsTest() : EncoderTest(GetParam()), test_io_(false) {} - ~EncodeApiGetTplStatsTest() override {} + ~EncodeApiGetTplStatsTest() override = default; protected: void SetUp() override { diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc index 5f9c58dc94..171ff8eeca 100644 --- a/test/encode_perf_test.cc +++ b/test/encode_perf_test.cc @@ -61,7 +61,7 @@ class VP9EncodePerfTest : EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0), encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {} - ~VP9EncodePerfTest() override {} + ~VP9EncodePerfTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc index 622c3c4461..8db4685257 100644 --- a/test/error_resilience_test.cc +++ b/test/error_resilience_test.cc @@ -30,7 +30,7 @@ class ErrorResilienceTestLarge Reset(); } - ~ErrorResilienceTestLarge() override {} + ~ErrorResilienceTestLarge() override = default; void Reset() { error_nframes_ = 0; @@ -381,7 +381,7 @@ class ErrorResilienceTestLargeCodecControls Reset(); } - ~ErrorResilienceTestLargeCodecControls() override {} + ~ErrorResilienceTestLargeCodecControls() override = default; void Reset() { last_pts_ = 0; diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index ba9db00120..3cdf909d46 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -143,7 +143,7 @@ void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { #endif class FwdTrans8x8TestBase { public: - virtual ~FwdTrans8x8TestBase() {} + virtual ~FwdTrans8x8TestBase() = default; protected: virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0; @@ -539,7 +539,7 @@ class FwdTrans8x8TestBase { class FwdTrans8x8DCT : public FwdTrans8x8TestBase, public ::testing::TestWithParam { public: - ~FwdTrans8x8DCT() override {} + ~FwdTrans8x8DCT() override = default; void SetUp() override { fwd_txfm_ = GET_PARAM(0); @@ -578,7 +578,7 @@ TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); } class FwdTrans8x8HT : public FwdTrans8x8TestBase, public ::testing::TestWithParam { public: - ~FwdTrans8x8HT() override {} + ~FwdTrans8x8HT() override = default; void SetUp() override { fwd_txfm_ = GET_PARAM(0); @@ -614,7 +614,7 @@ TEST_P(FwdTrans8x8HT, ExtremalCheck) { RunExtremalCheck(); } class InvTrans8x8DCT : public FwdTrans8x8TestBase, public ::testing::TestWithParam { public: - ~InvTrans8x8DCT() override {} + ~InvTrans8x8DCT() override = default; void SetUp() override { ref_txfm_ = GET_PARAM(0); diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc index 266858eebb..7b6c29a88f 100644 --- a/test/frame_size_tests.cc +++ b/test/frame_size_tests.cc @@ -79,7 +79,7 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest, protected: VP9FrameSizeTestsLarge() : EncoderTest(&::libvpx_test::kVP9), expected_res_(VPX_CODEC_OK) {} - ~VP9FrameSizeTestsLarge() override {} + ~VP9FrameSizeTestsLarge() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc index d624cb19d6..dabf88e415 100644 --- a/test/keyframe_test.cc +++ b/test/keyframe_test.cc @@ -22,7 +22,7 @@ class KeyframeTest public ::libvpx_test::CodecTestWithParam { protected: KeyframeTest() : EncoderTest(GET_PARAM(0)) {} - ~KeyframeTest() override {} + ~KeyframeTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/level_test.cc b/test/level_test.cc index 8e653d93e1..3f1cf9f1c5 100644 --- a/test/level_test.cc +++ b/test/level_test.cc @@ -22,7 +22,7 @@ class LevelTest : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), cpu_used_(GET_PARAM(2)), min_gf_internal_(24), target_level_(0), level_(0) {} - ~LevelTest() override {} + ~LevelTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/lpf_test.cc b/test/lpf_test.cc index 2f04194dce..ce0ddeae18 100644 --- a/test/lpf_test.cc +++ b/test/lpf_test.cc @@ -129,7 +129,7 @@ uint8_t GetHevThresh(ACMRandom *rnd) { class Loop8Test6Param : public ::testing::TestWithParam { public: - ~Loop8Test6Param() override {} + ~Loop8Test6Param() override = default; void SetUp() override { loopfilter_op_ = GET_PARAM(0); ref_loopfilter_op_ = GET_PARAM(1); @@ -151,7 +151,7 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param); (HAVE_DSPR2 || HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH) class Loop8Test9Param : public ::testing::TestWithParam { public: - ~Loop8Test9Param() override {} + ~Loop8Test9Param() override = default; void SetUp() override { loopfilter_op_ = GET_PARAM(0); ref_loopfilter_op_ = GET_PARAM(1); diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 6593ac68e9..01e63eb691 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -59,7 +59,7 @@ const int kCountTestBlock = 1000; class PartialIDctTest : public ::testing::TestWithParam { public: - ~PartialIDctTest() override {} + ~PartialIDctTest() override = default; void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); fwd_txfm_ = GET_PARAM(0); diff --git a/test/realtime_test.cc b/test/realtime_test.cc index c5de2dcb35..88e510fd0d 100644 --- a/test/realtime_test.cc +++ b/test/realtime_test.cc @@ -26,7 +26,7 @@ class RealtimeTest public ::libvpx_test::CodecTestWithParam { protected: RealtimeTest() : EncoderTest(GET_PARAM(0)), frame_packets_(0) {} - ~RealtimeTest() override {} + ~RealtimeTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/resize_test.cc b/test/resize_test.cc index 3eb842f549..7d01bbd3d5 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -247,7 +247,7 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource { } bool flag_codec_; bool smaller_width_larger_size_; - ~ResizingVideoSource() override {} + ~ResizingVideoSource() override = default; protected: void Next() override { @@ -267,7 +267,7 @@ class ResizeTest protected: ResizeTest() : EncoderTest(GET_PARAM(0)) {} - ~ResizeTest() override {} + ~ResizeTest() override = default; void SetUp() override { InitializeConfig(); @@ -336,7 +336,7 @@ class ResizeInternalTest : public ResizeTest { ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {} #endif - ~ResizeInternalTest() override {} + ~ResizeInternalTest() override = default; void BeginPassHook(unsigned int /*pass*/) override { #if WRITE_COMPRESSED_STREAM @@ -450,7 +450,7 @@ class ResizeRealtimeTest public ::libvpx_test::CodecTestWith2Params { protected: ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {} - ~ResizeRealtimeTest() override {} + ~ResizeRealtimeTest() override = default; void PreEncodeFrameHook(libvpx_test::VideoSource *video, libvpx_test::Encoder *encoder) override { @@ -693,7 +693,7 @@ class ResizeCspTest : public ResizeTest { ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {} #endif - ~ResizeCspTest() override {} + ~ResizeCspTest() override = default; void BeginPassHook(unsigned int /*pass*/) override { #if WRITE_COMPRESSED_STREAM @@ -758,7 +758,7 @@ class ResizingCspVideoSource : public ::libvpx_test::DummyVideoSource { limit_ = 30; } - ~ResizingCspVideoSource() override {} + ~ResizingCspVideoSource() override = default; protected: void Next() override { diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc index 1943acf8b3..5abb464dc0 100644 --- a/test/sum_squares_test.cc +++ b/test/sum_squares_test.cc @@ -33,7 +33,7 @@ typedef std::tuple SumSquaresParam; class SumSquaresTest : public ::testing::TestWithParam { public: - ~SumSquaresTest() override {} + ~SumSquaresTest() override = default; void SetUp() override { ref_func_ = GET_PARAM(0); tst_func_ = GET_PARAM(1); diff --git a/test/superframe_test.cc b/test/superframe_test.cc index 4a522dd496..4c3aa1625a 100644 --- a/test/superframe_test.cc +++ b/test/superframe_test.cc @@ -28,7 +28,7 @@ class SuperframeTest protected: SuperframeTest() : EncoderTest(GET_PARAM(0)), modified_buf_(nullptr), last_sf_pts_(0) {} - ~SuperframeTest() override {} + ~SuperframeTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc index 723538368f..aff4ace843 100644 --- a/test/svc_datarate_test.cc +++ b/test/svc_datarate_test.cc @@ -43,7 +43,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } protected: - ~DatarateOnePassCbrSvc() override {} + ~DatarateOnePassCbrSvc() override = default; virtual void ResetModel() { last_pts_ = 0; @@ -670,7 +670,7 @@ class DatarateOnePassCbrSvcSingleBR DatarateOnePassCbrSvcSingleBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - ~DatarateOnePassCbrSvcSingleBR() override {} + ~DatarateOnePassCbrSvcSingleBR() override = default; protected: void SetUp() override { @@ -1160,7 +1160,7 @@ class DatarateOnePassCbrSvcMultiBR DatarateOnePassCbrSvcMultiBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - ~DatarateOnePassCbrSvcMultiBR() override {} + ~DatarateOnePassCbrSvcMultiBR() override = default; protected: void SetUp() override { @@ -1243,7 +1243,7 @@ class DatarateOnePassCbrSvcFrameDropMultiBR : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - ~DatarateOnePassCbrSvcFrameDropMultiBR() override {} + ~DatarateOnePassCbrSvcFrameDropMultiBR() override = default; protected: void SetUp() override { @@ -1355,7 +1355,7 @@ class DatarateOnePassCbrSvcInterLayerPredSingleBR : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - ~DatarateOnePassCbrSvcInterLayerPredSingleBR() override {} + ~DatarateOnePassCbrSvcInterLayerPredSingleBR() override = default; protected: void SetUp() override { @@ -1441,7 +1441,7 @@ class DatarateOnePassCbrSvcDenoiser DatarateOnePassCbrSvcDenoiser() : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - ~DatarateOnePassCbrSvcDenoiser() override {} + ~DatarateOnePassCbrSvcDenoiser() override = default; protected: void SetUp() override { @@ -1499,7 +1499,7 @@ class DatarateOnePassCbrSvcSmallKF DatarateOnePassCbrSvcSmallKF() : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - ~DatarateOnePassCbrSvcSmallKF() override {} + ~DatarateOnePassCbrSvcSmallKF() override = default; protected: void SetUp() override { @@ -1702,7 +1702,7 @@ class DatarateOnePassCbrSvcPostencodeDrop DatarateOnePassCbrSvcPostencodeDrop() : DatarateOnePassCbrSvc(GET_PARAM(0)) { memset(&svc_params_, 0, sizeof(svc_params_)); } - ~DatarateOnePassCbrSvcPostencodeDrop() override {} + ~DatarateOnePassCbrSvcPostencodeDrop() override = default; protected: void SetUp() override { diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc index c9ef35bbe6..b4337ae754 100644 --- a/test/svc_end_to_end_test.cc +++ b/test/svc_end_to_end_test.cc @@ -45,7 +45,7 @@ class ScalePartitionOnePassCbrSvc } protected: - ~ScalePartitionOnePassCbrSvc() override {} + ~ScalePartitionOnePassCbrSvc() override = default; void SetUp() override { InitializeConfig(); @@ -129,7 +129,7 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, } protected: - ~SyncFrameOnePassCbrSvc() override {} + ~SyncFrameOnePassCbrSvc() override = default; void SetUp() override { InitializeConfig(); @@ -657,7 +657,7 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc, } protected: - ~LoopfilterOnePassCbrSvc() override {} + ~LoopfilterOnePassCbrSvc() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/timestamp_test.cc b/test/timestamp_test.cc index d6f0b3bda2..00abf8f31c 100644 --- a/test/timestamp_test.cc +++ b/test/timestamp_test.cc @@ -67,7 +67,7 @@ class TimestampTest public ::libvpx_test::CodecTestWithParam { protected: TimestampTest() : EncoderTest(GET_PARAM(0)) {} - ~TimestampTest() override {} + ~TimestampTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/vp8_datarate_test.cc b/test/vp8_datarate_test.cc index c91c2a0d26..aee27af66e 100644 --- a/test/vp8_datarate_test.cc +++ b/test/vp8_datarate_test.cc @@ -24,7 +24,7 @@ class DatarateTestLarge public: DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {} - ~DatarateTestLarge() override {} + ~DatarateTestLarge() override = default; protected: void SetUp() override { @@ -301,7 +301,7 @@ TEST_P(DatarateTestLarge, DropFramesMultiThreads) { class DatarateTestRealTime : public DatarateTestLarge { public: - ~DatarateTestRealTime() override {} + ~DatarateTestRealTime() override = default; }; #if CONFIG_TEMPORAL_DENOISING diff --git a/test/vp8_denoiser_sse2_test.cc b/test/vp8_denoiser_sse2_test.cc index a6d414e508..7fa867d8bb 100644 --- a/test/vp8_denoiser_sse2_test.cc +++ b/test/vp8_denoiser_sse2_test.cc @@ -30,7 +30,7 @@ namespace { const int kNumPixels = 16 * 16; class VP8DenoiserTest : public ::testing::TestWithParam { public: - ~VP8DenoiserTest() override {} + ~VP8DenoiserTest() override = default; void SetUp() override { increase_denoising_ = GetParam(); } diff --git a/test/vp8_fragments_test.cc b/test/vp8_fragments_test.cc index bb527966c5..01b4c2120e 100644 --- a/test/vp8_fragments_test.cc +++ b/test/vp8_fragments_test.cc @@ -17,7 +17,7 @@ class VP8FragmentsTest : public ::libvpx_test::EncoderTest, public ::testing::Test { protected: VP8FragmentsTest() : EncoderTest(&::libvpx_test::kVP8) {} - ~VP8FragmentsTest() override {} + ~VP8FragmentsTest() override = default; void SetUp() override { const unsigned long init_flags = // NOLINT(runtime/int) diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc index 70d73c52d6..81f06d90ad 100644 --- a/test/vp8_ratectrl_rtc_test.cc +++ b/test/vp8_ratectrl_rtc_test.cc @@ -25,7 +25,7 @@ namespace { struct Vp8RCTestVideo { - Vp8RCTestVideo() {} + Vp8RCTestVideo() = default; Vp8RCTestVideo(const char *name_, int width_, int height_, unsigned int frames_) : name(name_), width(width_), height(height_), frames(frames_) {} @@ -53,7 +53,7 @@ class Vp8RcInterfaceTest public: Vp8RcInterfaceTest() : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false) {} - ~Vp8RcInterfaceTest() override {} + ~Vp8RcInterfaceTest() override = default; protected: void SetUp() override { diff --git a/test/vp9_arf_freq_test.cc b/test/vp9_arf_freq_test.cc index 7cc9a28396..3882326d2f 100644 --- a/test/vp9_arf_freq_test.cc +++ b/test/vp9_arf_freq_test.cc @@ -86,7 +86,7 @@ class ArfFreqTest : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)), test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {} - ~ArfFreqTest() override {} + ~ArfFreqTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/vp9_block_error_test.cc b/test/vp9_block_error_test.cc index 4ff1838c34..0645341ac1 100644 --- a/test/vp9_block_error_test.cc +++ b/test/vp9_block_error_test.cc @@ -53,7 +53,7 @@ int64_t BlockError8BitWrapper(const tran_low_t *coeff, class BlockErrorTest : public ::testing::TestWithParam { public: - ~BlockErrorTest() override {} + ~BlockErrorTest() override = default; void SetUp() override { error_block_op_ = GET_PARAM(0); ref_error_block_op_ = GET_PARAM(1); diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc index 48480d6fa1..4bc9099206 100644 --- a/test/vp9_datarate_test.cc +++ b/test/vp9_datarate_test.cc @@ -28,7 +28,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest { } protected: - ~DatarateTestVP9() override {} + ~DatarateTestVP9() override = default; virtual void ResetModel() { last_pts_ = 0; @@ -579,7 +579,7 @@ class DatarateTestVP9RealTime : public DatarateTestVP9, public ::libvpx_test::CodecTestWithParam { public: DatarateTestVP9RealTime() : DatarateTestVP9(GET_PARAM(0)) {} - ~DatarateTestVP9RealTime() override {} + ~DatarateTestVP9RealTime() override = default; protected: void SetUp() override { @@ -731,7 +731,7 @@ class DatarateTestVP9RealTimeDeltaQUV public ::libvpx_test::CodecTestWith2Params { public: DatarateTestVP9RealTimeDeltaQUV() : DatarateTestVP9(GET_PARAM(0)) {} - ~DatarateTestVP9RealTimeDeltaQUV() override {} + ~DatarateTestVP9RealTimeDeltaQUV() override = default; protected: void SetUp() override { @@ -819,7 +819,7 @@ class DatarateTestVP9FrameQp public ::testing::TestWithParam { public: DatarateTestVP9FrameQp() : DatarateTestVP9(GetParam()), frame_(0) {} - ~DatarateTestVP9FrameQp() override {} + ~DatarateTestVP9FrameQp() override = default; protected: void SetUp() override { @@ -945,7 +945,7 @@ TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) { // Params: speed setting. class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime { public: - ~DatarateTestVP9RealTimeDenoiser() override {} + ~DatarateTestVP9RealTimeDenoiser() override = default; }; // Check basic datarate targeting, for a single bitrate, when denoiser is on. diff --git a/test/vp9_denoiser_test.cc b/test/vp9_denoiser_test.cc index 1f679d5bd8..831f83305c 100644 --- a/test/vp9_denoiser_test.cc +++ b/test/vp9_denoiser_test.cc @@ -42,7 +42,7 @@ class VP9DenoiserTest : public ::testing::Test, public ::testing::WithParamInterface { public: - ~VP9DenoiserTest() override {} + ~VP9DenoiserTest() override = default; void SetUp() override { bs_ = GET_PARAM(1); } diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc index 1f9929f880..0e182c76db 100644 --- a/test/vp9_encoder_parms_get_to_decoder.cc +++ b/test/vp9_encoder_parms_get_to_decoder.cc @@ -62,7 +62,7 @@ class VpxEncoderParmsGetToDecoder VpxEncoderParmsGetToDecoder() : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {} - ~VpxEncoderParmsGetToDecoder() override {} + ~VpxEncoderParmsGetToDecoder() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc index d4c0b0dd11..79be4ee146 100644 --- a/test/vp9_end_to_end_test.cc +++ b/test/vp9_end_to_end_test.cc @@ -89,7 +89,7 @@ class EndToEndTestAdaptiveRDThresh : EncoderTest(GET_PARAM(0)), cpu_used_start_(GET_PARAM(1)), cpu_used_end_(GET_PARAM(2)) {} - ~EndToEndTestAdaptiveRDThresh() override {} + ~EndToEndTestAdaptiveRDThresh() override = default; void SetUp() override { InitializeConfig(); @@ -131,7 +131,7 @@ class EndToEndTestLarge denoiser_on_ = 0; } - ~EndToEndTestLarge() override {} + ~EndToEndTestLarge() override = default; void SetUp() override { InitializeConfig(); @@ -207,7 +207,7 @@ class EndToEndTestLoopFilterThreading EndToEndTestLoopFilterThreading() : EncoderTest(GET_PARAM(0)), use_loop_filter_opt_(GET_PARAM(1)) {} - ~EndToEndTestLoopFilterThreading() override {} + ~EndToEndTestLoopFilterThreading() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc index 27a73b2aa5..c8d3cba7fb 100644 --- a/test/vp9_ethread_test.cc +++ b/test/vp9_ethread_test.cc @@ -233,7 +233,7 @@ class VPxEncoderThreadTest psnr_ = 0.0; nframes_ = 0; } - ~VPxEncoderThreadTest() override {} + ~VPxEncoderThreadTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/vp9_lossless_test.cc b/test/vp9_lossless_test.cc index 01dae08f54..fe3cd1aba4 100644 --- a/test/vp9_lossless_test.cc +++ b/test/vp9_lossless_test.cc @@ -29,7 +29,7 @@ class LosslessTest : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0), encoding_mode_(GET_PARAM(1)) {} - ~LosslessTest() override {} + ~LosslessTest() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/vp9_motion_vector_test.cc b/test/vp9_motion_vector_test.cc index 033c5fcd83..495ea11fce 100644 --- a/test/vp9_motion_vector_test.cc +++ b/test/vp9_motion_vector_test.cc @@ -42,7 +42,7 @@ class MotionVectorTestLarge : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {} - ~MotionVectorTestLarge() override {} + ~MotionVectorTestLarge() override = default; void SetUp() override { InitializeConfig(); diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index 0680ac7df3..313b68eda3 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -40,7 +40,7 @@ class RcInterfaceTest : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000), encoder_exit_(false) {} - ~RcInterfaceTest() override {} + ~RcInterfaceTest() override = default; protected: void SetUp() override { @@ -170,7 +170,7 @@ class RcInterfaceSvcTest : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000), dynamic_spatial_layers_(0), inter_layer_pred_off_(GET_PARAM(2)), parallel_spatial_layers_(false) {} - ~RcInterfaceSvcTest() override {} + ~RcInterfaceSvcTest() override = default; protected: void SetUp() override { diff --git a/test/vp9_scale_test.cc b/test/vp9_scale_test.cc index bd45c557ee..049a10a617 100644 --- a/test/vp9_scale_test.cc +++ b/test/vp9_scale_test.cc @@ -33,7 +33,7 @@ typedef void (*ScaleFrameFunc)(const YV12_BUFFER_CONFIG *src, class ScaleTest : public VpxScaleBase, public ::testing::TestWithParam { public: - ~ScaleTest() override {} + ~ScaleTest() override = default; protected: void SetUp() override { scale_fn_ = GetParam(); } diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc index 3409e72dfd..c0cea681d7 100644 --- a/test/vp9_thread_test.cc +++ b/test/vp9_thread_test.cc @@ -26,7 +26,7 @@ using std::string; class VPxWorkerThreadTest : public ::testing::TestWithParam { protected: - ~VPxWorkerThreadTest() override {} + ~VPxWorkerThreadTest() override = default; void SetUp() override { vpx_get_worker_interface()->init(&worker_); } void TearDown() override { vpx_get_worker_interface()->end(&worker_); } diff --git a/test/vpx_scale_test.cc b/test/vpx_scale_test.cc index 3a238b7a8d..3897a6088d 100644 --- a/test/vpx_scale_test.cc +++ b/test/vpx_scale_test.cc @@ -38,7 +38,7 @@ class ExtendBorderTest : public VpxScaleBase, public ::testing::TestWithParam { public: - ~ExtendBorderTest() override {} + ~ExtendBorderTest() override = default; protected: void SetUp() override { extend_fn_ = GetParam(); } @@ -68,7 +68,7 @@ INSTANTIATE_TEST_SUITE_P(C, ExtendBorderTest, class CopyFrameTest : public VpxScaleBase, public ::testing::TestWithParam { public: - ~CopyFrameTest() override {} + ~CopyFrameTest() override = default; protected: void SetUp() override { copy_frame_fn_ = GetParam(); } From 5740cb39296ca23e7218e27c2114c972a3f3fa65 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 25 Jul 2023 12:10:21 -0700 Subject: [PATCH 0844/1000] test/decode_perf_test.cc: use nullptr created with clang-tidy --fix --checks=-*,modernize-use-nullptr Change-Id: Ibf4a80fa00e9b59d471c92788ec4c7c72e4662e5 --- test/decode_perf_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc index ed23cc2e7e..383fd2d896 100644 --- a/test/decode_perf_test.cc +++ b/test/decode_perf_test.cc @@ -116,7 +116,7 @@ class VP9NewEncodeDecodePerfTest protected: VP9NewEncodeDecodePerfTest() : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0), - outfile_(0), out_frames_(0) {} + outfile_(nullptr), out_frames_(0) {} ~VP9NewEncodeDecodePerfTest() override = default; From d62edaf41f8ffb3f1bc3e7f7b449c63258666d9c Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 25 Jul 2023 12:18:03 -0700 Subject: [PATCH 0845/1000] test/**.cc: use bool literals created with clang-tidy --fix --checks=-*,modernize-use-bool-literals Change-Id: Ifaed8ca824676555acaf1053b2a5a52c51a70638 --- test/error_resilience_test.cc | 4 ++-- test/vp9_ratectrl_rtc_test.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc index 8db4685257..6b019b2bfb 100644 --- a/test/error_resilience_test.cc +++ b/test/error_resilience_test.cc @@ -136,11 +136,11 @@ class ErrorResilienceTestLarge if (error_frames_[i] == nframes_ - 1) { std::cout << " Skipping decoding frame: " << error_frames_[i] << "\n"; - return 0; + return false; } } } - return 1; + return true; } void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override { diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index 313b68eda3..b76fd3624c 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -227,7 +227,7 @@ class RcInterfaceSvcTest rc_cfg_.layer_target_bitrate[4] = 0; rc_cfg_.layer_target_bitrate[5] = 0; ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); - } else if (/*DISABLES CODE*/ (0) && video->frame() == 280) { + } else if (/*DISABLES CODE*/ (false) && video->frame() == 280) { // TODO(marpan): Re-enable this going back up when issue is fixed. // Go back up to 3 spatial layers. // Update the encoder config: use the original bitrates. From d899b979450c47c9ce4defad5508ed03af85d3cd Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 26 Jul 2023 15:38:36 -0700 Subject: [PATCH 0846/1000] encode_test_driver.h: use bool literal Change-Id: If47be9ca0daa18d92cb849484f9e139e65e3560e --- test/encode_test_driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index 165fcfabf6..c7974894c7 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -264,7 +264,7 @@ class EncoderTest { const CodecFactory *codec_; // Hook to determine whether to decode frame after encoding - virtual bool DoDecode() const { return 1; } + virtual bool DoDecode() const { return true; } // Hook to handle encode/decode mismatch virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2); From 626e37e77717673004b983b8d5fe0836636247b0 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 26 Jul 2023 15:29:40 -0700 Subject: [PATCH 0847/1000] test/*.h: prefer 'override' to 'virtual' created with clang-tidy --fix --checks=-*,modernize-use-override Change-Id: I53412f35590799574edb573ae417a4a004cccd1e --- test/codec_factory.h | 42 +++++++++++++++++++--------------------- test/ivf_video_source.h | 14 +++++++------- test/svc_test.h | 6 +++--- test/video_source.h | 22 ++++++++++----------- test/webm_video_source.h | 14 +++++++------- test/y4m_video_source.h | 18 ++++++++--------- test/yuv_video_source.h | 18 ++++++++--------- 7 files changed, 66 insertions(+), 68 deletions(-) diff --git a/test/codec_factory.h b/test/codec_factory.h index 96092610c6..d00563df1c 100644 --- a/test/codec_factory.h +++ b/test/codec_factory.h @@ -84,7 +84,7 @@ class VP8Decoder : public Decoder { : Decoder(cfg, flag) {} protected: - virtual vpx_codec_iface_t *CodecInterface() const { + vpx_codec_iface_t *CodecInterface() const override { #if CONFIG_VP8_DECODER return &vpx_codec_vp8_dx_algo; #else @@ -100,7 +100,7 @@ class VP8Encoder : public Encoder { : Encoder(cfg, deadline, init_flags, stats) {} protected: - virtual vpx_codec_iface_t *CodecInterface() const { + vpx_codec_iface_t *CodecInterface() const override { #if CONFIG_VP8_ENCODER return &vpx_codec_vp8_cx_algo; #else @@ -113,12 +113,12 @@ class VP8CodecFactory : public CodecFactory { public: VP8CodecFactory() : CodecFactory() {} - virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const { + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override { return CreateDecoder(cfg, 0); } - virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg, - const vpx_codec_flags_t flags) const { + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg, + const vpx_codec_flags_t flags) const override { #if CONFIG_VP8_DECODER return new VP8Decoder(cfg, flags); #else @@ -128,10 +128,9 @@ class VP8CodecFactory : public CodecFactory { #endif } - virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, - unsigned long deadline, - const unsigned long init_flags, - TwopassStatsStore *stats) const { + Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, + const unsigned long init_flags, + TwopassStatsStore *stats) const override { #if CONFIG_VP8_ENCODER return new VP8Encoder(cfg, deadline, init_flags, stats); #else @@ -143,8 +142,8 @@ class VP8CodecFactory : public CodecFactory { #endif } - virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, - int usage) const { + vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, + int usage) const override { #if CONFIG_VP8_ENCODER return vpx_codec_enc_config_default(&vpx_codec_vp8_cx_algo, cfg, usage); #else @@ -180,7 +179,7 @@ class VP9Decoder : public Decoder { : Decoder(cfg, flag) {} protected: - virtual vpx_codec_iface_t *CodecInterface() const { + vpx_codec_iface_t *CodecInterface() const override { #if CONFIG_VP9_DECODER return &vpx_codec_vp9_dx_algo; #else @@ -196,7 +195,7 @@ class VP9Encoder : public Encoder { : Encoder(cfg, deadline, init_flags, stats) {} protected: - virtual vpx_codec_iface_t *CodecInterface() const { + vpx_codec_iface_t *CodecInterface() const override { #if CONFIG_VP9_ENCODER return &vpx_codec_vp9_cx_algo; #else @@ -209,12 +208,12 @@ class VP9CodecFactory : public CodecFactory { public: VP9CodecFactory() : CodecFactory() {} - virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const { + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override { return CreateDecoder(cfg, 0); } - virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg, - const vpx_codec_flags_t flags) const { + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg, + const vpx_codec_flags_t flags) const override { #if CONFIG_VP9_DECODER return new VP9Decoder(cfg, flags); #else @@ -224,10 +223,9 @@ class VP9CodecFactory : public CodecFactory { #endif } - virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, - unsigned long deadline, - const unsigned long init_flags, - TwopassStatsStore *stats) const { + Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, + const unsigned long init_flags, + TwopassStatsStore *stats) const override { #if CONFIG_VP9_ENCODER return new VP9Encoder(cfg, deadline, init_flags, stats); #else @@ -239,8 +237,8 @@ class VP9CodecFactory : public CodecFactory { #endif } - virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, - int usage) const { + vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, + int usage) const override { #if CONFIG_VP9_ENCODER return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage); #else diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h index a8ac4f154c..3ccac62b51 100644 --- a/test/ivf_video_source.h +++ b/test/ivf_video_source.h @@ -33,19 +33,19 @@ class IVFVideoSource : public CompressedVideoSource { compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0), end_of_file_(false) {} - virtual ~IVFVideoSource() { + ~IVFVideoSource() override { delete[] compressed_frame_buf_; if (input_file_) fclose(input_file_); } - virtual void Init() { + void Init() override { // Allocate a buffer for read in the compressed video frame. compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize]; ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed"; } - virtual void Begin() { + void Begin() override { input_file_ = OpenTestDataFile(file_name_); ASSERT_NE(input_file_, nullptr) << "Input file open failed. Filename: " << file_name_; @@ -62,7 +62,7 @@ class IVFVideoSource : public CompressedVideoSource { FillFrame(); } - virtual void Next() { + void Next() override { ++frame_; FillFrame(); } @@ -86,11 +86,11 @@ class IVFVideoSource : public CompressedVideoSource { } } - virtual const uint8_t *cxdata() const { + const uint8_t *cxdata() const override { return end_of_file_ ? nullptr : compressed_frame_buf_; } - virtual size_t frame_size() const { return frame_sz_; } - virtual unsigned int frame_number() const { return frame_; } + size_t frame_size() const override { return frame_sz_; } + unsigned int frame_number() const override { return frame_; } protected: std::string file_name_; diff --git a/test/svc_test.h b/test/svc_test.h index f1d727fd9d..0026372de5 100644 --- a/test/svc_test.h +++ b/test/svc_test.h @@ -36,7 +36,7 @@ class OnePassCbrSvc : public ::libvpx_test::EncoderTest { } protected: - virtual ~OnePassCbrSvc() {} + ~OnePassCbrSvc() override {} virtual void SetConfig(const int num_temporal_layer) = 0; @@ -46,11 +46,11 @@ class OnePassCbrSvc : public ::libvpx_test::EncoderTest { virtual void PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder); - virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder); + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override; virtual void AssignLayerBitrates(); - virtual void MismatchHook(const vpx_image_t *, const vpx_image_t *) {} + void MismatchHook(const vpx_image_t *, const vpx_image_t *) override {} vpx_svc_extra_cfg_t svc_params_; int64_t bits_in_buffer_model_[VPX_MAX_LAYERS]; diff --git a/test/video_source.h b/test/video_source.h index 5ed99d0639..2194126f1f 100644 --- a/test/video_source.h +++ b/test/video_source.h @@ -163,35 +163,35 @@ class DummyVideoSource : public VideoSource { ReallocImage(); } - virtual ~DummyVideoSource() { vpx_img_free(img_); } + ~DummyVideoSource() override { vpx_img_free(img_); } - virtual void Begin() { + void Begin() override { frame_ = 0; FillFrame(); } - virtual void Next() { + void Next() override { ++frame_; FillFrame(); } - virtual vpx_image_t *img() const { + vpx_image_t *img() const override { return (frame_ < limit_) ? img_ : nullptr; } // Models a stream where Timebase = 1/FPS, so pts == frame. - virtual vpx_codec_pts_t pts() const { return frame_; } + vpx_codec_pts_t pts() const override { return frame_; } - virtual unsigned long duration() const { return 1; } + unsigned long duration() const override { return 1; } - virtual vpx_rational_t timebase() const { + vpx_rational_t timebase() const override { const vpx_rational_t t = { 1, 30 }; return t; } - virtual unsigned int frame() const { return frame_; } + unsigned int frame() const override { return frame_; } - virtual unsigned int limit() const { return limit_; } + unsigned int limit() const override { return limit_; } void set_limit(unsigned int limit) { limit_ = limit; } @@ -238,7 +238,7 @@ class RandomVideoSource : public DummyVideoSource { protected: // Reset the RNG to get a matching stream for the second pass - virtual void Begin() { + void Begin() override { frame_ = 0; rnd_.Reset(seed_); FillFrame(); @@ -246,7 +246,7 @@ class RandomVideoSource : public DummyVideoSource { // 15 frames of noise, followed by 15 static frames. Reset to 0 rather // than holding previous frames to encourage keyframes to be thrown. - virtual void FillFrame() { + void FillFrame() override { if (img_) { if (frame_ % 30 < 15) { for (size_t i = 0; i < raw_sz_; ++i) img_->img_data[i] = rnd_.Rand8(); diff --git a/test/webm_video_source.h b/test/webm_video_source.h index d245926298..6ab50c849f 100644 --- a/test/webm_video_source.h +++ b/test/webm_video_source.h @@ -29,16 +29,16 @@ class WebMVideoSource : public CompressedVideoSource { webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0), frame_(0), end_of_file_(false) {} - virtual ~WebMVideoSource() { + ~WebMVideoSource() override { if (vpx_ctx_->file != nullptr) fclose(vpx_ctx_->file); webm_free(webm_ctx_); delete vpx_ctx_; delete webm_ctx_; } - virtual void Init() {} + void Init() override {} - virtual void Begin() { + void Begin() override { vpx_ctx_->file = OpenTestDataFile(file_name_); ASSERT_NE(vpx_ctx_->file, nullptr) << "Input file open failed. Filename: " << file_name_; @@ -48,7 +48,7 @@ class WebMVideoSource : public CompressedVideoSource { FillFrame(); } - virtual void Next() { + void Next() override { ++frame_; FillFrame(); } @@ -74,11 +74,11 @@ class WebMVideoSource : public CompressedVideoSource { } while (!webm_ctx_->is_key_frame && !end_of_file_); } - virtual const uint8_t *cxdata() const { + const uint8_t *cxdata() const override { return end_of_file_ ? nullptr : buf_; } - virtual size_t frame_size() const { return buf_sz_; } - virtual unsigned int frame_number() const { return frame_; } + size_t frame_size() const override { return buf_sz_; } + unsigned int frame_number() const override { return frame_; } protected: std::string file_name_; diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h index 71fbf31931..e43e37d9e4 100644 --- a/test/y4m_video_source.h +++ b/test/y4m_video_source.h @@ -27,7 +27,7 @@ class Y4mVideoSource : public VideoSource { start_(start), limit_(limit), frame_(0), framerate_numerator_(0), framerate_denominator_(0), y4m_() {} - virtual ~Y4mVideoSource() { + ~Y4mVideoSource() override { vpx_img_free(img_.get()); CloseSource(); } @@ -51,33 +51,33 @@ class Y4mVideoSource : public VideoSource { FillFrame(); } - virtual void Begin() { + void Begin() override { OpenSource(); ReadSourceToStart(); } - virtual void Next() { + void Next() override { ++frame_; FillFrame(); } - virtual vpx_image_t *img() const { + vpx_image_t *img() const override { return (frame_ < limit_) ? img_.get() : nullptr; } // Models a stream where Timebase = 1/FPS, so pts == frame. - virtual vpx_codec_pts_t pts() const { return frame_; } + vpx_codec_pts_t pts() const override { return frame_; } - virtual unsigned long duration() const { return 1; } + unsigned long duration() const override { return 1; } - virtual vpx_rational_t timebase() const { + vpx_rational_t timebase() const override { const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ }; return t; } - virtual unsigned int frame() const { return frame_; } + unsigned int frame() const override { return frame_; } - virtual unsigned int limit() const { return limit_; } + unsigned int limit() const override { return limit_; } virtual void FillFrame() { ASSERT_NE(input_file_, nullptr); diff --git a/test/yuv_video_source.h b/test/yuv_video_source.h index 51948c0efb..bb5eec5bb8 100644 --- a/test/yuv_video_source.h +++ b/test/yuv_video_source.h @@ -35,12 +35,12 @@ class YUVVideoSource : public VideoSource { SetSize(width, height, format); } - virtual ~YUVVideoSource() { + ~YUVVideoSource() override { vpx_img_free(img_); if (input_file_) fclose(input_file_); } - virtual void Begin() { + void Begin() override { if (input_file_) fclose(input_file_); input_file_ = OpenTestDataFile(file_name_); ASSERT_NE(input_file_, nullptr) @@ -53,28 +53,28 @@ class YUVVideoSource : public VideoSource { FillFrame(); } - virtual void Next() { + void Next() override { ++frame_; FillFrame(); } - virtual vpx_image_t *img() const { + vpx_image_t *img() const override { return (frame_ < limit_) ? img_ : nullptr; } // Models a stream where Timebase = 1/FPS, so pts == frame. - virtual vpx_codec_pts_t pts() const { return frame_; } + vpx_codec_pts_t pts() const override { return frame_; } - virtual unsigned long duration() const { return 1; } + unsigned long duration() const override { return 1; } - virtual vpx_rational_t timebase() const { + vpx_rational_t timebase() const override { const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ }; return t; } - virtual unsigned int frame() const { return frame_; } + unsigned int frame() const override { return frame_; } - virtual unsigned int limit() const { return limit_; } + unsigned int limit() const override { return limit_; } virtual void SetSize(unsigned int width, unsigned int height, vpx_img_fmt format) { From 4f19de38265dd04559059a030cc8692c2e19685d Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 26 Jul 2023 19:03:04 -0700 Subject: [PATCH 0848/1000] resize_test: prefer 'override' to 'virtual' Update functions in WRITE_COMPRESSED_STREAM blocks, which are disabled by default. This caused them to be missed in: 84e6b7ab0 test/*.cc: prefer 'override' to 'virtual' Change-Id: I0e462263f19c15eb0a30d0c0f4e145062f789489 --- test/resize_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/resize_test.cc b/test/resize_test.cc index 7d01bbd3d5..fd1c2a92de 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -387,7 +387,7 @@ class ResizeInternalTest : public ResizeTest { } #if WRITE_COMPRESSED_STREAM - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { ++out_frames_; // Write initial file header if first frame. @@ -732,7 +732,7 @@ class ResizeCspTest : public ResizeTest { } #if WRITE_COMPRESSED_STREAM - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { ++out_frames_; // Write initial file header if first frame. From 70fc7563830e9b516db903617797d7537701e8c5 Mon Sep 17 00:00:00 2001 From: Johann Date: Mon, 14 Nov 2022 16:47:33 +0900 Subject: [PATCH 0849/1000] quantize: reduce parameters Pass macroblock_plane and ScanOrder instead of looking up the values beforehand. Avoids pushing arguments to the stack. Change-Id: I22df6f645eb1a1d89ba5a4d9bc58acb77af51aa9 --- test/vp9_quantize_test.cc | 102 ++++++++-------------- vp9/encoder/vp9_encodemb.c | 60 +++++-------- vp9/encoder/vp9_rdopt.c | 28 +++--- vpx_dsp/arm/highbd_quantize_neon.c | 18 ++-- vpx_dsp/arm/quantize_neon.c | 24 +++-- vpx_dsp/quantize.c | 27 +++--- vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 32 +++---- vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 16 ++-- vpx_dsp/x86/quantize_avx.c | 18 ++-- vpx_dsp/x86/quantize_avx2.c | 41 +++++---- vpx_dsp/x86/quantize_sse2.c | 17 ++-- vpx_dsp/x86/quantize_sse2.h | 15 ++-- vpx_dsp/x86/quantize_ssse3.c | 14 ++- 14 files changed, 172 insertions(+), 248 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 5ba90a21bc..26ea9af159 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -42,30 +42,11 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, const macroblock_plane *mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const struct ScanOrder *scan_order); + const struct ScanOrder *const scan_order); typedef std::tuple QuantizeParam; -// Wrapper which takes a macroblock_plane. -typedef void (*QuantizeBaseFunc)(const tran_low_t *coeff, intptr_t count, - const int16_t *zbin, const int16_t *round, - const int16_t *quant, - const int16_t *quant_shift, tran_low_t *qcoeff, - tran_low_t *dqcoeff, const int16_t *dequant, - uint16_t *eob, const int16_t *scan, - const int16_t *iscan); - -template -void QuantWrapper(const tran_low_t *coeff, intptr_t count, - const macroblock_plane *const mb_plane, tran_low_t *qcoeff, - tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const struct ScanOrder *const scan_order) { - fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant, - mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan_order->scan, - scan_order->iscan); -} - // Wrapper for 32x32 version which does not use count typedef void (*Quantize32x32Func)(const tran_low_t *coeff, const macroblock_plane *const mb_plane, @@ -542,19 +523,16 @@ using std::make_tuple; INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, ::testing::Values( - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 16, false), + make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c, VPX_BITS_8, 16, + false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 16, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, 16, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, 16, - false), + make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false), @@ -568,9 +546,8 @@ INSTANTIATE_TEST_SUITE_P( #else INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, - ::testing::Values(make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, - 16, false), + ::testing::Values(make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c, + VPX_BITS_8, 16, false), make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 16, true))); @@ -580,9 +557,8 @@ INSTANTIATE_TEST_SUITE_P( #if HAVE_SSSE3 INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, - ::testing::Values(make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, - 16, false), + ::testing::Values(make_tuple(vpx_quantize_b_ssse3, vpx_quantize_b_c, + VPX_BITS_8, 16, false), make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false), @@ -597,9 +573,8 @@ INSTANTIATE_TEST_SUITE_P( #if HAVE_AVX INSTANTIATE_TEST_SUITE_P( AVX, VP9QuantizeTest, - ::testing::Values(make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, - 16, false), + ::testing::Values(make_tuple(vpx_quantize_b_avx, vpx_quantize_b_c, + VPX_BITS_8, 16, false), make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false))); @@ -618,17 +593,14 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_12, 32, true), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 16, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 16, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, 16, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, 16, + make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c, VPX_BITS_8, 16, false), + make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false), @@ -650,9 +622,8 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 32, true), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, - 16, false), + make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c, + VPX_BITS_8, 16, false), make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false))); @@ -664,17 +635,14 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, ::testing::Values( - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 16, false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 16, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_10, 16, - false), - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_12, 16, + make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, false), + make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false), @@ -695,9 +663,8 @@ INSTANTIATE_TEST_SUITE_P( #else INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, - ::testing::Values(make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, - 16, false), + ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false), @@ -740,8 +707,7 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest, INSTANTIATE_TEST_SUITE_P( DISABLED_C, VP9QuantizeTest, ::testing::Values( - make_tuple(&QuantWrapper, - &QuantWrapper, VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false), make_tuple(&Quant32x32Wrapper, &Quant32x32Wrapper, VPX_BITS_8, 32, false), diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 946a1c3ee8..0ddf8d3c9f 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -517,22 +517,19 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_8X8: vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; } return; @@ -547,22 +544,19 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, break; case TX_16X16: vpx_fdct16x16(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_8X8: vpx_fdct8x8(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; } } @@ -904,9 +898,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); else vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type); - vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, + eob, scan_order); } if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -922,9 +915,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); else vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type); - vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -941,9 +933,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type); else x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -982,9 +973,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, case TX_16X16: if (!x->skip_recode) { vp9_fht16x16(src_diff, coeff, diff_stride, tx_type); - vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -995,9 +985,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, case TX_8X8: if (!x->skip_recode) { vp9_fht8x8(src_diff, coeff, diff_stride, tx_type); - vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; @@ -1012,9 +1001,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vp9_fht4x4(src_diff, coeff, diff_stride, tx_type); else x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 7b607b643a..fc06967105 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1132,9 +1132,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_highbd_fwht4x4(src_diff, coeff, 8); - vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, so->scan, so->iscan); + vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, + eob, so); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0); @@ -1152,9 +1151,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, vpx_highbd_fdct4x4(src_diff, coeff, 8); else vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type); - vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, so->scan, so->iscan); + vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, + eob, so); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); distortion += vp9_highbd_block_error_dispatch( @@ -1239,9 +1237,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_fwht4x4(src_diff, coeff, 8); - vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - so->scan, so->iscan); + vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, + so); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0; @@ -1256,9 +1253,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_fht4x4(src_diff, coeff, 8, tx_type); - vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - so->scan, so->iscan); + vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, + so); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0; @@ -1710,14 +1706,12 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), coeff, 8); #if CONFIG_VP9_HIGHBITDEPTH - vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - so->scan, so->iscan); + vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, + so); thisdistortion += vp9_highbd_block_error_dispatch( coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd); #else - vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, so->scan, so->iscan); + vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, so); thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz); #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index d2a7add60d..0a0ab5e58a 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -96,26 +96,25 @@ highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, } void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; + const int16_t *iscan = scan_order->iscan; // Only the first element of each vector is DC. // High half has identical elements, but we can reconstruct it from the low // half by duplicating the 2nd element. So we only need to pass a 4x32-bit // vector - int32x4_t zbin = vmovl_s16(vld1_s16(zbin_ptr)); - int32x4_t round = vmovl_s16(vld1_s16(round_ptr)); + int32x4_t zbin = vmovl_s16(vld1_s16(mb_plane->zbin)); + int32x4_t round = vmovl_s16(vld1_s16(mb_plane->round)); // Extend the quant, quant_shift vectors to ones of 32-bit elements // scale to high-half, so we can use vqdmulhq_s32 - int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15); - int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 15); + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15); + int32x4_t quant_shift = + vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 15); int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); // Process first 8 values which include a dc component. @@ -180,7 +179,6 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // Need these here, else the compiler complains about mixing declarations and // code in C90 (void)n_coeffs; - (void)scan; } static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) { diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index 5a76065549..e2351fa2cc 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -71,20 +71,19 @@ quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, } void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; + int16_t const *iscan = scan_order->iscan; // Only the first element of each vector is DC. - int16x8_t zbin = vld1q_s16(zbin_ptr); - int16x8_t round = vld1q_s16(round_ptr); - int16x8_t quant = vld1q_s16(quant_ptr); - int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + int16x8_t zbin = vld1q_s16(mb_plane->zbin); + int16x8_t round = vld1q_s16(mb_plane->round); + int16x8_t quant = vld1q_s16(mb_plane->quant); + int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift); int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. @@ -145,9 +144,6 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // VPX_ARCH_AARCH64 - // Need these here, else the compiler complains about mixing declarations and - // code in C90 - (void)scan; } static INLINE int32x4_t extract_sign_bit(int32x4_t a) { @@ -219,7 +215,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct ScanOrder *scan_order) { + const struct ScanOrder *const scan_order) { const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index 7dff8c7a87..dee12bae76 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -116,15 +116,17 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, #endif void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - (void)iscan; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -164,16 +166,17 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - (void)iscan; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -214,7 +217,7 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct ScanOrder *scan_order) { + const struct ScanOrder *const scan_order) { const int n_coeffs = 32 * 32; const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index f20f4e0454..21ea1c8d5b 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -722,17 +722,17 @@ () # Quantization # if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { - add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order"; specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; - add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *scan_order"; + add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order"; specialize qw/vpx_highbd_quantize_b neon sse2 avx2/; - add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *scan_order"; + add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order"; specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c index fbebd7db1c..35ca554049 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -28,17 +28,15 @@ static VPX_FORCE_INLINE void update_qp(__m256i *qp) { } } -static VPX_FORCE_INLINE void init_qp(const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *dequant_ptr, - const int16_t *quant_shift_ptr, - __m256i *qp, int log_scale) { - const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr); - const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); - const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); +static VPX_FORCE_INLINE void init_qp( + const struct macroblock_plane *const mb_plane, const int16_t *dequant_ptr, + __m256i *qp, int log_scale) { + const __m128i zbin = _mm_loadu_si128((const __m128i *)mb_plane->zbin); + const __m128i round = _mm_loadu_si128((const __m128i *)mb_plane->round); + const __m128i quant = _mm_loadu_si128((const __m128i *)mb_plane->quant); const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); - const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr); + const __m128i quant_shift = + _mm_loadu_si128((const __m128i *)mb_plane->quant_shift); init_one_qp(&zbin, &qp[0]); init_one_qp(&round, &qp[1]); init_one_qp(&quant, &qp[2]); @@ -136,19 +134,16 @@ static VPX_FORCE_INLINE void quantize(const __m256i *qp, } void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const int step = 8; __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; - (void)scan; + const int16_t *iscan = scan_order->iscan; - init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0); + init_qp(mb_plane, dequant_ptr, qp, 0); quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); @@ -233,8 +228,7 @@ void vpx_highbd_quantize_b_32x32_avx2( __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; - init_qp(mb_plane->zbin, mb_plane->round, mb_plane->quant, dequant_ptr, - mb_plane->quant_shift, qp, 1); + init_qp(mb_plane, dequant_ptr, qp, 1); quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index a5d874f3bc..adae60756d 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -18,18 +18,19 @@ #include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_block.h" -#if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i, j, non_zero_regs = (int)count / 4, eob_i = 0; __m128i zbins[2]; __m128i nzbins[2]; + const int16_t *iscan = scan_order->iscan; + const int16_t *zbin_ptr = mb_plane->zbin; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[0]); @@ -40,8 +41,6 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); - (void)scan; - memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); @@ -152,4 +151,3 @@ void vpx_highbd_quantize_b_32x32_sse2( } *eob_ptr = eob; } -#endif diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index 6837a5cf28..98bf1686cb 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -23,15 +23,14 @@ #include "vp9/encoder/vp9_block.h" void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const struct macroblock_plane *mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); const __m256i big_zero = _mm256_setzero_si256(); int index; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -40,12 +39,9 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i all_zero; __m128i eob = zero, eob0; - (void)scan; - *eob_ptr = 0; - load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, - dequant_ptr, &dequant, quant_shift_ptr, &shift); + load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); @@ -146,7 +142,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct ScanOrder *scan_order) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); const __m256i big_zero = _mm256_setzero_si256(); int index; diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index 3d97b3fdae..189b083f68 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -17,11 +17,11 @@ #include "vp9/encoder/vp9_block.h" static VPX_FORCE_INLINE void load_b_values_avx2( - const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, - __m256i *round, const int16_t *quant_ptr, __m256i *quant, - const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr, + const struct macroblock_plane *mb_plane, __m256i *zbin, __m256i *round, + __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant, __m256i *shift, int log_scale) { - *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr)); + *zbin = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->zbin)); *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); if (log_scale > 0) { const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); @@ -32,7 +32,8 @@ static VPX_FORCE_INLINE void load_b_values_avx2( // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16) *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); - *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + *round = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->round)); *round = _mm256_permute4x64_epi64(*round, 0x54); if (log_scale > 0) { const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); @@ -40,12 +41,14 @@ static VPX_FORCE_INLINE void load_b_values_avx2( *round = _mm256_srai_epi16(*round, log_scale); } - *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + *quant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->quant)); *quant = _mm256_permute4x64_epi64(*quant, 0x54); *dequant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); - *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr)); + *shift = _mm256_castsi128_si256( + _mm_load_si128((const __m128i *)mb_plane->quant_shift)); *shift = _mm256_permute4x64_epi64(*shift, 0x54); } @@ -153,20 +156,17 @@ static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) { } void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const struct macroblock_plane *mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask; __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; - (void)scan; + const int16_t *iscan = scan_order->iscan; - load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, - &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, - &v_quant_shift, 0); + load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr, + &v_dequant, &v_quant_shift, 0); // Do DC and first 15 AC. v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, &v_dequant, &v_round, &v_zbin, &v_quant_shift); @@ -256,15 +256,14 @@ void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct ScanOrder *scan_order) { + const struct ScanOrder *const scan_order) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; __m256i v_eobmax = _mm256_setzero_si256(); intptr_t count; const int16_t *iscan = scan_order->iscan; - load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round, - mb_plane->quant, &v_quant, dequant_ptr, &v_dequant, - mb_plane->quant_shift, &v_quant_shift, 1); + load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr, + &v_dequant, &v_quant_shift, 1); // Do DC and first 15 AC. v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c index 9533e7916d..64838eaa7d 100644 --- a/vpx_dsp/x86/quantize_sse2.c +++ b/vpx_dsp/x86/quantize_sse2.c @@ -16,16 +16,16 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" +#include "vp9/common/vp9_scan.h" void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); int index = 16; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; @@ -33,11 +33,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i cmp_mask0, cmp_mask1; __m128i eob, eob0; - (void)scan; - // Setup global values. - load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, - dequant_ptr, &dequant, quant_shift_ptr, &shift); + load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h index fe42fee018..6de75e0568 100644 --- a/vpx_dsp/x86/quantize_sse2.h +++ b/vpx_dsp/x86/quantize_sse2.h @@ -17,17 +17,16 @@ #include "vpx/vpx_integer.h" #include "vp9/encoder/vp9_block.h" -static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, - const int16_t *round_ptr, __m128i *round, - const int16_t *quant_ptr, __m128i *quant, +static INLINE void load_b_values(const struct macroblock_plane *const mb_plane, + __m128i *zbin, __m128i *round, __m128i *quant, const int16_t *dequant_ptr, __m128i *dequant, - const int16_t *shift_ptr, __m128i *shift) { - *zbin = _mm_load_si128((const __m128i *)zbin_ptr); - *round = _mm_load_si128((const __m128i *)round_ptr); - *quant = _mm_load_si128((const __m128i *)quant_ptr); + __m128i *shift) { + *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin); + *round = _mm_load_si128((const __m128i *)mb_plane->round); + *quant = _mm_load_si128((const __m128i *)mb_plane->quant); *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1)); *dequant = _mm_load_si128((const __m128i *)dequant_ptr); - *shift = _mm_load_si128((const __m128i *)shift_ptr); + *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift); } static INLINE void load_b_values32x32( diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 641f23298b..7f085566dd 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -20,14 +20,13 @@ #include "vp9/encoder/vp9_block.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); int index = 16; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; @@ -35,10 +34,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i cmp_mask0, cmp_mask1; __m128i eob, eob0; - (void)scan; - - load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, - dequant_ptr, &dequant, quant_shift_ptr, &shift); + load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); @@ -113,7 +109,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const struct ScanOrder *scan_order) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); int index; const int16_t *iscan = scan_order->iscan; From 7c7ab9165a9b0163a6cab72f538ab227b62a65fd Mon Sep 17 00:00:00 2001 From: Johann Date: Fri, 28 Jul 2023 19:37:48 +0900 Subject: [PATCH 0850/1000] quantize_fp: reduce parameters apply similar steps as to the other quantize functions to switch to macroblock_plane and ScanOrder Change-Id: I486d653326aaf52ffd3beafd2e891ba6a5d57ef3 --- test/vp9_quantize_test.cc | 31 ++++++------ vp9/common/vp9_rtcd_defs.pl | 10 ++-- vp9/encoder/arm/neon/vp9_quantize_neon.c | 60 +++++++++++------------ vp9/encoder/vp9_encodemb.c | 37 ++++++-------- vp9/encoder/vp9_pickmode.c | 15 +++--- vp9/encoder/vp9_quantize.c | 42 +++++++++------- vp9/encoder/vp9_tpl_model.c | 15 +++--- vp9/encoder/x86/vp9_quantize_avx2.c | 62 ++++++++++++------------ vp9/encoder/x86/vp9_quantize_sse2.c | 11 +++-- vp9/encoder/x86/vp9_quantize_ssse3.c | 21 ++++---- vpx_dsp/x86/quantize_sse2.h | 8 +-- 11 files changed, 151 insertions(+), 161 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 26ea9af159..f6984bd6fa 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -66,18 +66,17 @@ void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count, // Wrapper for FP version which does not use zbin or quant_shift. typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, - const int16_t *round, const int16_t *quant, + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, - const int16_t *scan, const int16_t *iscan); + const struct ScanOrder *const scan_order); template void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, const macroblock_plane *const mb_plane, tran_low_t *qcoeff, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, const struct ScanOrder *const scan_order) { - fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff, - dequant, eob, scan_order->scan, scan_order->iscan); + fn(coeff, count, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order); } void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, @@ -315,14 +314,16 @@ void VP9QuantizeTest::Speed(bool is_median) { // determine if further multiplication operations are needed. // Based on vp9_quantize_fp_sse2(). inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan, + const struct ScanOrder *const scan_order, int is_32x32) { int i, eob = -1; const int thr = dequant_ptr[1] >> (1 + is_32x32); - (void)iscan; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. @@ -389,21 +390,21 @@ inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - quant_fp_nz(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr, - dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0); + const struct ScanOrder *const scan_order) { + quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan_order, 0); } void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - quant_fp_nz(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr, - dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1); + const struct ScanOrder *const scan_order) { + quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan_order, 1); } TEST_P(VP9QuantizeTest, OperationCheck) { diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 1a4140b38b..8e00c4581d 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -23,7 +23,9 @@ () /* Encoder forward decls */ struct macroblock; +struct macroblock_plane; struct vp9_sad_table; +struct ScanOrder; struct search_site_config; struct mv; union int_mv; @@ -129,10 +131,10 @@ () add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; specialize qw/vp9_block_error_fp neon avx2 sse2/; -add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order"; specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/; -add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order"; specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { @@ -192,10 +194,10 @@ () # ENCODEMB INVOKE - add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order"; specialize qw/vp9_highbd_quantize_fp avx2 neon/; - add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ; + add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order" ; specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/; # fdct functions diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index 97ab13628e..e8cb78dbfc 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -16,6 +16,7 @@ #include "vpx_mem/vpx_mem.h" #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_scan.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/encoder/vp9_encoder.h" @@ -68,13 +69,11 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { #endif // VPX_ARCH_AARCH64 } -static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *dequant_ptr, - int16x8_t *round, int16x8_t *quant, - int16x8_t *dequant) { - *round = vld1q_s16(round_ptr); - *quant = vld1q_s16(quant_ptr); +static VPX_FORCE_INLINE void load_fp_values( + const struct macroblock_plane *mb_plane, const int16_t *dequant_ptr, + int16x8_t *round, int16x8_t *quant, int16x8_t *dequant) { + *round = vld1q_s16(mb_plane->round_fp); + *quant = vld1q_s16(mb_plane->quant_fp); *dequant = vld1q_s16(dequant_ptr); } @@ -118,19 +117,18 @@ static VPX_FORCE_INLINE void quantize_fp_8( } void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. int i; int16x8_t v_eobmax = vdupq_n_s16(-1); int16x8_t v_round, v_quant, v_dequant; - (void)scan; + const int16_t *iscan = scan_order->iscan; - load_fp_values(round_ptr, quant_ptr, dequant_ptr, &v_round, &v_quant, - &v_dequant); + load_fp_values(mb_plane, dequant_ptr, &v_round, &v_quant, &v_dequant); // process dc and the first seven ac coeffs quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &v_eobmax); @@ -187,21 +185,20 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_8( } void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, - const int16_t *round_ptr, - const int16_t *quant_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int16x8_t eob_max = vdupq_n_s16(-1); // ROUND_POWER_OF_TWO(round_ptr[], 1) - int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); - int16x8_t quant = vld1q_s16(quant_ptr); + int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round_fp), 1); + int16x8_t quant = vld1q_s16(mb_plane->quant_fp); int16x8_t dequant = vld1q_s16(dequant_ptr); // dequant >> 2 is used similar to zbin as a threshold. int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2); int i; + const int16_t *iscan = scan_order->iscan; - (void)scan; (void)count; // Process dc and the first seven ac coeffs. @@ -258,23 +255,21 @@ highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, } void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, - const int16_t *quant_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const int16x4_t v_zero = vdup_n_s16(0); - const int16x4_t v_quant = vld1_s16(quant_ptr); + const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp); const int16x4_t v_dequant = vld1_s16(dequant_ptr); - const int16x4_t v_round = vld1_s16(round_ptr); + const int16x4_t v_round = vld1_s16(mb_plane->round_fp); int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); uint16x4_t v_mask_lo, v_mask_hi; int16x8_t v_eobmax = vdupq_n_s16(-1); - - (void)scan; + const int16_t *iscan = scan_order->iscan; // DC and first 3 AC v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, @@ -349,22 +344,21 @@ highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, } void vp9_highbd_quantize_fp_32x32_neon( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { - const int16x4_t v_quant = vld1_s16(quant_ptr); + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp); const int16x4_t v_dequant = vld1_s16(dequant_ptr); const int16x4_t v_zero = vdup_n_s16(0); const int16x4_t v_round = - vqrdmulh_n_s16(vld1_s16(round_ptr), (int16_t)(1 << 14)); + vqrdmulh_n_s16(vld1_s16(mb_plane->round_fp), (int16_t)(1 << 14)); int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); uint16x4_t v_mask_lo, v_mask_hi; int16x8_t v_eobmax = vdupq_n_s16(-1); - - (void)scan; + const int16_t *iscan = scan_order->iscan; // DC and first 3 AC v_mask_lo = diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 0ddf8d3c9f..eded9f5c42 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -367,28 +367,24 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vp9_highbd_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff, + pd->dequant, eob, scan_order); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_highbd_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_8X8: vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_highbd_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_highbd_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; } return; @@ -398,26 +394,25 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vp9_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_16X16: vpx_fdct16x16(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, scan_order->iscan); + vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_8X8: vpx_fdct8x8(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, scan_order->iscan); + vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, scan_order->iscan); + vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; } } diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 4a92802dcc..6f2524b36e 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -786,22 +786,19 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, switch (tx_size) { case TX_16X16: vpx_hadamard_16x16(src_diff, diff_stride, coeff); - vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_8X8: vpx_hadamard_8x8(src_diff, diff_stride, coeff); - vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; default: assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; } *skippable &= (*eob == 0); diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 115c66723d..19edf166d3 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -15,6 +15,7 @@ #include "vpx_ports/mem.h" #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_scan.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/encoder/vp9_encoder.h" @@ -22,12 +23,14 @@ #include "vp9/encoder/vp9_rd.h" void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i, eob = -1; - (void)iscan; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -53,15 +56,15 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i; int eob = -1; - - (void)iscan; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -86,12 +89,14 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // TODO(jingning) Refactor this file and combine functions with similar // operations. void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i, eob = -1; - (void)iscan; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -118,13 +123,14 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_quantize_fp_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { int i, eob = -1; - - (void)iscan; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index 8d203bbf4f..0a81175f73 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -497,18 +497,15 @@ static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vp9_highbd_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, + pd->dequant, eob, scan_order); } else { - vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } #else - vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); #endif // CONFIG_VP9_HIGHBITDEPTH *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift; diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c index e6aa71d58a..62af3a9212 100644 --- a/vp9/encoder/x86/vp9_quantize_avx2.c +++ b/vp9/encoder/x86/vp9_quantize_avx2.c @@ -16,6 +16,8 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/x86/bitdepth_conversion_avx2.h" #include "vpx_dsp/x86/quantize_sse2.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" // Zero fill 8 positions in the output buffer. static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) { @@ -29,11 +31,13 @@ static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) { } static VPX_FORCE_INLINE void load_fp_values_avx2( - const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr, - __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) { - *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant, + const int16_t *dequant_ptr, __m256i *dequant) { + *round = _mm256_castsi128_si256( + _mm_load_si128((const __m128i *)mb_plane->round_fp)); *round = _mm256_permute4x64_epi64(*round, 0x54); - *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + *quant = _mm256_castsi128_si256( + _mm_load_si128((const __m128i *)mb_plane->quant_fp)); *quant = _mm256_permute4x64_epi64(*quant, 0x54); *dequant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); @@ -98,13 +102,13 @@ static VPX_FORCE_INLINE void quantize_fp_16( } void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { __m256i round, quant, dequant, thr; __m256i eob_max = _mm256_setzero_si256(); - (void)scan; + const int16_t *iscan = scan_order->iscan; coeff_ptr += n_coeffs; iscan += n_coeffs; @@ -113,8 +117,7 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, n_coeffs = -n_coeffs; // Setup global values - load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr, - &dequant); + load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant); thr = _mm256_setzero_si256(); quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, @@ -203,14 +206,13 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_16( } void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, - const int16_t *quant_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { __m256i round, quant, dequant, thr; __m256i eob_max = _mm256_setzero_si256(); - (void)scan; + const int16_t *iscan = scan_order->iscan; coeff_ptr += n_coeffs; iscan += n_coeffs; @@ -219,8 +221,7 @@ void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, n_coeffs = -n_coeffs; // Setup global values - load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr, - &dequant); + load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant); thr = _mm256_srli_epi16(dequant, 2); quant = _mm256_slli_epi16(quant, 1); { @@ -286,10 +287,10 @@ static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) { } static VPX_FORCE_INLINE void highbd_load_fp_values( - const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr, - __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) { - *round = highbd_init_256(round_ptr); - *quant = highbd_init_256(quant_ptr); + const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant, + const int16_t *dequant_ptr, __m256i *dequant) { + *round = highbd_init_256(mb_plane->round_fp); + *quant = highbd_init_256(mb_plane->quant_fp); *dequant = highbd_init_256(dequant_ptr); } @@ -325,16 +326,15 @@ static VPX_FORCE_INLINE void highbd_quantize_fp( } void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, - const int16_t *quant_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const int step = 8; __m256i round, quant, dequant; __m256i eob_max = _mm256_setzero_si256(); - (void)scan; + const int16_t *iscan = scan_order->iscan; coeff_ptr += n_coeffs; iscan += n_coeffs; @@ -343,8 +343,7 @@ void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, n_coeffs = -n_coeffs; // Setup global values - highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, - &dequant); + highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, iscan + n_coeffs, qcoeff_ptr + n_coeffs, @@ -391,14 +390,14 @@ static VPX_FORCE_INLINE void highbd_quantize_fp_32x32( } void vp9_highbd_quantize_fp_32x32_avx2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { const int step = 8; __m256i round, quant, dequant, thr; __m256i eob_max = _mm256_setzero_si256(); - (void)scan; + const int16_t *iscan = scan_order->iscan; coeff_ptr += n_coeffs; iscan += n_coeffs; @@ -407,8 +406,7 @@ void vp9_highbd_quantize_fp_32x32_avx2( n_coeffs = -n_coeffs; // Setup global values - highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, - &dequant); + highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); thr = _mm256_srli_epi32(dequant, 2); // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when // calculating the zbin mask. diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index c877234436..67f03eb310 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -17,12 +17,14 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); __m128i thr; int nzflag; @@ -31,11 +33,10 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i eob; - - (void)scan; + const int16_t *iscan = scan_order->iscan; // Setup global values. - load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); + load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.c b/vp9/encoder/x86/vp9_quantize_ssse3.c index d35004e370..c94c8dbb4c 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3.c +++ b/vp9/encoder/x86/vp9_quantize_ssse3.c @@ -17,12 +17,14 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" #include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, const int16_t *quant_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); __m128i thr; int nzflag; @@ -31,11 +33,10 @@ void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i coeff0, coeff1; __m128i qcoeff0, qcoeff1; __m128i eob; - - (void)scan; + const int16_t *iscan = scan_order->iscan; // Setup global values. - load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); + load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); @@ -119,12 +120,11 @@ void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, - const int16_t *quant_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const __m128i zero = _mm_setzero_si128(); const __m128i one_s16 = _mm_set1_epi16(1); __m128i thr; @@ -134,11 +134,10 @@ void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i coeff0, coeff1; __m128i qcoeff0, qcoeff1; __m128i eob; - - (void)scan; + const int16_t *iscan = scan_order->iscan; // Setup global values. - load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); + load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); // The 32x32 halves round. round = _mm_add_epi16(round, one_s16); round = _mm_srli_epi16(round, 1); diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h index 6de75e0568..82c755a0cf 100644 --- a/vpx_dsp/x86/quantize_sse2.h +++ b/vpx_dsp/x86/quantize_sse2.h @@ -56,12 +56,12 @@ static INLINE void load_b_values32x32( *shift = _mm_slli_epi16(*shift, 1); } -static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round, - const int16_t *quant_ptr, __m128i *quant, +static INLINE void load_fp_values(const struct macroblock_plane *mb_plane, + __m128i *round, __m128i *quant, const int16_t *dequant_ptr, __m128i *dequant) { - *round = _mm_load_si128((const __m128i *)round_ptr); - *quant = _mm_load_si128((const __m128i *)quant_ptr); + *round = _mm_load_si128((const __m128i *)mb_plane->round_fp); + *quant = _mm_load_si128((const __m128i *)mb_plane->quant_fp); *dequant = _mm_load_si128((const __m128i *)dequant_ptr); } From e7a4730fcc8f895bd08f8522fb6a72312e937c87 Mon Sep 17 00:00:00 2001 From: Johann Date: Fri, 28 Jul 2023 20:21:31 +0900 Subject: [PATCH 0851/1000] remove incorrect (void) n_coeffs is used in this function Change-Id: I5f5d2933304bb636a33e0fa294b4526edb65a08d --- vpx_dsp/arm/highbd_quantize_neon.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index 0a0ab5e58a..c2ad34a695 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -176,9 +176,6 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // VPX_ARCH_AARCH64 - // Need these here, else the compiler complains about mixing declarations and - // code in C90 - (void)n_coeffs; } static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) { From 22818907d2597069ffc3400e80a6d5ad4df0097d Mon Sep 17 00:00:00 2001 From: Johann Date: Sat, 29 Jul 2023 05:44:56 +0900 Subject: [PATCH 0852/1000] normalize *const in rtcd Change-Id: Iece50143b43263c0c8f90299bedd7d2a5b9aa56b --- vp9/common/vp9_rtcd_defs.pl | 8 +-- vpx_dsp/vpx_dsp_rtcd_defs.pl | 112 +++++++++++++++++------------------ 2 files changed, 60 insertions(+), 60 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 8e00c4581d..980827b15a 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -131,10 +131,10 @@ () add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; specialize qw/vp9_block_error_fp neon avx2 sse2/; -add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order"; +add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/; -add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order"; +add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { @@ -194,10 +194,10 @@ () # ENCODEMB INVOKE - add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order"; + add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vp9_highbd_quantize_fp avx2 neon/; - add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order" ; + add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order" ; specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/; # fdct functions diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 21ea1c8d5b..0f577398cf 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -722,17 +722,17 @@ () # Quantization # if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { - add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order"; + add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; - add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order"; + add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order"; + add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vpx_highbd_quantize_b neon sse2 avx2/; - add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder * const scan_order"; + add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER @@ -928,82 +928,82 @@ () # # Multi-block SAD, comparing a reference to N independent blocks # -add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi lsx/; -add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi lsx/; -add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi lsx/; -add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi lsx/; -add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi lsx/; -add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad8x8x4d neon msa sse2 mmi lsx/; -add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad_skip_64x64x4d neon avx2 sse2/; -add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad_skip_64x32x4d neon avx2 sse2/; -add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad_skip_32x64x4d neon avx2 sse2/; -add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad_skip_32x32x4d neon avx2 sse2/; -add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad_skip_32x16x4d neon avx2 sse2/; -add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad_skip_16x32x4d neon sse2/; -add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad_skip_16x16x4d neon sse2/; -add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad_skip_16x8x4d neon sse2/; -add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad_skip_8x16x4d neon sse2/; -add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad_skip_8x8x4d neon sse2/; -add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad_skip_8x4x4d neon/; -add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad_skip_4x8x4d neon sse2/; -add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; +add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad_skip_4x4x4d neon/; add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size"; @@ -1162,82 +1162,82 @@ () # # Multi-block SAD, comparing a reference to N independent blocks # - add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad64x64x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad64x32x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad32x32x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad32x16x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad16x16x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad16x8x4d sse2 neon avx2/; - add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad8x16x4d sse2 neon/; - add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad8x8x4d sse2 neon/; - add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad8x4x4d sse2 neon/; - add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad4x8x4d sse2 neon/; - add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad4x4x4d sse2 neon/; - add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad_skip_64x64x4d neon sse2 avx2/; - add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad_skip_64x32x4d neon sse2 avx2/; - add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad_skip_32x64x4d neon sse2 avx2/; - add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad_skip_32x32x4d neon sse2 avx2/; - add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad_skip_32x16x4d neon sse2 avx2/; - add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad_skip_16x32x4d neon sse2 avx2/; - add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad_skip_16x16x4d neon sse2 avx2/; - add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad_skip_16x8x4d neon sse2 avx2/; - add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad_skip_8x16x4d neon sse2/; - add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad_skip_8x8x4d neon sse2/; - add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad_skip_8x4x4d neon/; - add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad_skip_4x8x4d neon sse2/; - add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad_skip_4x4x4d neon/; # From 2b82efa769d9cfa70b856e2015bf82a1dc4cd8dc Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Tue, 1 Aug 2023 20:58:18 -0700 Subject: [PATCH 0853/1000] Add a 10-bit test file Added a 10-bit test file for VP9 end-to-end c vs SIMD bit- exactness test. BUG=webm:1800 Change-Id: I4a864f1a740abee27049d68231adf2ec308f9a96 --- test/test-data.mk | 1 + test/test-data.sha1 | 1 + 2 files changed, 2 insertions(+) diff --git a/test/test-data.mk b/test/test-data.mk index 62a9d6ef14..9eabffae3e 100644 --- a/test/test-data.mk +++ b/test/test-data.mk @@ -29,6 +29,7 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += bus_352x288_420_f20_b8.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += crowd_run_360p_10_150f.y4m # Test vectors LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf diff --git a/test/test-data.sha1 b/test/test-data.sha1 index 55f92a25df..a9decc6b6b 100644 --- a/test/test-data.sha1 +++ b/test/test-data.sha1 @@ -870,3 +870,4 @@ bac455906360b45338a16dd626ac5f19bc36a307 *desktop_office1.1280_720-020.yuv d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv 8a0b2c350539859463d3546a67876c83ff6ff0ac *desktopqvga.320_240.yuv +ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m From 6075b1a36f8090b309a2620075ab832f63793106 Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Tue, 13 Jun 2023 16:02:58 +0530 Subject: [PATCH 0854/1000] Add test to check bit exactness of C and SIMD in VP9 encoder This CL adds a shell script to test bit exactness of C and SIMD VP9 encoder for x86 platform. As C Vs NEON encoding outputs are not bit-exact (BUG=webm:1809), ARM tests are currently disabled. BUG=webm:1800 Change-Id: Iffcc70863e8cf83ccb5bc5be73e8866165697358 --- test/examples.sh | 2 +- test/vp9_c_vs_simd_encode.sh | 411 +++++++++++++++++++++++++++++++++++ 2 files changed, 412 insertions(+), 1 deletion(-) create mode 100644 test/vp9_c_vs_simd_encode.sh diff --git a/test/examples.sh b/test/examples.sh index 629f04239c..c15a367f3c 100755 --- a/test/examples.sh +++ b/test/examples.sh @@ -15,7 +15,7 @@ example_tests=$(ls $(dirname $0)/*.sh) # List of script names to exclude. -exclude_list="examples stress tools_common" +exclude_list="examples stress tools_common vp9_c_vs_simd_encode" # Filter out the scripts in $exclude_list. for word in ${exclude_list}; do diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh new file mode 100644 index 0000000000..76df049d9a --- /dev/null +++ b/test/vp9_c_vs_simd_encode.sh @@ -0,0 +1,411 @@ +#!/bin/sh +## +## Copyright (c) 2023 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This script checks the bit exactness between C and SIMD +## implementations of VP9 encoder. + +TEST_BITRATES="1600 6400" +PRESETS="good rt" +TEST_CLIPS="yuv_raw_input y4m_360p_10bit_input yuv_480p_raw_input y4m_720p_input" +OUT_FILE_SUFFIX=".ivf" +SCRIPT_DIR=$(dirname "$0") +LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/..; pwd) +devnull='> /dev/null 2>&1' + +# Clips used in test. +YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv" +YUV_480P_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_640_480_30.yuv" +Y4M_360P_10BIT_INPUT="${LIBVPX_TEST_DATA_PATH}/crowd_run_360p_10_150f.y4m" +Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m" + +# Number of frames to test. +VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT=20 + +# Create a temporary directory for output files. +if [ -n "${TMPDIR}" ]; then + VPX_TEST_TEMP_ROOT="${TMPDIR}" +elif [ -n "${TEMPDIR}" ]; then + VPX_TEST_TEMP_ROOT="${TEMPDIR}" +else + VPX_TEST_TEMP_ROOT=/tmp +fi + +VPX_TEST_OUTPUT_DIR="${VPX_TEST_TEMP_ROOT}/vpx_test_$$" + +if ! mkdir -p "${VPX_TEST_OUTPUT_DIR}" || \ + [ ! -d "${VPX_TEST_OUTPUT_DIR}" ]; then + echo "${0##*/}: Cannot create output directory, giving up." + echo "${0##*/}: VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}" + exit 1 +fi + +elog() { + echo "$@" 1>&2 +} + +# Echoes path to $1 when it's executable and exists in ${VPX_TEST_OUTPUT_DIR}, +# or an empty string. Caller is responsible for testing the string once the +# function returns. +vp9_enc_tool_path() { + local target="$1" + local tool_path="${VPX_TEST_OUTPUT_DIR}/build_target_${target}/vpxenc" + + if [ ! -x "${tool_path}" ]; then + tool_path="" + fi + echo "${tool_path}" +} + +# Environment check: Make sure input and source directories are available. +vp9_c_vs_simd_enc_verify_environment () { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -e "${YUV_480P_RAW_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -e "${Y4M_720P_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -e "${Y4M_360P_10BIT_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -d "$LIBVPX_SOURCE_DIR" ]; then + elog "LIBVPX_SOURCE_DIR does not exist." + return 1 + fi +} + +cleanup() { + rm -rf ${VPX_TEST_OUTPUT_DIR} +} + +# Echo VPX_SIMD_CAPS_MASK for different instruction set architecture. +avx512f() { + echo "0x1FF" +} + +avx2() { + echo "0x0FF" +} + +sse4_1() { + echo "0x03F" +} + +ssse3() { + echo "0x01F" +} + +sse2() { + echo "0x007" +} + +# Echo clip details to be used as input to vpxenc. +yuv_raw_input() { + echo ""${YUV_RAW_INPUT}" + --width=352 + --height=288 + --bit-depth=8 + --profile=0" +} + +yuv_480p_raw_input() { + echo ""${YUV_480P_RAW_INPUT}" + --width=640 + --height=480 + --bit-depth=8 + --profile=0" +} + +y4m_720p_input() { + echo ""${Y4M_720P_INPUT}" + --bit-depth=8 + --profile=0" +} + +y4m_360p_10bit_input() { + echo ""${Y4M_360P_10BIT_INPUT}" + --bit-depth=10 + --profile=2" +} + +has_x86_isa_extn() { + instruction_set=$1 + grep -q "$instruction_set" /proc/cpuinfo + if [ $? -eq 1 ]; then + return 1 + fi +} + +# Echo good encode params for use with VP9 encoder. +vp9_encode_good_params() { + echo "--codec=vp9 \ + --good \ + --test-decode=fatal \ + --ivf \ + --threads=1 \ + --static-thresh=0 \ + --tile-columns=0 \ + --end-usage=vbr \ + --kf-max-dist=160 \ + --kf-min-dist=0 \ + --lag-in-frames=19 \ + --max-q=63 \ + --min-q=0 \ + --passes=2 \ + --undershoot-pct=100 \ + --overshoot-pct=100 \ + --verbose \ + --auto-alt-ref=1 \ + --drop-frame=0 \ + --bias-pct=50 \ + --minsection-pct=0 \ + --maxsection-pct=2000 \ + --arnr-maxframes=7 \ + --arnr-strength=5 \ + --sharpness=0 \ + --frame-parallel=0" +} + +# Echo realtime encode params for use with VP9 encoder. +vp9_encode_rt_params() { + echo "--codec=vp9 \ + --rt \ + --test-decode=fatal \ + --ivf \ + --threads=1 \ + --static-thresh=0 \ + --tile-columns=0 \ + --tile-rows=0 \ + --end-usage=cbr \ + --kf-max-dist=90000 \ + --lag-in-frames=0 \ + --max-q=58 \ + --min-q=2 \ + --passes=1 \ + --undershoot-pct=50 \ + --overshoot-pct=50 \ + --verbose \ + --row-mt=0 \ + --buf-sz=1000 \ + --buf-initial-sz=500 \ + --buf-optimal-sz=600 \ + --max-intra-rate=300 \ + --resize-allowed=0 \ + --noise-sensitivity=0 \ + --aq-mode=3 \ + --error-resilient=0" +} + +# Configures for the given target in VPX_TEST_OUTPUT_DIR/build_target_${target} +# directory. +vp9_enc_build() { + local target=$1 + local configure="$2" + local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target} + mkdir -p $tmp_build_dir + cd $tmp_build_dir + + echo "Building target: ${target}" + local config_args="--disable-install-docs \ + --enable-unit-tests \ + --enable-debug \ + --enable-postproc \ + --enable-vp9-postproc \ + --enable-vp9-temporal-denoising \ + --enable-vp9-highbitdepth" + + eval "$configure" --target="${target}" "${config_args}" ${devnull} + eval make -j$(nproc) ${devnull} + echo "Done building target: ${target}" +} + +compare_enc_output() { + local target=$1 + local cpu=$2 + local clip=$3 + local bitrate=$4 + local preset=$5 + diff ${VPX_TEST_OUTPUT_DIR}/Out-generic-gnu-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \ + ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} > /dev/null + if [ $? -eq 1 ]; then + elog "C vs ${target} encode mismatches for ${clip}, at ${bitrate} kbps, speed ${cpu}, ${preset} preset" + return 1 + fi +} + +vp9_enc_test() { + local encoder="$1" + local target=$2 + if [ -z "$(vp9_enc_tool_path "${target}")" ]; then + elog "vpxenc not found. It must exist in ${VPX_TEST_OUTPUT_DIR}/build_target_${target} path" + return 1 + fi + + for preset in ${PRESETS}; do + if [ "${preset}" = "good" ]; then + local max_cpu_used=5 + local test_params=vp9_encode_good_params + elif [ "${preset}" = "rt" ]; then + local max_cpu_used=9 + local test_params=vp9_encode_rt_params + else + elog "Invalid preset" + return 1 + fi + + for cpu in $(seq 0 $max_cpu_used); do + for clip in ${TEST_CLIPS}; do + for bitrate in ${TEST_BITRATES}; do + eval "${encoder}" $($clip) $($test_params) \ + "--limit=${VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT}" \ + "--cpu-used=${cpu}" "--target-bitrate=${bitrate}" "-o" \ + ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \ + ${devnull} + + if [ "${target}" != "generic-gnu" ]; then + compare_enc_output ${target} $cpu ${clip} $bitrate ${preset} + if [ $? -eq 1 ]; then + return 1 + fi + fi + done + done + done + done +} + +vp9_test_generic() { + local configure="$LIBVPX_SOURCE_DIR/configure" + local target="generic-gnu" + + echo "Build for: ${target}" + vp9_enc_build ${target} ${configure} + local encoder="$(vp9_enc_tool_path "${target}")" + vp9_enc_test $encoder "${target}" +} + +# This function encodes VP9 bitstream by enabling SSE2, SSSE3, SSE4_1, AVX2, AVX512f as there are +# no functions with MMX, SSE, SSE3 and AVX specialization. +# The value of environment variable 'VPX_SIMD_CAPS' controls enabling of different instruction +# set extension optimizations. The value of the flag 'VPX_SIMD_CAPS' and the corresponding +# instruction set extension optimization enabled are as follows: +# AVX512 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX +# 1 1 1 1 1 1 1 1 1 -> 0x1FF -> Enable AVX512 and lower variants +# 0 1 1 1 1 1 1 1 1 -> 0x0FF -> Enable AVX2 and lower variants +# 0 0 1 1 1 1 1 1 1 -> 0x07F -> Enable AVX and lower variants +# 0 0 0 1 1 1 1 1 1 -> 0x03F -> Enable SSE4_1 and lower variants +# 0 0 0 0 1 1 1 1 1 -> 0x01F -> Enable SSSE3 and lower variants +# 0 0 0 0 0 1 1 1 1 -> 0x00F -> Enable SSE3 and lower variants +# 0 0 0 0 0 0 1 1 1 -> 0x007 -> Enable SSE2 and lower variants +# 0 0 0 0 0 0 0 1 1 -> 0x003 -> Enable SSE and lower variants +# 0 0 0 0 0 0 0 0 1 -> 0x001 -> Enable MMX +## NOTE: In x86_64 platform, it is not possible to enable sse/mmx/c using "VPX_SIMD_CAPS_MASK" as +# all x86_64 platforms implement sse2. +vp9_test_x86() { + local arch=$1 + + uname -m | grep -q "x86" + if [ $? -eq 1 ]; then + elog "Machine architecture is not x86 or x86_64" + return 0 + fi + + if [ $arch = "x86" ]; then + local target="x86-linux-gcc" + elif [ $arch = "x86_64" ]; then + local target="x86_64-linux-gcc" + fi + + local x86_isa_variants="avx512f avx2 sse4_1 ssse3 sse2" + local configure="$LIBVPX_SOURCE_DIR/configure" + + echo "Build for x86: ${target}" + vp9_enc_build ${target} ${configure} + local encoder="$(vp9_enc_tool_path "${target}")" + for isa in $x86_isa_variants; do + has_x86_isa_extn $isa + if [ $? -eq 1 ]; then + echo "${isa} is not supported in this machine" + continue + fi + export VPX_SIMD_CAPS_MASK=$($isa) + vp9_enc_test $encoder ${target} + if [ $? -eq 1 ]; then + return 1 + fi + unset VPX_SIMD_CAPS_MASK + done +} + +vp9_test_arm() { + local target="armv8-linux-gcc" + local configure="CROSS=aarch64-linux-gnu- $LIBVPX_SOURCE_DIR/configure --extra-cflags=-march=armv8.4-a \ + --extra-cxxflags=-march=armv8.4-a" + echo "Build for arm64: ${target}" + vp9_enc_build ${target} "${configure}" + + local encoder="$(vp9_enc_tool_path "${target}")" + vp9_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" ${target} + if [ $? -eq 1 ]; then + return 1 + fi +} + +vp9_c_vs_simd_enc_test () { + # Test Generic + vp9_test_generic + + # Test x86 (32 bit) + echo "vp9 test for x86 (32 bit): Started." + vp9_test_x86 "x86" + if [ $? -eq 1 ]; then + echo "vp9 test for x86 (32 bit): Done, test failed." + else + echo "vp9 test for x86 (32 bit): Done, all tests passed." + fi + + # Test x86_64 (64 bit) + if [ "$(eval uname -m)" = "x86_64" ]; then + echo "vp9 test for x86_64 (64 bit): Started." + vp9_test_x86 "x86_64" + if [ $? -eq 1 ]; then + echo "vp9 test for x86_64 (64 bit): Done, test failed." + else + echo "vp9 test for x86_64 (64 bit): Done, all tests passed." + fi + fi + + ##TODO(BUG=webm:1809): Enable testing for ARM after issues with NEON intrinsic + # are resolved. + # Test ARM + # echo "vp9_test_arm: Started." + # vp9_test_arm + # if [ $? -eq 1 ]; then + # echo "vp9 test for arm: Done, test failed." + # else + # echo "vp9 test for arm: Done, all tests passed." + # fi +} + +# Setup a trap function to clean up build, and output files after tests complete. +trap cleanup EXIT + +vp9_c_vs_simd_enc_verify_environment +if [ $? -eq 1 ]; then + echo "Environment check failed." + exit 1 +fi +vp9_c_vs_simd_enc_test From 2f2761c261a242ba696321d18976f6453a0ad822 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 1 Aug 2023 11:00:20 -0400 Subject: [PATCH 0855/1000] vp9 ext rc: Add callback for tpl stats Added test Bug: b/294049605 Change-Id: I3967a0f915e1a6e7a0d34d04732c33e1ca6f35e7 --- test/vp9_ext_ratectrl_test.cc | 21 +++++++++++++++++++++ vp9/encoder/vp9_ext_ratectrl.c | 15 +++++++++++++++ vp9/encoder/vp9_ext_ratectrl.h | 4 ++++ vp9/encoder/vp9_tpl_model.c | 9 +++++++++ vpx/vpx_ext_ratectrl.h | 19 ++++++++++++++++++- vpx/vpx_tpl.h | 1 + 6 files changed, 68 insertions(+), 1 deletion(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index 739c0b7f8e..a7248bcec4 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -18,6 +18,7 @@ #include "third_party/googletest/src/include/gtest/gtest.h" #include "vp9/simple_encode.h" #include "vpx/vpx_ext_ratectrl.h" +#include "vpx/vpx_tpl.h" #include "vpx_dsp/vpx_dsp_common.h" namespace { @@ -151,6 +152,19 @@ vpx_rc_status_t rc_send_firstpass_stats_gop_short( return VPX_RC_OK; } +vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model, + const VpxTplGopStats *tpl_gop_stats) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_GT(tpl_gop_stats->size, 0); + + for (int i = 0; i < tpl_gop_stats->size; ++i) { + EXPECT_GT(tpl_gop_stats->frame_stats_list[i].num_blocks, 0); + } + return VPX_RC_OK; +} + vpx_rc_status_t rc_get_encodeframe_decision( vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *encode_frame_info, @@ -679,6 +693,7 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest, ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { vpx_rc_funcs_t rc_funcs; + memset(&rc_funcs, 0, sizeof(rc_funcs)); rc_funcs.rc_type = VPX_RC_QP; rc_funcs.create_model = rc_create_model; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats; @@ -722,9 +737,11 @@ class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest, encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); vpx_rc_funcs_t rc_funcs; + memset(&rc_funcs, 0, sizeof(rc_funcs)); rc_funcs.rc_type = VPX_RC_GOP_QP; rc_funcs.create_model = rc_create_model_gop; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop; + rc_funcs.send_tpl_gop_stats = rc_send_tpl_gop_stats; rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop; rc_funcs.get_gop_decision = rc_get_gop_decision; rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop; @@ -769,6 +786,7 @@ class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest, encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); vpx_rc_funcs_t rc_funcs; + memset(&rc_funcs, 0, sizeof(rc_funcs)); rc_funcs.rc_type = VPX_RC_GOP_QP; rc_funcs.create_model = rc_create_model_gop_short; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; @@ -817,6 +835,7 @@ class ExtRateCtrlTestGOPShortOverlay encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); vpx_rc_funcs_t rc_funcs; + memset(&rc_funcs, 0, sizeof(rc_funcs)); rc_funcs.rc_type = VPX_RC_GOP_QP; rc_funcs.create_model = rc_create_model_gop_short; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; @@ -866,6 +885,7 @@ class ExtRateCtrlTestGOPShortNoARF encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); vpx_rc_funcs_t rc_funcs; + memset(&rc_funcs, 0, sizeof(rc_funcs)); rc_funcs.rc_type = VPX_RC_GOP_QP; rc_funcs.create_model = rc_create_model_gop_short; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; @@ -920,6 +940,7 @@ class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest, ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { vpx_rc_funcs_t rc_funcs; + memset(&rc_funcs, 0, sizeof(rc_funcs)); rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT; rc_funcs.create_model = rc_create_model_gop_short; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 09253403b8..6d8cf566d8 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -119,6 +119,21 @@ vpx_codec_err_t vp9_extrc_send_firstpass_stats( return VPX_CODEC_OK; } +vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl, + const VpxTplGopStats *tpl_gop_stats) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + if (ext_ratectrl->ready && ext_ratectrl->funcs.send_tpl_gop_stats != NULL) { + vpx_rc_status_t rc_status = ext_ratectrl->funcs.send_tpl_gop_stats( + ext_ratectrl->model, tpl_gop_stats); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + } + return VPX_CODEC_OK; +} + static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { // TODO(angiebird): Add unit test to make sure this function behaves like // get_frame_type_from_update_type() diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h index 7c38758833..b04580c1d4 100644 --- a/vp9/encoder/vp9_ext_ratectrl.h +++ b/vp9/encoder/vp9_ext_ratectrl.h @@ -12,6 +12,7 @@ #define VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ #include "vpx/vpx_ext_ratectrl.h" +#include "vpx/vpx_tpl.h" #include "vp9/encoder/vp9_firstpass.h" typedef struct EXT_RATECTRL { @@ -34,6 +35,9 @@ vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl); vpx_codec_err_t vp9_extrc_send_firstpass_stats( EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info); +vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl, + const VpxTplGopStats *tpl_gop_stats); + vpx_codec_err_t vp9_extrc_get_encodeframe_decision( EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index 0a81175f73..c45404c256 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -1505,6 +1505,15 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) { // Qmode. trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count); + if (cpi->ext_ratectrl.ready) { + const vpx_codec_err_t codec_status = + vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cpi->common.error, codec_status, + "vp9_extrc_send_tpl_stats() failed"); + } + } + #if CONFIG_NON_GREEDY_MV cpi->tpl_ready = 1; #if DUMP_TPL_STATS diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 2c312858b8..1c67c8deb4 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -16,6 +16,7 @@ extern "C" { #endif #include "./vpx_integer.h" +#include "vpx/vpx_tpl.h" /*!\brief Current ABI version number * @@ -25,7 +26,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures. */ -#define VPX_EXT_RATECTRL_ABI_VERSION (6) +#define VPX_EXT_RATECTRL_ABI_VERSION (7) /*!\brief The control type of the inference API. * In VPX_RC_QP mode, the external rate control model determines the @@ -410,6 +411,18 @@ typedef vpx_rc_status_t (*vpx_rc_send_firstpass_stats_cb_fn_t)( vpx_rc_model_t rate_ctrl_model, const vpx_rc_firstpass_stats_t *first_pass_stats); +/*!\brief Send TPL stats for the current GOP to the external rate control model + * callback prototype + * + * This callback is invoked by the encoder to send TPL stats for the GOP to the + * external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] tpl_gop_stats TPL stats for current GOP + */ +typedef vpx_rc_status_t (*vpx_rc_send_tpl_gop_stats_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, const VpxTplGopStats *tpl_gop_stats); + /*!\brief Receive encode frame decision callback prototype * * This callback is invoked by the encoder to receive encode frame decision from @@ -491,6 +504,10 @@ typedef struct vpx_rc_funcs { * Send first pass stats to the external rate control model. */ vpx_rc_send_firstpass_stats_cb_fn_t send_firstpass_stats; + /*! + * Send TPL stats for current GOP to the external rate control model. + */ + vpx_rc_send_tpl_gop_stats_cb_fn_t send_tpl_gop_stats; /*! * Get encodeframe decision from the external rate control model. */ diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h index 50aec49eb6..61473168f4 100644 --- a/vpx/vpx_tpl.h +++ b/vpx/vpx_tpl.h @@ -18,6 +18,7 @@ #include #include "./vpx_integer.h" +#include "vpx/vpx_codec.h" #ifdef __cplusplus extern "C" { From 44f2819298fdffd50bc877ac55e575fbff9bd541 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 3 Aug 2023 14:07:55 -0400 Subject: [PATCH 0856/1000] vp9_quantize_fp_neon: Same params name as in decl Clear some clang-tidy warnings Change-Id: Iea4c4e77b3d515ec6384bd34875a0002ab13c14c --- vp9/common/vp9_rtcd_defs.pl | 2 +- vp9/encoder/arm/neon/vp9_quantize_neon.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 980827b15a..3ecbd5417f 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -197,7 +197,7 @@ () add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vp9_highbd_quantize_fp avx2 neon/; - add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order" ; + add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/; # fdct functions diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index e8cb78dbfc..968cdc6d11 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -116,7 +116,7 @@ static VPX_FORCE_INLINE void quantize_fp_8( *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask); } -void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, @@ -135,7 +135,7 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, // now process the rest of the ac coeffs update_fp_values(&v_round, &v_quant, &v_dequant); - for (i = 8; i < count; i += 8) { + for (i = 8; i < n_coeffs; i += 8) { quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i, qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax); } @@ -184,7 +184,7 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_8( *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask); } -void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, @@ -199,7 +199,7 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, int i; const int16_t *iscan = scan_order->iscan; - (void)count; + (void)n_coeffs; // Process dc and the first seven ac coeffs. quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr, From f6aaad370d0d02ddea7d5361dce17c69d679c202 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 4 Aug 2023 14:18:33 -0400 Subject: [PATCH 0857/1000] Fix include path fpr vpx_tpl.h,vpx_ext_ratectrl.h Bug: b/294049605 Change-Id: I6422fc4250c2192f985cce2e296a19a05934969b --- vpx/vpx_ext_ratectrl.h | 2 +- vpx/vpx_tpl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index 1c67c8deb4..b93df11cd5 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -16,7 +16,7 @@ extern "C" { #endif #include "./vpx_integer.h" -#include "vpx/vpx_tpl.h" +#include "./vpx_tpl.h" /*!\brief Current ABI version number * diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h index 61473168f4..30d28de50f 100644 --- a/vpx/vpx_tpl.h +++ b/vpx/vpx_tpl.h @@ -18,7 +18,7 @@ #include #include "./vpx_integer.h" -#include "vpx/vpx_codec.h" +#include "./vpx_codec.h" #ifdef __cplusplus extern "C" { From fc29b8533e7c678d78dc1eb87c26076dcaef15ef Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 4 Aug 2023 16:12:29 -0400 Subject: [PATCH 0858/1000] Fix some clang-tidy warnings - Use zero initializer instead of memset to avoid including - Include vpx_codec.h for vpx_codec_err_t and error codes - Include vpx_tpl.h for VpxTplGopStats Change-Id: Iac5837ce2173bd945bfe8eeb401ff4dfd04fd2e1 --- test/vp9_ext_ratectrl_test.cc | 18 ++++++------------ vp9/encoder/vp9_ext_ratectrl.c | 2 ++ vp9/encoder/vp9_tpl_model.c | 1 + 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index a7248bcec4..e0107b2d26 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -692,8 +692,7 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest, void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { - vpx_rc_funcs_t rc_funcs; - memset(&rc_funcs, 0, sizeof(rc_funcs)); + vpx_rc_funcs_t rc_funcs = {}; rc_funcs.rc_type = VPX_RC_QP; rc_funcs.create_model = rc_create_model; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats; @@ -736,8 +735,7 @@ class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest, encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); - vpx_rc_funcs_t rc_funcs; - memset(&rc_funcs, 0, sizeof(rc_funcs)); + vpx_rc_funcs_t rc_funcs = {}; rc_funcs.rc_type = VPX_RC_GOP_QP; rc_funcs.create_model = rc_create_model_gop; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop; @@ -785,8 +783,7 @@ class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest, encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); - vpx_rc_funcs_t rc_funcs; - memset(&rc_funcs, 0, sizeof(rc_funcs)); + vpx_rc_funcs_t rc_funcs = {}; rc_funcs.rc_type = VPX_RC_GOP_QP; rc_funcs.create_model = rc_create_model_gop_short; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; @@ -834,8 +831,7 @@ class ExtRateCtrlTestGOPShortOverlay encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); - vpx_rc_funcs_t rc_funcs; - memset(&rc_funcs, 0, sizeof(rc_funcs)); + vpx_rc_funcs_t rc_funcs = {}; rc_funcs.rc_type = VPX_RC_GOP_QP; rc_funcs.create_model = rc_create_model_gop_short; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; @@ -884,8 +880,7 @@ class ExtRateCtrlTestGOPShortNoARF encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); - vpx_rc_funcs_t rc_funcs; - memset(&rc_funcs, 0, sizeof(rc_funcs)); + vpx_rc_funcs_t rc_funcs = {}; rc_funcs.rc_type = VPX_RC_GOP_QP; rc_funcs.create_model = rc_create_model_gop_short; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; @@ -939,8 +934,7 @@ class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest, void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { - vpx_rc_funcs_t rc_funcs; - memset(&rc_funcs, 0, sizeof(rc_funcs)); + vpx_rc_funcs_t rc_funcs = {}; rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT; rc_funcs.create_model = rc_create_model_gop_short; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index 6d8cf566d8..aa248f43f6 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -12,6 +12,8 @@ #include "vp9/encoder/vp9_encoder.h" #include "vp9/common/vp9_common.h" #include "vpx_dsp/psnr.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_tpl.h" vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) { if (ext_ratectrl == NULL) { diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index c45404c256..7b75815571 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -19,6 +19,7 @@ #include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_tpl_model.h" +#include "vpx/vpx_codec.h" static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, const GF_GROUP *gf_group, int *tpl_group_frames) { From d4b6132d2b9f65ba887a1a40029b1b5d61881470 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 7 Aug 2023 10:28:46 -0400 Subject: [PATCH 0859/1000] Fix more clang-tidy warnings - Include vpx/vpx_ext_ratectrl.h in vp9_ext_ratectrl.c - Include vpx/internal/vpx_codec_internal.h - Include for NULL Bug: b/294049605 Change-Id: Iedd8b3864da27fde1678bfa6606e6fc5630a7a09 --- vp9/encoder/vp9_ext_ratectrl.c | 3 +++ vp9/encoder/vp9_tpl_model.c | 1 + 2 files changed, 4 insertions(+) diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c index aa248f43f6..4664e8c5e2 100644 --- a/vp9/encoder/vp9_ext_ratectrl.c +++ b/vp9/encoder/vp9_ext_ratectrl.c @@ -8,11 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include + #include "vp9/encoder/vp9_ext_ratectrl.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/common/vp9_common.h" #include "vpx_dsp/psnr.h" #include "vpx/vpx_codec.h" +#include "vpx/vpx_ext_ratectrl.h" #include "vpx/vpx_tpl.h" vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) { diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index 7b75815571..903ea8d753 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -19,6 +19,7 @@ #include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_tpl_model.h" +#include "vpx/internal/vpx_codec_internal.h" #include "vpx/vpx_codec.h" static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, From 242c7431700b75a640f2e94e876d4d7c02de8e16 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 7 Aug 2023 16:42:43 -0400 Subject: [PATCH 0860/1000] VP9 RC: Add pixel row/col of a TPL block Bug: b/294049605 Change-Id: I383a88a037a2a48a5fc1b9def6f991278c3665a8 --- vp9/encoder/vp9_tpl_model.c | 2 ++ vpx/vpx_tpl.h | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index 903ea8d753..909b05292b 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -400,6 +400,8 @@ static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats, for (idx = 0; idx < mi_width; ++idx) { VpxTplBlockStats *tpl_block_stats_ptr = &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx]; + tpl_block_stats_ptr->row = mi_row * 8; + tpl_block_stats_ptr->col = mi_col * 8; tpl_block_stats_ptr->inter_cost = src_stats->inter_cost; tpl_block_stats_ptr->intra_cost = src_stats->intra_cost; tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h index 30d28de50f..3828eb8f29 100644 --- a/vpx/vpx_tpl.h +++ b/vpx/vpx_tpl.h @@ -32,10 +32,12 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_TPL_ABI_VERSION (1) /**<\hideinitializer*/ +#define VPX_TPL_ABI_VERSION (2) /**<\hideinitializer*/ /*!\brief Temporal dependency model stats for each block before propagation */ typedef struct VpxTplBlockStats { + int16_t row; /**< Pixel row of the top left corner */ + int16_t col; /**< Pixel col of the top left corner */ int64_t intra_cost; /**< Intra cost */ int64_t inter_cost; /**< Inter cost */ int16_t mv_r; /**< Motion vector row */ From 6e5fc000017f919de705d103fc4e6c1a41629c67 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Mon, 7 Aug 2023 13:10:41 -0700 Subject: [PATCH 0861/1000] Disable vpx_int_pro_row/col neon SIMD functions The vpx_int_pro_row/col neon SIMD version caused a mismatch between neon encoding vs c encoding. Disabled them for now to ensure the correctness of VP9 encoding on the arm platform. Since these 2 functions were not used much, so this wouldn't affect the overall encoder speed much. BUG=webm:1800 BUG=webm:1809 Change-Id: Id1a7d542fc03d4cf9fa1039a49832abf35fb722f --- test/avg_test.cc | 32 +++++---- vpx_dsp/arm/avg_neon.c | 130 +++++++++++++++++------------------ vpx_dsp/vpx_dsp_rtcd_defs.pl | 7 +- 3 files changed, 88 insertions(+), 81 deletions(-) diff --git a/test/avg_test.cc b/test/avg_test.cc index dbd3309ee4..b885c4de4f 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -681,19 +681,25 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon), make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon))); -INSTANTIATE_TEST_SUITE_P( - NEON, IntProRowTest, - ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c), - make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c), - make_tuple(64, &vpx_int_pro_row_neon, - &vpx_int_pro_row_c))); - -INSTANTIATE_TEST_SUITE_P( - NEON, IntProColTest, - ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c), - make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c), - make_tuple(64, &vpx_int_pro_col_neon, - &vpx_int_pro_col_c))); +// Disabled neon optimization since it caused mismatch. See details in: +// https://bugs.chromium.org/p/webm/issues/detail?id=1809 +// INSTANTIATE_TEST_SUITE_P( +// NEON, IntProRowTest, +// ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon, +// &vpx_int_pro_row_c), +// make_tuple(32, &vpx_int_pro_row_neon, +// &vpx_int_pro_row_c), make_tuple(64, +// &vpx_int_pro_row_neon, +// &vpx_int_pro_row_c))); +// +// INSTANTIATE_TEST_SUITE_P( +// NEON, IntProColTest, +// ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon, +// &vpx_int_pro_col_c), +// make_tuple(32, &vpx_int_pro_col_neon, +// &vpx_int_pro_col_c), make_tuple(64, +// &vpx_int_pro_col_neon, +// &vpx_int_pro_col_c))); INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest, ::testing::Values(make_tuple(16, &vpx_satd_neon), diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index 22164242c5..0cb102fdf6 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -67,71 +67,71 @@ int vpx_satd_neon(const tran_low_t *coeff, int length) { return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1])); } -void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, - const int ref_stride, const int height) { - int i; - uint8x16_t r0, r1, r2, r3; - uint16x8_t sum_lo[2], sum_hi[2]; - uint16x8_t tmp_lo[2], tmp_hi[2]; - int16x8_t avg_lo, avg_hi; - - const int norm_factor = (height >> 5) + 3; - const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor); - - assert(height >= 4 && height % 4 == 0); - - r0 = vld1q_u8(ref + 0 * ref_stride); - r1 = vld1q_u8(ref + 1 * ref_stride); - r2 = vld1q_u8(ref + 2 * ref_stride); - r3 = vld1q_u8(ref + 3 * ref_stride); - - sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); - sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); - sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); - sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); - - for (i = 4; i < height; i += 4) { - r0 = vld1q_u8(ref + 0 * ref_stride); - r1 = vld1q_u8(ref + 1 * ref_stride); - r2 = vld1q_u8(ref + 2 * ref_stride); - r3 = vld1q_u8(ref + 3 * ref_stride); - - tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); - tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); - tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); - tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); - - sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]); - sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]); - sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]); - sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]); - - ref += 4 * ref_stride; - } - - sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]); - sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]); - - avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor); - avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor); - - vst1q_s16(hbuf, avg_lo); - vst1q_s16(hbuf + 8, avg_hi); -} - -int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) { - uint16x8_t sum; - int i; - - assert(width >= 16 && width % 16 == 0); - - sum = vpaddlq_u8(vld1q_u8(ref)); - for (i = 16; i < width; i += 16) { - sum = vpadalq_u8(sum, vld1q_u8(ref + i)); - } - - return (int16_t)horizontal_add_uint16x8(sum); -} +// void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, +// const int ref_stride, const int height) { +// int i; +// uint8x16_t r0, r1, r2, r3; +// uint16x8_t sum_lo[2], sum_hi[2]; +// uint16x8_t tmp_lo[2], tmp_hi[2]; +// int16x8_t avg_lo, avg_hi; +// +// const int norm_factor = (height >> 5) + 3; +// const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor); +// +// assert(height >= 4 && height % 4 == 0); +// +// r0 = vld1q_u8(ref + 0 * ref_stride); +// r1 = vld1q_u8(ref + 1 * ref_stride); +// r2 = vld1q_u8(ref + 2 * ref_stride); +// r3 = vld1q_u8(ref + 3 * ref_stride); +// +// sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); +// sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); +// sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); +// sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); +// +// for (i = 4; i < height; i += 4) { +// r0 = vld1q_u8(ref + 0 * ref_stride); +// r1 = vld1q_u8(ref + 1 * ref_stride); +// r2 = vld1q_u8(ref + 2 * ref_stride); +// r3 = vld1q_u8(ref + 3 * ref_stride); +// +// tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); +// tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); +// tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); +// tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); +// +// sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]); +// sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]); +// sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]); +// sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]); +// +// ref += 4 * ref_stride; +// } +// +// sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]); +// sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]); +// +// avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor); +// avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor); +// +// vst1q_s16(hbuf, avg_lo); +// vst1q_s16(hbuf + 8, avg_hi); +// } + +// int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) { +// uint16x8_t sum; +// int i; +// +// assert(width >= 16 && width % 16 == 0); +// +// sum = vpaddlq_u8(vld1q_u8(ref)); +// for (i = 16; i < width; i += 16) { +// sum = vpadalq_u8(sum, vld1q_u8(ref + i)); +// } +// +// return (int16_t)horizontal_add_uint16x8(sum); +// } // ref, src = [0, 510] - max diff = 16-bits // bwl = {2, 3, 4}, width = {16, 32, 64} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 0f577398cf..798bd93890 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -876,11 +876,12 @@ () specialize qw/vpx_satd avx2 sse2 neon msa/; } + # Disabled neon optimization since it caused mismatch. See details in: + # https://bugs.chromium.org/p/webm/issues/detail?id=1809 add_proto qw/void vpx_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height"; - specialize qw/vpx_int_pro_row sse2 neon msa/; - + specialize qw/vpx_int_pro_row sse2 msa/; add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width"; - specialize qw/vpx_int_pro_col sse2 neon msa/; + specialize qw/vpx_int_pro_col sse2 msa/; add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl"; specialize qw/vpx_vector_var neon sse2 msa/; From 685715b698e635f1152646fa48711f418a4082f7 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Mon, 7 Aug 2023 14:54:30 -0700 Subject: [PATCH 0862/1000] Enable arm test in c vs SIMD bit-exactness test Arm SIMD testing was enabled in c vs SIMD bit-exactness test after arm SIMD mismatch was resolved. BUG=webm:1800 Change-Id: Id60127313a0955f4a5c8468281fd5a441668fddb --- test/vp9_c_vs_simd_encode.sh | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) mode change 100644 => 100755 test/vp9_c_vs_simd_encode.sh diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh old mode 100644 new mode 100755 index 76df049d9a..e3d3624ed4 --- a/test/vp9_c_vs_simd_encode.sh +++ b/test/vp9_c_vs_simd_encode.sh @@ -266,6 +266,11 @@ vp9_enc_test() { return 1 fi + # Enable armv8 test for real-time only + if [ "${preset}" = "good" ] && [ "${target}" = "armv8-linux-gcc" ]; then + continue + fi + for cpu in $(seq 0 $max_cpu_used); do for clip in ${TEST_CLIPS}; do for bitrate in ${TEST_BITRATES}; do @@ -388,16 +393,14 @@ vp9_c_vs_simd_enc_test () { fi fi - ##TODO(BUG=webm:1809): Enable testing for ARM after issues with NEON intrinsic - # are resolved. # Test ARM - # echo "vp9_test_arm: Started." - # vp9_test_arm - # if [ $? -eq 1 ]; then - # echo "vp9 test for arm: Done, test failed." - # else - # echo "vp9 test for arm: Done, all tests passed." - # fi + echo "vp9_test_arm: Started." + vp9_test_arm + if [ $? -eq 1 ]; then + echo "vp9 test for arm: Done, test failed." + else + echo "vp9 test for arm: Done, all tests passed." + fi } # Setup a trap function to clean up build, and output files after tests complete. From c8610c266c7f1a098b304709936f3b0782d36fc6 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 10 Aug 2023 15:33:54 +0100 Subject: [PATCH 0863/1000] Fix bug and re-enable vpx_int_pro_row/col_neon Fix a bug in vpx_int_pro_row_neon (increment pointer after peeled first loop iteration) and re-enable both vpx_int_pro_row/col_neon paths. Also fix IntProRowTest to use width_ (instead of 0) as the src_stride for the input data block. The test's use of 0 for src_stride is the reason the tests passed with the buggy Neon implementation noted in the listed bugs. (The old buggy Neon implementation fails the adjusted unit tests.) BUG=webm:1800 BUG=webm:1809 Change-Id: I1f4572ee155653a7596fe2c10b5938ea7a3f63ae --- test/avg_test.cc | 37 +++++----- vpx_dsp/arm/avg_neon.c | 132 ++++++++++++++++++----------------- vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +- 3 files changed, 85 insertions(+), 90 deletions(-) diff --git a/test/avg_test.cc b/test/avg_test.cc index b885c4de4f..ede9c0ba8c 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -190,8 +190,9 @@ class IntProRowTest : public AverageTestBase, } void RunComparison() { - ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_)); - ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_)); + ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, width_, height_)); + ASM_REGISTER_STATE_CHECK( + asm_func_(hbuf_asm_, source_data_, width_, height_)); EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16)) << "Output mismatch"; } @@ -681,25 +682,19 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon), make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon))); -// Disabled neon optimization since it caused mismatch. See details in: -// https://bugs.chromium.org/p/webm/issues/detail?id=1809 -// INSTANTIATE_TEST_SUITE_P( -// NEON, IntProRowTest, -// ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon, -// &vpx_int_pro_row_c), -// make_tuple(32, &vpx_int_pro_row_neon, -// &vpx_int_pro_row_c), make_tuple(64, -// &vpx_int_pro_row_neon, -// &vpx_int_pro_row_c))); -// -// INSTANTIATE_TEST_SUITE_P( -// NEON, IntProColTest, -// ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon, -// &vpx_int_pro_col_c), -// make_tuple(32, &vpx_int_pro_col_neon, -// &vpx_int_pro_col_c), make_tuple(64, -// &vpx_int_pro_col_neon, -// &vpx_int_pro_col_c))); +INSTANTIATE_TEST_SUITE_P( + NEON, IntProRowTest, + ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c), + make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c), + make_tuple(64, &vpx_int_pro_row_neon, + &vpx_int_pro_row_c))); + +INSTANTIATE_TEST_SUITE_P( + NEON, IntProColTest, + ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c), + make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c), + make_tuple(64, &vpx_int_pro_col_neon, + &vpx_int_pro_col_c))); INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest, ::testing::Values(make_tuple(16, &vpx_satd_neon), diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index 0cb102fdf6..1b17a326b4 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -67,71 +67,73 @@ int vpx_satd_neon(const tran_low_t *coeff, int length) { return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1])); } -// void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, -// const int ref_stride, const int height) { -// int i; -// uint8x16_t r0, r1, r2, r3; -// uint16x8_t sum_lo[2], sum_hi[2]; -// uint16x8_t tmp_lo[2], tmp_hi[2]; -// int16x8_t avg_lo, avg_hi; -// -// const int norm_factor = (height >> 5) + 3; -// const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor); -// -// assert(height >= 4 && height % 4 == 0); -// -// r0 = vld1q_u8(ref + 0 * ref_stride); -// r1 = vld1q_u8(ref + 1 * ref_stride); -// r2 = vld1q_u8(ref + 2 * ref_stride); -// r3 = vld1q_u8(ref + 3 * ref_stride); -// -// sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); -// sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); -// sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); -// sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); -// -// for (i = 4; i < height; i += 4) { -// r0 = vld1q_u8(ref + 0 * ref_stride); -// r1 = vld1q_u8(ref + 1 * ref_stride); -// r2 = vld1q_u8(ref + 2 * ref_stride); -// r3 = vld1q_u8(ref + 3 * ref_stride); -// -// tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); -// tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); -// tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); -// tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); -// -// sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]); -// sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]); -// sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]); -// sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]); -// -// ref += 4 * ref_stride; -// } -// -// sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]); -// sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]); -// -// avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor); -// avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor); -// -// vst1q_s16(hbuf, avg_lo); -// vst1q_s16(hbuf + 8, avg_hi); -// } - -// int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) { -// uint16x8_t sum; -// int i; -// -// assert(width >= 16 && width % 16 == 0); -// -// sum = vpaddlq_u8(vld1q_u8(ref)); -// for (i = 16; i < width; i += 16) { -// sum = vpadalq_u8(sum, vld1q_u8(ref + i)); -// } -// -// return (int16_t)horizontal_add_uint16x8(sum); -// } +void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, + const int ref_stride, const int height) { + int i; + uint8x16_t r0, r1, r2, r3; + uint16x8_t sum_lo[2], sum_hi[2]; + uint16x8_t tmp_lo[2], tmp_hi[2]; + int16x8_t avg_lo, avg_hi; + + const int norm_factor = (height >> 5) + 3; + const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor); + + assert(height >= 4 && height % 4 == 0); + + r0 = vld1q_u8(ref + 0 * ref_stride); + r1 = vld1q_u8(ref + 1 * ref_stride); + r2 = vld1q_u8(ref + 2 * ref_stride); + r3 = vld1q_u8(ref + 3 * ref_stride); + + sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); + sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); + sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); + sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); + + ref += 4 * ref_stride; + + for (i = 4; i < height; i += 4) { + r0 = vld1q_u8(ref + 0 * ref_stride); + r1 = vld1q_u8(ref + 1 * ref_stride); + r2 = vld1q_u8(ref + 2 * ref_stride); + r3 = vld1q_u8(ref + 3 * ref_stride); + + tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); + tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); + tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); + tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); + + sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]); + sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]); + sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]); + sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]); + + ref += 4 * ref_stride; + } + + sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]); + sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]); + + avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor); + avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor); + + vst1q_s16(hbuf, avg_lo); + vst1q_s16(hbuf + 8, avg_hi); +} + +int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) { + uint16x8_t sum; + int i; + + assert(width >= 16 && width % 16 == 0); + + sum = vpaddlq_u8(vld1q_u8(ref)); + for (i = 16; i < width; i += 16) { + sum = vpadalq_u8(sum, vld1q_u8(ref + i)); + } + + return (int16_t)horizontal_add_uint16x8(sum); +} // ref, src = [0, 510] - max diff = 16-bits // bwl = {2, 3, 4}, width = {16, 32, 64} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 798bd93890..8033b4a81a 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -876,12 +876,10 @@ () specialize qw/vpx_satd avx2 sse2 neon msa/; } - # Disabled neon optimization since it caused mismatch. See details in: - # https://bugs.chromium.org/p/webm/issues/detail?id=1809 add_proto qw/void vpx_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height"; - specialize qw/vpx_int_pro_row sse2 msa/; + specialize qw/vpx_int_pro_row neon sse2 msa/; add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width"; - specialize qw/vpx_int_pro_col sse2 msa/; + specialize qw/vpx_int_pro_col neon sse2 msa/; add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl"; specialize qw/vpx_vector_var neon sse2 msa/; From 335728c987b3164ff25c58c06d29eb49e19e21d4 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 11 Aug 2023 13:40:07 -0700 Subject: [PATCH 0864/1000] *quantize*.c: fix visual studio warnings after: 22818907d normalize *const in rtcd fixes warnings of the form: vpx_dsp\x86\quantize_avx.c(145): warning C4028: formal parameter 2 different from declaration Change-Id: I4dc423f11ec4a9171e18bdb6be2fa8dfb65ee61a --- vp9/encoder/x86/vp9_quantize_avx2.c | 8 ++++---- vp9/encoder/x86/vp9_quantize_sse2.c | 2 +- vp9/encoder/x86/vp9_quantize_ssse3.c | 4 ++-- vpx_dsp/quantize.c | 4 ++-- vpx_dsp/x86/quantize_avx.c | 4 ++-- vpx_dsp/x86/quantize_avx2.c | 4 ++-- vpx_dsp/x86/quantize_ssse3.c | 4 ++-- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c index 62af3a9212..bf44b08674 100644 --- a/vp9/encoder/x86/vp9_quantize_avx2.c +++ b/vp9/encoder/x86/vp9_quantize_avx2.c @@ -102,7 +102,7 @@ static VPX_FORCE_INLINE void quantize_fp_16( } void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const struct macroblock_plane *mb_plane, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { @@ -206,7 +206,7 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_16( } void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const struct macroblock_plane *mb_plane, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { @@ -326,7 +326,7 @@ static VPX_FORCE_INLINE void highbd_quantize_fp( } void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const struct macroblock_plane *mb_plane, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, @@ -391,7 +391,7 @@ static VPX_FORCE_INLINE void highbd_quantize_fp_32x32( void vp9_highbd_quantize_fp_32x32_avx2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { const int step = 8; diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index 67f03eb310..2481eb366e 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -21,7 +21,7 @@ #include "vp9/encoder/vp9_block.h" void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const struct macroblock_plane *mb_plane, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.c b/vp9/encoder/x86/vp9_quantize_ssse3.c index c94c8dbb4c..98decae749 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3.c +++ b/vp9/encoder/x86/vp9_quantize_ssse3.c @@ -21,7 +21,7 @@ #include "vp9/encoder/vp9_block.h" void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const struct macroblock_plane *mb_plane, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { @@ -120,7 +120,7 @@ void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const struct macroblock_plane *mb_plane, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index dee12bae76..fac9136f8c 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -166,7 +166,7 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const struct macroblock_plane *mb_plane, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { @@ -214,7 +214,7 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #endif void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, - const struct macroblock_plane *mb_plane, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index 98bf1686cb..5ff5abc110 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -23,7 +23,7 @@ #include "vp9/encoder/vp9_block.h" void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const struct macroblock_plane *mb_plane, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { @@ -139,7 +139,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, - const struct macroblock_plane *mb_plane, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index 189b083f68..d4872f6bca 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -156,7 +156,7 @@ static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) { } void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const struct macroblock_plane *mb_plane, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { @@ -253,7 +253,7 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16( } void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, - const struct macroblock_plane *mb_plane, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 7f085566dd..2c6d851a16 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -20,7 +20,7 @@ #include "vp9/encoder/vp9_block.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const struct macroblock_plane *mb_plane, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { @@ -106,7 +106,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, - const struct macroblock_plane *mb_plane, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { From 8d2c357eab390923657113ead4567f70a026daf0 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 11 Aug 2023 15:48:19 -0700 Subject: [PATCH 0865/1000] fdct4x4_neon: fix compile w/cl Use an array for constant initialization rather than array syntax which assumes the underlying type is a vector. Fixes compile error with cl targeting Windows Arm64: vpx_dsp\arm\fdct4x4_neon.c(55,52): error C2078: too many initializers No change in assembly with gcc 12.2.0 & clang 14. Bug: b/277255390 Bug: webm:1810 Fixed: webm:1810 Change-Id: Ia30edcdbb45067dfe865b9958a5eecf1fd9ddfc8 --- vpx_dsp/arm/fdct4x4_neon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vpx_dsp/arm/fdct4x4_neon.c b/vpx_dsp/arm/fdct4x4_neon.c index 3b9196fae9..4bc968ecba 100644 --- a/vpx_dsp/arm/fdct4x4_neon.c +++ b/vpx_dsp/arm/fdct4x4_neon.c @@ -52,7 +52,6 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, int stride) { - static const int32x4_t const_1000 = { 1, 0, 0, 0 }; const int32x4_t const_one = vdupq_n_s32(1); // input[M * stride] * 16 @@ -64,7 +63,8 @@ void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, // If the very first value != 0, then add 1. if (input[0] != 0) { - in[0] = vaddq_s32(in[0], const_1000); + static const int32_t k1000[4] = { 1, 0, 0, 0 }; + in[0] = vaddq_s32(in[0], vld1q_s32(k1000)); } vpx_highbd_fdct4x4_pass1_neon(in); From 6e2c3b9b3c529f7a6b8f3092cd39cea83f4220f0 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 15 Aug 2023 14:39:16 -0400 Subject: [PATCH 0866/1000] Add RC mode to vpx external RC interface Bug: b/295507002 Change-Id: Id2dd21482828ec64eef9abdf6a1cca83100d21ba --- vp9/vp9_cx_iface.c | 11 ++++++++++- vpx/vpx_ext_ratectrl.h | 14 ++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index cc2ae20d27..77d3fb7684 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1949,7 +1949,7 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, // TODO(angiebird): Check the possibility of this flag being set at pass == 1 if (oxcf->pass == 2) { const FRAME_INFO *frame_info = &cpi->frame_info; - vpx_rc_config_t ratectrl_config; + vpx_rc_config_t ratectrl_config = {}; vpx_codec_err_t codec_status; ratectrl_config.frame_width = frame_info->frame_width; @@ -1962,6 +1962,15 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, ratectrl_config.frame_rate_num = oxcf->g_timebase.den; ratectrl_config.frame_rate_den = oxcf->g_timebase.num; + if (oxcf->rc_mode == VPX_VBR) { + ratectrl_config.rc_mode = VPX_RC_VBR; + ratectrl_config.overshoot_percent = oxcf->over_shoot_pct; + ratectrl_config.undershoot_percent = oxcf->under_shoot_pct; + } else if (oxcf->rc_mode == VPX_Q) { + ratectrl_config.rc_mode = VPX_RC_QMODE; + } else { + return VPX_CODEC_INVALID_PARAM; + } codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl); if (codec_status != VPX_CODEC_OK) { return codec_status; diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index b93df11cd5..d755cff47b 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -48,6 +48,13 @@ typedef enum vpx_rc_type { VPX_RC_GOP_QP_RDMULT = VPX_RC_QP | VPX_RC_GOP | VPX_RC_RDMULT } vpx_rc_type_t; +/*!\brief The rate control mode for the external rate control model. + */ +typedef enum vpx_ext_rc_mode { + VPX_RC_QMODE = 0, + VPX_RC_VBR = 1, +} vpx_ext_rc_mode_t; + /*!\brief Abstract rate control model handler * * The encoder will receive the model handler from create_model() defined in @@ -305,6 +312,13 @@ typedef struct vpx_rc_config { int target_bitrate_kbps; int frame_rate_num; /**< numerator of frame rate */ int frame_rate_den; /**< denominator of frame rate */ + /*! + * The following fields are only for external rate control models that support + * different rate control modes. + */ + vpx_ext_rc_mode_t rc_mode; /**< Q mode or VBR mode */ + int overshoot_percent; /**< for VBR mode only */ + int undershoot_percent; /**< for VBR mode only */ } vpx_rc_config_t; /*!\brief Information passed to the external rate control model to From 58eed626d8c36f595de416fa4defa189eec8f831 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 16 Aug 2023 11:15:55 -0700 Subject: [PATCH 0867/1000] tools_common,die_codec(): output to stderr This function is used to report a failure, messages of this type should go to stderr. Change-Id: I0dee246dddc886a3278b247a770a356446658864 --- tools_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools_common.c b/tools_common.c index 0de15558dd..0fcab2cf29 100644 --- a/tools_common.c +++ b/tools_common.c @@ -77,8 +77,8 @@ void warn(const char *fmt, ...) { LOG_ERROR("Warning"); } void die_codec(vpx_codec_ctx_t *ctx, const char *s) { const char *detail = vpx_codec_error_detail(ctx); - printf("%s: %s\n", s, vpx_codec_error(ctx)); - if (detail) printf(" %s\n", detail); + fprintf(stderr, "%s: %s\n", s, vpx_codec_error(ctx)); + if (detail) fprintf(stderr, " %s\n", detail); exit(EXIT_FAILURE); } From 4b1ac3c23fba1d0eb38f0f2153017f4d13277039 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 16 Aug 2023 16:05:01 -0400 Subject: [PATCH 0868/1000] Extend ext RC mode to have CQ mode Also do not return error if it's not specified. Bug: b/295507002 Change-Id: Ib1f83551272bdde1bceff03554abc4c02d95ca09 --- vp9/vp9_cx_iface.c | 5 +++-- vpx/vpx_ext_ratectrl.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 77d3fb7684..a9f7431ba6 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1968,9 +1968,10 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, ratectrl_config.undershoot_percent = oxcf->under_shoot_pct; } else if (oxcf->rc_mode == VPX_Q) { ratectrl_config.rc_mode = VPX_RC_QMODE; - } else { - return VPX_CODEC_INVALID_PARAM; + } else if (oxcf->rc_mode == VPX_CQ) { + ratectrl_config.rc_mode = VPX_RC_CQ; } + codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl); if (codec_status != VPX_CODEC_OK) { return codec_status; diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index d755cff47b..ef96be6fff 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -53,6 +53,7 @@ typedef enum vpx_rc_type { typedef enum vpx_ext_rc_mode { VPX_RC_QMODE = 0, VPX_RC_VBR = 1, + VPX_RC_CQ = 2, } vpx_ext_rc_mode_t; /*!\brief Abstract rate control model handler From 87a467f35648581109394705032216915d9ed36e Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 17 Aug 2023 12:34:48 -0400 Subject: [PATCH 0869/1000] vp9 ext rc: Assign over/undershoot % for CQ mode Bug: b/295507002 Change-Id: Ie5b4dabc620f6d17c4039f186e0709d8e9479b47 --- vp9/vp9_cx_iface.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index a9f7431ba6..4c3169f86e 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1970,6 +1970,8 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, ratectrl_config.rc_mode = VPX_RC_QMODE; } else if (oxcf->rc_mode == VPX_CQ) { ratectrl_config.rc_mode = VPX_RC_CQ; + ratectrl_config.overshoot_percent = oxcf->over_shoot_pct; + ratectrl_config.undershoot_percent = oxcf->under_shoot_pct; } codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl); From 401d8f36beb3fa395457d01640407ab194d457d6 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 18 Aug 2023 09:10:00 -0700 Subject: [PATCH 0870/1000] vp9_cx_iface: fix code compatibility Remove '= {}' (C23 [1]) and use memset to clear a vpx_rc_config_t instance. after: 6e2c3b9b3 Add RC mode to vpx external RC interface Fixes compile with -pedantic and Microsoft's cl compiler. [1] https://en.cppreference.com/w/c/language/initialization Change-Id: I2019cdf0c42103cfc80b1e58c68b7596e497007f --- vp9/vp9_cx_iface.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 4c3169f86e..f5f246406e 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1949,8 +1949,9 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, // TODO(angiebird): Check the possibility of this flag being set at pass == 1 if (oxcf->pass == 2) { const FRAME_INFO *frame_info = &cpi->frame_info; - vpx_rc_config_t ratectrl_config = {}; + vpx_rc_config_t ratectrl_config; vpx_codec_err_t codec_status; + memset(&ratectrl_config, 0, sizeof(ratectrl_config)); ratectrl_config.frame_width = frame_info->frame_width; ratectrl_config.frame_height = frame_info->frame_height; From 80b1b5a7e99cd31d2962ba5b12d3d03d58ffe2ad Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 18 Aug 2023 12:36:45 -0700 Subject: [PATCH 0871/1000] vp8,ratectrl.c: fix integer overflow in calc_iframe_target_size(): vp8/encoder/ratectrl.c:349:31: runtime error: signed integer overflow: 38 * 343597280 cannot be represented in type 'int' Bug: chromium:1473473 Change-Id: Ie8f7b147efb27c92314df09837b66f7d97046883 --- vp8/encoder/ratectrl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index 49ab4aa238..6f14322fdc 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -346,7 +346,8 @@ static void calc_iframe_target_size(VP8_COMP *cpi) { /* Minimal target size is |2* per_frame_bandwidth|. */ if (kf_boost < 16) kf_boost = 16; - target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4; + target = ((uint64_t)(16 + kf_boost) * cpi->per_frame_bandwidth) >> 4; + target = VPXMIN(INT_MAX, target); } if (cpi->oxcf.rc_max_intra_bitrate_pct) { From c7aa75ac5593cafe98073444cb29ffdc1ecba3e3 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 18 Aug 2023 14:04:33 -0700 Subject: [PATCH 0872/1000] vp9_calc_pframe_target_size_one_pass_cbr: fix int overflow vp9/encoder/vp9_ratectrl.c:2171:23: runtime error: signed integer overflow: 103079280 * -22 cannot be represented in type 'int' Bug: chromium:1473268 Change-Id: Ic1de7d48e74d94c2a992e53ec4382b5b44dba7af --- vp9/encoder/vp9_ratectrl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index c32745b4f8..16c47525fb 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -2168,12 +2168,12 @@ int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { if (diff > 0) { // Lower the target bandwidth for this frame. const int pct_low = (int)VPXMIN(diff / one_pct_bits, oxcf->under_shoot_pct); - target -= (target * pct_low) / 200; + target -= (int)(((int64_t)target * pct_low) / 200); } else if (diff < 0) { // Increase the target bandwidth for this frame. const int pct_high = (int)VPXMIN(-diff / one_pct_bits, oxcf->over_shoot_pct); - target += (target * pct_high) / 200; + target += (int)(((int64_t)target * pct_high) / 200); } if (oxcf->rc_max_inter_bitrate_pct) { const int max_rate = From ade6905e3919322d6f324e8f66a85fc53029c545 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 17 Aug 2023 14:56:35 -0400 Subject: [PATCH 0873/1000] vp9 ext rc: copy under/overshoot% for all RC modes Bug: b/295507002 Change-Id: Ie4b302b82fa2d83e0be450cea60c59907b37f954 --- vp9/vp9_cx_iface.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index f5f246406e..dfc02deb2f 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1962,17 +1962,15 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, ratectrl_config.target_bitrate_kbps = (int)(oxcf->target_bandwidth / 1000); ratectrl_config.frame_rate_num = oxcf->g_timebase.den; ratectrl_config.frame_rate_den = oxcf->g_timebase.num; + ratectrl_config.overshoot_percent = oxcf->over_shoot_pct; + ratectrl_config.undershoot_percent = oxcf->under_shoot_pct; if (oxcf->rc_mode == VPX_VBR) { ratectrl_config.rc_mode = VPX_RC_VBR; - ratectrl_config.overshoot_percent = oxcf->over_shoot_pct; - ratectrl_config.undershoot_percent = oxcf->under_shoot_pct; } else if (oxcf->rc_mode == VPX_Q) { ratectrl_config.rc_mode = VPX_RC_QMODE; } else if (oxcf->rc_mode == VPX_CQ) { ratectrl_config.rc_mode = VPX_RC_CQ; - ratectrl_config.overshoot_percent = oxcf->over_shoot_pct; - ratectrl_config.undershoot_percent = oxcf->under_shoot_pct; } codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl); From e052ada7801c458f9fc0c2818f1be814f86e94a4 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 25 Aug 2023 10:56:23 -0400 Subject: [PATCH 0874/1000] Do not call ext rc functions when they're null Change-Id: Ie78afadd4ad5845e42bd4d5412703369f8d5e0f5 --- vp9/encoder/vp9_encoder.c | 9 ++++++--- vp9/encoder/vp9_firstpass.c | 6 ++++-- vp9/encoder/vp9_tpl_model.c | 3 ++- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index aaf42a2a3f..869d557dd3 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4563,7 +4563,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest } #endif // CONFIG_RATE_CTRL if (cpi->ext_ratectrl.ready && !ext_rc_recode && - (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) { + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 && + cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) { vpx_codec_err_t codec_status; const GF_GROUP *gf_group = &cpi->twopass.gf_group; vpx_rc_encodeframe_decision_t encode_frame_decision; @@ -5575,7 +5576,8 @@ static void encode_frame_to_data_rate( // Backup to ensure consistency between recodes save_encode_params(cpi); if (cpi->ext_ratectrl.ready && - (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0) { + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 && + cpi->ext_ratectrl.funcs.get_frame_rdmult != NULL) { vpx_codec_err_t codec_status; const GF_GROUP *gf_group = &cpi->twopass.gf_group; FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; @@ -5693,7 +5695,8 @@ static void encode_frame_to_data_rate( end_timing(cpi, vp9_pack_bitstream_time); #endif - if (cpi->ext_ratectrl.ready) { + if (cpi->ext_ratectrl.ready && + cpi->ext_ratectrl.funcs.update_encodeframe_result != NULL) { const RefCntBuffer *coded_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx); vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result( diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index bd203f1e21..1e6f6f7b3b 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -2769,7 +2769,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref| // will be overwritten. if (cpi->ext_ratectrl.ready && - (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0) { + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && + cpi->ext_ratectrl.funcs.get_gop_decision != NULL) { vpx_codec_err_t codec_status; vpx_rc_gop_decision_t gop_decision; vpx_rc_gop_info_t gop_info; @@ -3506,7 +3507,8 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { FIRSTPASS_STATS this_frame; const int show_idx = cm->current_video_frame; - if (cpi->common.current_frame_coding_index == 0) { + if (cpi->common.current_frame_coding_index == 0 && + cpi->ext_ratectrl.funcs.send_firstpass_stats != NULL) { const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats( &cpi->ext_ratectrl, &cpi->twopass.first_pass_info); if (codec_status != VPX_CODEC_OK) { diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index 909b05292b..02318070c2 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -1509,7 +1509,8 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) { // Qmode. trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count); - if (cpi->ext_ratectrl.ready) { + if (cpi->ext_ratectrl.ready && + cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL) { const vpx_codec_err_t codec_status = vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats); if (codec_status != VPX_CODEC_OK) { From 6da1bd01d64d3d246b633bf25c766dfe751345b7 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Mon, 28 Aug 2023 12:09:30 -0700 Subject: [PATCH 0875/1000] vp9 svc: fix interger overflow Overflow was happening in two places: one in set_encoder_config(), where the input layer_target_bitrates are converted from kbps to bps, the other in vp9_calc_pframe_target_size_one_pass_vbr(), where target is scaled by kf_ratio. vp9_ratectrl.c:2039: runtime error: signed integer overflow: -137438983 * 25 cannot be represented in type 'int' Bug: chromium:1475943 Change-Id: I1ab0980862548c8827fae461df9a7a74425209ff --- vp9/encoder/vp9_ratectrl.c | 6 +++++- vp9/vp9_cx_iface.c | 8 ++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 16c47525fb..fe7414687a 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -2036,7 +2036,11 @@ int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) { int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *cpi) { static const int kf_ratio = 25; const RATE_CONTROL *rc = &cpi->rc; - const int target = rc->avg_frame_bandwidth * kf_ratio; + int target = rc->avg_frame_bandwidth; + if (target > INT_MAX / kf_ratio) + target = INT_MAX; + else + target = rc->avg_frame_bandwidth * kf_ratio; return vp9_rc_clamp_iframe_target_size(cpi, target); } diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index dfc02deb2f..06c0ac1bfb 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -639,8 +639,12 @@ static vpx_codec_err_t set_encoder_config( for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { - oxcf->layer_target_bitrate[sl * oxcf->ts_number_layers + tl] = - 1000 * cfg->layer_target_bitrate[sl * oxcf->ts_number_layers + tl]; + const int layer = sl * oxcf->ts_number_layers + tl; + if (cfg->layer_target_bitrate[layer] > INT_MAX / 1000) + oxcf->layer_target_bitrate[layer] = INT_MAX; + else + oxcf->layer_target_bitrate[layer] = + 1000 * cfg->layer_target_bitrate[layer]; } } if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) { From 7ee16bc1786674cfbd8982e43f4ee6932da464d1 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 31 Aug 2023 14:21:01 +0100 Subject: [PATCH 0876/1000] Simplify Neon MSE helper function params/return values Simplify the parameters and return values of the Neon MSE helper functions for both standard and high bitdepth - avoiding unused return values. Change-Id: I6f9208f9ce890fbe58346d9c7d9d701f28f2f90f --- vpx_dsp/arm/highbd_variance_neon.c | 86 +++++++++++++----------------- vpx_dsp/arm/variance_neon.c | 36 +++++-------- 2 files changed, 52 insertions(+), 70 deletions(-) diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c index 75fde676a0..e361f6f6f1 100644 --- a/vpx_dsp/arm/highbd_variance_neon.c +++ b/vpx_dsp/arm/highbd_variance_neon.c @@ -357,8 +357,7 @@ HIGHBD_GET_VAR(16) static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, - int ref_stride, int w, int h, - unsigned int *sse) { + int ref_stride, int w, int h) { uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = h; @@ -382,8 +381,7 @@ static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr, ref_ptr += ref_stride; } while (--i != 0); - *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); - return *sse; + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); } #if defined(__ARM_FEATURE_DOTPROD) @@ -391,8 +389,7 @@ static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr, static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, - int ref_stride, int h, - unsigned int *sse) { + int ref_stride, int h) { uint32x4_t sse_u32 = vdupq_n_u32(0); int i = h / 2; @@ -416,15 +413,13 @@ static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr, sse_u32 = vdotq_u32(sse_u32, diff, diff); } while (--i != 0); - *sse = horizontal_add_uint32x4(sse_u32); - return *sse; + return horizontal_add_uint32x4(sse_u32); } static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, - int ref_stride, int h, - unsigned int *sse) { + int ref_stride, int h) { uint32x4_t sse_u32 = vdupq_n_u32(0); int i = h; @@ -447,8 +442,7 @@ static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr, ref_ptr += ref_stride; } while (--i != 0); - *sse = horizontal_add_uint32x4(sse_u32); - return *sse; + return horizontal_add_uint32x4(sse_u32); } #else // !defined(__ARM_FEATURE_DOTPROD) @@ -456,51 +450,47 @@ static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr, static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, - int ref_stride, int h, - unsigned int *sse) { - return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h, - sse); + int ref_stride, int h) { + return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h); } static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, - int ref_stride, int h, - unsigned int *sse) { - return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h, - sse); + int ref_stride, int h) { + return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h); } #endif // defined(__ARM_FEATURE_DOTPROD) -#define HIGHBD_MSE_WXH_NEON(w, h) \ - uint32_t vpx_highbd_8_mse##w##x##h##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse); \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_10_mse##w##x##h##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ - *sse = ROUND_POWER_OF_TWO(*sse, 4); \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_12_mse##w##x##h##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ - *sse = ROUND_POWER_OF_TWO(*sse, 8); \ - return *sse; \ +#define HIGHBD_MSE_WXH_NEON(w, h) \ + uint32_t vpx_highbd_8_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_10_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_12_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + return *sse; \ } HIGHBD_MSE_WXH_NEON(16, 16) diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index 69ff1cf153..f41249d4d5 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -374,8 +374,7 @@ VARIANCE_WXH_NEON(64, 64, 12) static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, - int ref_stride, int h, - unsigned int *sse) { + int ref_stride, int h) { uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) }; int i = h / 2; @@ -398,15 +397,13 @@ static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr, sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1); } while (--i != 0); - *sse = horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1])); - return *sse; + return horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1])); } static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, - int ref_stride, int h, - unsigned int *sse) { + int ref_stride, int h) { uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = h / 2; @@ -429,8 +426,7 @@ static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1); } while (--i != 0); - *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); - return *sse; + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); } unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, @@ -451,8 +447,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, - int ref_stride, int h, - unsigned int *sse) { + int ref_stride, int h) { uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = h / 2; @@ -478,15 +473,13 @@ static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr, sse_u32[1] = vpadalq_u16(sse_u32[1], sse1); } while (--i != 0); - *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); - return *sse; + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); } static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, - int ref_stride, int h, - unsigned int *sse) { + int ref_stride, int h) { uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = h; @@ -507,8 +500,7 @@ static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, sse_u32[1] = vpadalq_u16(sse_u32[1], sse1); } while (--i != 0); - *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); - return *sse; + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); } unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, @@ -538,12 +530,12 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, #endif // defined(__ARM_FEATURE_DOTPROD) -#define VPX_MSE_WXH_NEON(w, h) \ - unsigned int vpx_mse##w##x##h##_neon( \ - const unsigned char *src_ptr, int src_stride, \ - const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \ - return vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h, \ - sse); \ +#define VPX_MSE_WXH_NEON(w, h) \ + unsigned int vpx_mse##w##x##h##_neon( \ + const unsigned char *src_ptr, int src_stride, \ + const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \ + *sse = vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + return *sse; \ } VPX_MSE_WXH_NEON(8, 8) From 148d1085f79c6b4dd07d552cb51b53bd2e87a3aa Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Sat, 19 Aug 2023 12:45:36 +0100 Subject: [PATCH 0877/1000] Refactor and extend run-time CPU feature detection on Arm 1) Overhaul the Arm CPU feature detection code, taking inspiration from similar recent changes in libaom. 2) Add neon_dotprod and neon_i8mm arch options in the configure, build and unit test files, adding appropriate conditional options where necessary. 3) Soft-enable run-time CPU feature detection by default for both 32- bit and 64-bit Arm platforms. Change-Id: I3f13317d88324acc5753394351188baa8d18a261 --- build/make/Makefile | 6 ++ build/make/configure.sh | 17 +++- build/make/rtcd.pl | 2 +- configure | 9 +- test/test_libvpx.cc | 24 ++++- vpx_ports/aarch32_cpudetect.c | 89 +++++++++++++++++ vpx_ports/aarch64_cpudetect.c | 173 ++++++++++++++++++++++++++++++++++ vpx_ports/arm.h | 12 +-- vpx_ports/arm_cpudetect.c | 154 ------------------------------ vpx_ports/arm_cpudetect.h | 52 ++++++++++ vpx_ports/vpx_ports.mk | 7 +- 11 files changed, 378 insertions(+), 167 deletions(-) create mode 100644 vpx_ports/aarch32_cpudetect.c create mode 100644 vpx_ports/aarch64_cpudetect.c delete mode 100644 vpx_ports/arm_cpudetect.c create mode 100644 vpx_ports/arm_cpudetect.h diff --git a/build/make/Makefile b/build/make/Makefile index 65ac2290c7..c2dc47ccff 100644 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -143,6 +143,12 @@ $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 $(BUILD_PFX)%_avx512.c.d: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl $(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl +# AARCH64 +$(BUILD_PFX)%_neon_dotprod.c.d: CFLAGS += -march=armv8.2-a+dotprod +$(BUILD_PFX)%_neon_dotprod.c.o: CFLAGS += -march=armv8.2-a+dotprod +$(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm +$(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm + # POWER $(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx $(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx diff --git a/build/make/configure.sh b/build/make/configure.sh index 7b2da3c1a1..9d3cd80cb3 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -973,10 +973,23 @@ process_common_toolchain() { # Process architecture variants case ${toolchain} in arm*) - # on arm, isa versions are supersets + soft_enable runtime_cpu_detect + # Arm ISA extensions are treated as supersets. case ${tgt_isa} in arm64|armv8) - soft_enable neon + for ext in ${ARCH_EXT_LIST_AARCH64}; do + # Disable higher order extensions to simplify dependencies. + if [ "$disable_exts" = "yes" ]; then + if ! disabled $ext; then + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} " + disable_feature $ext + fi + elif disabled $ext; then + disable_exts="yes" + else + soft_enable $ext + fi + done ;; armv7|armv7s) soft_enable neon diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl index f4edeaad51..1a6b93d5ae 100755 --- a/build/make/rtcd.pl +++ b/build/make/rtcd.pl @@ -487,7 +487,7 @@ () @ALL_ARCHS = filter(qw/neon_asm neon/); arm; } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) { - @ALL_ARCHS = filter(qw/neon/); + @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm/); @REQUIRES = filter(qw/neon/); &require(@REQUIRES); arm; diff --git a/configure b/configure index aef65a8505..2c638e5e5a 100755 --- a/configure +++ b/configure @@ -252,6 +252,13 @@ ARCH_LIST=" ppc loongarch " + +ARCH_EXT_LIST_AARCH64=" + neon + neon_dotprod + neon_i8mm +" + ARCH_EXT_LIST_X86=" mmx sse @@ -271,8 +278,8 @@ ARCH_EXT_LIST_LOONGSON=" " ARCH_EXT_LIST=" - neon neon_asm + ${ARCH_EXT_LIST_AARCH64} mips32 dspr2 diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc index 222a83f8c7..caab2dbd01 100644 --- a/test/test_libvpx.cc +++ b/test/test_libvpx.cc @@ -12,6 +12,9 @@ #include "third_party/googletest/src/include/gtest/gtest.h" #include "./vpx_config.h" +#if VPX_ARCH_ARM +#include "vpx_ports/arm.h" +#endif #if VPX_ARCH_X86 || VPX_ARCH_X86_64 #include "vpx_ports/x86.h" #endif @@ -26,7 +29,7 @@ extern void vpx_dsp_rtcd(); extern void vpx_scale_rtcd(); } -#if VPX_ARCH_X86 || VPX_ARCH_X86_64 +#if (!CONFIG_SHARED && VPX_ARCH_ARM) || VPX_ARCH_X86 || VPX_ARCH_X86_64 static void append_negative_gtest_filter(const char *str) { std::string filter = ::testing::FLAGS_gtest_filter; // Negative patterns begin with one '-' followed by a ':' separated list. @@ -34,11 +37,28 @@ static void append_negative_gtest_filter(const char *str) { filter += str; ::testing::FLAGS_gtest_filter = filter; } -#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64 +#endif // (!CONFIG_SHARED && VPX_ARCH_ARM) || VPX_ARCH_X86 || VPX_ARCH_X86_64 int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); +#if !CONFIG_SHARED +#if VPX_ARCH_AARCH64 + const int caps = arm_cpu_caps(); + if (!(caps & HAS_NEON_DOTPROD)) { + append_negative_gtest_filter(":NEON_DOTPROD.*:NEON_DOTPROD/*"); + } + if (!(caps & HAS_NEON_I8MM)) { + append_negative_gtest_filter(":NEON_I8MM.*:NEON_I8MM/*"); + } +#elif VPX_ARCH_ARM + const int caps = arm_cpu_caps(); + if (!(caps & HAS_NEON)) { + append_negative_gtest_filter(":NEON.*:NEON/*"); + } +#endif // VPX_ARCH_ARM +#endif // !CONFIG_SHARED + #if VPX_ARCH_X86 || VPX_ARCH_X86_64 const int simd_caps = x86_simd_caps(); if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*"); diff --git a/vpx_ports/aarch32_cpudetect.c b/vpx_ports/aarch32_cpudetect.c new file mode 100644 index 0000000000..48bdc70f92 --- /dev/null +++ b/vpx_ports/aarch32_cpudetect.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +// Feature detection code for Armv7-A / AArch32. + +#include "arm_cpudetect.h" + +#if !CONFIG_RUNTIME_CPU_DETECT + +static int arm_get_cpu_caps(void) { + // This function should actually be a no-op. There is no way to adjust any of + // these because the RTCD tables do not exist: the functions are called + // statically. + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; +#endif // HAVE_NEON + return flags; +} + +#elif defined(_MSC_VER) // end !CONFIG_RUNTIME_CPU_DETECT + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON || HAVE_NEON_ASM + // MSVC has no inline __asm support for Arm, but it does let you __emit + // instructions via their assembled hex code. + // All of these instructions should be essentially nops. + __try { + // VORR q0,q0,q0 + __emit(0xF2200150); + flags |= HAS_NEON; + } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { + // Ignore exception. + } +#endif // HAVE_NEON || HAVE_NEON_ASM + return flags; +} + +#elif defined(ANDROID_USE_CPU_FEATURES_LIB) + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON || HAVE_NEON_ASM + uint64_t features = android_getCpuFeatures(); + if (features & ANDROID_CPU_ARM_FEATURE_NEON) { + flags |= HAS_NEON; + } +#endif // HAVE_NEON || HAVE_NEON_ASM + return flags; +} + +#elif defined(__linux__) // end defined(AOM_USE_ANDROID_CPU_FEATURES) + +#include + +// Define hwcap values ourselves: building with an old auxv header where these +// hwcap values are not defined should not prevent features from being enabled. +#define VPX_AARCH32_HWCAP_NEON (1 << 12) + +static int arm_get_cpu_caps(void) { + int flags = 0; + unsigned long hwcap = getauxval(AT_HWCAP); +#if HAVE_NEON || HAVE_NEON_ASM + if (hwcap & VPX_AARCH32_HWCAP_NEON) { + flags |= HAS_NEON; + } +#endif // HAVE_NEON || HAVE_NEON_ASM + return flags; +} +#else // end __linux__ +#error \ + "Runtime CPU detection selected, but no CPU detection method available" \ +"for your platform. Rerun configure with --disable-runtime-cpu-detect." +#endif + +int arm_cpu_caps(void) { + int flags = 0; + if (arm_cpu_env_flags(&flags)) { + return flags; + } + return arm_get_cpu_caps() & arm_cpu_env_mask(); +} diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c new file mode 100644 index 0000000000..a3054ad717 --- /dev/null +++ b/vpx_ports/aarch64_cpudetect.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "arm_cpudetect.h" + +#if defined(__APPLE__) +#include +#endif + +#if !CONFIG_RUNTIME_CPU_DETECT + +static int arm_get_cpu_caps(void) { + // This function should actually be a no-op. There is no way to adjust any of + // these because the RTCD tables do not exist: the functions are called + // statically. + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; +#endif // HAVE_NEON + return flags; +} + +#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT + +// sysctlbyname() parameter documentation for instruction set characteristics: +// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics +static INLINE int64_t have_feature(const char *feature) { + int64_t feature_present = 0; + size_t size = sizeof(feature_present); + if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) { + return 0; + } + return feature_present; +} + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; +#endif // HAVE_NEON +#if HAVE_NEON_DOTPROD + if (have_feature("hw.optional.arm.FEAT_DotProd")) { + flags |= HAS_NEON_DOTPROD; + } +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM + if (have_feature("hw.optional.arm.FEAT_I8MM")) { + flags |= HAS_NEON_I8MM; + } +#endif // HAVE_NEON_I8MM + return flags; +} + +#elif defined(_MSC_VER) // end __APPLE__ + +static int arm_get_cpu_caps(void) { + int flags = 0; +// IsProcessorFeaturePresent() parameter documentation: +// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent#parameters +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON +#if HAVE_NEON_DOTPROD +// Support for PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE was added in Windows SDK +// 20348, supported by Windows 11 and Windows Server 2022. +#if defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) + if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) { + flags |= HAS_NEON_DOTPROD; + } +#endif // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) +#endif // HAVE_NEON_DOTPROD + // No I8MM feature detection available on Windows at time of writing. + return flags; +} + +#elif defined(ANDROID_USE_CPU_FEATURES_LIB) + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON + return flags; +} + +#elif defined(__linux__) // end defined(VPX_USE_ANDROID_CPU_FEATURES) + +#include + +// Define hwcap values ourselves: building with an old auxv header where these +// hwcap values are not defined should not prevent features from being enabled. +#define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20) +#define VPX_AARCH64_HWCAP2_I8MM (1 << 13) + +static int arm_get_cpu_caps(void) { + int flags = 0; + unsigned long hwcap = getauxval(AT_HWCAP); + unsigned long hwcap2 = getauxval(AT_HWCAP2); +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON +#if HAVE_NEON_DOTPROD + if (hwcap & VPX_AARCH64_HWCAP_ASIMDDP) { + flags |= HAS_NEON_DOTPROD; + } +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM + if (hwcap2 & VPX_AARCH64_HWCAP2_I8MM) { + flags |= HAS_NEON_I8MM; + } +#endif // HAVE_NEON_I8MM + return flags; +} + +#elif defined(__Fuchsia__) // end __linux__ + +#include +#include + +// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/894282. +#ifndef ZX_ARM64_FEATURE_ISA_I8MM +#define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19)) +#endif + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON + uint32_t features; + zx_status_t status = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features); + if (status != ZX_OK) { + return flags; + } +#if HAVE_NEON_DOTPROD + if (features & ZX_ARM64_FEATURE_ISA_DP) { + flags |= HAS_NEON_DOTPROD; + } +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM + if (features & ZX_ARM64_FEATURE_ISA_I8MM) { + flags |= HAS_NEON_I8MM; + } +#endif // HAVE_NEON_I8MM + return flags; +} + +#else // end __Fuchsia__ +#error \ + "Runtime CPU detection selected, but no CPU detection method available" \ +"for your platform. Rerun configure with --disable-runtime-cpu-detect." +#endif + +int arm_cpu_caps(void) { + int flags = 0; + if (!arm_cpu_env_flags(&flags)) { + flags = arm_get_cpu_caps() & arm_cpu_env_mask(); + } + + // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available. + if (!(flags & HAS_NEON_DOTPROD)) { + flags &= ~HAS_NEON_I8MM; + } + + return flags; +} diff --git a/vpx_ports/arm.h b/vpx_ports/arm.h index 6458a2c5b0..65909d8260 100644 --- a/vpx_ports/arm.h +++ b/vpx_ports/arm.h @@ -17,12 +17,12 @@ extern "C" { #endif -/*ARMv5TE "Enhanced DSP" instructions.*/ -#define HAS_EDSP 0x01 -/*ARMv6 "Parallel" or "Media" instructions.*/ -#define HAS_MEDIA 0x02 -/*ARMv7 optional NEON instructions.*/ -#define HAS_NEON 0x04 +// Armv7-A optional Neon instructions, mandatory from Armv8.0-A. +#define HAS_NEON (1 << 0) +// Armv8.2-A optional Neon dot-product instructions, mandatory from Armv8.4-A. +#define HAS_NEON_DOTPROD (1 << 1) +// Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A. +#define HAS_NEON_I8MM (1 << 2) int arm_cpu_caps(void); diff --git a/vpx_ports/arm_cpudetect.c b/vpx_ports/arm_cpudetect.c deleted file mode 100644 index 4f9d480ade..0000000000 --- a/vpx_ports/arm_cpudetect.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include - -#include "./vpx_config.h" -#include "vpx_ports/arm.h" - -#ifdef WINAPI_FAMILY -#include -#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) -#define getenv(x) NULL -#endif -#endif - -static int arm_cpu_env_flags(int *flags) { - char *env; - env = getenv("VPX_SIMD_CAPS"); - if (env && *env) { - *flags = (int)strtol(env, NULL, 0); - return 0; - } - *flags = 0; - return -1; -} - -static int arm_cpu_env_mask(void) { - char *env; - env = getenv("VPX_SIMD_CAPS_MASK"); - return env && *env ? (int)strtol(env, NULL, 0) : ~0; -} - -#if !CONFIG_RUNTIME_CPU_DETECT - -int arm_cpu_caps(void) { - /* This function should actually be a no-op. There is no way to adjust any of - * these because the RTCD tables do not exist: the functions are called - * statically */ - int flags; - int mask; - if (!arm_cpu_env_flags(&flags)) { - return flags; - } - mask = arm_cpu_env_mask(); -#if HAVE_NEON || HAVE_NEON_ASM - flags |= HAS_NEON; -#endif /* HAVE_NEON || HAVE_NEON_ASM */ - return flags & mask; -} - -#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */ -/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/ -#ifndef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#endif -#ifndef WIN32_EXTRA_LEAN -#define WIN32_EXTRA_LEAN -#endif -#include - -int arm_cpu_caps(void) { - int flags; - int mask; - if (!arm_cpu_env_flags(&flags)) { - return flags; - } - mask = arm_cpu_env_mask(); -/* MSVC has no inline __asm support for ARM, but it does let you __emit - * instructions via their assembled hex code. - * All of these instructions should be essentially nops. - */ -#if HAVE_NEON || HAVE_NEON_ASM - if (mask & HAS_NEON) { - __try { - /*VORR q0,q0,q0*/ - __emit(0xF2200150); - flags |= HAS_NEON; - } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { - /*Ignore exception.*/ - } - } -#endif /* HAVE_NEON || HAVE_NEON_ASM */ - return flags & mask; -} - -#elif defined(__ANDROID__) /* end _MSC_VER */ -#include - -int arm_cpu_caps(void) { - int flags; - int mask; - uint64_t features; - if (!arm_cpu_env_flags(&flags)) { - return flags; - } - mask = arm_cpu_env_mask(); - features = android_getCpuFeatures(); - -#if HAVE_NEON || HAVE_NEON_ASM - if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON; -#endif /* HAVE_NEON || HAVE_NEON_ASM */ - return flags & mask; -} - -#elif defined(__linux__) /* end __ANDROID__ */ - -#include - -int arm_cpu_caps(void) { - FILE *fin; - int flags; - int mask; - if (!arm_cpu_env_flags(&flags)) { - return flags; - } - mask = arm_cpu_env_mask(); - /* Reading /proc/self/auxv would be easier, but that doesn't work reliably - * on Android. - * This also means that detection will fail in Scratchbox. - */ - fin = fopen("/proc/cpuinfo", "r"); - if (fin != NULL) { - /* 512 should be enough for anybody (it's even enough for all the flags - * that x86 has accumulated... so far). - */ - char buf[512]; - while (fgets(buf, 511, fin) != NULL) { -#if HAVE_NEON || HAVE_NEON_ASM - if (memcmp(buf, "Features", 8) == 0) { - char *p; - p = strstr(buf, " neon"); - if (p != NULL && (p[5] == ' ' || p[5] == '\n')) { - flags |= HAS_NEON; - } - } -#endif /* HAVE_NEON || HAVE_NEON_ASM */ - } - fclose(fin); - } - return flags & mask; -} -#else /* end __linux__ */ -#error \ - "--enable-runtime-cpu-detect selected, but no CPU detection method " \ -"available for your platform. Reconfigure with --disable-runtime-cpu-detect." -#endif diff --git a/vpx_ports/arm_cpudetect.h b/vpx_ports/arm_cpudetect.h new file mode 100644 index 0000000000..24095d1acf --- /dev/null +++ b/vpx_ports/arm_cpudetect.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx_config.h" +#include "vpx_ports/arm.h" + +#if defined(_MSC_VER) +#undef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#undef WIN32_EXTRA_LEAN +#define WIN32_EXTRA_LEAN +#include +#endif + +#ifdef WINAPI_FAMILY +#include +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define getenv(x) NULL +#endif +#endif + +#if defined(__ANDROID__) && (__ANDROID_API__ < 18) +#define ANDROID_USE_CPU_FEATURES_LIB 1 +// Use getauxval() when targeting (64-bit) Android with API level >= 18. +// getauxval() is supported since Android API level 18 (Android 4.3.) +// First Android version with 64-bit support was Android 5.x (API level 21). +#include +#endif + +static INLINE int arm_cpu_env_flags(int *flags) { + const char *env = getenv("VPX_SIMD_CAPS"); + if (env && *env) { + *flags = (int)strtol(env, NULL, 0); + return 1; + } + return 0; +} + +static INLINE int arm_cpu_env_mask(void) { + const char *env = getenv("VPX_SIMD_CAPS_MASK"); + return env && *env ? (int)strtol(env, NULL, 0) : ~0; +} diff --git a/vpx_ports/vpx_ports.mk b/vpx_ports/vpx_ports.mk index e30e87cefb..93279dbebc 100644 --- a/vpx_ports/vpx_ports.mk +++ b/vpx_ports/vpx_ports.mk @@ -36,7 +36,12 @@ PORTS_SRCS-yes += x86.h PORTS_SRCS-yes += x86_abi_support.asm endif -PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.c +ifeq ($(VPX_ARCH_AARCH64),yes) +PORTS_SRCS-yes += aarch64_cpudetect.c +else +PORTS_SRCS-$(VPX_ARCH_ARM) += aarch32_cpudetect.c +endif +PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.h PORTS_SRCS-$(VPX_ARCH_ARM) += arm.h PORTS_SRCS-$(VPX_ARCH_PPC) += ppc_cpudetect.c From 91158c99f7bc241bf70bca597f289c681c71956b Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Sat, 19 Aug 2023 15:24:12 +0100 Subject: [PATCH 0878/1000] Use run-time CPU feature detection for vpx_convolve8_neon Arm Neon DotProd and I8MM implementations of vpx_convolve8* currently need to be enabled at compile time since they're guarded by ifdef feature macros. Now that run-time feature detection has been enabled for Arm platforms, expose these implementations with distinct *neon_dotprod/*neon_i8mm names in separate files and wire them up to the build system and rtcd.pl. Also add new test cases for the new DotProd and I8MM functions. Change-Id: I3db3cd62e8596099d9fec7805ca3ee86b2a01c74 --- test/convolve_test.cc | 30 + vpx_dsp/arm/vpx_convolve8_neon.c | 1427 ---------------------- vpx_dsp/arm/vpx_convolve8_neon.h | 22 +- vpx_dsp/arm/vpx_convolve8_neon_dotprod.c | 777 ++++++++++++ vpx_dsp/arm/vpx_convolve8_neon_i8mm.c | 698 +++++++++++ vpx_dsp/arm/vpx_convolve_neon.c | 55 - vpx_dsp/arm/vpx_convolve_neon_dotprod.c | 60 + vpx_dsp/arm/vpx_convolve_neon_i8mm.c | 60 + vpx_dsp/vpx_dsp.mk | 4 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 12 +- 10 files changed, 1648 insertions(+), 1497 deletions(-) create mode 100644 vpx_dsp/arm/vpx_convolve8_neon_dotprod.c create mode 100644 vpx_dsp/arm/vpx_convolve8_neon_i8mm.c create mode 100644 vpx_dsp/arm/vpx_convolve_neon_dotprod.c create mode 100644 vpx_dsp/arm/vpx_convolve_neon_i8mm.c diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 4d27c5ffcf..ffd5c41c63 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -1423,6 +1423,36 @@ INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest, ::testing::ValuesIn(kArrayConvolve_neon)); #endif // HAVE_NEON +#if HAVE_NEON_DOTPROD +const ConvolveFunctions convolve8_neon_dotprod( + vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_dotprod, + vpx_convolve8_avg_horiz_neon_dotprod, vpx_convolve8_vert_neon_dotprod, + vpx_convolve8_avg_vert_neon_dotprod, vpx_convolve8_neon_dotprod, + vpx_convolve8_avg_neon_dotprod, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, + vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, + vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve_neon_dotprod[] = { ALL_SIZES( + convolve8_neon_dotprod) }; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_neon_dotprod)); +#endif // HAVE_NEON_DOTPROD + +#if HAVE_NEON_I8MM +const ConvolveFunctions convolve8_neon_i8mm( + vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_i8mm, + vpx_convolve8_avg_horiz_neon_i8mm, vpx_convolve8_vert_neon_i8mm, + vpx_convolve8_avg_vert_neon_i8mm, vpx_convolve8_neon_i8mm, + vpx_convolve8_avg_neon_i8mm, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, + vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, + vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve_neon_i8mm[] = { ALL_SIZES( + convolve8_neon_i8mm) }; +INSTANTIATE_TEST_SUITE_P(NEON_I8MM, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_neon_i8mm)); +#endif // HAVE_NEON_I8MM + #if HAVE_DSPR2 const ConvolveFunctions convolve8_dspr2( vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2, vpx_convolve8_horiz_dspr2, diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index 505d0672f0..8b89862ba9 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -32,1429 +32,6 @@ // instructions. This optimization is much faster in speed unit test, but slowed // down the whole decoder by 5%. -#if VPX_ARCH_AARCH64 && \ - (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)) - -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { - 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, - 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, - 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 -}; - -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { - 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, - 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 -}; - -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { - /* Shift left and insert new last column in transposed 4x4 block. */ - 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, - /* Shift left and insert two new columns in transposed 4x4 block. */ - 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, - /* Shift left and insert three new columns in transposed 4x4 block. */ - 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 -}; - -#if defined(__ARM_FEATURE_MATMUL_INT8) - -void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - uint8x16_t s0, s1, s2, s3; - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(x_step_q4 == 16); - assert(h % 4 == 3); - - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; - - src -= 3; - - if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - do { - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_4_usdot(s0, filters, perm_tbl); - d1 = convolve8_4_usdot(s1, filters, perm_tbl); - d2 = convolve8_4_usdot(s2, filters, perm_tbl); - d3 = convolve8_4_usdot(s3, filters, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - load_u8_16x3(src, src_stride, &s0, &s1, &s2); - - d0 = convolve8_4_usdot(s0, filters, perm_tbl); - d1 = convolve8_4_usdot(s1, filters, perm_tbl); - d2 = convolve8_4_usdot(s2, filters, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8_4x1(dst + 2 * dst_stride, d23); - } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_8_usdot(s0, filters, perm_tbl); - d1 = convolve8_8_usdot(s1, filters, perm_tbl); - d2 = convolve8_8_usdot(s2, filters, perm_tbl); - d3 = convolve8_8_usdot(s3, filters, perm_tbl); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - width = w; - s = src; - d = dst; - do { - load_u8_16x3(s, src_stride, &s0, &s1, &s2); - - d0 = convolve8_8_usdot(s0, filters, perm_tbl); - d1 = convolve8_8_usdot(s1, filters, perm_tbl); - d2 = convolve8_8_usdot(s2, filters, perm_tbl); - - store_u8_8x3(d, dst_stride, d0, d1, d2); - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - } -} - -void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - uint8x16_t s0, s1, s2, s3; - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; - - src -= 3; - - if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; - - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - t0 = convolve8_4_usdot(s0, filters, perm_tbl); - t1 = convolve8_4_usdot(s1, filters, perm_tbl); - t2 = convolve8_4_usdot(s2, filters, perm_tbl); - t3 = convolve8_4_usdot(s3, filters, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_8_usdot(s0, filters, perm_tbl); - d1 = convolve8_8_usdot(s1, filters, perm_tbl); - d2 = convolve8_8_usdot(s2, filters, perm_tbl); - d3 = convolve8_8_usdot(s3, filters, perm_tbl); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } -} - -void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - uint8x16_t s0, s1, s2, s3; - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; - - src -= 3; - - if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23, dd01, dd23; - - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - t0 = convolve8_4_usdot(s0, filters, perm_tbl); - t1 = convolve8_4_usdot(s1, filters, perm_tbl); - t2 = convolve8_4_usdot(s2, filters, perm_tbl); - t3 = convolve8_4_usdot(s3, filters, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); - - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); - - d01 = vrhadd_u8(d01, dd01); - d23 = vrhadd_u8(d23, dd23); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_8_usdot(s0, filters, perm_tbl); - d1 = convolve8_8_usdot(s1, filters, perm_tbl); - d2 = convolve8_8_usdot(s2, filters, perm_tbl); - d3 = convolve8_8_usdot(s3, filters, perm_tbl); - - load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); - - d0 = vrhadd_u8(d0, dd0); - d1 = vrhadd_u8(d1, dd1); - d2 = vrhadd_u8(d2, dd2); - d3 = vrhadd_u8(d3, dd3); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } -} - -static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, - uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b, - const uint8x16_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, XX, XX, XX, XX - * a1: 10, 11, 12, 13, XX, XX, XX, XX - * a2: 20, 21, 22, 23, XX, XX, XX, XX - * a3: 30, 31, 32, 33, XX, XX, XX, XX - * - * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; - *b = vqtbl2q_u8(samples, permute_tbl); -} - -static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, - uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b0, uint8x16_t *b1, - const uint8x16x2_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, 04, 05, 06, 07 - * a1: 10, 11, 12, 13, 14, 15, 16, 17 - * a2: 20, 21, 22, 23, 24, 25, 26, 27 - * a3: 30, 31, 32, 33, 34, 35, 36, 37 - * - * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; - *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]); -} - -void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint8x16x2_t samples_LUT; - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(y_step_q4 == 16); - - (void)x0_q4; - (void)x_step_q4; - (void)y_step_q4; - - src -= 3 * src_stride; - - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - src += 7 * src_stride; - - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); - - do { - load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); - - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_4_usdot_partial(s0123, s4567, filters); - d1 = convolve8_4_usdot_partial(s1234, s5678, filters); - d2 = convolve8_4_usdot_partial(s2345, s6789, filters); - d3 = convolve8_4_usdot_partial(s3456, s78910, filters); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123 = s4567; - s1234 = s5678; - s2345 = s6789; - s3456 = s78910; - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; - - do { - height = h; - s = src; - d = dst; - - load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - s += 7 * src_stride; - - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); - - do { - load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); - - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - samples_LUT.val[0] = s3456_hi; - samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - filters); - d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - filters); - d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - filters); - d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - filters); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123_lo = s4567_lo; - s0123_hi = s4567_hi; - s1234_lo = s5678_lo; - s1234_hi = s5678_hi; - s2345_lo = s6789_lo; - s2345_hi = s6789_hi; - s3456_lo = s78910_lo; - s3456_hi = s78910_hi; - - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height != 0); - src += 8; - dst += 8; - w -= 8; - } while (w != 0); - } -} - -void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint8x16x2_t samples_LUT; - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(y_step_q4 == 16); - - (void)x0_q4; - (void)x_step_q4; - (void)y_step_q4; - - src -= 3 * src_stride; - - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23, dd01, dd23; - - load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - src += 7 * src_stride; - - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); - - do { - load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); - - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_4_usdot_partial(s0123, s4567, filters); - d1 = convolve8_4_usdot_partial(s1234, s5678, filters); - d2 = convolve8_4_usdot_partial(s2345, s6789, filters); - d3 = convolve8_4_usdot_partial(s3456, s78910, filters); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); - - d01 = vrhadd_u8(d01, dd01); - d23 = vrhadd_u8(d23, dd23); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123 = s4567; - s1234 = s5678; - s2345 = s6789; - s3456 = s78910; - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; - const uint8_t *s; - uint8_t *d; - int height; - - do { - height = h; - s = src; - d = dst; - - load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - s += 7 * src_stride; - - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); - - do { - load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); - - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - samples_LUT.val[0] = s3456_hi; - samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - filters); - d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - filters); - d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - filters); - d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - filters); - - load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); - - d0 = vrhadd_u8(d0, dd0); - d1 = vrhadd_u8(d1, dd1); - d2 = vrhadd_u8(d2, dd2); - d3 = vrhadd_u8(d3, dd3); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123_lo = s4567_lo; - s0123_hi = s4567_hi; - s1234_lo = s5678_lo; - s1234_hi = s5678_hi; - s2345_lo = s6789_lo; - s2345_hi = s6789_hi; - s3456_lo = s78910_lo; - s3456_hi = s78910_hi; - - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height != 0); - src += 8; - dst += 8; - w -= 8; - } while (w != 0); - } -} - -#else // !defined(__ARM_FEATURE_MATMUL_INT8) - -void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x16_t range_limit = vdupq_n_u8(128); - uint8x16_t s0, s1, s2, s3; - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(x_step_q4 == 16); - assert(h % 4 == 3); - - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; - - src -= 3; - - if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - do { - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); - d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); - d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); - d3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - load_u8_16x3(src, src_stride, &s0, &s1, &s2); - - d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); - d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); - d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8_4x1(dst + 2 * dst_stride, d23); - } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); - d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - width = w; - s = src; - d = dst; - do { - load_u8_16x3(s, src_stride, &s0, &s1, &s2); - - d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); - - store_u8_8x3(d, dst_stride, d0, d1, d2); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - } -} - -void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x16_t range_limit = vdupq_n_u8(128); - uint8x16_t s0, s1, s2, s3; - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; - - src -= 3; - - if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; - - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); - t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); - t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); - t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); - d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } -} - -void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x16_t range_limit = vdupq_n_u8(128); - uint8x16_t s0, s1, s2, s3; - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; - - src -= 3; - - if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23, dd01, dd23; - - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); - t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); - t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); - t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); - - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); - - d01 = vrhadd_u8(d01, dd01); - d23 = vrhadd_u8(d23, dd23); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); - d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); - - load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); - - d0 = vrhadd_u8(d0, dd0); - d1 = vrhadd_u8(d1, dd1); - d2 = vrhadd_u8(d2, dd2); - d3 = vrhadd_u8(d3, dd3); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } -} - -static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, - int8x8_t a3, int8x16_t *b, - const uint8x16_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, XX, XX, XX, XX - * a1: 10, 11, 12, 13, XX, XX, XX, XX - * a2: 20, 21, 22, 23, XX, XX, XX, XX - * a3: 30, 31, 32, 33, XX, XX, XX, XX - * - * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; - *b = vqtbl2q_s8(samples, permute_tbl); -} - -static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, - int8x8_t a3, int8x16_t *b0, - int8x16_t *b1, - const uint8x16x2_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, 04, 05, 06, 07 - * a1: 10, 11, 12, 13, 14, 15, 16, 17 - * a2: 20, 21, 22, 23, 24, 25, 26, 27 - * a3: 30, 31, 32, 33, 34, 35, 36, 37 - * - * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; - *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); -} - -void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x8_t range_limit = vdup_n_u8(128); - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; - int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int8x16x2_t samples_LUT; - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(y_step_q4 == 16); - - (void)x0_q4; - (void)x_step_q4; - (void)y_step_q4; - - src -= 3 * src_stride; - - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - src += 7 * src_stride; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); - - do { - uint8x8_t t7, t8, t9, t10; - - load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); - d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); - d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); - d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123 = s4567; - s1234 = s5678; - s2345 = s6789; - s3456 = s78910; - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; - - do { - height = h; - s = src; - d = dst; - - load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s += 7 * src_stride; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); - - do { - uint8x8_t t7, t8, t9, t10; - - load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - samples_LUT.val[0] = s3456_hi; - samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filters); - d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filters); - d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filters); - d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filters); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123_lo = s4567_lo; - s0123_hi = s4567_hi; - s1234_lo = s5678_lo; - s1234_hi = s5678_hi; - s2345_lo = s6789_lo; - s2345_hi = s6789_hi; - s3456_lo = s78910_lo; - s3456_hi = s78910_hi; - - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height != 0); - src += 8; - dst += 8; - w -= 8; - } while (w != 0); - } -} - -void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x8_t range_limit = vdup_n_u8(128); - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; - int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int8x16x2_t samples_LUT; - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(y_step_q4 == 16); - - (void)x0_q4; - (void)x_step_q4; - (void)y_step_q4; - - src -= 3 * src_stride; - - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23, dd01, dd23; - - load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - src += 7 * src_stride; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); - - do { - uint8x8_t t7, t8, t9, t10; - - load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); - d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); - d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); - d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); - - d01 = vrhadd_u8(d01, dd01); - d23 = vrhadd_u8(d23, dd23); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123 = s4567; - s1234 = s5678; - s2345 = s6789; - s3456 = s78910; - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; - const uint8_t *s; - uint8_t *d; - int height; - - do { - height = h; - s = src; - d = dst; - - load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s += 7 * src_stride; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); - - do { - uint8x8_t t7, t8, t9, t10; - - load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - samples_LUT.val[0] = s3456_hi; - samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filters); - d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filters); - d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filters); - d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filters); - - load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); - - d0 = vrhadd_u8(d0, dd0); - d1 = vrhadd_u8(d1, dd1); - d2 = vrhadd_u8(d2, dd2); - d3 = vrhadd_u8(d3, dd3); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ - s0123_lo = s4567_lo; - s0123_hi = s4567_hi; - s1234_lo = s5678_lo; - s1234_hi = s5678_hi; - s2345_lo = s6789_lo; - s2345_hi = s6789_hi; - s3456_lo = s78910_lo; - s3456_hi = s78910_hi; - - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height != 0); - src += 8; - dst += 8; - w -= 8; - } while (w != 0); - } -} - -#endif // defined(__ARM_FEATURE_MATMUL_INT8) - -#else // !(VPX_ARCH_AARCH64 && - // (defined(__ARM_FEATURE_DOTPROD) || - // defined(__ARM_FEATURE_MATMUL_INT8))) - void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -2193,7 +770,3 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } while (w != 0); } } - -#endif // #if VPX_ARCH_AARCH64 && - // (defined(__ARM_FEATURE_DOTPROD) || - // defined(__ARM_FEATURE_MATMUL_INT8)) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index 2f78583af3..025e943cc4 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -17,17 +17,15 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/vpx_filter.h" -#if VPX_ARCH_AARCH64 && \ - (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)) -void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h); -#endif - #if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) +void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h); + static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, const int8x16_t samples_hi, const int32x4_t correction, @@ -128,6 +126,12 @@ static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, #if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) +void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h); + static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, const uint8x16_t samples_hi, const int8x8_t filters) { diff --git a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c new file mode 100644 index 0000000000..bf01364cf7 --- /dev/null +++ b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c @@ -0,0 +1,777 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { + /* Shift left and insert new last column in transposed 4x4 block. */ + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + /* Shift left and insert two new columns in transposed 4x4 block. */ + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + /* Shift left and insert three new columns in transposed 4x4 block. */ + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 +}; + +void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x16_t range_limit = vdupq_n_u8(128); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + assert(h % 4 == 3); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + do { + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); + d3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); + d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + width = w; + s = src; + d = dst; + do { + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + } +} + +void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x16_t range_limit = vdupq_n_u8(128); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); + t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); + t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); + t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); + d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x16_t range_limit = vdupq_n_u8(128); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); + t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); + t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); + t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); + d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b, + const uint8x16_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, XX, XX, XX, XX + * a1: 10, 11, 12, 13, XX, XX, XX, XX + * a2: 20, 21, 22, 23, XX, XX, XX, XX + * a3: 30, 31, 32, 33, XX, XX, XX, XX + * + * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; + *b = vqtbl2q_s8(samples, permute_tbl); +} + +static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b0, + int8x16_t *b1, + const uint8x16x2_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, 04, 05, 06, 07 + * a1: 10, 11, 12, 13, 14, 15, 16, 17 + * a2: 20, 21, 22, 23, 24, 25, 26, 27 + * a3: 30, 31, 32, 33, 34, 35, 36, 37 + * + * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; + *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); + *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); +} + +void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x8_t range_limit = vdup_n_u8(128); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int8x16x2_t samples_LUT; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); + d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); + d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); + d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filters); + d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filters); + d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filters); + d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filters); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x8_t range_limit = vdup_n_u8(128); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int8x16x2_t samples_LUT; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); + d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); + d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); + d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filters); + d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filters); + d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filters); + d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filters); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c new file mode 100644 index 0000000000..e0e482e3f5 --- /dev/null +++ b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c @@ -0,0 +1,698 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { + /* Shift left and insert new last column in transposed 4x4 block. */ + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + /* Shift left and insert two new columns in transposed 4x4 block. */ + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + /* Shift left and insert three new columns in transposed 4x4 block. */ + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 +}; + +void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + assert(h % 4 == 3); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + do { + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_4_usdot(s0, filters, perm_tbl); + d1 = convolve8_4_usdot(s1, filters, perm_tbl); + d2 = convolve8_4_usdot(s2, filters, perm_tbl); + d3 = convolve8_4_usdot(s3, filters, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + d0 = convolve8_4_usdot(s0, filters, perm_tbl); + d1 = convolve8_4_usdot(s1, filters, perm_tbl); + d2 = convolve8_4_usdot(s2, filters, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filters, perm_tbl); + d1 = convolve8_8_usdot(s1, filters, perm_tbl); + d2 = convolve8_8_usdot(s2, filters, perm_tbl); + d3 = convolve8_8_usdot(s3, filters, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + width = w; + s = src; + d = dst; + do { + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + d0 = convolve8_8_usdot(s0, filters, perm_tbl); + d1 = convolve8_8_usdot(s1, filters, perm_tbl); + d2 = convolve8_8_usdot(s2, filters, perm_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + } +} + +void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_usdot(s0, filters, perm_tbl); + t1 = convolve8_4_usdot(s1, filters, perm_tbl); + t2 = convolve8_4_usdot(s2, filters, perm_tbl); + t3 = convolve8_4_usdot(s3, filters, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filters, perm_tbl); + d1 = convolve8_8_usdot(s1, filters, perm_tbl); + d2 = convolve8_8_usdot(s2, filters, perm_tbl); + d3 = convolve8_8_usdot(s3, filters, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_usdot(s0, filters, perm_tbl); + t1 = convolve8_4_usdot(s1, filters, perm_tbl); + t2 = convolve8_4_usdot(s2, filters, perm_tbl); + t3 = convolve8_4_usdot(s3, filters, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filters, perm_tbl); + d1 = convolve8_8_usdot(s1, filters, perm_tbl); + d2 = convolve8_8_usdot(s2, filters, perm_tbl); + d3 = convolve8_8_usdot(s3, filters, perm_tbl); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b, + const uint8x16_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, XX, XX, XX, XX + * a1: 10, 11, 12, 13, XX, XX, XX, XX + * a2: 20, 21, 22, 23, XX, XX, XX, XX + * a3: 30, 31, 32, 33, XX, XX, XX, XX + * + * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; + *b = vqtbl2q_u8(samples, permute_tbl); +} + +static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b0, uint8x16_t *b1, + const uint8x16x2_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, 04, 05, 06, 07 + * a1: 10, 11, 12, 13, 14, 15, 16, 17 + * a2: 20, 21, 22, 23, 24, 25, 26, 27 + * a3: 30, 31, 32, 33, 34, 35, 36, 37 + * + * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; + *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]); + *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]); +} + +void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint8x16x2_t samples_LUT; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_usdot_partial(s0123, s4567, filters); + d1 = convolve8_4_usdot_partial(s1234, s5678, filters); + d2 = convolve8_4_usdot_partial(s2345, s6789, filters); + d3 = convolve8_4_usdot_partial(s3456, s78910, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + filters); + d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + filters); + d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + filters); + d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + filters); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint8x16x2_t samples_LUT; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_usdot_partial(s0123, s4567, filters); + d1 = convolve8_4_usdot_partial(s1234, s5678, filters); + d2 = convolve8_4_usdot_partial(s2345, s6789, filters); + d3 = convolve8_4_usdot_partial(s3456, s78910, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + filters); + d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + filters); + d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + filters); + d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + filters); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c index f7db3e6a9c..830f3176d7 100644 --- a/vpx_dsp/arm/vpx_convolve_neon.c +++ b/vpx_dsp/arm/vpx_convolve_neon.c @@ -14,57 +14,6 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_ports/mem.h" -#if VPX_ARCH_AARCH64 && \ - (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)) -#include "vpx_dsp/arm/vpx_convolve8_neon.h" - -void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const InterpKernel *filter, - int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, - int w, int h) { - /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the - * maximum buffer size to 64 * (64 + 7). */ - uint8_t temp[64 * 71]; - - /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ - const int intermediate_height = h + 7; - - assert(y_step_q4 == 16); - assert(x_step_q4 == 16); - - /* Filter starting 3 lines back. */ - vpx_convolve8_2d_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, - intermediate_height); - - /* Step into the temp buffer 3 lines to get the actual frame data */ - vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, - x_step_q4, y0_q4, y_step_q4, w, h); -} - -void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - uint8_t temp[64 * 71]; - const int intermediate_height = h + 7; - - assert(y_step_q4 == 16); - assert(x_step_q4 == 16); - - vpx_convolve8_2d_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, - intermediate_height); - - vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, - x_step_q4, y0_q4, y_step_q4, w, h); -} - -#else // !(VPX_ARCH_AARCH64 && - // (defined(__ARM_FEATURE_DOTPROD) || - // defined(__ARM_FEATURE_MATMUL_INT8))) - void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, @@ -114,7 +63,3 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } - -#endif // #if VPX_ARCH_AARCH64 && - // (defined(__ARM_FEATURE_DOTPROD) || - // defined(__ARM_FEATURE_MATMUL_INT8)) diff --git a/vpx_dsp/arm/vpx_convolve_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve_neon_dotprod.c new file mode 100644 index 0000000000..400e26b30a --- /dev/null +++ b/vpx_dsp/arm/vpx_convolve_neon_dotprod.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" + +void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the + * maximum buffer size to 64 * (64 + 7). */ + uint8_t temp[64 * 71]; + + /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ + const int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + /* Filter starting 3 lines back. */ + vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w, + filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, intermediate_height); + + /* Step into the temp buffer 3 lines to get the actual frame data. */ + vpx_convolve8_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); +} + +void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + uint8_t temp[64 * 71]; + const int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w, + filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, intermediate_height); + + vpx_convolve8_avg_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); +} diff --git a/vpx_dsp/arm/vpx_convolve_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve_neon_i8mm.c new file mode 100644 index 0000000000..4d94bb79b7 --- /dev/null +++ b/vpx_dsp/arm/vpx_convolve_neon_i8mm.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" + +void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the + * maximum buffer size to 64 * (64 + 7). */ + uint8_t temp[64 * 71]; + + /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ + const int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + /* Filter starting 3 lines back. */ + vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, + w, intermediate_height); + + /* Step into the temp buffer 3 lines to get the actual frame data. */ + vpx_convolve8_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); +} + +void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + uint8_t temp[64 * 71]; + const int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, + w, intermediate_height); + + vpx_convolve8_avg_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 04969f37e1..8d2422b1de 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -133,6 +133,10 @@ DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c DSP_SRCS-yes += arm/vpx_convolve8_neon.c DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c DSP_SRCS-yes += arm/vpx_convolve_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve8_neon_dotprod.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve_neon_dotprod.c +DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve8_neon_i8mm.c +DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve_neon_i8mm.c endif # HAVE_NEON endif # HAVE_NEON_ASM diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 8033b4a81a..0cd21c7997 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -382,22 +382,22 @@ () specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi lsx/; add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; +specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; +specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; +specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; +specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; +specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/; +specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/; add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_scaled_2d ssse3 neon msa/; From 02dc617f8cbd7a39ec1125b950a1d6f8ceba70f1 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Sat, 19 Aug 2023 20:21:24 +0100 Subject: [PATCH 0879/1000] Use run-time CPU feature detection for Neon DotProd SAD Arm Neon DotProd implementations of vpx_sad* currently need to be enabled at compile time since they're guarded by ifdef feature macros. Now that run-time feature detection has been enabled for Arm platforms, expose these implementations with distinct *neon_dotprod names in separate files and wire them up to the build system and rtcd.pl. Also add new test cases for the new DotProd functions. Change-Id: Ic6906c28240276ba89787eadbc9393a232374f95 --- test/sad_test.cc | 45 ++++++ vpx_dsp/arm/sad_neon.c | 183 +----------------------- vpx_dsp/arm/sad_neon_dotprod.c | 247 +++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 48 +++---- 5 files changed, 319 insertions(+), 205 deletions(-) create mode 100644 vpx_dsp/arm/sad_neon_dotprod.c diff --git a/test/sad_test.cc b/test/sad_test.cc index 83c4fe0c36..3f9c020ee8 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1129,6 +1129,21 @@ const SadMxNParam neon_tests[] = { }; INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests)); +#if HAVE_NEON_DOTPROD +const SadMxNParam neon_dotprod_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_neon_dotprod), + SadMxNParam(64, 32, &vpx_sad64x32_neon_dotprod), + SadMxNParam(32, 64, &vpx_sad32x64_neon_dotprod), + SadMxNParam(32, 32, &vpx_sad32x32_neon_dotprod), + SadMxNParam(32, 16, &vpx_sad32x16_neon_dotprod), + SadMxNParam(16, 32, &vpx_sad16x32_neon_dotprod), + SadMxNParam(16, 16, &vpx_sad16x16_neon_dotprod), + SadMxNParam(16, 8, &vpx_sad16x8_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADTest, + ::testing::ValuesIn(neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD + const SadSkipMxNParam skip_neon_tests[] = { SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon), SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon), @@ -1188,6 +1203,21 @@ const SadSkipMxNParam skip_neon_tests[] = { INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest, ::testing::ValuesIn(skip_neon_tests)); +#if HAVE_NEON_DOTPROD +const SadSkipMxNParam skip_neon_dotprod_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon_dotprod), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon_dotprod), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon_dotprod), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon_dotprod), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon_dotprod), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon_dotprod), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon_dotprod), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipTest, + ::testing::ValuesIn(skip_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD + const SadMxNAvgParam avg_neon_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon), SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon), @@ -1246,6 +1276,21 @@ const SadMxNAvgParam avg_neon_tests[] = { }; INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests)); +#if HAVE_NEON_DOTPROD +const SadMxNAvgParam avg_neon_dotprod_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon_dotprod), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon_dotprod), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon_dotprod), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon_dotprod), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon_dotprod), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon_dotprod), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon_dotprod), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADavgTest, + ::testing::ValuesIn(avg_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD + const SadMxNx4Param x4d_neon_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon), SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon), diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index 566a1f81db..4dd87ddc0f 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -17,84 +17,6 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -#if defined(__ARM_FEATURE_DOTPROD) - -static INLINE unsigned int sadwxh_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - int w, int h) { - // Only two accumulators are required for optimal instruction throughput of - // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. - uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; - - int i = h; - do { - int j = 0; - do { - uint8x16_t s0, s1, r0, r1, diff0, diff1; - - s0 = vld1q_u8(src_ptr + j); - r0 = vld1q_u8(ref_ptr + j); - diff0 = vabdq_u8(s0, r0); - sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); - - s1 = vld1q_u8(src_ptr + j + 16); - r1 = vld1q_u8(ref_ptr + j + 16); - diff1 = vabdq_u8(s1, r1); - sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); - - j += 32; - } while (j < w); - - src_ptr += src_stride; - ref_ptr += ref_stride; - } while (--i != 0); - - return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); -} - -static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - int h) { - return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h); -} - -static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - int h) { - return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h); -} - -static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - int h) { - uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; - - int i = h / 2; - do { - uint8x16_t s0, s1, r0, r1, diff0, diff1; - - s0 = vld1q_u8(src_ptr); - r0 = vld1q_u8(ref_ptr); - diff0 = vabdq_u8(s0, r0); - sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); - - src_ptr += src_stride; - ref_ptr += ref_stride; - - s1 = vld1q_u8(src_ptr); - r1 = vld1q_u8(ref_ptr); - diff1 = vabdq_u8(s1, r1); - sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); - - src_ptr += src_stride; - ref_ptr += ref_stride; - } while (--i != 0); - - return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); -} - -#else // !defined(__ARM_FEATURE_DOTPROD) - static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { @@ -186,8 +108,6 @@ static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride, return horizontal_add_uint16x8(sum); } -#endif // defined(__ARM_FEATURE_DOTPROD) - static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { @@ -280,105 +200,6 @@ SAD_SKIP_WXH_NEON(64, 64) #undef SAD_SKIP_WXH_NEON -#if defined(__ARM_FEATURE_DOTPROD) - -static INLINE unsigned int sadwxh_avg_neon(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, int w, int h, - const uint8_t *second_pred) { - // Only two accumulators are required for optimal instruction throughput of - // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. - uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; - - int i = h; - do { - int j = 0; - do { - uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; - - s0 = vld1q_u8(src_ptr + j); - r0 = vld1q_u8(ref_ptr + j); - p0 = vld1q_u8(second_pred); - avg0 = vrhaddq_u8(r0, p0); - diff0 = vabdq_u8(s0, avg0); - sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); - - s1 = vld1q_u8(src_ptr + j + 16); - r1 = vld1q_u8(ref_ptr + j + 16); - p1 = vld1q_u8(second_pred + 16); - avg1 = vrhaddq_u8(r1, p1); - diff1 = vabdq_u8(s1, avg1); - sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); - - j += 32; - second_pred += 32; - } while (j < w); - - src_ptr += src_stride; - ref_ptr += ref_stride; - } while (--i != 0); - - return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); -} - -static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, int h, - const uint8_t *second_pred) { - return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h, - second_pred); -} - -static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, int h, - const uint8_t *second_pred) { - return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h, - second_pred); -} - -static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, int h, - const uint8_t *second_pred) { - uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; - - int i = h / 2; - do { - uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; - - s0 = vld1q_u8(src_ptr); - r0 = vld1q_u8(ref_ptr); - p0 = vld1q_u8(second_pred); - avg0 = vrhaddq_u8(r0, p0); - diff0 = vabdq_u8(s0, avg0); - sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); - - src_ptr += src_stride; - ref_ptr += ref_stride; - second_pred += 16; - - s1 = vld1q_u8(src_ptr); - r1 = vld1q_u8(ref_ptr); - p1 = vld1q_u8(second_pred); - avg1 = vrhaddq_u8(r1, p1); - diff1 = vabdq_u8(s1, avg1); - sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); - - src_ptr += src_stride; - ref_ptr += ref_stride; - second_pred += 16; - } while (--i != 0); - - return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); -} - -#else // !defined(__ARM_FEATURE_DOTPROD) - static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, @@ -493,8 +314,6 @@ static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr, return horizontal_add_uint16x8(sum); } -#endif // defined(__ARM_FEATURE_DOTPROD) - static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, @@ -568,3 +387,5 @@ SAD_WXH_AVG_NEON(32, 64) SAD_WXH_AVG_NEON(64, 32) SAD_WXH_AVG_NEON(64, 64) + +#undef SAD_WXH_AVG_NEON diff --git a/vpx_dsp/arm/sad_neon_dotprod.c b/vpx_dsp/arm/sad_neon_dotprod.c new file mode 100644 index 0000000000..fbc0b8d75f --- /dev/null +++ b/vpx_dsp/arm/sad_neon_dotprod.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h) { + // Only two accumulators are required for optimal instruction throughput of + // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr + j); + r0 = vld1q_u8(ref_ptr + j); + diff0 = vabdq_u8(s0, r0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + s1 = vld1q_u8(src_ptr + j + 16); + r1 = vld1q_u8(ref_ptr + j + 16); + diff1 = vabdq_u8(s1, r1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + j += 32; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h); +} + +static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h); +} + +static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + diff0 = vabdq_u8(s0, r0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + + s1 = vld1q_u8(src_ptr); + r1 = vld1q_u8(ref_ptr); + diff1 = vabdq_u8(s1, r1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +#define SAD_WXH_NEON_DOTPROD(w, h) \ + unsigned int vpx_sad##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \ + } + +SAD_WXH_NEON_DOTPROD(16, 8) +SAD_WXH_NEON_DOTPROD(16, 16) +SAD_WXH_NEON_DOTPROD(16, 32) + +SAD_WXH_NEON_DOTPROD(32, 16) +SAD_WXH_NEON_DOTPROD(32, 32) +SAD_WXH_NEON_DOTPROD(32, 64) + +SAD_WXH_NEON_DOTPROD(64, 32) +SAD_WXH_NEON_DOTPROD(64, 64) + +#undef SAD_WXH_NEON_DOTPROD + +#define SAD_SKIP_WXH_NEON_DOTPROD(w, h) \ + unsigned int vpx_sad_skip_##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \ + 2 * ref_stride, (h) / 2); \ + } + +SAD_SKIP_WXH_NEON_DOTPROD(16, 8) +SAD_SKIP_WXH_NEON_DOTPROD(16, 16) +SAD_SKIP_WXH_NEON_DOTPROD(16, 32) + +SAD_SKIP_WXH_NEON_DOTPROD(32, 16) +SAD_SKIP_WXH_NEON_DOTPROD(32, 32) +SAD_SKIP_WXH_NEON_DOTPROD(32, 64) + +SAD_SKIP_WXH_NEON_DOTPROD(64, 32) +SAD_SKIP_WXH_NEON_DOTPROD(64, 64) + +#undef SAD_SKIP_WXH_NEON_DOTPROD + +static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + const uint8_t *second_pred) { + // Only two accumulators are required for optimal instruction throughput of + // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; + + s0 = vld1q_u8(src_ptr + j); + r0 = vld1q_u8(ref_ptr + j); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + s1 = vld1q_u8(src_ptr + j + 16); + r1 = vld1q_u8(ref_ptr + j + 16); + p1 = vld1q_u8(second_pred + 16); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + j += 32; + second_pred += 32; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +static INLINE unsigned int sad64xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, + h, second_pred); +} + +static INLINE unsigned int sad32xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, + h, second_pred); +} + +static INLINE unsigned int sad16xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + + s1 = vld1q_u8(src_ptr); + r1 = vld1q_u8(ref_ptr); + p1 = vld1q_u8(second_pred); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +#define SAD_WXH_AVG_NEON_DOTPROD(w, h) \ + uint32_t vpx_sad##w##x##h##_avg_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ + } + +SAD_WXH_AVG_NEON_DOTPROD(16, 8) +SAD_WXH_AVG_NEON_DOTPROD(16, 16) +SAD_WXH_AVG_NEON_DOTPROD(16, 32) + +SAD_WXH_AVG_NEON_DOTPROD(32, 16) +SAD_WXH_AVG_NEON_DOTPROD(32, 32) +SAD_WXH_AVG_NEON_DOTPROD(32, 64) + +SAD_WXH_AVG_NEON_DOTPROD(64, 32) +SAD_WXH_AVG_NEON_DOTPROD(64, 64) + +#undef SAD_WXH_AVG_NEON_DOTPROD diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 8d2422b1de..d789353a1e 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -374,6 +374,7 @@ DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad_neon_dotprod.c DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 0cd21c7997..8383bdd4ca 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -748,28 +748,28 @@ () # Single block SAD # add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi lsx/; +specialize qw/vpx_sad64x64 neon neon_dotprod avx2 msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad64x32 neon neon_dotprod avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad32x64 neon neon_dotprod avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi lsx/; +specialize qw/vpx_sad32x32 neon neon_dotprod avx2 msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad32x16 neon neon_dotprod avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad16x32 neon msa sse2 vsx mmi/; +specialize qw/vpx_sad16x32 neon neon_dotprod msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi lsx/; +specialize qw/vpx_sad16x16 neon neon_dotprod msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/; +specialize qw/vpx_sad16x8 neon neon_dotprod msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad8x16 neon msa sse2 vsx mmi/; @@ -787,28 +787,28 @@ () specialize qw/vpx_sad4x4 neon msa sse2 mmi/; add_proto qw/unsigned int vpx_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_64x64 neon avx2 sse2/; +specialize qw/vpx_sad_skip_64x64 neon neon_dotprod avx2 sse2/; add_proto qw/unsigned int vpx_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_64x32 neon avx2 sse2/; +specialize qw/vpx_sad_skip_64x32 neon neon_dotprod avx2 sse2/; add_proto qw/unsigned int vpx_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_32x64 neon avx2 sse2/; +specialize qw/vpx_sad_skip_32x64 neon neon_dotprod avx2 sse2/; add_proto qw/unsigned int vpx_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_32x32 neon avx2 sse2/; +specialize qw/vpx_sad_skip_32x32 neon neon_dotprod avx2 sse2/; add_proto qw/unsigned int vpx_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_32x16 neon avx2 sse2/; +specialize qw/vpx_sad_skip_32x16 neon neon_dotprod avx2 sse2/; add_proto qw/unsigned int vpx_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_16x32 neon sse2/; +specialize qw/vpx_sad_skip_16x32 neon neon_dotprod sse2/; add_proto qw/unsigned int vpx_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_16x16 neon sse2/; +specialize qw/vpx_sad_skip_16x16 neon neon_dotprod sse2/; add_proto qw/unsigned int vpx_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad_skip_16x8 neon sse2/; +specialize qw/vpx_sad_skip_16x8 neon neon_dotprod sse2/; add_proto qw/unsigned int vpx_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad_skip_8x16 neon sse2/; @@ -886,28 +886,28 @@ () } # CONFIG_VP9_ENCODER add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi lsx/; +specialize qw/vpx_sad64x64_avg neon neon_dotprod avx2 msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad64x32_avg neon neon_dotprod avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad32x64_avg neon neon_dotprod avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi lsx/; +specialize qw/vpx_sad32x32_avg neon neon_dotprod avx2 msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx mmi/; +specialize qw/vpx_sad32x16_avg neon neon_dotprod avx2 msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad16x32_avg neon msa sse2 vsx mmi/; +specialize qw/vpx_sad16x32_avg neon neon_dotprod msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad16x16_avg neon msa sse2 vsx mmi/; +specialize qw/vpx_sad16x16_avg neon neon_dotprod msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad16x8_avg neon msa sse2 vsx mmi/; +specialize qw/vpx_sad16x8_avg neon neon_dotprod msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vpx_sad8x16_avg neon msa sse2 mmi/; From 7009fe55a9a7aed3a3504c09c677de0326c8207b Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Sat, 19 Aug 2023 20:59:40 +0100 Subject: [PATCH 0880/1000] Use run-time CPU feature detection for Neon DotProd SAD4D Arm Neon DotProd implementations of vpx_sad*4d currently need to be enabled at compile time since they're guarded by ifdef feature macros. Now that run-time feature detection has been enabled for Arm platforms, expose these implementations with distinct *neon_dotprod names in separate files and wire them up to the build system and rtcd.pl. Also add new test cases for the new DotProd functions. Change-Id: Ie99ee0b03ec488626f52c3f13e4111fe26cc5619 --- test/sad_test.cc | 30 ++++++ vpx_dsp/arm/sad4d_neon.c | 116 -------------------- vpx_dsp/arm/sad4d_neon_dotprod.c | 176 +++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 32 +++--- 5 files changed, 223 insertions(+), 132 deletions(-) create mode 100644 vpx_dsp/arm/sad4d_neon_dotprod.c diff --git a/test/sad_test.cc b/test/sad_test.cc index 3f9c020ee8..3530e66050 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1346,6 +1346,21 @@ const SadMxNx4Param x4d_neon_tests[] = { }; INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests)); +#if HAVE_NEON_DOTPROD +const SadMxNx4Param x4d_neon_dotprod_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon_dotprod), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon_dotprod), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon_dotprod), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon_dotprod), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon_dotprod), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon_dotprod), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon_dotprod), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADx4Test, + ::testing::ValuesIn(x4d_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD + const SadSkipMxNx4Param skip_x4d_neon_tests[] = { SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon), SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon), @@ -1401,6 +1416,21 @@ const SadSkipMxNx4Param skip_x4d_neon_tests[] = { }; INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test, ::testing::ValuesIn(skip_x4d_neon_tests)); + +#if HAVE_NEONE_DOTPROD +const SadSkipMxNx4Param skip_x4d_neon_dotprod_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon_dotprod), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon_dotprod), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon_dotprod), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon_dotprod), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon_dotprod), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon_dotprod), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon_dotprod), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD #endif // HAVE_NEON //------------------------------------------------------------------------------ diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 3a548d0f9f..713eec7a92 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -17,120 +17,6 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -#if defined(__ARM_FEATURE_DOTPROD) - -static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, - uint32x4_t *const sad_sum) { - uint8x16_t abs_diff = vabdq_u8(src, ref); - *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1)); -} - -static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4], int h) { - uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0) }; - uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0) }; - uint32x4_t sum[4]; - - int i = 0; - do { - uint8x16_t s0, s1, s2, s3; - - s0 = vld1q_u8(src + i * src_stride); - sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); - sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); - sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); - sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); - - s1 = vld1q_u8(src + i * src_stride + 16); - sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); - sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); - sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); - sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); - - s2 = vld1q_u8(src + i * src_stride + 32); - sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); - sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); - sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); - sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); - - s3 = vld1q_u8(src + i * src_stride + 48); - sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); - sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); - sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); - sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); - - i++; - } while (i < h); - - sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); - sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); - sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); - sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); - - vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); -} - -static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4], int h) { - uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0) }; - uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0) }; - uint32x4_t sum[4]; - - int i = 0; - do { - uint8x16_t s0, s1; - - s0 = vld1q_u8(src + i * src_stride); - sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); - sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); - sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); - sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); - - s1 = vld1q_u8(src + i * src_stride + 16); - sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); - sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); - sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); - sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); - - i++; - } while (i < h); - - sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); - sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); - sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); - sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); - - vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); -} - -static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4], int h) { - uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0) }; - - int i = 0; - do { - const uint8x16_t s = vld1q_u8(src + i * src_stride); - sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]); - sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]); - sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]); - sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]); - - i++; - } while (i < h); - - vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); -} - -#else // !defined(__ARM_FEATURE_DOTPROD)) - static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, uint16x8_t *const sad_sum) { uint8x16_t abs_diff = vabdq_u8(src, ref); @@ -229,8 +115,6 @@ static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); } -#endif // defined(__ARM_FEATURE_DOTPROD) - static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref, uint16x8_t *const sad_sum) { uint8x8_t abs_diff = vabd_u8(src, ref); diff --git a/vpx_dsp/arm/sad4d_neon_dotprod.c b/vpx_dsp/arm/sad4d_neon_dotprod.c new file mode 100644 index 0000000000..933fc48b8c --- /dev/null +++ b/vpx_dsp/arm/sad4d_neon_dotprod.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, + uint32x4_t *const sad_sum) { + uint8x16_t abs_diff = vabdq_u8(src, ref); + *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1)); +} + +static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + uint8x16_t s0, s1, s2, s3; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + s2 = vld1q_u8(src + i * src_stride + 32); + sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); + sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); + sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); + sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); + + s3 = vld1q_u8(src + i * src_stride + 48); + sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); + sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); + sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); + sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + uint8x16_t s0, s1; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = 0; + do { + const uint8x16_t s = vld1q_u8(src + i * src_stride); + sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]); + sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]); + sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]); + sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]); + + } while (++i < h); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +#define SAD_WXH_4D_NEON_DOTPROD(w, h) \ + void vpx_sad##w##x##h##x4d_neon_dotprod( \ + const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], int ref_stride, \ + uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon_dotprod(src_ptr, src_stride, ref_array, ref_stride, \ + sad_array, (h)); \ + } + +SAD_WXH_4D_NEON_DOTPROD(16, 8) +SAD_WXH_4D_NEON_DOTPROD(16, 16) +SAD_WXH_4D_NEON_DOTPROD(16, 32) + +SAD_WXH_4D_NEON_DOTPROD(32, 16) +SAD_WXH_4D_NEON_DOTPROD(32, 32) +SAD_WXH_4D_NEON_DOTPROD(32, 64) + +SAD_WXH_4D_NEON_DOTPROD(64, 32) +SAD_WXH_4D_NEON_DOTPROD(64, 64) + +#undef SAD_WXH_4D_NEON_DOTPROD + +#define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h) \ + void vpx_sad_skip_##w##x##h##x4d_neon_dotprod( \ + const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], int ref_stride, \ + uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon_dotprod(src_ptr, 2 * src_stride, ref_array, \ + 2 * ref_stride, sad_array, ((h) >> 1)); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8) +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16) +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32) + +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16) +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32) +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64) + +SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32) +SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64) + +#undef SAD_SKIP_WXH_4D_NEON_DOTPROD diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index d789353a1e..feb48ee73c 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -373,6 +373,7 @@ DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad4d_neon_dotprod.c DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad_neon_dotprod.c DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 8383bdd4ca..ff97e68d31 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -928,28 +928,28 @@ () # Multi-block SAD, comparing a reference to N independent blocks # add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi lsx/; +specialize qw/vpx_sad64x64x4d avx512 avx2 neon neon_dotprod msa sse2 vsx mmi lsx/; add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi lsx/; +specialize qw/vpx_sad64x32x4d neon neon_dotprod msa sse2 vsx mmi lsx/; add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi lsx/; +specialize qw/vpx_sad32x64x4d neon neon_dotprod msa sse2 vsx mmi lsx/; add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi lsx/; +specialize qw/vpx_sad32x32x4d avx2 neon neon_dotprod msa sse2 vsx mmi lsx/; add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/; +specialize qw/vpx_sad32x16x4d neon neon_dotprod msa sse2 vsx mmi/; add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/; +specialize qw/vpx_sad16x32x4d neon neon_dotprod msa sse2 vsx mmi/; add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi lsx/; +specialize qw/vpx_sad16x16x4d neon neon_dotprod msa sse2 vsx mmi lsx/; add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/; +specialize qw/vpx_sad16x8x4d neon neon_dotprod msa sse2 vsx mmi/; add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/; @@ -967,28 +967,28 @@ () specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/; add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_64x64x4d neon avx2 sse2/; +specialize qw/vpx_sad_skip_64x64x4d neon neon_dotprod avx2 sse2/; add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_64x32x4d neon avx2 sse2/; +specialize qw/vpx_sad_skip_64x32x4d neon neon_dotprod avx2 sse2/; add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_32x64x4d neon avx2 sse2/; +specialize qw/vpx_sad_skip_32x64x4d neon neon_dotprod avx2 sse2/; add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_32x32x4d neon avx2 sse2/; +specialize qw/vpx_sad_skip_32x32x4d neon neon_dotprod avx2 sse2/; add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_32x16x4d neon avx2 sse2/; +specialize qw/vpx_sad_skip_32x16x4d neon neon_dotprod avx2 sse2/; add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_16x32x4d neon sse2/; +specialize qw/vpx_sad_skip_16x32x4d neon neon_dotprod sse2/; add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_16x16x4d neon sse2/; +specialize qw/vpx_sad_skip_16x16x4d neon neon_dotprod sse2/; add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad_skip_16x8x4d neon sse2/; +specialize qw/vpx_sad_skip_16x8x4d neon neon_dotprod sse2/; add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad_skip_8x16x4d neon sse2/; From ad4f28abaa7ba1ab0482ab8e844d98845961be63 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Sat, 19 Aug 2023 23:09:07 +0100 Subject: [PATCH 0881/1000] Use run-time feature detection for Neon DotProd variance Arm Neon DotProd implementations of vpx_variancex currently need to be enabled at compile time since they're guarded by #ifdef feature macros. Now that run-time feature detection has been enabled for Arm platforms, expose these implementations with distinct *neon_dotprod names in a separate file and wire them up to the build system and rtcd.pl. Also add new test cases for the new functions. Remove the _neon suffix in functions making reference to vpx_variancex_neon() (e.g. sub-pixel variance) - enabling use of the appropriate *neon or *neon_dotprod version at run time. Similar changes for the specialty variance and MSE functions will be made in a subsequent commit. Change-Id: I69a0ef0d622ecb2d15bd90b4ace53273a32ed22d --- test/variance_test.cc | 18 +++ vpx_dsp/arm/subpel_variance_neon.c | 123 ++++++++-------- vpx_dsp/arm/variance_neon.c | 141 +------------------ vpx_dsp/arm/variance_neon_dotprod.c | 211 ++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 30 ++-- 6 files changed, 308 insertions(+), 216 deletions(-) create mode 100644 vpx_dsp/arm/variance_neon_dotprod.c diff --git a/test/variance_test.cc b/test/variance_test.cc index 6885252b82..5abbcb3647 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1475,6 +1475,24 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(2, 3, &vpx_variance4x8_neon), VarianceParams(2, 2, &vpx_variance4x4_neon))); +#if HAVE_NEON_DOTPROD +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_neon_dotprod), + VarianceParams(6, 5, &vpx_variance64x32_neon_dotprod), + VarianceParams(5, 6, &vpx_variance32x64_neon_dotprod), + VarianceParams(5, 5, &vpx_variance32x32_neon_dotprod), + VarianceParams(5, 4, &vpx_variance32x16_neon_dotprod), + VarianceParams(4, 5, &vpx_variance16x32_neon_dotprod), + VarianceParams(4, 4, &vpx_variance16x16_neon_dotprod), + VarianceParams(4, 3, &vpx_variance16x8_neon_dotprod), + VarianceParams(3, 4, &vpx_variance8x16_neon_dotprod), + VarianceParams(3, 3, &vpx_variance8x8_neon_dotprod), + VarianceParams(3, 2, &vpx_variance8x4_neon_dotprod), + VarianceParams(2, 3, &vpx_variance4x8_neon_dotprod), + VarianceParams(2, 2, &vpx_variance4x4_neon_dotprod))); +#endif // HAVE_NEON_DOTPROD + INSTANTIATE_TEST_SUITE_P( NEON, VpxSubpelVarianceTest, ::testing::Values( diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c index 9328c3ed89..d92f1615d7 100644 --- a/vpx_dsp/arm/subpel_variance_neon.c +++ b/vpx_dsp/arm/subpel_variance_neon.c @@ -143,59 +143,58 @@ static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr, return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } -#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ - unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, unsigned int *sse) { \ - if (xoffset == 0) { \ - if (yoffset == 0) { \ - return vpx_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \ - sse); \ - } else if (yoffset == 4) { \ - uint8_t tmp[w * h]; \ - var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \ - return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ - } else { \ - uint8_t tmp[w * h]; \ - var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \ - yoffset); \ - return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ - } \ - } else if (xoffset == 4) { \ - uint8_t tmp0[w * (h + padding)]; \ - if (yoffset == 0) { \ - var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \ - return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ - } else if (yoffset == 4) { \ - uint8_t tmp1[w * (h + padding)]; \ - var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ - var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ - } else { \ - uint8_t tmp1[w * (h + padding)]; \ - var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ - var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ - } \ - } else { \ - uint8_t tmp0[w * (h + padding)]; \ - if (yoffset == 0) { \ - var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \ - return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ - } else if (yoffset == 4) { \ - uint8_t tmp1[w * h]; \ - var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ - xoffset); \ - var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ - } else { \ - uint8_t tmp1[w * h]; \ - var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ - xoffset); \ - var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ - } \ - } \ +#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse) { \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return vpx_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \ + yoffset); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } \ } // 4x blocks are processed two rows at a time, so require an extra row of @@ -418,53 +417,53 @@ static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, uint8_t tmp[w * h]; \ if (yoffset == 0) { \ avg_pred(src, tmp, source_stride, w, h, second_pred); \ - return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \ source_stride, w, h, second_pred); \ - return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ } else { \ avg_pred_var_filter_block2d_bil_w##w( \ src, tmp, source_stride, source_stride, h, yoffset, second_pred); \ - return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ uint8_t tmp0[w * (h + padding)]; \ if (yoffset == 0) { \ avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \ second_pred); \ - return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint8_t tmp1[w * (h + padding)]; \ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } else { \ uint8_t tmp1[w * (h + padding)]; \ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ second_pred); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } \ } else { \ uint8_t tmp0[w * (h + padding)]; \ if (yoffset == 0) { \ avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \ xoffset, second_pred); \ - return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint8_t tmp1[w * h]; \ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ (h + padding), xoffset); \ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } else { \ uint8_t tmp1[w * h]; \ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ (h + padding), xoffset); \ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ second_pred); \ - return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } \ } \ } diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index f41249d4d5..84a6a761f9 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -19,143 +19,6 @@ #include "vpx_dsp/arm/sum_neon.h" #include "vpx_ports/mem.h" -#if defined(__ARM_FEATURE_DOTPROD) - -// Process a block of width 4 four rows at a time. -static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - int h, uint32_t *sse, int *sum) { - uint32x4_t src_sum = vdupq_n_u32(0); - uint32x4_t ref_sum = vdupq_n_u32(0); - uint32x4_t sse_u32 = vdupq_n_u32(0); - - int i = h; - do { - const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); - - const uint8x16_t abs_diff = vabdq_u8(s, r); - sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); - - src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); - ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); - - src_ptr += 4 * src_stride; - ref_ptr += 4 * ref_stride; - i -= 4; - } while (i != 0); - - *sum = horizontal_add_int32x4( - vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); - *sse = horizontal_add_uint32x4(sse_u32); -} - -// Process a block of width 8 two rows at a time. -static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - int h, uint32_t *sse, int *sum) { - uint32x4_t src_sum = vdupq_n_u32(0); - uint32x4_t ref_sum = vdupq_n_u32(0); - uint32x4_t sse_u32 = vdupq_n_u32(0); - - int i = h; - do { - const uint8x16_t s = - vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride)); - const uint8x16_t r = - vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride)); - - const uint8x16_t abs_diff = vabdq_u8(s, r); - sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); - - src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); - ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); - - src_ptr += 2 * src_stride; - ref_ptr += 2 * ref_stride; - i -= 2; - } while (i != 0); - - *sum = horizontal_add_int32x4( - vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); - *sse = horizontal_add_uint32x4(sse_u32); -} - -// Process a block of width 16 one row at a time. -static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - int h, uint32_t *sse, int *sum) { - uint32x4_t src_sum = vdupq_n_u32(0); - uint32x4_t ref_sum = vdupq_n_u32(0); - uint32x4_t sse_u32 = vdupq_n_u32(0); - - int i = h; - do { - const uint8x16_t s = vld1q_u8(src_ptr); - const uint8x16_t r = vld1q_u8(ref_ptr); - - const uint8x16_t abs_diff = vabdq_u8(s, r); - sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); - - src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); - ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); - - src_ptr += src_stride; - ref_ptr += ref_stride; - } while (--i != 0); - - *sum = horizontal_add_int32x4( - vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); - *sse = horizontal_add_uint32x4(sse_u32); -} - -// Process a block of any size where the width is divisible by 16. -static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - int w, int h, uint32_t *sse, int *sum) { - uint32x4_t src_sum = vdupq_n_u32(0); - uint32x4_t ref_sum = vdupq_n_u32(0); - uint32x4_t sse_u32 = vdupq_n_u32(0); - - int i = h; - do { - int j = 0; - do { - const uint8x16_t s = vld1q_u8(src_ptr + j); - const uint8x16_t r = vld1q_u8(ref_ptr + j); - - const uint8x16_t abs_diff = vabdq_u8(s, r); - sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); - - src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); - ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); - - j += 16; - } while (j < w); - - src_ptr += src_stride; - ref_ptr += ref_stride; - } while (--i != 0); - - *sum = horizontal_add_int32x4( - vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); - *sse = horizontal_add_uint32x4(sse_u32); -} - -static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, int h, - uint32_t *sse, int *sum) { - variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum); -} - -static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, int h, - uint32_t *sse, int *sum) { - variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum); -} - -#else // !defined(__ARM_FEATURE_DOTPROD) - // Process a block of width 4 two rows at a time. static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, @@ -328,8 +191,6 @@ static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride, variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum); } -#endif // defined(__ARM_FEATURE_DOTPROD) - void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum) { @@ -369,6 +230,8 @@ VARIANCE_WXH_NEON(32, 64, 11) VARIANCE_WXH_NEON(64, 32, 11) VARIANCE_WXH_NEON(64, 64, 12) +#undef VARIANCE_WXH_NEON + #if defined(__ARM_FEATURE_DOTPROD) static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr, diff --git a/vpx_dsp/arm/variance_neon_dotprod.c b/vpx_dsp/arm/variance_neon_dotprod.c new file mode 100644 index 0000000000..a47c355636 --- /dev/null +++ b/vpx_dsp/arm/variance_neon_dotprod.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_ports/mem.h" + +// Process a block of width 4 four rows at a time. +static INLINE void variance_4xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += 4 * src_stride; + ref_ptr += 4 * ref_stride; + i -= 4; + } while (i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of width 8 two rows at a time. +static INLINE void variance_8xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = + vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride)); + const uint8x16_t r = + vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride)); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of width 16 one row at a time. +static INLINE void variance_16xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = vld1q_u8(src_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of any size where the width is divisible by 16. +static INLINE void variance_large_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + int j = 0; + do { + const uint8x16_t s = vld1q_u8(src_ptr + j); + const uint8x16_t r = vld1q_u8(ref_ptr + j); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + j += 16; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +static INLINE void variance_32xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 32, h, sse, + sum); +} + +static INLINE void variance_64xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 64, h, sse, + sum); +} + +void vpx_get8x8var_neon_dotprod(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_8xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, + sum); +} + +void vpx_get16x16var_neon_dotprod(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_16xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, + sum); +} + +#define VARIANCE_WXH_NEON_DOTPROD(w, h, shift) \ + unsigned int vpx_variance##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + variance_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, sse, \ + &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + } + +VARIANCE_WXH_NEON_DOTPROD(4, 4, 4) +VARIANCE_WXH_NEON_DOTPROD(4, 8, 5) + +VARIANCE_WXH_NEON_DOTPROD(8, 4, 5) +VARIANCE_WXH_NEON_DOTPROD(8, 8, 6) +VARIANCE_WXH_NEON_DOTPROD(8, 16, 7) + +VARIANCE_WXH_NEON_DOTPROD(16, 8, 7) +VARIANCE_WXH_NEON_DOTPROD(16, 16, 8) +VARIANCE_WXH_NEON_DOTPROD(16, 32, 9) + +VARIANCE_WXH_NEON_DOTPROD(32, 16, 9) +VARIANCE_WXH_NEON_DOTPROD(32, 32, 10) +VARIANCE_WXH_NEON_DOTPROD(32, 64, 11) + +VARIANCE_WXH_NEON_DOTPROD(64, 32, 11) +VARIANCE_WXH_NEON_DOTPROD(64, 64, 12) + +#undef VARIANCE_WXH_NEON_DOTPROD diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index feb48ee73c..84fd969daa 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -418,6 +418,7 @@ DSP_SRCS-yes += variance.h DSP_SRCS-$(HAVE_NEON) += arm/avg_pred_neon.c DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/variance_neon_dotprod.c DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index ff97e68d31..94a821371e 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1254,52 +1254,52 @@ () # Variance # add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx lsx/; + specialize qw/vpx_variance64x64 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/; add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_variance64x32 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_variance32x64 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx lsx/; + specialize qw/vpx_variance32x32 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/; add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_variance32x16 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_variance16x32 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx lsx/; + specialize qw/vpx_variance16x16 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/; add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_variance16x8 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x16 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_variance8x16 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x8 sse2 avx2 neon msa mmi vsx lsx/; + specialize qw/vpx_variance8x8 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/; add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x4 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_variance8x4 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance4x8 sse2 neon msa mmi vsx/; + specialize qw/vpx_variance4x8 sse2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance4x4 sse2 neon msa mmi vsx/; + specialize qw/vpx_variance4x4 sse2 neon neon_dotprod msa mmi vsx/; # # Specialty Variance # add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx lsx/; + specialize qw/vpx_get16x16var sse2 avx2 neon neon_dotprod msa vsx lsx/; add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_get8x8var sse2 neon msa vsx/; + specialize qw/vpx_get8x8var sse2 neon neon_dotprod msa vsx/; add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx lsx/; From 15d621571665d8731fd980282913d9454d75c870 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Sat, 19 Aug 2023 23:25:01 +0100 Subject: [PATCH 0882/1000] Use run-time feature detection for Neon DotProd specialty var. Enable Arm Neon DotProd implementations of vpx_get_var_sse_sum* specialty variance functions via run-time feature detection, wiring up the new *neon_dotprod names to rtcd.pl. Also add new test cases. Change-Id: I04ac3db87d32ee7f94702b6c0360254e5688f713 --- test/variance_test.cc | 11 ++++ vpx_dsp/arm/variance_neon.c | 79 +------------------------- vpx_dsp/arm/variance_neon_dotprod.c | 87 +++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 10 ++-- 4 files changed, 105 insertions(+), 82 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index 5abbcb3647..c32c919760 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1476,6 +1476,17 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(2, 2, &vpx_variance4x4_neon))); #if HAVE_NEON_DOTPROD +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxSseTest, + ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_neon_dotprod))); + +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon_dotprod), + MseParams(4, 3, &vpx_mse16x8_neon_dotprod), + MseParams(3, 4, &vpx_mse8x16_neon_dotprod), + MseParams(3, 3, &vpx_mse8x8_neon_dotprod))); + INSTANTIATE_TEST_SUITE_P( NEON_DOTPROD, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_neon_dotprod), diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index 84a6a761f9..efb2c1d8da 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -232,81 +232,6 @@ VARIANCE_WXH_NEON(64, 64, 12) #undef VARIANCE_WXH_NEON -#if defined(__ARM_FEATURE_DOTPROD) - -static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr, - int src_stride, - const unsigned char *ref_ptr, - int ref_stride, int h) { - uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) }; - - int i = h / 2; - do { - uint8x8_t s0, s1, r0, r1, diff0, diff1; - - s0 = vld1_u8(src_ptr); - src_ptr += src_stride; - s1 = vld1_u8(src_ptr); - src_ptr += src_stride; - r0 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - r1 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - - diff0 = vabd_u8(s0, r0); - diff1 = vabd_u8(s1, r1); - - sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0); - sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1); - } while (--i != 0); - - return horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1])); -} - -static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, - int src_stride, - const unsigned char *ref_ptr, - int ref_stride, int h) { - uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; - - int i = h / 2; - do { - uint8x16_t s0, s1, r0, r1, diff0, diff1; - - s0 = vld1q_u8(src_ptr); - src_ptr += src_stride; - s1 = vld1q_u8(src_ptr); - src_ptr += src_stride; - r0 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - r1 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - - diff0 = vabdq_u8(s0, r0); - diff1 = vabdq_u8(s1, r1); - - sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0); - sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1); - } while (--i != 0); - - return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); -} - -unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, - int ref_stride) { - uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); - uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); - - uint8x16_t abs_diff = vabdq_u8(s, r); - - uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff); - - return horizontal_add_uint32x4(sse); -} - -#else // !defined(__ARM_FEATURE_DOTPROD) - static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, @@ -391,8 +316,6 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, return horizontal_add_uint32x4(sse); } -#endif // defined(__ARM_FEATURE_DOTPROD) - #define VPX_MSE_WXH_NEON(w, h) \ unsigned int vpx_mse##w##x##h##_neon( \ const unsigned char *src_ptr, int src_stride, \ @@ -405,3 +328,5 @@ VPX_MSE_WXH_NEON(8, 8) VPX_MSE_WXH_NEON(8, 16) VPX_MSE_WXH_NEON(16, 8) VPX_MSE_WXH_NEON(16, 16) + +#undef VPX_MSE_WXH_NEON diff --git a/vpx_dsp/arm/variance_neon_dotprod.c b/vpx_dsp/arm/variance_neon_dotprod.c index a47c355636..ab843e9fca 100644 --- a/vpx_dsp/arm/variance_neon_dotprod.c +++ b/vpx_dsp/arm/variance_neon_dotprod.c @@ -209,3 +209,90 @@ VARIANCE_WXH_NEON_DOTPROD(64, 32, 11) VARIANCE_WXH_NEON_DOTPROD(64, 64, 12) #undef VARIANCE_WXH_NEON_DOTPROD + +static INLINE unsigned int vpx_mse8xh_neon_dotprod(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h) { + uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + int i = h / 2; + do { + uint8x8_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1_u8(src_ptr); + src_ptr += src_stride; + s1 = vld1_u8(src_ptr); + src_ptr += src_stride; + r0 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + + diff0 = vabd_u8(s0, r0); + diff1 = vabd_u8(s1, r1); + + sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0); + sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1); + } while (--i != 0); + + return horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1])); +} + +static INLINE unsigned int vpx_mse16xh_neon_dotprod( + const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + s1 = vld1q_u8(src_ptr); + src_ptr += src_stride; + r0 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + + diff0 = vabdq_u8(s0, r0); + diff1 = vabdq_u8(s1, r1); + + sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0); + sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1); + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); +} + +unsigned int vpx_get4x4sse_cs_neon_dotprod(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride) { + uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); + uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); + + uint8x16_t abs_diff = vabdq_u8(s, r); + + uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff); + + return horizontal_add_uint32x4(sse); +} + +#define VPX_MSE_WXH_NEON_DOTPROD(w, h) \ + unsigned int vpx_mse##w##x##h##_neon_dotprod( \ + const unsigned char *src_ptr, int src_stride, \ + const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \ + *sse = vpx_mse##w##xh_neon_dotprod(src_ptr, src_stride, ref_ptr, \ + ref_stride, h); \ + return *sse; \ + } + +VPX_MSE_WXH_NEON_DOTPROD(8, 8) +VPX_MSE_WXH_NEON_DOTPROD(8, 16) +VPX_MSE_WXH_NEON_DOTPROD(16, 8) +VPX_MSE_WXH_NEON_DOTPROD(16, 16) + +#undef VPX_MSE_WXH_NEON_DOTPROD diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 94a821371e..c9cdc285f2 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1302,22 +1302,22 @@ () specialize qw/vpx_get8x8var sse2 neon neon_dotprod msa vsx/; add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx lsx/; + specialize qw/vpx_mse16x16 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/; add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_mse16x8 sse2 avx2 neon msa mmi vsx/; + specialize qw/vpx_mse16x8 sse2 avx2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_mse8x16 sse2 neon msa mmi vsx/; + specialize qw/vpx_mse8x16 sse2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_mse8x8 sse2 neon msa mmi vsx/; + specialize qw/vpx_mse8x8 sse2 neon neon_dotprod msa mmi vsx/; add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *"; specialize qw/vpx_get_mb_ss sse2 msa vsx/; add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride"; - specialize qw/vpx_get4x4sse_cs neon msa vsx/; + specialize qw/vpx_get4x4sse_cs neon neon_dotprod msa vsx/; add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; specialize qw/vpx_comp_avg_pred neon sse2 avx2 vsx lsx/; From 1a1f50a89d2ba6890024464742bf5a01e034fb45 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Sat, 19 Aug 2023 23:41:09 +0100 Subject: [PATCH 0883/1000] Use run-time feature detection for Neon DotProd HBD MSE Arm Neon DotProd implementations of vpx_highbd_8_msex currently need to be enabled at compile time since they're guarded by #ifdef feature macros. Now that run-time feature detection has been enabled for Arm platforms, expose these implementations with distinct *neon_dotprod names in a separate file and wire them up to the build system and rtcd.pl. Also add new test cases for the new functions. Change-Id: I26be6fb587258c8fa9fbf03509b7602358a001a8 --- test/variance_test.cc | 10 +++ vpx_dsp/arm/highbd_variance_neon.c | 67 +-------------- vpx_dsp/arm/highbd_variance_neon_dotprod.c | 96 ++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- 5 files changed, 113 insertions(+), 69 deletions(-) create mode 100644 vpx_dsp/arm/highbd_variance_neon_dotprod.c diff --git a/test/variance_test.cc b/test/variance_test.cc index c32c919760..e231df0c6b 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1555,6 +1555,16 @@ INSTANTIATE_TEST_SUITE_P( MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8), MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8))); +#if HAVE_NEON_DOTPROD +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxHBDMseTest, + ::testing::Values( + MseParams(4, 4, &vpx_highbd_8_mse16x16_neon_dotprod, VPX_BITS_8), + MseParams(4, 3, &vpx_highbd_8_mse16x8_neon_dotprod, VPX_BITS_8), + MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8))); +#endif // HAVE_NEON_DOTPROD + INSTANTIATE_TEST_SUITE_P( NEON, VpxHBDVarianceTest, ::testing::Values( diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c index e361f6f6f1..309ae7fd35 100644 --- a/vpx_dsp/arm/highbd_variance_neon.c +++ b/vpx_dsp/arm/highbd_variance_neon.c @@ -384,69 +384,6 @@ static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr, return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); } -#if defined(__ARM_FEATURE_DOTPROD) - -static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr, - int src_stride, - const uint16_t *ref_ptr, - int ref_stride, int h) { - uint32x4_t sse_u32 = vdupq_n_u32(0); - - int i = h / 2; - do { - uint16x8_t s0, s1, r0, r1; - uint8x16_t s, r, diff; - - s0 = vld1q_u16(src_ptr); - src_ptr += src_stride; - s1 = vld1q_u16(src_ptr); - src_ptr += src_stride; - r0 = vld1q_u16(ref_ptr); - ref_ptr += ref_stride; - r1 = vld1q_u16(ref_ptr); - ref_ptr += ref_stride; - - s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); - r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); - - diff = vabdq_u8(s, r); - sse_u32 = vdotq_u32(sse_u32, diff, diff); - } while (--i != 0); - - return horizontal_add_uint32x4(sse_u32); -} - -static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr, - int src_stride, - const uint16_t *ref_ptr, - int ref_stride, int h) { - uint32x4_t sse_u32 = vdupq_n_u32(0); - - int i = h; - do { - uint16x8_t s0, s1, r0, r1; - uint8x16_t s, r, diff; - - s0 = vld1q_u16(src_ptr); - s1 = vld1q_u16(src_ptr + 8); - r0 = vld1q_u16(ref_ptr); - r1 = vld1q_u16(ref_ptr + 8); - - s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); - r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); - - diff = vabdq_u8(s, r); - sse_u32 = vdotq_u32(sse_u32, diff, diff); - - src_ptr += src_stride; - ref_ptr += ref_stride; - } while (--i != 0); - - return horizontal_add_uint32x4(sse_u32); -} - -#else // !defined(__ARM_FEATURE_DOTPROD) - static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, @@ -461,8 +398,6 @@ static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr, return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h); } -#endif // defined(__ARM_FEATURE_DOTPROD) - #define HIGHBD_MSE_WXH_NEON(w, h) \ uint32_t vpx_highbd_8_mse##w##x##h##_neon( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ @@ -497,3 +432,5 @@ HIGHBD_MSE_WXH_NEON(16, 16) HIGHBD_MSE_WXH_NEON(16, 8) HIGHBD_MSE_WXH_NEON(8, 16) HIGHBD_MSE_WXH_NEON(8, 8) + +#undef HIGHBD_MSE_WXH_NEON diff --git a/vpx_dsp/arm/highbd_variance_neon_dotprod.c b/vpx_dsp/arm/highbd_variance_neon_dotprod.c new file mode 100644 index 0000000000..1a88720172 --- /dev/null +++ b/vpx_dsp/arm/highbd_variance_neon_dotprod.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_ports/mem.h" + +static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h / 2; + do { + uint16x8_t s0, s1, r0, r1; + uint8x16_t s, r, diff; + + s0 = vld1q_u16(src_ptr); + src_ptr += src_stride; + s1 = vld1q_u16(src_ptr); + src_ptr += src_stride; + r0 = vld1q_u16(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1q_u16(ref_ptr); + ref_ptr += ref_stride; + + s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); + r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); + + diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, diff, diff); + } while (--i != 0); + + return horizontal_add_uint32x4(sse_u32); +} + +static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + uint16x8_t s0, s1, r0, r1; + uint8x16_t s, r, diff; + + s0 = vld1q_u16(src_ptr); + s1 = vld1q_u16(src_ptr + 8); + r0 = vld1q_u16(ref_ptr); + r1 = vld1q_u16(ref_ptr + 8); + + s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); + r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); + + diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, diff, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(sse_u32); +} + +#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h) \ + uint32_t vpx_highbd_8_mse##w##x##h##_neon_dotprod( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = \ + highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h); \ + return *sse; \ + } + +HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16) +HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8) +HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16) +HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8) + +#undef HIGHBD_MSE_WXH_NEON_DOTPROD diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 84fd969daa..5343088d1b 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -448,6 +448,7 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/highbd_variance_neon_dotprod.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index c9cdc285f2..1012df11ec 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1534,14 +1534,14 @@ () specialize qw/vpx_highbd_12_get8x8var sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse16x16 sse2 neon/; + specialize qw/vpx_highbd_8_mse16x16 sse2 neon neon_dotprod/; add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse16x8 neon/; + specialize qw/vpx_highbd_8_mse16x8 neon neon_dotprod/; add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse8x16 neon/; + specialize qw/vpx_highbd_8_mse8x16 neon neon_dotprod/; add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse8x8 sse2 neon/; + specialize qw/vpx_highbd_8_mse8x8 sse2 neon neon_dotprod/; add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_mse16x16 sse2 neon/; From d549cb74b9fc3f16aa3ed9f59459c791765cd723 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 12 Sep 2023 15:28:06 -0400 Subject: [PATCH 0884/1000] Add missing headers for clang-tidy warnings Change-Id: I97edec8ecffdcc79b8f3528deb60a3a0332ea0cc --- tools_common.c | 2 ++ vp9/encoder/arm/neon/vp9_quantize_neon.c | 1 + vp9/encoder/vp9_encoder.c | 2 ++ 3 files changed, 5 insertions(+) diff --git a/tools_common.c b/tools_common.c index 0fcab2cf29..5c13781513 100644 --- a/tools_common.c +++ b/tools_common.c @@ -24,6 +24,8 @@ #include "vpx/vp8dx.h" #endif +#include "vpx/vpx_codec.h" + #if defined(_WIN32) || defined(__OS2__) #include #include diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index 968cdc6d11..96d0614367 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "./vpx_config.h" #include "vpx_mem/vpx_mem.h" diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 869d557dd3..69a4e3c314 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -81,6 +81,8 @@ #include "vp9/encoder/vp9_tpl_model.h" #include "vp9/vp9_cx_iface.h" +#include "vpx/vpx_ext_ratectrl.h" + #define AM_SEGMENT_ID_INACTIVE 7 #define AM_SEGMENT_ID_ACTIVE 0 From 391bb5604b85195468e73d576766252f6ce8e427 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Sat, 15 Apr 2023 11:11:16 +0800 Subject: [PATCH 0885/1000] loongarch: simplify vpx_quantize_b/b_32x32_lsx args Bug: webm:1755 Change-Id: I42fdb1c34f959dd1204b343b8192e3d9b49821b4 --- test/vp9_quantize_test.cc | 14 +++++----- vpx_dsp/loongarch/quantize_lsx.c | 46 +++++++++++++++----------------- 2 files changed, 28 insertions(+), 32 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index f6984bd6fa..e00ab4022c 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -695,13 +695,13 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH #if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_lsx, - &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_lsx, - &vpx_quantize_b_32x32_c, - VPX_BITS_8, 32, false))); +INSTANTIATE_TEST_SUITE_P( + LSX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_lsx, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false))); #endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH // Only useful to compare "Speed" test results. diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c index 77be0bb4fe..9bb1691e2e 100644 --- a/vpx_dsp/loongarch/quantize_lsx.c +++ b/vpx_dsp/loongarch/quantize_lsx.c @@ -11,6 +11,8 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_util/loongson_intrinsics.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs, __m128i round, __m128i quant, @@ -88,15 +90,15 @@ static INLINE int16_t accumulate_eob(__m128i eob) { } #if !CONFIG_VP9_HIGHBITDEPTH + void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, - int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { __m128i zero = __lsx_vldi(0); int index = 16; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, quant_shift; __m128i coeff0, coeff1; @@ -104,13 +106,11 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, __m128i cmp_mask0, cmp_mask1; __m128i eob, eob0; - (void)scan; - - zbin = __lsx_vld(zbin_ptr, 0); - round = __lsx_vld(round_ptr, 0); - quant = __lsx_vld(quant_ptr, 0); + zbin = __lsx_vld(mb_plane->zbin, 0); + round = __lsx_vld(mb_plane->round, 0); + quant = __lsx_vld(mb_plane->quant, 0); dequant = __lsx_vld(dequant_ptr, 0); - quant_shift = __lsx_vld(quant_shift_ptr, 0); + quant_shift = __lsx_vld(mb_plane->quant_shift, 0); // Handle one DC and first 15 AC. DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1); qcoeff0 = __lsx_vabsd_h(coeff0, zero); @@ -167,31 +167,27 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = accumulate_eob(eob); } -void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, +void vpx_quantize_b_32x32_lsx(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { __m128i zero = __lsx_vldi(0); int index; + const int16_t *iscan = scan_order->iscan; __m128i zbin, round, quant, dequant, quant_shift; __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1; __m128i eob = zero, eob0; - (void)scan; - (void)n_coeffs; - - zbin = __lsx_vld(zbin_ptr, 0); + zbin = __lsx_vld(mb_plane->zbin, 0); zbin = __lsx_vsrari_h(zbin, 1); - round = __lsx_vld(round_ptr, 0); + round = __lsx_vld(mb_plane->round, 0); round = __lsx_vsrari_h(round, 1); - quant = __lsx_vld(quant_ptr, 0); + quant = __lsx_vld(mb_plane->quant, 0); dequant = __lsx_vld(dequant_ptr, 0); - quant_shift = __lsx_vld(quant_shift_ptr, 0); + quant_shift = __lsx_vld(mb_plane->quant_shift, 0); quant_shift = __lsx_vslli_h(quant_shift, 1); // Handle one DC and first 15 AC. DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1); From eb232b662aeb9ccc76b6dde3f50c3f806b7b58fa Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Tue, 12 Sep 2023 10:30:38 +0800 Subject: [PATCH 0886/1000] loongarch: Fix bugs from vp8_sixtap_predict4x4/16x16_lsx Bug: webm:1755 Change-Id: I7295e0f9a1551b8a418d5b65a2b7351df1fdc063 --- test/predict_test.cc | 8 ++++++++ test/test_intra_pred_speed.cc | 9 +++++++++ test/vp9_intrapred_test.cc | 9 +++++++++ vp8/common/loongarch/sixtap_filter_lsx.c | 23 ++++++++++++----------- 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/test/predict_test.cc b/test/predict_test.cc index fbf42077b3..474eab2cb5 100644 --- a/test/predict_test.cc +++ b/test/predict_test.cc @@ -350,6 +350,14 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi))); #endif +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_lsx), + make_tuple(8, 8, &vp8_sixtap_predict8x8_lsx), + make_tuple(4, 4, &vp8_sixtap_predict4x4_lsx))); +#endif + class BilinearPredictTest : public PredictTestBase {}; TEST_P(BilinearPredictTest, TestWithRandomData) { diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 15303816b9..b013e0bd5d 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -348,6 +348,15 @@ INTRA_PRED_TEST(VSX, TestIntraPred32, vpx_dc_predictor_32x32_vsx, vpx_tm_predictor_32x32_vsx) #endif // HAVE_VSX +#if HAVE_LSX +INTRA_PRED_TEST(LSX, TestIntraPred8, vpx_dc_predictor_8x8_lsx, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr) +INTRA_PRED_TEST(LSX, TestIntraPred16, vpx_dc_predictor_16x16_lsx, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr) +#endif // HAVE_LSX + // ----------------------------------------------------------------------------- #if CONFIG_VP9_HIGHBITDEPTH diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index daaf768699..c69d43efbc 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -489,6 +489,15 @@ INSTANTIATE_TEST_SUITE_P( &vpx_v_predictor_32x32_c, 32, 8))); #endif // HAVE_VSX +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, VP9IntraPredTest, + ::testing::Values(IntraPredParam(&vpx_dc_predictor_8x8_lsx, + &vpx_dc_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_predictor_16x16_lsx, + &vpx_dc_predictor_16x16_c, 16, 8))); +#endif // HAVE_LSX + #if CONFIG_VP9_HIGHBITDEPTH typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, diff --git a/vp8/common/loongarch/sixtap_filter_lsx.c b/vp8/common/loongarch/sixtap_filter_lsx.c index cd7ba54746..9867633415 100644 --- a/vp8/common/loongarch/sixtap_filter_lsx.c +++ b/vp8/common/loongarch/sixtap_filter_lsx.c @@ -1706,21 +1706,22 @@ void vp8_sixtap_predict4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride, switch (xoffset) { case 0: { __m128i tp0; - tp0 = __lsx_vinsgr2vr_w(tp0, src, 0); - src += src_stride; - tp0 = __lsx_vinsgr2vr_w(tp0, src, 0); - src += src_stride; - tp0 = __lsx_vinsgr2vr_w(tp0, src, 0); - src += src_stride; - tp0 = __lsx_vinsgr2vr_w(tp0, src, 0); + tp0 = __lsx_vldrepl_w(src, 0); + src += src_stride; __lsx_vstelm_w(tp0, dst, 0, 0); dst += dst_stride; - __lsx_vstelm_w(tp0, dst, 0, 1); + tp0 = __lsx_vldrepl_w(src, 0); + src += src_stride; + __lsx_vstelm_w(tp0, dst, 0, 0); dst += dst_stride; - __lsx_vstelm_w(tp0, dst, 0, 2); + tp0 = __lsx_vldrepl_w(src, 0); + src += src_stride; + __lsx_vstelm_w(tp0, dst, 0, 0); dst += dst_stride; - __lsx_vstelm_w(tp0, dst, 0, 3); + tp0 = __lsx_vldrepl_w(src, 0); + __lsx_vstelm_w(tp0, dst, 0, 0); + break; } case 2: @@ -1865,7 +1866,7 @@ void vp8_sixtap_predict16x16_lsx(uint8_t *RESTRICT src, int32_t src_stride, case 1: Predict16x16Funcs1[3](src, src_stride, dst, dst_stride, - h_filter, v_filter + 1, 16); + h_filter + 1, v_filter + 1, 16); break; } break; From 9c2e33ff74a78a491c765478505e04c3c7be6849 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 14 Sep 2023 11:59:57 -0400 Subject: [PATCH 0887/1000] Set frame width height for 1st TPL GOP frame Change-Id: Ic92dfd232bf90e8cbe6c6233af523ed40d12097a --- vp9/encoder/vp9_tpl_model.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index 02318070c2..e98ee459f0 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -175,7 +175,8 @@ static void free_tpl_frame_stats_list(VpxTplGopStats *tpl_gop_stats) { static void init_tpl_stats_before_propagation( struct vpx_internal_error_info *error_info, VpxTplGopStats *tpl_gop_stats, - TplDepFrame *tpl_stats, int tpl_gop_frames) { + TplDepFrame *tpl_stats, int tpl_gop_frames, int frame_width, + int frame_height) { int frame_idx; free_tpl_frame_stats_list(tpl_gop_stats); CHECK_MEM_ERROR( @@ -192,6 +193,8 @@ static void init_tpl_stats_before_propagation( sizeof( *tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list))); tpl_gop_stats->frame_stats_list[frame_idx].num_blocks = mi_rows * mi_cols; + tpl_gop_stats->frame_stats_list[frame_idx].frame_width = frame_width; + tpl_gop_stats->frame_stats_list[frame_idx].frame_height = frame_height; } } @@ -1497,7 +1500,8 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) { init_tpl_stats(cpi); init_tpl_stats_before_propagation(&cpi->common.error, &cpi->tpl_gop_stats, - cpi->tpl_stats, tpl_group_frames); + cpi->tpl_stats, tpl_group_frames, + cpi->common.width, cpi->common.height); // Backward propagation from tpl_group_frames to 1. for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) { From 8e61b3cd1bdd6db56cbd040fae8f7c318180f2a6 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 14 Sep 2023 13:27:39 -0400 Subject: [PATCH 0888/1000] Fix ref frame buffer in TPL stats for RC The original ref frame index was the index in the GF group; RC expects the index to be the one for ref frame buffer. Change-Id: I9a2b0e72b6332023fb2e8da131b557f82db02e39 --- vp9/encoder/vp9_tpl_model.c | 13 ++++++++----- vpx/vpx_tpl.h | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c index e98ee459f0..b8910370e0 100644 --- a/vp9/encoder/vp9_tpl_model.c +++ b/vp9/encoder/vp9_tpl_model.c @@ -393,7 +393,7 @@ static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats, TplDepStats *tpl_stats, int mi_row, int mi_col, BLOCK_SIZE bsize, int stride, int64_t recon_error, - int64_t rate_cost) { + int64_t rate_cost, int ref_frame_idx) { const int mi_height = num_8x8_blocks_high_lookup[bsize]; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; @@ -411,7 +411,7 @@ static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats, tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row; tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col; - tpl_block_stats_ptr->ref_frame_index = src_stats->ref_frame_index; + tpl_block_stats_ptr->ref_frame_index = ref_frame_idx; } } } @@ -576,7 +576,7 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, int64_t *recon_error, int64_t *rate_cost, - int64_t *sse) { + int64_t *sse, int *ref_frame_idx) { VP9_COMMON *cm = &cpi->common; ThreadData *td = &cpi->td; @@ -723,6 +723,7 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; tpl_stats->mv.as_int = best_mv.as_int; + *ref_frame_idx = best_rf_idx; } #if CONFIG_NON_GREEDY_MV @@ -1232,10 +1233,12 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int64_t recon_error = 0; int64_t rate_cost = 0; int64_t sse = 0; + // Ref frame index in the ref frame buffer. + int ref_frame_idx = -1; mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame, src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize, tx_size, ref_frame, predictor, &recon_error, &rate_cost, - &sse); + &sse, &ref_frame_idx); // Motion flow dependency dispenser. tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride); @@ -1243,7 +1246,7 @@ static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, tpl_store_before_propagation( tpl_frame_stats_before_propagation->block_stats_list, tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride, - recon_error, rate_cost); + recon_error, rate_cost, ref_frame_idx); tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize); diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h index 3828eb8f29..a250aada60 100644 --- a/vpx/vpx_tpl.h +++ b/vpx/vpx_tpl.h @@ -44,7 +44,7 @@ typedef struct VpxTplBlockStats { int16_t mv_c; /**< Motion vector col */ int64_t recrf_rate; /**< Rate from reconstructed ref frame */ int64_t recrf_dist; /**< Distortion from reconstructed ref frame */ - int ref_frame_index; /**< Ref frame index */ + int ref_frame_index; /**< Ref frame index in the ref frame buffer */ } VpxTplBlockStats; /*!\brief Temporal dependency model stats for each frame before propagation */ From 8f8e7414684e97ea9b94710ac7853565c8a11c3a Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 14 Sep 2023 16:04:19 -0400 Subject: [PATCH 0889/1000] Add max/min_gf_interval to vpx_rc_config_t Bug: b/300499738 Change-Id: Id32cb5e3ce667539c0d1efe1ff5fcc7a49e35329 --- vp9/vp9_cx_iface.c | 3 ++- vpx/vpx_ext_ratectrl.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 06c0ac1bfb..b1dfe992cf 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1960,7 +1960,8 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, ratectrl_config.frame_width = frame_info->frame_width; ratectrl_config.frame_height = frame_info->frame_height; ratectrl_config.show_frame_count = cpi->twopass.first_pass_info.num_frames; - + ratectrl_config.max_gf_interval = oxcf->max_gf_interval; + ratectrl_config.min_gf_interval = oxcf->min_gf_interval; // TODO(angiebird): Double check whether this is the proper way to set up // target_bitrate and frame_rate. ratectrl_config.target_bitrate_kbps = (int)(oxcf->target_bandwidth / 1000); diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h index ef96be6fff..46d290dff4 100644 --- a/vpx/vpx_ext_ratectrl.h +++ b/vpx/vpx_ext_ratectrl.h @@ -307,6 +307,8 @@ typedef struct vpx_rc_config { int frame_width; /**< frame width */ int frame_height; /**< frame height */ int show_frame_count; /**< number of visible frames in the video */ + int max_gf_interval; /**< max GOP size in number of show frames */ + int min_gf_interval; /**< min GOP size in number of show frames */ /*! * Target bitrate in kilobytes per second */ From ad3301e6a3f70608b17fa8b61527a3bd2c711bbd Mon Sep 17 00:00:00 2001 From: Martin Storsjo Date: Fri, 15 Sep 2023 12:29:32 +0300 Subject: [PATCH 0890/1000] aarch64: Generalize Windows cpu detection to any Windows variant This cpu detection implementation doesn't do anything MSVC specific, it just calls the IsProcessorFeaturePresent function. This can be compiled with mingw compilers just as well. Change-Id: I55e607a47c8f5b70d9f707ef96b2fa7553f2f79f --- vpx_ports/aarch64_cpudetect.c | 2 +- vpx_ports/arm_cpudetect.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c index a3054ad717..dad081c8ce 100644 --- a/vpx_ports/aarch64_cpudetect.c +++ b/vpx_ports/aarch64_cpudetect.c @@ -58,7 +58,7 @@ static int arm_get_cpu_caps(void) { return flags; } -#elif defined(_MSC_VER) // end __APPLE__ +#elif defined(_WIN32) // end __APPLE__ static int arm_get_cpu_caps(void) { int flags = 0; diff --git a/vpx_ports/arm_cpudetect.h b/vpx_ports/arm_cpudetect.h index 24095d1acf..881397abc2 100644 --- a/vpx_ports/arm_cpudetect.h +++ b/vpx_ports/arm_cpudetect.h @@ -14,7 +14,7 @@ #include "vpx_config.h" #include "vpx_ports/arm.h" -#if defined(_MSC_VER) +#if defined(_WIN32) #undef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #undef WIN32_EXTRA_LEAN From 67bfb41ed8598edfb25bd6f245f9c39a68808548 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 21 Sep 2023 15:26:01 -0400 Subject: [PATCH 0891/1000] Do not call WebM RC for new GOP at the end of seq define_gf_group is called at the last frame of each GOP to get GOP size for next one, which means it'll also be called at the last GOP of the sequence, when calling WebM RC will be returned with error since WebM RC does not have any more GOP to return. When gop_coding_frames from the encoder is 1, it means it's running out of firstpass stats, which means end of sequence. Bug: b/299610956 Change-Id: I30e077a28fe41593ebabbc1dc0c2915a4bcbece3 --- test/vp9_ext_ratectrl_test.cc | 4 ++-- vp9/encoder/vp9_firstpass.c | 17 +++++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index e0107b2d26..b43430586d 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -338,7 +338,7 @@ vpx_rc_status_t rc_get_encodeframe_decision_gop_short( EXPECT_EQ(encode_frame_info->show_index, 3); EXPECT_EQ(encode_frame_info->gop_index, 0); EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); } // When the model recommends an invalid q, valid range [0, 255], @@ -398,7 +398,7 @@ vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay( EXPECT_EQ(encode_frame_info->show_index, 3); EXPECT_EQ(encode_frame_info->gop_index, 0); EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); } // When the model recommends an invalid q, valid range [0, 255], diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 1e6f6f7b3b..3ec7ba5400 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -2504,7 +2504,7 @@ static int get_gop_coding_frame_num( int *use_alt_ref, const FRAME_INFO *frame_info, const TWO_PASS *const twopass, const RATE_CONTROL *rc, int gf_start_show_idx, const RANGE *active_gf_interval, - double gop_intra_factor, int lag_in_frames) { + double gop_intra_factor, int lag_in_frames, int *end_of_sequence) { const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; double loop_decay_rate = 1.00; double mv_ratio_accumulator = 0.0; @@ -2530,6 +2530,7 @@ static int get_gop_coding_frame_num( next_frame = fps_get_frame_stats(first_pass_info, gf_start_show_idx + gop_coding_frames); if (next_frame == NULL) { + *end_of_sequence = (gop_coding_frames == 1); break; } @@ -2720,6 +2721,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { double gop_intra_factor; int gop_frames; RANGE active_gf_interval; + // Whether this is at the end of last GOP of this sequence. + int end_of_sequence = 0; // Reset the GF group data structures unless this is a key // frame in which case it will already have been done. @@ -2751,7 +2754,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { gop_coding_frames = get_gop_coding_frame_num( &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx, - &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames); + &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames, + &end_of_sequence); use_alt_ref &= allow_alt_ref; #if CONFIG_RATE_CTRL // If the external gop_command is on, we will override the decisions @@ -2770,7 +2774,7 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { // will be overwritten. if (cpi->ext_ratectrl.ready && (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && - cpi->ext_ratectrl.funcs.get_gop_decision != NULL) { + cpi->ext_ratectrl.funcs.get_gop_decision != NULL && !end_of_sequence) { vpx_codec_err_t codec_status; vpx_rc_gop_decision_t gop_decision; vpx_rc_gop_info_t gop_info; @@ -3805,6 +3809,7 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf, const int arf_active_or_kf = last_gop_use_alt_ref || first_is_key_frame; RANGE active_gf_interval; int arf_layers; + int end_of_sequence = 0; if (oxcf->use_simple_encode_api) { active_gf_interval = get_active_gf_inverval_range_simple( rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key); @@ -3822,9 +3827,9 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf, gop_intra_factor = 1.0; } - frame_count = get_gop_coding_frame_num(use_alt_ref, frame_info, twopass, rc, - show_idx, &active_gf_interval, - gop_intra_factor, oxcf->lag_in_frames); + frame_count = get_gop_coding_frame_num( + use_alt_ref, frame_info, twopass, rc, show_idx, &active_gf_interval, + gop_intra_factor, oxcf->lag_in_frames, &end_of_sequence); *use_alt_ref &= allow_alt_ref; return frame_count; } From af6dedd715f4307669366944cca6e0417b290282 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 25 Sep 2023 18:53:41 -0700 Subject: [PATCH 0892/1000] encode_api_test: add ConfigResizeChangeThreadCount Update thread counts and resolution to ensure allocations are updated correctly. VP8 is disabled to avoid a crash. Bug: chromium:1486441 Change-Id: Ie89776d9818d27dc351eff298a44c699e850761b --- test/encode_api_test.cc | 50 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index c8bd7daa4a..a8a4df2ddf 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -308,7 +308,6 @@ TEST(EncodeAPI, SetRoi) { void InitCodec(const vpx_codec_iface_t &iface, int width, int height, vpx_codec_ctx_t *enc, vpx_codec_enc_cfg_t *cfg) { - ASSERT_EQ(vpx_codec_enc_config_default(&iface, cfg, 0), VPX_CODEC_OK); cfg->g_w = width; cfg->g_h = height; cfg->g_lag_in_frames = 0; @@ -346,6 +345,7 @@ TEST(EncodeAPI, ConfigChangeThreadCount) { vpx_codec_ctx_t ctx = {}; } enc; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); EXPECT_NO_FATAL_FAILURE( InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg)); if (IsVP9(iface)) { @@ -364,6 +364,54 @@ TEST(EncodeAPI, ConfigChangeThreadCount) { } } +TEST(EncodeAPI, ConfigResizeChangeThreadCount) { + constexpr int kInitWidth = 1024; + constexpr int kInitHeight = 1024; + + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); + if (!IsVP9(iface)) { + GTEST_SKIP() << "TODO(https://crbug.com/1486441) remove this condition " + "after VP8 is fixed."; + } + for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) { + vpx_codec_enc_cfg_t cfg = {}; + struct Encoder { + ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); } + vpx_codec_ctx_t ctx = {}; + } enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + // Start in threaded mode to ensure resolution and thread related + // allocations are updated correctly across changes in resolution and + // thread counts. See https://crbug.com/1486441. + cfg.g_threads = 4; + EXPECT_NO_FATAL_FAILURE( + InitCodec(*iface, kInitWidth, kInitHeight, &enc.ctx, &cfg)); + if (IsVP9(iface)) { + EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_TILE_COLUMNS, 6), + VPX_CODEC_OK); + EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_ROW_MT, i), + VPX_CODEC_OK); + } + + cfg.g_w = 1000; + cfg.g_h = 608; + EXPECT_EQ(vpx_codec_enc_config_set(&enc.ctx, &cfg), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc.ctx); + + cfg.g_w = 16; + cfg.g_h = 720; + + for (const auto threads : { 1, 4, 8, 6, 2, 1 }) { + cfg.g_threads = threads; + EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx)) + << "iteration: " << i << " threads: " << threads; + } + } + } +} + #if CONFIG_VP9_ENCODER class EncodeApiGetTplStatsTest : public ::libvpx_test::EncoderTest, From 3fbd1dca6a4d2dad332a2110d646e4ffef36d590 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 25 Sep 2023 18:55:59 -0700 Subject: [PATCH 0893/1000] VP8: disallow thread count changes Currently allocations are done at encoder creation time. Going from threaded to non-threaded would cause a crash. Bug: chromium:1486441 Change-Id: Ie301c2a70847dff2f0daae408fbef1e4d42e73d4 --- test/encode_api_test.cc | 4 ---- vp8/encoder/onyx_if.c | 5 +++++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index a8a4df2ddf..f1c98b2c71 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -370,10 +370,6 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) { for (const auto *iface : kCodecIfaces) { SCOPED_TRACE(vpx_codec_iface_name(iface)); - if (!IsVP9(iface)) { - GTEST_SKIP() << "TODO(https://crbug.com/1486441) remove this condition " - "after VP8 is fixed."; - } for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) { vpx_codec_enc_cfg_t cfg = {}; struct Encoder { diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index c65afc643b..c5e9970c3c 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1447,6 +1447,11 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { last_h = cpi->oxcf.Height; prev_number_of_layers = cpi->oxcf.number_of_layers; + if (cpi->initial_width) { + // TODO(https://crbug.com/1486441): Allow changing thread counts; the + // allocation is done once in vp8_create_compressor(). + oxcf->multi_threaded = cpi->oxcf.multi_threaded; + } cpi->oxcf = *oxcf; switch (cpi->oxcf.Mode) { From 61f868bcf78d167df63ac9e792871f75b6b6e1f0 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Thu, 21 Sep 2023 13:49:09 -0700 Subject: [PATCH 0894/1000] Modify vp9_c_vs_simd_enc_test script Applied James' change to the script. Enabled the test: vp9_c_vs_simd_enc_test BUG=webm:1800 Change-Id: If1e33e5ccb6ca9315004f3e3f5b910f8a8255317 --- test/examples.sh | 2 +- test/vp9_c_vs_simd_encode.sh | 74 ++++++++++++++++++------------------ 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/test/examples.sh b/test/examples.sh index c15a367f3c..629f04239c 100755 --- a/test/examples.sh +++ b/test/examples.sh @@ -15,7 +15,7 @@ example_tests=$(ls $(dirname $0)/*.sh) # List of script names to exclude. -exclude_list="examples stress tools_common vp9_c_vs_simd_encode" +exclude_list="examples stress tools_common" # Filter out the scripts in $exclude_list. for word in ${exclude_list}; do diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh index e3d3624ed4..dfd4b93a1f 100755 --- a/test/vp9_c_vs_simd_encode.sh +++ b/test/vp9_c_vs_simd_encode.sh @@ -10,6 +10,8 @@ ## ## This script checks the bit exactness between C and SIMD ## implementations of VP9 encoder. +## +. $(dirname $0)/tools_common.sh TEST_BITRATES="1600 6400" PRESETS="good rt" @@ -17,7 +19,6 @@ TEST_CLIPS="yuv_raw_input y4m_360p_10bit_input yuv_480p_raw_input y4m_720p_input OUT_FILE_SUFFIX=".ivf" SCRIPT_DIR=$(dirname "$0") LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/..; pwd) -devnull='> /dev/null 2>&1' # Clips used in test. YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv" @@ -87,9 +88,11 @@ vp9_c_vs_simd_enc_verify_environment () { fi } -cleanup() { - rm -rf ${VPX_TEST_OUTPUT_DIR} -} +# This is not needed since tools_common.sh does the same cleanup. +# Keep the code here for our reference. +# cleanup() { +# rm -rf ${VPX_TEST_OUTPUT_DIR} +# } # Echo VPX_SIMD_CAPS_MASK for different instruction set architecture. avx512f() { @@ -143,10 +146,12 @@ y4m_360p_10bit_input() { has_x86_isa_extn() { instruction_set=$1 - grep -q "$instruction_set" /proc/cpuinfo - if [ $? -eq 1 ]; then + if ! grep -q "$instruction_set" /proc/cpuinfo; then + # This instruction_set is not supported. return 1 fi + # This instruction_set is supported. + return 0 } # Echo good encode params for use with VP9 encoder. @@ -238,9 +243,8 @@ compare_enc_output() { local clip=$3 local bitrate=$4 local preset=$5 - diff ${VPX_TEST_OUTPUT_DIR}/Out-generic-gnu-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \ - ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} > /dev/null - if [ $? -eq 1 ]; then + if ! diff -q ${VPX_TEST_OUTPUT_DIR}/Out-generic-gnu-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \ + ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX}; then elog "C vs ${target} encode mismatches for ${clip}, at ${bitrate} kbps, speed ${cpu}, ${preset} preset" return 1 fi @@ -281,8 +285,8 @@ vp9_enc_test() { ${devnull} if [ "${target}" != "generic-gnu" ]; then - compare_enc_output ${target} $cpu ${clip} $bitrate ${preset} - if [ $? -eq 1 ]; then + if ! compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}; then + # Find the mismatch return 1 fi fi @@ -322,8 +326,7 @@ vp9_test_generic() { vp9_test_x86() { local arch=$1 - uname -m | grep -q "x86" - if [ $? -eq 1 ]; then + if ! uname -m | grep -q "x86"; then elog "Machine architecture is not x86 or x86_64" return 0 fi @@ -341,14 +344,14 @@ vp9_test_x86() { vp9_enc_build ${target} ${configure} local encoder="$(vp9_enc_tool_path "${target}")" for isa in $x86_isa_variants; do - has_x86_isa_extn $isa - if [ $? -eq 1 ]; then + # Note that if has_x86_isa_extn returns 1, it is false, and vice versa. + if ! has_x86_isa_extn $isa; then echo "${isa} is not supported in this machine" continue fi export VPX_SIMD_CAPS_MASK=$($isa) - vp9_enc_test $encoder ${target} - if [ $? -eq 1 ]; then + if ! vp9_enc_test $encoder ${target}; then + # Find the mismatch return 1 fi unset VPX_SIMD_CAPS_MASK @@ -363,8 +366,8 @@ vp9_test_arm() { vp9_enc_build ${target} "${configure}" local encoder="$(vp9_enc_tool_path "${target}")" - vp9_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" ${target} - if [ $? -eq 1 ]; then + if ! vp9_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" ${target}; then + # Find the mismatch return 1 fi } @@ -373,21 +376,23 @@ vp9_c_vs_simd_enc_test () { # Test Generic vp9_test_generic + # TODO(webm:1816): Enable x86 test once issue 1816 is fixed. + # Details: https://bugs.chromium.org/p/webm/issues/detail?id=1816 # Test x86 (32 bit) - echo "vp9 test for x86 (32 bit): Started." - vp9_test_x86 "x86" - if [ $? -eq 1 ]; then - echo "vp9 test for x86 (32 bit): Done, test failed." - else - echo "vp9 test for x86 (32 bit): Done, all tests passed." - fi + # echo "vp9 test for x86 (32 bit): Started." + # if ! vp9_test_x86 "x86"; then + # echo "vp9 test for x86 (32 bit): Done, test failed." + # return 1 + # else + # echo "vp9 test for x86 (32 bit): Done, all tests passed." + # fi # Test x86_64 (64 bit) if [ "$(eval uname -m)" = "x86_64" ]; then echo "vp9 test for x86_64 (64 bit): Started." - vp9_test_x86 "x86_64" - if [ $? -eq 1 ]; then + if ! vp9_test_x86 "x86_64"; then echo "vp9 test for x86_64 (64 bit): Done, test failed." + return 1 else echo "vp9 test for x86_64 (64 bit): Done, all tests passed." fi @@ -395,20 +400,15 @@ vp9_c_vs_simd_enc_test () { # Test ARM echo "vp9_test_arm: Started." - vp9_test_arm - if [ $? -eq 1 ]; then + if ! vp9_test_arm; then echo "vp9 test for arm: Done, test failed." + return 1 else echo "vp9 test for arm: Done, all tests passed." fi } # Setup a trap function to clean up build, and output files after tests complete. -trap cleanup EXIT +# trap cleanup EXIT -vp9_c_vs_simd_enc_verify_environment -if [ $? -eq 1 ]; then - echo "Environment check failed." - exit 1 -fi -vp9_c_vs_simd_enc_test +run_tests vp9_c_vs_simd_enc_verify_environment vp9_c_vs_simd_enc_test From 452199ca85a3c968d31115345109c6d00d2a485b Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 25 Sep 2023 18:53:41 -0700 Subject: [PATCH 0895/1000] encode_api_test: add ConfigResizeChangeThreadCount Update thread counts and resolution to ensure allocations are updated correctly. VP8 is disabled to avoid a crash. Bug: chromium:1486441 Change-Id: Ie89776d9818d27dc351eff298a44c699e850761b (cherry picked from commit af6dedd715f4307669366944cca6e0417b290282) --- test/encode_api_test.cc | 50 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index ecdf928343..02aedc0575 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -304,7 +304,6 @@ TEST(EncodeAPI, SetRoi) { void InitCodec(const vpx_codec_iface_t &iface, int width, int height, vpx_codec_ctx_t *enc, vpx_codec_enc_cfg_t *cfg) { - ASSERT_EQ(vpx_codec_enc_config_default(&iface, cfg, 0), VPX_CODEC_OK); cfg->g_w = width; cfg->g_h = height; cfg->g_lag_in_frames = 0; @@ -342,6 +341,7 @@ TEST(EncodeAPI, ConfigChangeThreadCount) { vpx_codec_ctx_t ctx = {}; } enc; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); EXPECT_NO_FATAL_FAILURE( InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg)); if (IsVP9(iface)) { @@ -360,4 +360,52 @@ TEST(EncodeAPI, ConfigChangeThreadCount) { } } +TEST(EncodeAPI, ConfigResizeChangeThreadCount) { + constexpr int kInitWidth = 1024; + constexpr int kInitHeight = 1024; + + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); + if (!IsVP9(iface)) { + GTEST_SKIP() << "TODO(https://crbug.com/1486441) remove this condition " + "after VP8 is fixed."; + } + for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) { + vpx_codec_enc_cfg_t cfg = {}; + struct Encoder { + ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); } + vpx_codec_ctx_t ctx = {}; + } enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + // Start in threaded mode to ensure resolution and thread related + // allocations are updated correctly across changes in resolution and + // thread counts. See https://crbug.com/1486441. + cfg.g_threads = 4; + EXPECT_NO_FATAL_FAILURE( + InitCodec(*iface, kInitWidth, kInitHeight, &enc.ctx, &cfg)); + if (IsVP9(iface)) { + EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_TILE_COLUMNS, 6), + VPX_CODEC_OK); + EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_ROW_MT, i), + VPX_CODEC_OK); + } + + cfg.g_w = 1000; + cfg.g_h = 608; + EXPECT_EQ(vpx_codec_enc_config_set(&enc.ctx, &cfg), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc.ctx); + + cfg.g_w = 16; + cfg.g_h = 720; + + for (const auto threads : { 1, 4, 8, 6, 2, 1 }) { + cfg.g_threads = threads; + EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx)) + << "iteration: " << i << " threads: " << threads; + } + } + } +} + } // namespace From baed1218776fba096c05c1c683564ba4523d17e5 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 25 Sep 2023 18:55:59 -0700 Subject: [PATCH 0896/1000] VP8: disallow thread count changes Currently allocations are done at encoder creation time. Going from threaded to non-threaded would cause a crash. Bug: chromium:1486441 Change-Id: Ie301c2a70847dff2f0daae408fbef1e4d42e73d4 (cherry picked from commit 3fbd1dca6a4d2dad332a2110d646e4ffef36d590) --- test/encode_api_test.cc | 4 ---- vp8/encoder/onyx_if.c | 5 +++++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 02aedc0575..e0e793b156 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -366,10 +366,6 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) { for (const auto *iface : kCodecIfaces) { SCOPED_TRACE(vpx_codec_iface_name(iface)); - if (!IsVP9(iface)) { - GTEST_SKIP() << "TODO(https://crbug.com/1486441) remove this condition " - "after VP8 is fixed."; - } for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) { vpx_codec_enc_cfg_t cfg = {}; struct Encoder { diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 4bbeadef01..148a16cc49 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1443,6 +1443,11 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { last_h = cpi->oxcf.Height; prev_number_of_layers = cpi->oxcf.number_of_layers; + if (cpi->initial_width) { + // TODO(https://crbug.com/1486441): Allow changing thread counts; the + // allocation is done once in vp8_create_compressor(). + oxcf->multi_threaded = cpi->oxcf.multi_threaded; + } cpi->oxcf = *oxcf; switch (cpi->oxcf.Mode) { From 263682c9a29395055f3b3afe2d97be1828a6223f Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 30 Jun 2022 13:48:56 -0400 Subject: [PATCH 0897/1000] Fix bug with smaller width bigger size Fixed previous patch that clusterfuzz failed on. Bug: webm:1642 Change-Id: If0e08e72abd2e042efe4dcfac21e4cc51afdfdb9 --- test/resize_test.cc | 11 +++-------- vp9/common/vp9_alloccommon.c | 13 ++++++------- vp9/encoder/vp9_encoder.c | 27 +++++++++++++++++++++++++-- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/test/resize_test.cc b/test/resize_test.cc index fd1c2a92de..20ad2229b4 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -102,11 +102,8 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, if (frame < 30) { return; } - if (frame < 100) { - *w = initial_w * 7 / 10; - *h = initial_h * 16 / 10; - return; - } + *w = initial_w * 7 / 10; + *h = initial_h * 16 / 10; return; } if (frame < 10) { @@ -559,9 +556,7 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) { } } -// TODO(https://crbug.com/webm/1642): This causes a segfault in -// init_encode_frame_mb_context(). -TEST_P(ResizeRealtimeTest, DISABLED_TestExternalResizeSmallerWidthBiggerSize) { +TEST_P(ResizeRealtimeTest, TestExternalResizeSmallerWidthBiggerSize) { ResizingVideoSource video; video.flag_codec_ = true; video.smaller_width_larger_size_ = true; diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index e53883f621..9e73e40ea0 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -135,13 +135,6 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { cm->free_mi(cm); if (cm->alloc_mi(cm, new_mi_size)) goto fail; } - - if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) { - // Create the segmentation map structure and set to 0. - free_seg_map(cm); - if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail; - } - if (cm->above_context_alloc_cols < cm->mi_cols) { vpx_free(cm->above_context); cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc( @@ -156,6 +149,12 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { cm->above_context_alloc_cols = cm->mi_cols; } + if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) { + // Create the segmentation map structure and set to 0. + free_seg_map(cm); + if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail; + } + if (vp9_alloc_loop_filter(cm)) goto fail; return 0; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 69a4e3c314..e3ba294c32 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2047,6 +2047,17 @@ static void alloc_copy_partition_data(VP9_COMP *cpi) { } } +static void free_copy_partition_data(VP9_COMP *cpi) { + vpx_free(cpi->prev_partition); + cpi->prev_partition = NULL; + vpx_free(cpi->prev_segment_id); + cpi->prev_segment_id = NULL; + vpx_free(cpi->prev_variance_low); + cpi->prev_variance_low = NULL; + vpx_free(cpi->copied_frame_cnt); + cpi->copied_frame_cnt = NULL; +} + void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; @@ -2126,6 +2137,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows); if (cm->mi_alloc_size < new_mi_size) { vp9_free_context_buffers(cm); + vp9_free_pc_tree(&cpi->td); + vpx_free(cpi->mbmi_ext_base); alloc_compressor_data(cpi); realloc_segmentation_maps(cpi); cpi->initial_width = cpi->initial_height = 0; @@ -2144,8 +2157,18 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { update_frame_size(cpi); if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) { - memset(cpi->consec_zero_mv, 0, - cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv)); + vpx_free(cpi->consec_zero_mv); + CHECK_MEM_ERROR( + &cm->error, cpi->consec_zero_mv, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv))); + + vpx_free(cpi->skin_map); + CHECK_MEM_ERROR( + &cm->error, cpi->skin_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0]))); + + free_copy_partition_data(cpi); + alloc_copy_partition_data(cpi); if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_reset_resize(cpi); rc->rc_1_frame = 0; From 58a854fb271f99a15f3d2687587f74a6d12c8512 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Wed, 27 Sep 2023 16:12:12 -0700 Subject: [PATCH 0898/1000] Skip the y4m_360p_10bit_input clip for armv8 It is a known mismatch. Bug: webm:1819 Change-Id: Ieb707a6f53ddf6c7b0d1202c6520599d3e45da76 --- test/vp9_c_vs_simd_encode.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh index dfd4b93a1f..22503545a1 100755 --- a/test/vp9_c_vs_simd_encode.sh +++ b/test/vp9_c_vs_simd_encode.sh @@ -277,6 +277,10 @@ vp9_enc_test() { for cpu in $(seq 0 $max_cpu_used); do for clip in ${TEST_CLIPS}; do + # TODO(webm:1819): Delete this if statement once issue 1819 is fixed. + if [ "${clip}" = "y4m_360p_10bit_input" ] && [ "${target}" = "armv8-linux-gcc" ]; then + continue + fi for bitrate in ${TEST_BITRATES}; do eval "${encoder}" $($clip) $($test_params) \ "--limit=${VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT}" \ From e462a0de03fc4daca2ba32a5ba29c4182263e43c Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 28 Sep 2023 08:58:36 -0700 Subject: [PATCH 0899/1000] vp9_c_vs_simd_encode: Restore cwd at end of test Restore the original current directory at the end of vp9_c_vs_simd_enc_test(). Bug: webm:1800 Change-Id: Iad64848a231e3c900149cc2b248055b02dda80a6 --- test/vp9_c_vs_simd_encode.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh index 22503545a1..82939470f6 100755 --- a/test/vp9_c_vs_simd_encode.sh +++ b/test/vp9_c_vs_simd_encode.sh @@ -65,7 +65,7 @@ vp9_enc_tool_path() { } # Environment check: Make sure input and source directories are available. -vp9_c_vs_simd_enc_verify_environment () { +vp9_c_vs_simd_enc_verify_environment() { if [ ! -e "${YUV_RAW_INPUT}" ]; then elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." return 1 @@ -376,7 +376,9 @@ vp9_test_arm() { fi } -vp9_c_vs_simd_enc_test () { +vp9_c_vs_simd_enc_test() { + local save_dir=$(pwd) + # Test Generic vp9_test_generic @@ -410,6 +412,8 @@ vp9_c_vs_simd_enc_test () { else echo "vp9 test for arm: Done, all tests passed." fi + + cd ${save_dir} } # Setup a trap function to clean up build, and output files after tests complete. From fd2052d4c91ca9451563482817bbeab1da3f76e7 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Thu, 28 Sep 2023 17:47:54 -0700 Subject: [PATCH 0900/1000] Properly determine end of sequence When the next frame is null and the current frame is an overlay frame, which is equivalent to there is an active alt ref frame, we call this an end of sequence. Change-Id: I49c2cf7a001df98aff8b62ba034317e408274bd4 --- test/vp9_ext_ratectrl_test.cc | 2 +- vp9/encoder/vp9_firstpass.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc index b43430586d..33fa05c65c 100644 --- a/test/vp9_ext_ratectrl_test.cc +++ b/test/vp9_ext_ratectrl_test.cc @@ -338,7 +338,7 @@ vpx_rc_status_t rc_get_encodeframe_decision_gop_short( EXPECT_EQ(encode_frame_info->show_index, 3); EXPECT_EQ(encode_frame_info->gop_index, 0); EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2); } // When the model recommends an invalid q, valid range [0, 255], diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 3ec7ba5400..a9cdf5353f 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -2530,7 +2530,7 @@ static int get_gop_coding_frame_num( next_frame = fps_get_frame_stats(first_pass_info, gf_start_show_idx + gop_coding_frames); if (next_frame == NULL) { - *end_of_sequence = (gop_coding_frames == 1); + *end_of_sequence = gop_coding_frames == 1 && rc->source_alt_ref_active; break; } From 4697b110ac3c4da19bfbd742432548820f0f38de Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Fri, 29 Sep 2023 10:35:20 -0700 Subject: [PATCH 0901/1000] Update 32-bit version of horizontal_add_uint32x2 The code was originally added in https://aomedia-review.googlesource.com/c/aom/+/162267 by Jonathan Wright. Change-Id: Iafd9e31d82abe22387e0d68f02c7ab81e85367ed --- vpx_dsp/arm/sum_neon.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index 48a2fc05ca..75c170df60 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -154,7 +154,8 @@ static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) { #if VPX_ARCH_AARCH64 return vaddv_u32(a); #else - return vget_lane_u32(a, 0) + vget_lane_u32(a, 1); + const uint64x1_t b = vpaddl_u32(a); + return vget_lane_u32(vreinterpret_u32_u64(b), 0); #endif } From a53700e4a3820ad929e2b8c79d10e46abef62575 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 26 Jun 2023 19:06:51 -0700 Subject: [PATCH 0902/1000] vp9_alloccommon: clear allocation sizes on free This fixes reallocations (and avoids potential crashes) if any allocations fails and the application continues to call vpx_codec_decode(). Found with vpx_dec_fuzzer_vp9 & Nallocfuzz (https://github.com/catenacyber/nallocfuzz). Bug: webm:1807 Change-Id: If5dc96b73c02efc94ec84c25eb50d10ad6b645a6 (cherry picked from commit 02ab555e992c191e5c509ed87b3cc48ed915b447) --- vp9/common/vp9_alloccommon.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index faad657a08..e53883f621 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -65,6 +65,7 @@ static void free_seg_map(VP9_COMMON *cm) { vpx_free(cm->seg_map_array[i]); cm->seg_map_array[i] = NULL; } + cm->seg_map_alloc_size = 0; cm->current_frame_seg_map = NULL; cm->last_frame_seg_map = NULL; @@ -108,6 +109,7 @@ void vp9_free_context_buffers(VP9_COMMON *cm) { cm->above_context = NULL; vpx_free(cm->above_seg_context); cm->above_seg_context = NULL; + cm->above_context_alloc_cols = 0; vpx_free(cm->lf.lfm); cm->lf.lfm = NULL; } From df9fd9d5b7325060b2b921558a1eb20ca7880937 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 30 Jun 2022 13:48:56 -0400 Subject: [PATCH 0903/1000] Fix bug with smaller width bigger size Fixed previous patch that clusterfuzz failed on. Local fuzzing passing overnight. Bug: webm:1642 Change-Id: If0e08e72abd2e042efe4dcfac21e4cc51afdfdb9 (cherry picked from commit 263682c9a29395055f3b3afe2d97be1828a6223f) --- test/resize_test.cc | 11 +++-------- vp9/common/vp9_alloccommon.c | 13 ++++++------- vp9/encoder/vp9_encoder.c | 27 +++++++++++++++++++++++++-- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/test/resize_test.cc b/test/resize_test.cc index 715bb9d70f..d9420a4548 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -102,11 +102,8 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, if (frame < 30) { return; } - if (frame < 100) { - *w = initial_w * 7 / 10; - *h = initial_h * 16 / 10; - return; - } + *w = initial_w * 7 / 10; + *h = initial_h * 16 / 10; return; } if (frame < 10) { @@ -559,9 +556,7 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) { } } -// TODO(https://crbug.com/webm/1642): This causes a segfault in -// init_encode_frame_mb_context(). -TEST_P(ResizeRealtimeTest, DISABLED_TestExternalResizeSmallerWidthBiggerSize) { +TEST_P(ResizeRealtimeTest, TestExternalResizeSmallerWidthBiggerSize) { ResizingVideoSource video; video.flag_codec_ = true; video.smaller_width_larger_size_ = true; diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index e53883f621..9e73e40ea0 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -135,13 +135,6 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { cm->free_mi(cm); if (cm->alloc_mi(cm, new_mi_size)) goto fail; } - - if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) { - // Create the segmentation map structure and set to 0. - free_seg_map(cm); - if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail; - } - if (cm->above_context_alloc_cols < cm->mi_cols) { vpx_free(cm->above_context); cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc( @@ -156,6 +149,12 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { cm->above_context_alloc_cols = cm->mi_cols; } + if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) { + // Create the segmentation map structure and set to 0. + free_seg_map(cm); + if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail; + } + if (vp9_alloc_loop_filter(cm)) goto fail; return 0; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index b66fdc0bca..e385077545 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1973,6 +1973,17 @@ static void alloc_copy_partition_data(VP9_COMP *cpi) { } } +static void free_copy_partition_data(VP9_COMP *cpi) { + vpx_free(cpi->prev_partition); + cpi->prev_partition = NULL; + vpx_free(cpi->prev_segment_id); + cpi->prev_segment_id = NULL; + vpx_free(cpi->prev_variance_low); + cpi->prev_variance_low = NULL; + vpx_free(cpi->copied_frame_cnt); + cpi->copied_frame_cnt = NULL; +} + void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; @@ -2052,6 +2063,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows); if (cm->mi_alloc_size < new_mi_size) { vp9_free_context_buffers(cm); + vp9_free_pc_tree(&cpi->td); + vpx_free(cpi->mbmi_ext_base); alloc_compressor_data(cpi); realloc_segmentation_maps(cpi); cpi->initial_width = cpi->initial_height = 0; @@ -2070,8 +2083,18 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { update_frame_size(cpi); if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) { - memset(cpi->consec_zero_mv, 0, - cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv)); + vpx_free(cpi->consec_zero_mv); + CHECK_MEM_ERROR( + cm, cpi->consec_zero_mv, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv))); + + vpx_free(cpi->skin_map); + CHECK_MEM_ERROR( + cm, cpi->skin_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0]))); + + free_copy_partition_data(cpi); + alloc_copy_partition_data(cpi); if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_reset_resize(cpi); rc->rc_1_frame = 0; From 490a7067e806092cb85a7f0b9534700c33b95199 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 29 Sep 2023 11:21:35 -0700 Subject: [PATCH 0904/1000] update version to 1.13.1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SO_VERSION_MAJOR = 8 SO_VERSION_MINOR = 0 SO_VERSION_PATCH = 1 The increase of the patch number corresponds to the revision number in the libtool text. 3. If the library source code has changed at all since the last update, then increment revision (‘c:r:a’ becomes ‘c:r+1:a’). Bug: webm:1818 Change-Id: Ia114368e9fd7a908e7fcf6e4d3142f142770e3f4 --- README | 2 +- libs.mk | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README b/README index e360df05f6..444289a237 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -v1.13.0 Ugly Duckling +v1.13.1 Ugly Duckling Welcome to the WebM VP8/VP9 Codec SDK! diff --git a/libs.mk b/libs.mk index 1f7f03aa38..13d878901b 100644 --- a/libs.mk +++ b/libs.mk @@ -314,7 +314,7 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1 SO_VERSION_MAJOR := 8 SO_VERSION_MINOR := 0 -SO_VERSION_PATCH := 0 +SO_VERSION_PATCH := 1 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib SHARED_LIB_SUF := .dylib From 10b9492dcf05b652e2e4b370e205bd605d421972 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 29 Sep 2023 11:32:27 -0700 Subject: [PATCH 0905/1000] update CHANGELOG Bug: webm:1818 Change-Id: Ic0a943b5d1c69a3621ad3f91012fb5308a0c11f1 --- CHANGELOG | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 3fb2d19bbe..f932f6bf4d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,13 @@ +2023-09-29 v1.13.1 "Ugly Duckling" + This release contains two security related fixes. One each for VP8 and VP9. + + - Upgrading: + This release is ABI compatible with the previous release. + + - Bug fixes: + https://crbug.com/1486441 (CVE-2023-5217) + Fix to a crash related to VP9 encoding (#1642) + 2023-01-31 v1.13.0 "Ugly Duckling" This release includes more Neon and AVX2 optimizations, adds a new codec control to set per frame QP, upgrades GoogleTest to v1.12.1, and includes From 7f568f98762547a22d4e82d0ddf64986c6407c34 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Fri, 22 Sep 2023 15:50:17 -0700 Subject: [PATCH 0906/1000] Fix a potential resource leak and add alloc checks Backport fixes from libaom: https://aomedia-review.googlesource.com/c/aom/+/109061 https://aomedia-review.googlesource.com/c/aom/+/158102 Change-Id: Ia9d42d474be2898f9ae2fdc28606273377da3e90 --- examples/resize_util.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/resize_util.c b/examples/resize_util.c index 7e529b2e20..5fb63e1660 100644 --- a/examples/resize_util.c +++ b/examples/resize_util.c @@ -52,6 +52,7 @@ int main(int argc, char *argv[]) { uint8_t *inbuf_v, *outbuf_v; int f, frames; int width, height, target_width, target_height; + int failed = 0; exec_name = argv[0]; @@ -82,6 +83,7 @@ int main(int argc, char *argv[]) { } fpout = fopen(fout, "wb"); if (fpout == NULL) { + fclose(fpin); printf("Can't open file %s to write\n", fout); usage(); return 1; @@ -100,6 +102,11 @@ int main(int argc, char *argv[]) { inbuf = (uint8_t *)malloc(width * height * 3 / 2); outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2); + if (!(inbuf && outbuf)) { + printf("Failed to allocate buffers.\n"); + failed = 1; + goto Error; + } inbuf_u = inbuf + width * height; inbuf_v = inbuf_u + width * height / 4; outbuf_u = outbuf + target_width * target_height; @@ -114,10 +121,11 @@ int main(int argc, char *argv[]) { f++; } printf("%d frames processed\n", f); +Error: fclose(fpin); fclose(fpout); free(inbuf); free(outbuf); - return 0; + return failed; } From ec9e1ed41f49c4ca2d8b180ebdcdb249a479acdc Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 29 Sep 2023 15:16:46 -0700 Subject: [PATCH 0907/1000] vp9_encoder: normalize sizeof() calls use sizeof(var) instead of sizeof(type) and sizeof(*var) instead of sizeof(var[0]) for consistency in some places. Change-Id: Ibd9a783cfef5ce1d06131df3831a4093821a502f --- vp9/encoder/vp9_encoder.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index e3ba294c32..993e6310eb 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -881,10 +881,11 @@ static int vp9_enc_alloc_mi(VP9_COMMON *cm, int mi_size) { if (!cm->prev_mip) return 1; cm->mi_alloc_size = mi_size; - cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *)); + cm->mi_grid_base = + (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base)); if (!cm->mi_grid_base) return 1; cm->prev_mi_grid_base = - (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *)); + (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base)); if (!cm->prev_mi_grid_base) return 1; return 0; @@ -2165,7 +2166,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { vpx_free(cpi->skin_map); CHECK_MEM_ERROR( &cm->error, cpi->skin_map, - vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0]))); + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map))); free_copy_partition_data(cpi); alloc_copy_partition_data(cpi); @@ -2379,7 +2380,7 @@ void vp9_update_compressor_with_img_fmt(VP9_COMP *cpi, vpx_img_fmt_t img_fmt) { VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, BufferPool *const pool) { unsigned int i; - VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP)); + VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(*cpi)); VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL; if (!cm) return NULL; @@ -2428,7 +2429,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, CHECK_MEM_ERROR( &cm->error, cpi->skin_map, - vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0]))); + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map))); #if !CONFIG_REALTIME_ONLY CHECK_MEM_ERROR(&cm->error, cpi->alt_ref_aq, vp9_alt_ref_aq_create()); From 6512f994da13e2f27e6a7bd449efee0a374b55b7 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Fri, 29 Sep 2023 16:58:26 -0700 Subject: [PATCH 0908/1000] Disable vpx_highbd_8_mse16x16_neon_dotprod, etc. These functions assume the uint16_t samples are <= 255 (bit depth 8), but vpx_highbd_8_mse16x16() is called for any bit depth, not just 8. A better fix is to port the libaom CL https://aomedia-review.googlesource.com/c/aom/+/175063 to libvpx, but that requires porting aom_sse() and aom_highbd_sse() to libvpx, which is quite involved. So disable vpx_highbd_8_mse16x16_neon_dotprod, etc. first. Bug: webm:1819 Change-Id: If495a5dedc58d9981317b9993c9fbb81ff3ab50c --- test/variance_test.cc | 4 ++++ test/vp9_c_vs_simd_encode.sh | 4 ---- vpx_dsp/vpx_dsp.mk | 1 - vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 ++++---- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index e231df0c6b..b8320e9ceb 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1555,6 +1555,9 @@ INSTANTIATE_TEST_SUITE_P( MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8), MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8))); +// TODO(webm:1819): Re-enable when vpx_highbd_8_mse16x16_neon_dotprod, etc. can +// be used again. +#if 0 #if HAVE_NEON_DOTPROD INSTANTIATE_TEST_SUITE_P( NEON_DOTPROD, VpxHBDMseTest, @@ -1564,6 +1567,7 @@ INSTANTIATE_TEST_SUITE_P( MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8), MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8))); #endif // HAVE_NEON_DOTPROD +#endif // 0 INSTANTIATE_TEST_SUITE_P( NEON, VpxHBDVarianceTest, diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh index 82939470f6..42e5f3b589 100755 --- a/test/vp9_c_vs_simd_encode.sh +++ b/test/vp9_c_vs_simd_encode.sh @@ -277,10 +277,6 @@ vp9_enc_test() { for cpu in $(seq 0 $max_cpu_used); do for clip in ${TEST_CLIPS}; do - # TODO(webm:1819): Delete this if statement once issue 1819 is fixed. - if [ "${clip}" = "y4m_360p_10bit_input" ] && [ "${target}" = "armv8-linux-gcc" ]; then - continue - fi for bitrate in ${TEST_BITRATES}; do eval "${encoder}" $($clip) $($test_params) \ "--limit=${VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT}" \ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 5343088d1b..84fd969daa 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -448,7 +448,6 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c -DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/highbd_variance_neon_dotprod.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 1012df11ec..c9cdc285f2 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1534,14 +1534,14 @@ () specialize qw/vpx_highbd_12_get8x8var sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse16x16 sse2 neon neon_dotprod/; + specialize qw/vpx_highbd_8_mse16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse16x8 neon neon_dotprod/; + specialize qw/vpx_highbd_8_mse16x8 neon/; add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse8x16 neon neon_dotprod/; + specialize qw/vpx_highbd_8_mse8x16 neon/; add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse8x8 sse2 neon neon_dotprod/; + specialize qw/vpx_highbd_8_mse8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_mse16x16 sse2 neon/; From b863d8bb47c2fd6bb3ad9d502532d34a13575b23 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 28 Sep 2023 16:35:54 -0700 Subject: [PATCH 0909/1000] Have vp9_enc_build and vp9_enc_test restore cwd Use $PWD to get the current directory. Quote directory pathnames. Suggested by James Zern. Bug: webm:1800 Change-Id: I51e922b24da0e89d936370f858eab55d193ebdcb --- test/vp9_c_vs_simd_encode.sh | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh index 82939470f6..5bb343bfaa 100755 --- a/test/vp9_c_vs_simd_encode.sh +++ b/test/vp9_c_vs_simd_encode.sh @@ -18,7 +18,7 @@ PRESETS="good rt" TEST_CLIPS="yuv_raw_input y4m_360p_10bit_input yuv_480p_raw_input y4m_720p_input" OUT_FILE_SUFFIX=".ivf" SCRIPT_DIR=$(dirname "$0") -LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/..; pwd) +LIBVPX_SOURCE_DIR=$(cd "${SCRIPT_DIR}/.."; pwd) # Clips used in test. YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv" @@ -214,14 +214,15 @@ vp9_encode_rt_params() { --error-resilient=0" } -# Configures for the given target in VPX_TEST_OUTPUT_DIR/build_target_${target} -# directory. +# Configures for the given target in the +# ${VPX_TEST_OUTPUT_DIR}/build_target_${target} directory. vp9_enc_build() { local target=$1 local configure="$2" local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target} - mkdir -p $tmp_build_dir - cd $tmp_build_dir + mkdir -p "$tmp_build_dir" + local save_dir="$PWD" + cd "$tmp_build_dir" echo "Building target: ${target}" local config_args="--disable-install-docs \ @@ -235,6 +236,7 @@ vp9_enc_build() { eval "$configure" --target="${target}" "${config_args}" ${devnull} eval make -j$(nproc) ${devnull} echo "Done building target: ${target}" + cd "${save_dir}" } compare_enc_output() { @@ -258,6 +260,9 @@ vp9_enc_test() { return 1 fi + local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target} + local save_dir="$PWD" + cd "$tmp_build_dir" for preset in ${PRESETS}; do if [ "${preset}" = "good" ]; then local max_cpu_used=5 @@ -267,6 +272,7 @@ vp9_enc_test() { local test_params=vp9_encode_rt_params else elog "Invalid preset" + cd "${save_dir}" return 1 fi @@ -291,6 +297,7 @@ vp9_enc_test() { if [ "${target}" != "generic-gnu" ]; then if ! compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}; then # Find the mismatch + cd "${save_dir}" return 1 fi fi @@ -298,6 +305,7 @@ vp9_enc_test() { done done done + cd "${save_dir}" } vp9_test_generic() { @@ -377,8 +385,6 @@ vp9_test_arm() { } vp9_c_vs_simd_enc_test() { - local save_dir=$(pwd) - # Test Generic vp9_test_generic @@ -412,8 +418,6 @@ vp9_c_vs_simd_enc_test() { else echo "vp9 test for arm: Done, all tests passed." fi - - cd ${save_dir} } # Setup a trap function to clean up build, and output files after tests complete. From b729684b059d48b6bac0750045acbe6a4a9e9a6b Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Sat, 30 Sep 2023 19:45:35 -0700 Subject: [PATCH 0910/1000] Use big cfg.g_w in ConfigResizeChangeThreadCount vp8cx_create_encoder_threads() caps the thread count at (cm->mb_cols / cpi->mt_sync_range) - 1. If cfg.g_w is 16, cm->mb_cols is only 1 (see vp8_alloc_frame_buffers: mb_cols = width >> 4), so we won't be using multiple threads. To reproduce bug chromium:1486441, the test just needs to increase cfg.g_h sufficiently. Bug: chromium:1486441 Change-Id: Ie6b2da2e31cfa1717a481f55eebc8c875db94d87 --- test/encode_api_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index f1c98b2c71..6b22febf6e 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -396,7 +396,7 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) { EXPECT_EQ(vpx_codec_enc_config_set(&enc.ctx, &cfg), VPX_CODEC_OK) << vpx_codec_error_detail(&enc.ctx); - cfg.g_w = 16; + cfg.g_w = 1000; cfg.g_h = 720; for (const auto threads : { 1, 4, 8, 6, 2, 1 }) { From 0a3e2b4ca1540f4a52848b79fcfad62391fcdba8 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 3 Oct 2023 09:58:23 -0400 Subject: [PATCH 0911/1000] Factor out common code used in test binaries Bug: b/303112617 Change-Id: Icbe16e95ff334a9578a692cc51b4773393ad0005 --- test/init_vpx_test.cc | 74 ++++++++++++++++++++++++++++++++ test/init_vpx_test.h | 18 ++++++++ test/test.mk | 4 ++ test/test_intra_pred_speed.cc | 8 +++- test/test_libvpx.cc | 81 +---------------------------------- 5 files changed, 105 insertions(+), 80 deletions(-) create mode 100644 test/init_vpx_test.cc create mode 100644 test/init_vpx_test.h diff --git a/test/init_vpx_test.cc b/test/init_vpx_test.cc new file mode 100644 index 0000000000..5b40d9e4f7 --- /dev/null +++ b/test/init_vpx_test.cc @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "test/init_vpx_test.h" + +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "./vpx_config.h" +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 +#include "vpx_ports/x86.h" +#endif +extern "C" { +#if CONFIG_VP8 +extern void vp8_rtcd(); +#endif // CONFIG_VP8 +#if CONFIG_VP9 +extern void vp9_rtcd(); +#endif // CONFIG_VP9 +extern void vpx_dsp_rtcd(); +extern void vpx_scale_rtcd(); +} + +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 +static void append_negative_gtest_filter(const char *str) { + std::string filter = GTEST_FLAG_GET(filter); + // Negative patterns begin with one '-' followed by a ':' separated list. + if (filter.find('-') == std::string::npos) filter += '-'; + filter += str; + GTEST_FLAG_SET(filter, filter); +} +#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64 + +namespace libvpx_test { +void init_vpx_test() { +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 + const int simd_caps = x86_simd_caps(); + if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*"); + if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*"); + if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*"); + if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*"); + if (!(simd_caps & HAS_SSSE3)) { + append_negative_gtest_filter(":SSSE3.*:SSSE3/*"); + } + if (!(simd_caps & HAS_SSE4_1)) { + append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*"); + } + if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*"); + if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*"); + if (!(simd_caps & HAS_AVX512)) { + append_negative_gtest_filter(":AVX512.*:AVX512/*"); + } +#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64 + +#if !CONFIG_SHARED +// Shared library builds don't support whitebox tests +// that exercise internal symbols. +#if CONFIG_VP8 + vp8_rtcd(); +#endif // CONFIG_VP8 +#if CONFIG_VP9 + vp9_rtcd(); +#endif // CONFIG_VP9 + vpx_dsp_rtcd(); + vpx_scale_rtcd(); +#endif // !CONFIG_SHARED +} +} // namespace libvpx_test diff --git a/test/init_vpx_test.h b/test/init_vpx_test.h new file mode 100644 index 0000000000..39ed6525b3 --- /dev/null +++ b/test/init_vpx_test.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef TEST_TEST_COMMON_MAIN_H_ +#define TEST_TEST_COMMON_MAIN_H_ + +namespace libvpx_test { +void init_vpx_test(); +} + +#endif diff --git a/test/test.mk b/test/test.mk index b64e89bb43..d4521f08bf 100644 --- a/test/test.mk +++ b/test/test.mk @@ -7,6 +7,8 @@ LIBVPX_TEST_SRCS-yes += codec_factory.h LIBVPX_TEST_SRCS-yes += md5_helper.h LIBVPX_TEST_SRCS-yes += register_state_check.h LIBVPX_TEST_SRCS-yes += test.mk +LIBVPX_TEST_SRCS-yes += init_vpx_test.cc +LIBVPX_TEST_SRCS-yes += init_vpx_test.h LIBVPX_TEST_SRCS-yes += test_libvpx.cc LIBVPX_TEST_SRCS-yes += test_vectors.cc LIBVPX_TEST_SRCS-yes += test_vectors.h @@ -215,6 +217,8 @@ endif TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c +TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.cc +TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.h RC_INTERFACE_TEST_SRCS-yes := test_rc_interface.cc RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ratectrl_rtc_test.cc diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index b013e0bd5d..4c464a262f 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -14,9 +14,11 @@ #include "third_party/googletest/src/include/gtest/gtest.h" +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" #include "test/clear_system_state.h" +#include "test/init_vpx_test.h" #include "test/md5_helper.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" @@ -607,4 +609,8 @@ HIGHBD_INTRA_PRED_TEST( #endif // CONFIG_VP9_HIGHBITDEPTH -#include "test/test_libvpx.cc" +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + ::libvpx_test::init_vpx_test(); + return RUN_ALL_TESTS(); +} diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc index caab2dbd01..c1798b8b8b 100644 --- a/test/test_libvpx.cc +++ b/test/test_libvpx.cc @@ -7,89 +7,12 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include +#include "test/init_vpx_test.h" #include "third_party/googletest/src/include/gtest/gtest.h" -#include "./vpx_config.h" -#if VPX_ARCH_ARM -#include "vpx_ports/arm.h" -#endif -#if VPX_ARCH_X86 || VPX_ARCH_X86_64 -#include "vpx_ports/x86.h" -#endif -extern "C" { -#if CONFIG_VP8 -extern void vp8_rtcd(); -#endif // CONFIG_VP8 -#if CONFIG_VP9 -extern void vp9_rtcd(); -#endif // CONFIG_VP9 -extern void vpx_dsp_rtcd(); -extern void vpx_scale_rtcd(); -} - -#if (!CONFIG_SHARED && VPX_ARCH_ARM) || VPX_ARCH_X86 || VPX_ARCH_X86_64 -static void append_negative_gtest_filter(const char *str) { - std::string filter = ::testing::FLAGS_gtest_filter; - // Negative patterns begin with one '-' followed by a ':' separated list. - if (filter.find('-') == std::string::npos) filter += '-'; - filter += str; - ::testing::FLAGS_gtest_filter = filter; -} -#endif // (!CONFIG_SHARED && VPX_ARCH_ARM) || VPX_ARCH_X86 || VPX_ARCH_X86_64 - int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); - -#if !CONFIG_SHARED -#if VPX_ARCH_AARCH64 - const int caps = arm_cpu_caps(); - if (!(caps & HAS_NEON_DOTPROD)) { - append_negative_gtest_filter(":NEON_DOTPROD.*:NEON_DOTPROD/*"); - } - if (!(caps & HAS_NEON_I8MM)) { - append_negative_gtest_filter(":NEON_I8MM.*:NEON_I8MM/*"); - } -#elif VPX_ARCH_ARM - const int caps = arm_cpu_caps(); - if (!(caps & HAS_NEON)) { - append_negative_gtest_filter(":NEON.*:NEON/*"); - } -#endif // VPX_ARCH_ARM -#endif // !CONFIG_SHARED - -#if VPX_ARCH_X86 || VPX_ARCH_X86_64 - const int simd_caps = x86_simd_caps(); - if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*"); - if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*"); - if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*"); - if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*"); - if (!(simd_caps & HAS_SSSE3)) { - append_negative_gtest_filter(":SSSE3.*:SSSE3/*"); - } - if (!(simd_caps & HAS_SSE4_1)) { - append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*"); - } - if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*"); - if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*"); - if (!(simd_caps & HAS_AVX512)) { - append_negative_gtest_filter(":AVX512.*:AVX512/*"); - } -#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64 - -#if !CONFIG_SHARED -// Shared library builds don't support whitebox tests -// that exercise internal symbols. -#if CONFIG_VP8 - vp8_rtcd(); -#endif // CONFIG_VP8 -#if CONFIG_VP9 - vp9_rtcd(); -#endif // CONFIG_VP9 - vpx_dsp_rtcd(); - vpx_scale_rtcd(); -#endif // !CONFIG_SHARED - + ::libvpx_test::init_vpx_test(); return RUN_ALL_TESTS(); } From 5b6ceba996f08e2502737eec14f291ac8d46a5bc Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 3 Oct 2023 10:14:43 -0400 Subject: [PATCH 0912/1000] Include vpx_config.h for macros Clear some clang-tidy complaints Change-Id: I6690428d336c81709befd19a33e11c1367275df3 --- vpx_ports/aarch32_cpudetect.c | 1 + vpx_ports/aarch64_cpudetect.c | 1 + 2 files changed, 2 insertions(+) diff --git a/vpx_ports/aarch32_cpudetect.c b/vpx_ports/aarch32_cpudetect.c index 48bdc70f92..639f4ff8ea 100644 --- a/vpx_ports/aarch32_cpudetect.c +++ b/vpx_ports/aarch32_cpudetect.c @@ -9,6 +9,7 @@ */ // Feature detection code for Armv7-A / AArch32. +#include "./vpx_config.h" #include "arm_cpudetect.h" #if !CONFIG_RUNTIME_CPU_DETECT diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c index dad081c8ce..ac68f44452 100644 --- a/vpx_ports/aarch64_cpudetect.c +++ b/vpx_ports/aarch64_cpudetect.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_config.h" #include "arm_cpudetect.h" #if defined(__APPLE__) From f73026c2ccc276ee1493a3cd733149fe8ed7df33 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 3 Oct 2023 22:00:02 -0400 Subject: [PATCH 0913/1000] Use correct include guards for init_vpx_test.h Bug: b/303112617 Change-Id: Ie18df33b2bcab91c18e920825f4ed3a29e18373b --- test/init_vpx_test.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/init_vpx_test.h b/test/init_vpx_test.h index 39ed6525b3..5e0dbb0e7e 100644 --- a/test/init_vpx_test.h +++ b/test/init_vpx_test.h @@ -8,11 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_TEST_COMMON_MAIN_H_ -#define TEST_TEST_COMMON_MAIN_H_ +#ifndef TEST_INIT_VPX_TEST_H_ +#define TEST_INIT_VPX_TEST_H_ namespace libvpx_test { void init_vpx_test(); } -#endif +#endif // TEST_INIT_VPX_TEST_H_ From f67f9ce3469fba7585c1a9fc4e5f1700b5433a39 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Sat, 30 Sep 2023 11:17:30 -0700 Subject: [PATCH 0914/1000] Declare cur_row inside #if CONFIG_MULTITHREAD Fix the following compiler warning when libvpx is configured with the --disable-multithread option: vp9/common/vp9_thread_common.c:391:7: warning: variable 'cur_row' set but not used [-Wunused-but-set-variable] int cur_row; ^ Change-Id: I53aa279152715083df40990eb7fdcaeb77a66777 --- vp9/common/vp9_thread_common.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c index 1c6ecc0fe6..8df18af3b8 100644 --- a/vp9/common/vp9_thread_common.c +++ b/vp9/common/vp9_thread_common.c @@ -388,10 +388,10 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) { static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) { int return_val = -1; - int cur_row; const int max_rows = cm->mi_rows; #if CONFIG_MULTITHREAD + int cur_row; const int tile_cols = 1 << cm->log2_tile_cols; pthread_mutex_lock(lf_sync->lf_mutex); @@ -428,14 +428,8 @@ static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) { #else (void)lf_sync; if (cm->lf_row < max_rows) { - cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2; return_val = cm->lf_row; cm->lf_row += MI_BLOCK_SIZE; - if (cm->lf_row < max_rows) { - /* If this is not the last row, make sure the next row is also decoded. - * This is because the intra predict has to happen before loop filter */ - cur_row += 1; - } } #endif // CONFIG_MULTITHREAD From ea67878f8c36f96c54947a72f759c955ce69cb80 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Tue, 3 Oct 2023 20:08:18 -0700 Subject: [PATCH 0915/1000] Clean up vp8cx_create/remove_encoder_threads() Make vp8cx_create_encoder_threads() undo everything cleanly before returning an error. Make vp8cx_remove_encoder_threads() reset pointer fields to NULL after freeing them, reset encoding_thread_count to 0, and reset b_lpf_running to 0 (false). This makes it safe to call vp8cx_create_encoder_threads() after calling vp8cx_remove_encoder_threads(). Change-Id: I586f06ce3d5b1c88ca46884bb4d6667ffc97e440 --- vp8/encoder/ethreading.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index b7f1932c58..353496cc57 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -553,6 +553,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { /* shutdown other threads */ vpx_atomic_store_release(&cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { + sem_post(&cpi->h_event_start_encoding[ithread]); + sem_post(&cpi->h_event_end_encoding[ithread]); pthread_join(cpi->h_encoding_thread[ithread], 0); sem_destroy(&cpi->h_event_start_encoding[ithread]); sem_destroy(&cpi->h_event_end_encoding[ithread]); @@ -560,10 +562,16 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { /* free thread related resources */ vpx_free(cpi->h_event_start_encoding); + cpi->h_event_start_encoding = NULL; vpx_free(cpi->h_event_end_encoding); + cpi->h_event_end_encoding = NULL; vpx_free(cpi->h_encoding_thread); + cpi->h_encoding_thread = NULL; vpx_free(cpi->mb_row_ei); + cpi->mb_row_ei = NULL; vpx_free(cpi->en_thread_data); + cpi->en_thread_data = NULL; + cpi->encoding_thread_count = 0; return -1; } @@ -592,10 +600,16 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { /* free thread related resources */ vpx_free(cpi->h_event_start_encoding); + cpi->h_event_start_encoding = NULL; vpx_free(cpi->h_event_end_encoding); + cpi->h_event_end_encoding = NULL; vpx_free(cpi->h_encoding_thread); + cpi->h_encoding_thread = NULL; vpx_free(cpi->mb_row_ei); + cpi->mb_row_ei = NULL; vpx_free(cpi->en_thread_data); + cpi->en_thread_data = NULL; + cpi->encoding_thread_count = 0; return -2; } @@ -627,13 +641,20 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) { sem_destroy(&cpi->h_event_end_lpf); sem_destroy(&cpi->h_event_start_lpf); + cpi->b_lpf_running = 0; /* free thread related resources */ vpx_free(cpi->h_event_start_encoding); + cpi->h_event_start_encoding = NULL; vpx_free(cpi->h_event_end_encoding); + cpi->h_event_end_encoding = NULL; vpx_free(cpi->h_encoding_thread); + cpi->h_encoding_thread = NULL; vpx_free(cpi->mb_row_ei); + cpi->mb_row_ei = NULL; vpx_free(cpi->en_thread_data); + cpi->en_thread_data = NULL; + cpi->encoding_thread_count = 0; } } #endif From 41caf8fef5dbfd56d1d79303964b91875d3925a3 Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Tue, 3 Oct 2023 14:46:36 +0530 Subject: [PATCH 0916/1000] Add an emms instruction to vpx_subtract_block This CL adds an `emms` instruction at the end of the MMX assembly for the vpx_subtract_block function, to properly clear the register state. This resolves a mismatch between x86 build and C only build. BUG=webm:1816 Change-Id: I79d2947da7f587f3558a2ae17df214d2faf59e74 --- vpx_dsp/x86/subtract_sse2.asm | 1 + 1 file changed, 1 insertion(+) diff --git a/vpx_dsp/x86/subtract_sse2.asm b/vpx_dsp/x86/subtract_sse2.asm index 4273efb854..e3055ab292 100644 --- a/vpx_dsp/x86/subtract_sse2.asm +++ b/vpx_dsp/x86/subtract_sse2.asm @@ -124,4 +124,5 @@ INIT_MMX lea predq, [predq+pred_str*2] sub rowsd, 2 jg .loop_4 + emms RET From c23da380a386e2cae2c757f06a2aebfd72451413 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Tue, 3 Oct 2023 19:30:12 -0700 Subject: [PATCH 0917/1000] VP8: Allocate cpi->mt_current_mb_col array lazily Add the mt_current_mb_col_size field to VP8_COMP to record the size of the mt_current_mb_col array. Move the allocation of the mt_current_mb_col array from vp8_alloc_compressor_data() to vp8_encode_frame(), where the use of mt_current_mb_col starts. Allocate mt_current_mb_col right before use if mt_current_mb_col hasn't been allocated or if the current size is incorrect. Move the deallocation of the mt_current_mb_col array from dealloc_compressor_data() to vp8cx_remove_encoder_threads(). Move the TODO(https://crbug.com/1486441) comment from vp8/encoder/onyx_if.c to vp8/vp8_cx_iface.c. Change-Id: Ic5a0793278c2cc94876669aaa0dd732412876673 --- vp8/encoder/encodeframe.c | 9 +++++++++ vp8/encoder/ethreading.c | 3 +++ vp8/encoder/onyx_if.c | 21 --------------------- vp8/encoder/onyx_int.h | 1 + vp8/vp8_cx_iface.c | 2 ++ 5 files changed, 15 insertions(+), 21 deletions(-) diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index dc29945729..3f0319a54b 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -750,6 +750,15 @@ void vp8_encode_frame(VP8_COMP *cpi) { vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, cpi->encoding_thread_count); + if (cpi->mt_current_mb_col_size != cm->mb_rows) { + vpx_free(cpi->mt_current_mb_col); + cpi->mt_current_mb_col = NULL; + cpi->mt_current_mb_col_size = 0; + CHECK_MEM_ERROR( + &cpi->common.error, cpi->mt_current_mb_col, + vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows)); + cpi->mt_current_mb_col_size = cm->mb_rows; + } for (i = 0; i < cm->mb_rows; ++i) vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1); diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index 353496cc57..3362755094 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -644,6 +644,9 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) { cpi->b_lpf_running = 0; /* free thread related resources */ + vpx_free(cpi->mt_current_mb_col); + cpi->mt_current_mb_col = NULL; + cpi->mt_current_mb_col_size = 0; vpx_free(cpi->h_event_start_encoding); cpi->h_event_start_encoding = NULL; vpx_free(cpi->h_event_end_encoding); diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index c5e9970c3c..890237f7b2 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -442,11 +442,6 @@ static void dealloc_compressor_data(VP8_COMP *cpi) { vpx_free(cpi->mb.pip); cpi->mb.pip = 0; - -#if CONFIG_MULTITHREAD - vpx_free(cpi->mt_current_mb_col); - cpi->mt_current_mb_col = NULL; -#endif } static void enable_segmentation(VP8_COMP *cpi) { @@ -1224,17 +1219,6 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { } else { cpi->mt_sync_range = 16; } - - if (cpi->oxcf.multi_threaded > 1) { - int i; - - vpx_free(cpi->mt_current_mb_col); - CHECK_MEM_ERROR(&cpi->common.error, cpi->mt_current_mb_col, - vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows)); - for (i = 0; i < cm->mb_rows; ++i) - vpx_atomic_init(&cpi->mt_current_mb_col[i], 0); - } - #endif vpx_free(cpi->tplist); @@ -1447,11 +1431,6 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { last_h = cpi->oxcf.Height; prev_number_of_layers = cpi->oxcf.number_of_layers; - if (cpi->initial_width) { - // TODO(https://crbug.com/1486441): Allow changing thread counts; the - // allocation is done once in vp8_create_compressor(). - oxcf->multi_threaded = cpi->oxcf.multi_threaded; - } cpi->oxcf = *oxcf; switch (cpi->oxcf.Mode) { diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 4304f054ca..2c6a55a845 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -526,6 +526,7 @@ typedef struct VP8_COMP { #if CONFIG_MULTITHREAD /* multithread data */ vpx_atomic_int *mt_current_mb_col; + int mt_current_mb_col_size; int mt_sync_range; vpx_atomic_int b_multi_threaded; int encoding_thread_count; diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 8950de0d8a..470fe9e6df 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -488,6 +488,8 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx, ctx->cfg = *cfg; set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); vp8_change_config(ctx->cpi, &ctx->oxcf); + // TODO(https://crbug.com/1486441): Change thread counts; + // vp8cx_create_encoder_threads() is called once in vp8_create_compressor(). ctx->cpi->common.error.setjmp = 0; return VPX_CODEC_OK; } From 8cb4544c21a221e04fe21222349431ba1779d884 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Sat, 30 Sep 2023 15:50:14 -0700 Subject: [PATCH 0918/1000] VP8: allow thread count changes Fix the TODO(https://crbug.com/1486441) comment in vp8/vp8_cx_iface.c. Make vp8cx_create_encoder_threads() work after it has been called before. If there are already the exact number of threads it needs to create, return immediately. Otherwise, shut down the existing threads (by calling vp8cx_remove_encoder_threads()) and create the required number of threads. Call vp8cx_create_encoder_threads() in vp8e_set_config() to respond to changes in g_threads or g_w (which also affects the number of threads through cm->mb_cols and cpi->mt_sync_range). Change-Id: I552eeca5b1f1f5313f59559eb1da396f270a2429 --- vp8/encoder/ethreading.c | 16 ++++++++-------- vp8/vp8_cx_iface.c | 8 ++++++-- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index 3362755094..9993905567 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -487,15 +487,10 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x, int vp8cx_create_encoder_threads(VP8_COMP *cpi) { const VP8_COMMON *cm = &cpi->common; - - vpx_atomic_init(&cpi->b_multi_threaded, 0); - cpi->encoding_thread_count = 0; - cpi->b_lpf_running = 0; + int th_count = 0; if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) { - int ithread; - int th_count = cpi->oxcf.multi_threaded - 1; - int rc = 0; + th_count = cpi->oxcf.multi_threaded - 1; /* don't allocate more threads than cores available */ if (cpi->oxcf.multi_threaded > cm->processor_core_count) { @@ -507,8 +502,13 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { if (th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1)) { th_count = (cm->mb_cols / cpi->mt_sync_range) - 1; } + } + if (th_count == cpi->encoding_thread_count) return 0; - if (th_count == 0) return 0; + vp8cx_remove_encoder_threads(cpi); + if (th_count != 0) { + int ithread; + int rc = 0; CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * th_count)); diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 470fe9e6df..20c44ff4e1 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -19,6 +19,9 @@ #include "vpx_ports/static_assert.h" #include "vpx_ports/system_state.h" #include "vpx_util/vpx_timestamp.h" +#if CONFIG_MULTITHREAD +#include "vp8/encoder/ethreading.h" +#endif #include "vp8/encoder/onyx_int.h" #include "vpx/vp8cx.h" #include "vp8/encoder/firstpass.h" @@ -488,8 +491,9 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx, ctx->cfg = *cfg; set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); vp8_change_config(ctx->cpi, &ctx->oxcf); - // TODO(https://crbug.com/1486441): Change thread counts; - // vp8cx_create_encoder_threads() is called once in vp8_create_compressor(). +#if CONFIG_MULTITHREAD + if (vp8cx_create_encoder_threads(ctx->cpi)) return VPX_CODEC_ERROR; +#endif ctx->cpi->common.error.setjmp = 0; return VPX_CODEC_OK; } From 7c31749387e0297a16289ba851559df4fd2f935d Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Sat, 30 Sep 2023 17:31:09 -0700 Subject: [PATCH 0919/1000] Declare some "VP8_CONFIG *oxcf" params as const Change-Id: Ia5e8445098e18da5978aacf17281f16252413f17 --- vp8/common/onyx.h | 2 +- vp8/encoder/onyx_if.c | 6 +++--- vp8/encoder/onyx_int.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index 8c35e433e7..7f7f567c6a 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -240,7 +240,7 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf); void vp8_remove_compressor(struct VP8_COMP **comp); void vp8_init_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf); -void vp8_change_config(struct VP8_COMP *cpi, VP8_CONFIG *oxcf); +void vp8_change_config(struct VP8_COMP *cpi, const VP8_CONFIG *oxcf); int vp8_receive_raw_frame(struct VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 890237f7b2..12a99584f9 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -270,7 +270,7 @@ static int rescale(int val, int num, int denom) { return (int)(llval * llnum / llden); } -void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, +void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf, const int layer, double prev_layer_framerate) { LAYER_CONTEXT *lc = &cpi->layer_context[layer]; @@ -328,7 +328,7 @@ void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, // for any "new" layers. For "existing" layers, let them inherit the parameters // from the previous layer state (at the same layer #). In future we may want // to better map the previous layer state(s) to the "new" ones. -void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, +void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf, const int prev_num_layers) { int i; double prev_layer_framerate = 0; @@ -1412,7 +1412,7 @@ void vp8_update_layer_contexts(VP8_COMP *cpi) { } } -void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { +void vp8_change_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) { VP8_COMMON *cm = &cpi->common; int last_w, last_h; unsigned int prev_number_of_layers; diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 2c6a55a845..2f06702a1d 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -714,9 +714,9 @@ void vp8_initialize_enc(void); void vp8_alloc_compressor_data(VP8_COMP *cpi); int vp8_reverse_trans(int x); -void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, +void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf, const int prev_num_layers); -void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, +void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf, const int layer, double prev_layer_framerate); void vp8_update_layer_contexts(VP8_COMP *cpi); From 9c377eafbedd3911e83ee80793b4fba80710d4e0 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Sat, 7 Oct 2023 07:06:36 -0700 Subject: [PATCH 0920/1000] Handle Arm/AArch64 runtime feature detection Port the following libaom CLs to libvpx: https://aomedia-review.googlesource.com/c/aom/+/178361 https://aomedia-review.googlesource.com/c/aom/+/180701 https://aomedia-review.googlesource.com/c/aom/+/181821 The tests themselves are not feature-gated in the same way that they are used in the rest of the codebase since they are not controlled by rtcd.pl. This means that tests that assume the existence of features not present on the target can cause SIGILL to be thrown. This commit extends init_vpx_test.cc to match the behaviour for other targets and automatically disable testing for features that are not available on the machine running the tests. Call arm_cpu_caps() and x86_simd_caps() inside #if !CONFIG_SHARED. All the SIMD-specialized functions (arm or x86) are internal functions, so they are not exported from the libvpx shared library. If CONFIG_SHARED is 1, it is not necessary to call arm_cpu_caps(), x86_simd_caps(), and append_negative_gtest_filter() either. Change-Id: I330631816bdb52842020c5aa2a1ad802865cc285 --- test/init_vpx_test.cc | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/test/init_vpx_test.cc b/test/init_vpx_test.cc index 5b40d9e4f7..e88c54f323 100644 --- a/test/init_vpx_test.cc +++ b/test/init_vpx_test.cc @@ -10,9 +10,14 @@ #include "test/init_vpx_test.h" +#include "./vpx_config.h" + +#if !CONFIG_SHARED #include #include "third_party/googletest/src/include/gtest/gtest.h" -#include "./vpx_config.h" +#if VPX_ARCH_ARM +#include "vpx_ports/arm.h" +#endif #if VPX_ARCH_X86 || VPX_ARCH_X86_64 #include "vpx_ports/x86.h" #endif @@ -27,7 +32,7 @@ extern void vpx_dsp_rtcd(); extern void vpx_scale_rtcd(); } -#if VPX_ARCH_X86 || VPX_ARCH_X86_64 +#if VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64 static void append_negative_gtest_filter(const char *str) { std::string filter = GTEST_FLAG_GET(filter); // Negative patterns begin with one '-' followed by a ':' separated list. @@ -35,10 +40,25 @@ static void append_negative_gtest_filter(const char *str) { filter += str; GTEST_FLAG_SET(filter, filter); } -#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64 +#endif // VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64 +#endif // !CONFIG_SHARED namespace libvpx_test { void init_vpx_test() { +#if !CONFIG_SHARED +#if VPX_ARCH_AARCH64 + const int caps = arm_cpu_caps(); + if (!(caps & HAS_NEON_DOTPROD)) { + append_negative_gtest_filter(":NEON_DOTPROD.*:NEON_DOTPROD/*"); + } + if (!(caps & HAS_NEON_I8MM)) { + append_negative_gtest_filter(":NEON_I8MM.*:NEON_I8MM/*"); + } +#elif VPX_ARCH_ARM + const int caps = arm_cpu_caps(); + if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*"); +#endif // VPX_ARCH_ARM + #if VPX_ARCH_X86 || VPX_ARCH_X86_64 const int simd_caps = x86_simd_caps(); if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*"); @@ -58,9 +78,8 @@ void init_vpx_test() { } #endif // VPX_ARCH_X86 || VPX_ARCH_X86_64 -#if !CONFIG_SHARED -// Shared library builds don't support whitebox tests -// that exercise internal symbols. + // Shared library builds don't support whitebox tests that exercise internal + // symbols. #if CONFIG_VP8 vp8_rtcd(); #endif // CONFIG_VP8 From 2ab7ba82511eceb24d45361a0ba81a1460f5dbfc Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 10 Oct 2023 17:43:16 -0400 Subject: [PATCH 0921/1000] Force mode search on 64x64 if no mode is selected A speed feature disable_split_mask (set to 63) could cause no mode and partition to be selected in rd_pick_partition because: -> thresh_mult_sub8x8 all INT_MAX -> All modes skipped for sub8x8 blocks -> found_best_rd is 0 -> break from the loop of 4 sub blocks -> sum_rdc is INT_MAX -> No rd update -> should_encode_sb is 0 -> Propagating to top of the tree -> No partition / mode is selected Bug: b/290499385 Change-Id: Ia655e262f3b32445347ae0aaf1a2d868cea997f3 --- vp9/encoder/vp9_encodeframe.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 0d03d01c80..67869596b1 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -4419,6 +4419,19 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); } + if (bsize == BLOCK_64X64 && best_rdc.rdcost == INT64_MAX) { + vp9_rd_cost_reset(&this_rdc); + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, BLOCK_64X64, + ctx, INT_MAX, INT64_MAX); + ctx->rdcost = this_rdc.rdcost; + vp9_rd_cost_update(partition_mul, x->rddiv, &this_rdc); + if (this_rdc.rdcost < best_rdc.rdcost) { + best_rdc = this_rdc; + should_encode_sb = 1; + pc_tree->partitioning = PARTITION_NONE; + } + } + *rd_cost = best_rdc; if (should_encode_sb && pc_tree->index != 3) { From 0129e64a65594a2af81df5df2ad474dd168a1519 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 16 Oct 2023 11:22:48 -0400 Subject: [PATCH 0922/1000] Fix ubsan failure caused by left shift of negative Bug: b/305642441 Change-Id: Iddb1572c284161140da48f61b04cf600e5b57ecc --- vp8/encoder/mcomp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index b92e2135e9..bc150e482b 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -1123,8 +1123,8 @@ int vp8_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, } } - this_mv.as_mv.row = best_mv->as_mv.row << 3; - this_mv.as_mv.col = best_mv->as_mv.col << 3; + this_mv.as_mv.row = best_mv->as_mv.row * 8; + this_mv.as_mv.col = best_mv->as_mv.col * 8; return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); @@ -1441,8 +1441,8 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, } } - this_mv.as_mv.row = ref_mv->as_mv.row << 3; - this_mv.as_mv.col = ref_mv->as_mv.col << 3; + this_mv.as_mv.row = ref_mv->as_mv.row * 8; + this_mv.as_mv.col = ref_mv->as_mv.col * 8; return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); From e4db6c3aacb3fbcbb939f132915234988f8617c1 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Mon, 16 Oct 2023 16:26:18 -0400 Subject: [PATCH 0923/1000] Cap avg_frame_bandwidth at INT_MAX avg_frame_bandwidth = target_bandwidth / framerate If target_bandwidth is too big and/or framerate is too small (< 1), avg_frame_bandwidth could be overflow Bug: chromium:1492864 Change-Id: I32314da1414b472ae4bf2acdcd81b8a948286146 --- test/encode_api_test.cc | 23 +++++++++++++++++++++++ vp9/encoder/vp9_ratectrl.c | 3 ++- vp9/encoder/vp9_svc_layercontext.c | 12 ++++++++---- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 6b22febf6e..8e90af9113 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -409,6 +409,29 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) { } #if CONFIG_VP9_ENCODER +TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) { + constexpr int kWidth = 16383; + constexpr int kHeight = 16383; + constexpr auto *iface = &vpx_codec_vp9_cx_algo; + SCOPED_TRACE(vpx_codec_iface_name(iface)); + vpx_codec_enc_cfg_t cfg = {}; + struct Encoder { + ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); } + vpx_codec_ctx_t ctx = {}; + } enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + // The following setting will cause avg_frame_bandwidth in rate control to be + // larger than INT_MAX + cfg.rc_target_bitrate = INT_MAX; + cfg.g_timebase.den = 1; + cfg.g_timebase.num = 10; + EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg)); + EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx)) + << "target bitrate: " << cfg.rc_target_bitrate << " framerate: " + << static_cast(cfg.g_timebase.den) / cfg.g_timebase.num; +} + class EncodeApiGetTplStatsTest : public ::libvpx_test::EncoderTest, public ::testing::TestWithParam { diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index fe7414687a..7f4761dfab 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -2643,7 +2643,8 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; int vbr_max_bits; - rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate); + rc->avg_frame_bandwidth = + (int)VPXMIN(oxcf->target_bandwidth / cpi->framerate, INT_MAX); rc->min_frame_bandwidth = (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100); diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 24fd818133..0df34bf459 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -232,7 +232,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size); lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size); lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl]; - lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->avg_frame_bandwidth = + (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX); lrc->max_frame_bandwidth = rc->max_frame_bandwidth; lrc->worst_quality = rc->worst_quality; lrc->best_quality = rc->best_quality; @@ -272,7 +273,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, } else { lc->framerate = cpi->framerate; } - lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->avg_frame_bandwidth = + (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX); lrc->max_frame_bandwidth = rc->max_frame_bandwidth; // Update qp-related quantities. lrc->worst_quality = rc->worst_quality; @@ -314,7 +316,8 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) { const int tl = svc->temporal_layer_id; lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl]; - lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->avg_frame_bandwidth = + (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX); lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth; // Update the average layer frame size (non-cumulative per-frame-bw). if (tl == 0) { @@ -336,7 +339,8 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) { RATE_CONTROL *const lrc = &lc->rc; lc->framerate = framerate; - lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->avg_frame_bandwidth = + (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX); lrc->min_frame_bandwidth = (int)(lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100); lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth * From 424723dc025ce451dab9568239a46b18d0919b4d Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 19 Oct 2023 10:06:50 -0400 Subject: [PATCH 0924/1000] Run bitrate overflow test only on 64bit systems Frame size caps the target bitrate internally, so the frame size needs to be large enough to reproduce the target bitrate overflow in the fuzzing test. However the frame size needed exceeds the max buffer allowed on 32bit system defined by VPX_MAX_ALLOCABLE_MEMORY Bug: chromium:1492864 Change-Id: Ia3a9a78cd35516373897039a7769b492e29e8450 --- test/encode_api_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 8e90af9113..cf89fd1f24 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -409,6 +409,7 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) { } #if CONFIG_VP9_ENCODER +#if VPX_ARCH_X86_64 || VPX_ARCH_AARCH64 TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) { constexpr int kWidth = 16383; constexpr int kHeight = 16383; @@ -431,6 +432,7 @@ TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) { << "target bitrate: " << cfg.rc_target_bitrate << " framerate: " << static_cast(cfg.g_timebase.den) / cfg.g_timebase.num; } +#endif // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64 class EncodeApiGetTplStatsTest : public ::libvpx_test::EncoderTest, From 9004ace97802e2e6bb3de67952fd140c95d43e6e Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 19 Oct 2023 18:28:48 -0700 Subject: [PATCH 0925/1000] Also test VPX_ARCH_AARCH64 for 64-bit platforms Change-Id: Ic11ccd791ff78801e0aba1d12ad2d99b9941ce9d --- test/realtime_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/realtime_test.cc b/test/realtime_test.cc index 88e510fd0d..a9870b3cbf 100644 --- a/test/realtime_test.cc +++ b/test/realtime_test.cc @@ -95,7 +95,7 @@ TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); } TEST_P(RealtimeTest, IntegerOverflowLarge) { if (IsVP9()) { -#if VPX_ARCH_X86_64 +#if VPX_ARCH_AARCH64 || VPX_ARCH_X86_64 TestIntegerOverflow(16384, 16384); #else TestIntegerOverflow(4096, 4096); From 352f9f64df62c8673f54651a90cd6d3935c34c6f Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 20 Oct 2023 14:25:14 -0400 Subject: [PATCH 0926/1000] Reduce memory usage of test with large frame size - Use smaller frame size that still triggers the overflow - Do not run encoder as the encoder init also triggers the overflow Bug: chromium:1492864 Change-Id: I392549abf69f1cfb3754cc847a214513ec9bedc5 --- test/encode_api_test.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index cf89fd1f24..270be3679e 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -409,10 +409,12 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) { } #if CONFIG_VP9_ENCODER +// Frame size needed to trigger the overflow exceeds the max buffer allowed on +// 32-bit systems defined by VPX_MAX_ALLOCABLE_MEMORY #if VPX_ARCH_X86_64 || VPX_ARCH_AARCH64 TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) { - constexpr int kWidth = 16383; - constexpr int kHeight = 16383; + constexpr int kWidth = 12383; + constexpr int kHeight = 8192; constexpr auto *iface = &vpx_codec_vp9_cx_algo; SCOPED_TRACE(vpx_codec_iface_name(iface)); vpx_codec_enc_cfg_t cfg = {}; @@ -425,10 +427,11 @@ TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) { // The following setting will cause avg_frame_bandwidth in rate control to be // larger than INT_MAX cfg.rc_target_bitrate = INT_MAX; + // Framerate 0.1 (equivalent to timebase 10) is the smallest framerate allowed + // by libvpx cfg.g_timebase.den = 1; cfg.g_timebase.num = 10; - EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg)); - EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx)) + EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg)) << "target bitrate: " << cfg.rc_target_bitrate << " framerate: " << static_cast(cfg.g_timebase.den) / cfg.g_timebase.num; } From 6457f065290f8114930204df33957388758c7a43 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Tue, 24 Oct 2023 08:37:27 +0100 Subject: [PATCH 0927/1000] Add Arm SVE build flags and run-time CPU feature detection Add 'sve' arch options to the configure, build and unit test files - adding appropriate conditional options where necessary. Arm SIMD extensions are treated as supersets in libvpx, so disable SVE if either Neon DotProd or I8MM are unavailable. Change-Id: I39dd24f2b209251084d1e28d7ac68099460309bb --- build/make/Makefile | 2 ++ build/make/rtcd.pl | 2 +- configure | 1 + test/init_vpx_test.cc | 3 +++ vpx_ports/aarch64_cpudetect.c | 25 ++++++++++++++++++++++++- vpx_ports/arm.h | 2 ++ 6 files changed, 33 insertions(+), 2 deletions(-) diff --git a/build/make/Makefile b/build/make/Makefile index c2dc47ccff..199ed78058 100644 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -148,6 +148,8 @@ $(BUILD_PFX)%_neon_dotprod.c.d: CFLAGS += -march=armv8.2-a+dotprod $(BUILD_PFX)%_neon_dotprod.c.o: CFLAGS += -march=armv8.2-a+dotprod $(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm $(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm +$(BUILD_PFX)%_sve.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve +$(BUILD_PFX)%_sve.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve # POWER $(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl index 1a6b93d5ae..0b9e16738e 100755 --- a/build/make/rtcd.pl +++ b/build/make/rtcd.pl @@ -487,7 +487,7 @@ () @ALL_ARCHS = filter(qw/neon_asm neon/); arm; } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) { - @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm/); + @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve/); @REQUIRES = filter(qw/neon/); &require(@REQUIRES); arm; diff --git a/configure b/configure index 2c638e5e5a..434c43792c 100755 --- a/configure +++ b/configure @@ -257,6 +257,7 @@ ARCH_EXT_LIST_AARCH64=" neon neon_dotprod neon_i8mm + sve " ARCH_EXT_LIST_X86=" diff --git a/test/init_vpx_test.cc b/test/init_vpx_test.cc index e88c54f323..f66f00b5c1 100644 --- a/test/init_vpx_test.cc +++ b/test/init_vpx_test.cc @@ -54,6 +54,9 @@ void init_vpx_test() { if (!(caps & HAS_NEON_I8MM)) { append_negative_gtest_filter(":NEON_I8MM.*:NEON_I8MM/*"); } + if (!(caps & HAS_SVE)) { + append_negative_gtest_filter(":SVE.*:SVE/*"); + } #elif VPX_ARCH_ARM const int caps = arm_cpu_caps(); if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*"); diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c index ac68f44452..f56d5888ba 100644 --- a/vpx_ports/aarch64_cpudetect.c +++ b/vpx_ports/aarch64_cpudetect.c @@ -77,7 +77,7 @@ static int arm_get_cpu_caps(void) { } #endif // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) #endif // HAVE_NEON_DOTPROD - // No I8MM feature detection available on Windows at time of writing. + // No I8MM or SVE feature detection available on Windows at time of writing. return flags; } @@ -98,6 +98,7 @@ static int arm_get_cpu_caps(void) { // Define hwcap values ourselves: building with an old auxv header where these // hwcap values are not defined should not prevent features from being enabled. #define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20) +#define VPX_AARCH64_HWCAP_SVE (1 << 22) #define VPX_AARCH64_HWCAP2_I8MM (1 << 13) static int arm_get_cpu_caps(void) { @@ -117,6 +118,11 @@ static int arm_get_cpu_caps(void) { flags |= HAS_NEON_I8MM; } #endif // HAVE_NEON_I8MM +#if HAVE_SVE + if (hwcap & VPX_AARCH64_HWCAP_SVE) { + flags |= HAS_SVE; + } +#endif // HAVE_SVE return flags; } @@ -129,6 +135,10 @@ static int arm_get_cpu_caps(void) { #ifndef ZX_ARM64_FEATURE_ISA_I8MM #define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19)) #endif +// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/895083. +#ifndef ZX_ARM64_FEATURE_ISA_SVE +#define ZX_ARM64_FEATURE_ISA_SVE ((uint32_t)(1u << 20)) +#endif static int arm_get_cpu_caps(void) { int flags = 0; @@ -150,6 +160,11 @@ static int arm_get_cpu_caps(void) { flags |= HAS_NEON_I8MM; } #endif // HAVE_NEON_I8MM +#if HAVE_SVE + if (features & ZX_ARM64_FEATURE_ISA_SVE) { + flags |= HAS_SVE; + } +#endif // HAVE_SVE return flags; } @@ -170,5 +185,13 @@ int arm_cpu_caps(void) { flags &= ~HAS_NEON_I8MM; } + // Restrict flags: FEAT_SVE assumes that FEAT_{DotProd,I8MM} are available. + if (!(flags & HAS_NEON_DOTPROD)) { + flags &= ~HAS_SVE; + } + if (!(flags & HAS_NEON_I8MM)) { + flags &= ~HAS_SVE; + } + return flags; } diff --git a/vpx_ports/arm.h b/vpx_ports/arm.h index 65909d8260..39365d18ee 100644 --- a/vpx_ports/arm.h +++ b/vpx_ports/arm.h @@ -23,6 +23,8 @@ extern "C" { #define HAS_NEON_DOTPROD (1 << 1) // Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A. #define HAS_NEON_I8MM (1 << 2) +// Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A. +#define HAS_SVE (1 << 3) int arm_cpu_caps(void); From b759032a0ed2b57ea3412f6820eda377a2dad480 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Fri, 27 Oct 2023 09:24:06 -0700 Subject: [PATCH 0928/1000] Clear some clang-tidy complaints on header includes Change-Id: Id6f54dc4643172f6a5576dc4846c47c8eda31c0f --- vp8/encoder/encodeframe.c | 6 ++++-- vp8/encoder/ethreading.c | 1 + vp9/encoder/vp9_encoder.c | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 3f0319a54b..5c973940ec 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -7,6 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include +#include #include "vpx_config.h" #include "vp8_rtcd.h" @@ -29,9 +31,9 @@ #include "rdopt.h" #include "pickinter.h" #include "vp8/common/findnearmv.h" -#include -#include #include "vp8/common/invtrans.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx_mem/vpx_mem.h" #include "vpx_ports/vpx_timer.h" #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING #include "bitstream.h" diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index 9993905567..e2f8b89d46 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -7,6 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include #include "onyx_int.h" #include "vp8/common/threading.h" diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 993e6310eb..cac35a97ef 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -24,6 +24,7 @@ #if CONFIG_INTERNAL_STATS #include "vpx_dsp/ssim.h" #endif +#include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" #include "vpx_ports/system_state.h" #include "vpx_ports/vpx_once.h" @@ -33,6 +34,7 @@ #endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_idct.h" #if CONFIG_VP9_POSTPROC From 61c927a4ede8b43b29091e3ee9f993eea3b3156a Mon Sep 17 00:00:00 2001 From: Xiahong Bao Date: Sat, 28 Oct 2023 08:52:04 +0900 Subject: [PATCH 0929/1000] calc_pframe_target_size: fix integer overflow The intermediate value in the target bandwidth calculation may exceed integer bounds. Bug: 308007926 Change-Id: I8288c5820db06a550d88bf91fccc86106996deaa Signed-off-by: Xiahong Bao --- vp8/encoder/ratectrl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index 6f14322fdc..fcd4eb04eb 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -719,7 +719,8 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { } /* lower the target bandwidth for this frame. */ - cpi->this_frame_target -= (cpi->this_frame_target * percent_low) / 200; + cpi->this_frame_target -= + (int)(((int64_t)cpi->this_frame_target * percent_low) / 200); /* Are we using allowing control of active_worst_allowed_q * according to buffer level. From 3f3576098ffcc6cf5b44835e2fc1414c227de6cd Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 30 Oct 2023 15:24:12 +0000 Subject: [PATCH 0930/1000] Fix 'unused variable' warning when neon_i8mm is disabled Guard hwcap2 feature interrogation on HAVE_NEON_I8MM so that it gets disabled if neon_i8mm is disabled when configuring the build. Bug: webm:1825 Change-Id: Ic6ff71f17387b96219591928a583d43560bb7c7a --- vpx_ports/aarch64_cpudetect.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c index f56d5888ba..539d09bb39 100644 --- a/vpx_ports/aarch64_cpudetect.c +++ b/vpx_ports/aarch64_cpudetect.c @@ -104,7 +104,9 @@ static int arm_get_cpu_caps(void) { static int arm_get_cpu_caps(void) { int flags = 0; unsigned long hwcap = getauxval(AT_HWCAP); +#if HAVE_NEON_I8MM unsigned long hwcap2 = getauxval(AT_HWCAP2); +#endif // HAVE_NEON_I8MM #if HAVE_NEON flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. #endif // HAVE_NEON From 0d3ef6ffd22bda0ba1ec1bf9c7a24852e4a1d111 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Tue, 24 Oct 2023 11:03:56 -0700 Subject: [PATCH 0931/1000] vp9-RC: Add drop_frame support to external RC Supports single layer and svc. For svc only the framedrop_mode = FULL_SUPERFRAME_DROP is allowed for now. Dropping frames due to overshoot is enabled by the oxcf->drop_frames_water_mark, which is zero as default. Note that this CL also allows for drop/skip encoding of enhancement layers if that layer bitrate is zero. max_consec_drop is also added, set to INT_MAX as default. Note that max_consec_drop is only used for svc mode. It has not been added yet for single layer in libvpx encoder. Tests added for single layer and svc case. Change-Id: Ic12f6a0eb3fbf07d8eb8456c46cec27b2e1930d3 --- test/vp9_ratectrl_rtc_test.cc | 104 ++++++++++++++++++++++++++--- vp9/encoder/vp9_encoder.c | 21 +----- vp9/encoder/vp9_ratectrl.c | 4 +- vp9/encoder/vp9_ratectrl.h | 2 + vp9/encoder/vp9_svc_layercontext.c | 30 ++++++++- vp9/encoder/vp9_svc_layercontext.h | 2 + vp9/ratectrl_rtc.cc | 52 ++++++++++++++- vp9/ratectrl_rtc.h | 14 +++- vpx/internal/vpx_ratectrl_rtc.h | 2 + 9 files changed, 192 insertions(+), 39 deletions(-) diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index b76fd3624c..ff718bbaa7 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -31,6 +31,7 @@ const int kTemporalId2Layer[2] = { 0, 1 }; const int kTemporalRateAllocation3Layer[3] = { 50, 70, 100 }; const int kTemporalRateAllocation2Layer[2] = { 60, 100 }; const int kSpatialLayerBitrate[3] = { 200, 400, 1000 }; +const int kSpatialLayerBitrateLow[3] = { 50, 100, 400 }; class RcInterfaceTest : public ::libvpx_test::EncoderTest, @@ -38,7 +39,7 @@ class RcInterfaceTest public: RcInterfaceTest() : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000), - encoder_exit_(false) {} + encoder_exit_(false), frame_drop_thresh_(0), num_drops_(0) {} ~RcInterfaceTest() override = default; @@ -76,9 +77,12 @@ class RcInterfaceTest int loopfilter_level, qp; encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level); encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp); - rc_api_->ComputeQP(frame_params_); - ASSERT_EQ(rc_api_->GetQP(), qp); - ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level); + if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) { + ASSERT_EQ(rc_api_->GetQP(), qp); + ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level); + } else { + num_drops_++; + } } void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { @@ -97,6 +101,29 @@ class RcInterfaceTest ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } + void RunOneLayerDropFramesCBR() { + if (GET_PARAM(2) != VPX_CBR) { + GTEST_SKIP() << "Frame dropping is only for CBR mode."; + } + frame_drop_thresh_ = 30; + SetConfig(GET_PARAM(2)); + // Use lower bitrate, lower max-q, and enable frame dropper. + rc_cfg_.target_bandwidth = 200; + cfg_.rc_target_bitrate = 200; + rc_cfg_.max_quantizer = 50; + cfg_.rc_max_quantizer = 50; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + frame_params_.spatial_layer_id = 0; + frame_params_.temporal_layer_id = 0; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); + } + void RunOneLayerVBRPeriodicKey() { if (GET_PARAM(2) != VPX_VBR) return; key_interval_ = 100; @@ -134,6 +161,7 @@ class RcInterfaceTest rc_cfg_.min_quantizers[0] = 2; rc_cfg_.rc_mode = rc_mode; rc_cfg_.aq_mode = aq_mode_; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; // Encoder settings for ground truth. cfg_.g_w = 1280; @@ -152,6 +180,7 @@ class RcInterfaceTest cfg_.rc_target_bitrate = 1000; cfg_.kf_min_dist = key_interval_; cfg_.kf_max_dist = key_interval_; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; } std::unique_ptr rc_api_; @@ -160,6 +189,8 @@ class RcInterfaceTest int key_interval_; libvpx::VP9FrameParamsQpRTC frame_params_; bool encoder_exit_; + int frame_drop_thresh_; + int num_drops_; }; class RcInterfaceSvcTest @@ -169,7 +200,8 @@ class RcInterfaceSvcTest RcInterfaceSvcTest() : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000), dynamic_spatial_layers_(0), inter_layer_pred_off_(GET_PARAM(2)), - parallel_spatial_layers_(false) {} + parallel_spatial_layers_(false), frame_drop_thresh_(0), + max_consec_drop_(INT_MAX), num_drops_(0) {} ~RcInterfaceSvcTest() override = default; protected: @@ -181,6 +213,7 @@ class RcInterfaceSvcTest void PreEncodeFrameHook(libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { + current_superframe_ = 0; encoder->Control(VP8E_SET_CPUUSED, 7); encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); encoder->Control(VP9E_SET_TUNE_CONTENT, 0); @@ -192,12 +225,19 @@ class RcInterfaceSvcTest encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, INTER_LAYER_PRED_OFF_NONKEY); } + if (frame_drop_thresh_ > 0) { + vpx_svc_frame_drop_t svc_drop_frame; + svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP; + for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) + svc_drop_frame.framedrop_thresh[sl] = frame_drop_thresh_; + svc_drop_frame.max_consec_drop = max_consec_drop_; + encoder->Control(VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame); + } } frame_params_.frame_type = video->frame() % key_interval_ == 0 ? libvpx::RcFrameType::kKeyFrame : libvpx::RcFrameType::kInterFrame; encoder_exit_ = video->frame() == kNumFrames; - current_superframe_ = video->frame(); if (dynamic_spatial_layers_ == 1) { if (video->frame() == 100) { // Go down to 2 spatial layers: set top SL to 0 bitrate. @@ -257,24 +297,38 @@ class RcInterfaceSvcTest } void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { + if (encoder_exit_) { + return; + } + int superframe_is_dropped = false; ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0; std::vector rc_qp; + // For FULL_SUPERFRAME_DROP: the full superframe drop decision is + // determined on the base spatial layer. + SetFrameParamsSvc(0); + if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kDrop) { + superframe_is_dropped = true; + num_drops_++; + } while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + ASSERT_EQ(superframe_is_dropped, false); ParseSuperframeSizes(static_cast(pkt->data.frame.buf), pkt->data.frame.sz); if (!parallel_spatial_layers_ || current_superframe_ == 0) { for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { if (sizes_[sl] > 0) { SetFrameParamsSvc(sl); - rc_api_->ComputeQP(frame_params_); + // For sl=0 ComputeQP() is already called above (line 310). + if (sl > 0) rc_api_->ComputeQP(frame_params_); rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_); rc_qp.push_back(rc_api_->GetQP()); } } } else { for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { - if (sizes_[sl] > 0) { + // For sl=0 ComputeQP() is already called above (line 310). + if (sizes_[sl] > 0 && sl > 0) { SetFrameParamsSvc(sl); rc_api_->ComputeQP(frame_params_); } @@ -288,7 +342,7 @@ class RcInterfaceSvcTest } } } - if (!encoder_exit_) { + if (!superframe_is_dropped) { int loopfilter_level; std::vector encoder_qp(VPX_SS_MAX_LAYERS, 0); encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level); @@ -296,6 +350,7 @@ class RcInterfaceSvcTest encoder_qp.resize(rc_qp.size()); ASSERT_EQ(rc_qp, encoder_qp); ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level); + current_superframe_++; } } // This method needs to be overridden because non-reference frames are @@ -315,6 +370,21 @@ class RcInterfaceSvcTest ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } + void RunSvcDropFramesCBR() { + max_consec_drop_ = 10; + frame_drop_thresh_ = 30; + SetRCConfigSvc(3, 3); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); + } + void RunSvcPeriodicKey() { SetRCConfigSvc(3, 3); key_interval_ = 100; @@ -438,12 +508,14 @@ class RcInterfaceSvcTest cfg_.kf_max_dist = 9999; cfg_.rc_overshoot_pct = 50; cfg_.rc_undershoot_pct = 50; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; cfg_.rc_target_bitrate = 0; for (int sl = 0; sl < number_spatial_layers; sl++) { int spatial_bitrate = 0; if (number_spatial_layers <= 3) - spatial_bitrate = kSpatialLayerBitrate[sl]; + spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl] + : kSpatialLayerBitrate[sl]; for (int tl = 0; tl < number_temporal_layers; tl++) { int layer = sl * number_temporal_layers + tl; if (number_temporal_layers == 3) @@ -478,6 +550,8 @@ class RcInterfaceSvcTest rc_cfg_.framerate = 30.0; rc_cfg_.rc_mode = VPX_CBR; rc_cfg_.aq_mode = aq_mode_; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; + rc_cfg_.max_consec_drop = max_consec_drop_; if (number_spatial_layers == 3) { rc_cfg_.scaling_factor_num[0] = 1; @@ -511,7 +585,8 @@ class RcInterfaceSvcTest for (int sl = 0; sl < number_spatial_layers; sl++) { int spatial_bitrate = 0; if (number_spatial_layers <= 3) - spatial_bitrate = kSpatialLayerBitrate[sl]; + spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl] + : kSpatialLayerBitrate[sl]; for (int tl = 0; tl < number_temporal_layers; tl++) { int layer = sl * number_temporal_layers + tl; if (number_temporal_layers == 3) @@ -548,14 +623,21 @@ class RcInterfaceSvcTest bool inter_layer_pred_off_; // ComputeQP() and PostEncodeUpdate() don't need to be sequential for KSVC. bool parallel_spatial_layers_; + int frame_drop_thresh_; + int max_consec_drop_; + int num_drops_; }; TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); } +TEST_P(RcInterfaceTest, OneLayerDropFramesCBR) { RunOneLayerDropFramesCBR(); } + TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); } TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); } +TEST_P(RcInterfaceSvcTest, SvcDropFramesCBR) { RunSvcDropFramesCBR(); } + TEST_P(RcInterfaceSvcTest, SvcParallelSpatialLayers) { RunSvcParallelSpatialLayers(); } diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index cac35a97ef..e1a4d986b7 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -5517,26 +5517,7 @@ static void encode_frame_to_data_rate( struct segmentation *const seg = &cm->seg; TX_SIZE t; - // SVC: skip encoding of enhancement layer if the layer target bandwidth = 0. - // No need to set svc.skip_enhancement_layer if whole superframe will be - // dropped. - if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 && - cpi->oxcf.target_bandwidth == 0 && - !(cpi->svc.framedrop_mode != LAYER_DROP && - (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP || - cpi->svc - .force_drop_constrained_from_above[cpi->svc.number_spatial_layers - - 1]) && - cpi->svc.drop_spatial_layer[0])) { - cpi->svc.skip_enhancement_layer = 1; - vp9_rc_postencode_update_drop_frame(cpi); - cpi->ext_refresh_frame_flags_pending = 0; - cpi->last_frame_dropped = 1; - cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1; - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1; - vp9_inc_frame_in_layer(cpi); - return; - } + if (vp9_svc_check_skip_enhancement_layer(cpi)) return; set_ext_overrides(cpi); vpx_clear_system_state(); diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 7f4761dfab..e02b2892ac 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -260,7 +260,7 @@ void vp9_update_buffer_level_preencode(VP9_COMP *cpi) { // for the layered rate control which involves cumulative buffer levels for // the temporal layers. Allow for using the timestamp(pts) delta for the // framerate when the set_ref_frame_config is used. -static void update_buffer_level_svc_preencode(VP9_COMP *cpi) { +void vp9_update_buffer_level_svc_preencode(VP9_COMP *cpi) { SVC *const svc = &cpi->svc; int i; // Set this to 1 to use timestamp delta for "framerate" under @@ -2445,7 +2445,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { vp9_cyclic_refresh_update_parameters(cpi); vp9_rc_set_frame_target(cpi, target); - if (cm->show_frame) update_buffer_level_svc_preencode(cpi); + if (cm->show_frame) vp9_update_buffer_level_svc_preencode(cpi); if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 && svc->spatial_layer_id == svc->first_spatial_layer_to_encode && diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 96a8fd3f1d..48c49e937e 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -350,6 +350,8 @@ void vp9_estimate_qp_gop(struct VP9_COMP *cpi); void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi); +void vp9_update_buffer_level_svc_preencode(struct VP9_COMP *cpi); + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 0df34bf459..fff6d25de0 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -223,11 +223,11 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; } lrc->starting_buffer_level = - (int64_t)(rc->starting_buffer_level * bitrate_alloc); + (int64_t)(rc->starting_buffer_level * bitrate_alloc + 0.5); lrc->optimal_buffer_level = - (int64_t)(rc->optimal_buffer_level * bitrate_alloc); + (int64_t)(rc->optimal_buffer_level * bitrate_alloc + 0.5); lrc->maximum_buffer_size = - (int64_t)(rc->maximum_buffer_size * bitrate_alloc); + (int64_t)(rc->maximum_buffer_size * bitrate_alloc + 0.5); lrc->bits_off_target = VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size); lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size); @@ -1350,3 +1350,27 @@ void vp9_svc_adjust_avg_frame_qindex(VP9_COMP *const cpi) { } } } + +// SVC: skip encoding of enhancement layer if the layer target bandwidth = 0. +// No need to set svc.skip_enhancement_layer if whole superframe will be +// dropped. +int vp9_svc_check_skip_enhancement_layer(VP9_COMP *const cpi) { + if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 && + cpi->oxcf.target_bandwidth == 0 && + !(cpi->svc.framedrop_mode != LAYER_DROP && + (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP || + cpi->svc + .force_drop_constrained_from_above[cpi->svc.number_spatial_layers - + 1]) && + cpi->svc.drop_spatial_layer[0])) { + cpi->svc.skip_enhancement_layer = 1; + vp9_rc_postencode_update_drop_frame(cpi); + cpi->ext_refresh_frame_flags_pending = 0; + cpi->last_frame_dropped = 1; + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1; + cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1; + vp9_inc_frame_in_layer(cpi); + return 1; + } + return 0; +} diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index 90dec5e20a..388a02789d 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -281,6 +281,8 @@ void vp9_svc_update_ref_frame(struct VP9_COMP *const cpi); void vp9_svc_adjust_frame_rate(struct VP9_COMP *const cpi); void vp9_svc_adjust_avg_frame_qindex(struct VP9_COMP *const cpi); + +int vp9_svc_check_skip_enhancement_layer(struct VP9_COMP *const cpi); #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index d92b095714..d8239718a8 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -130,6 +130,7 @@ bool VP9RateControlRTC::UpdateRateControl( oxcf->maximum_buffer_size_ms = rc_cfg.buf_sz; oxcf->under_shoot_pct = rc_cfg.undershoot_pct; oxcf->over_shoot_pct = rc_cfg.overshoot_pct; + oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh; oxcf->ss_number_layers = rc_cfg.ss_number_layers; oxcf->ts_number_layers = rc_cfg.ts_number_layers; oxcf->temporal_layering_mode = (VP9E_TEMPORAL_LAYERING_MODE)( @@ -172,9 +173,15 @@ bool VP9RateControlRTC::UpdateRateControl( vp9_new_framerate(cpi_, cpi_->framerate); if (cpi_->svc.number_temporal_layers > 1 || cpi_->svc.number_spatial_layers > 1) { - if (cm->current_video_frame == 0) vp9_init_layer_context(cpi_); + if (cm->current_video_frame == 0) { + vp9_init_layer_context(cpi_); + // svc->framedrop_mode is not currently exposed, so only allow for + // full superframe drop for now. + cpi_->svc.framedrop_mode = FULL_SUPERFRAME_DROP; + } vp9_update_layer_context_change_config(cpi_, (int)cpi_->oxcf.target_bandwidth); + cpi_->svc.max_consec_drop = rc_cfg.max_consec_drop; } vp9_check_reset_rc_flag(cpi_); @@ -182,7 +189,11 @@ bool VP9RateControlRTC::UpdateRateControl( return true; } -void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { +// Compute the QP for the frame. If the frame is dropped this function +// returns kDrop, and no QP is computed. If the frame is encoded (not dropped) +// the QP is computed and kOk is returned. +FrameDropDecision VP9RateControlRTC::ComputeQP( + const VP9FrameParamsQpRTC &frame_params) { VP9_COMMON *const cm = &cpi_->common; int width, height; cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id; @@ -234,6 +245,36 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { vp9_restore_layer_context(cpi_); vp9_rc_get_svc_params(cpi_); } + if (cpi_->svc.spatial_layer_id == 0) vp9_zero(cpi_->svc.drop_spatial_layer); + // SVC: check for skip encoding of enhancement layer if the + // layer target bandwidth = 0. + if (vp9_svc_check_skip_enhancement_layer(cpi_)) + return FrameDropDecision::kDrop; + // Check for dropping this frame based on buffer level. + // Never drop on key frame, or if base layer is key for svc, + if (!frame_is_intra_only(cm) && + (!cpi_->use_svc || + !cpi_->svc.layer_context[cpi_->svc.temporal_layer_id].is_key_frame)) { + if (vp9_rc_drop_frame(cpi_)) { + // For FULL_SUPERFRAME_DROP mode (the only mode considered here): + // if the superframe drop is decided we need to save the layer context for + // all spatial layers, and call update_buffer_level and postencode_drop + // for all spatial layers. + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + vp9_save_layer_context(cpi_); + for (int sl = 1; sl < cpi_->svc.number_spatial_layers; sl++) { + cpi_->svc.spatial_layer_id = sl; + vp9_restore_layer_context(cpi_); + vp9_update_buffer_level_svc_preencode(cpi_); + vp9_rc_postencode_update_drop_frame(cpi_); + vp9_save_layer_context(cpi_); + } + } + return FrameDropDecision::kDrop; + } + } + // Compute the QP for the frame. int bottom_index, top_index; cpi_->common.base_qindex = vp9_rc_pick_q_and_bounds(cpi_, &bottom_index, &top_index); @@ -242,6 +283,13 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { if (cpi_->svc.number_spatial_layers > 1 || cpi_->svc.number_temporal_layers > 1) vp9_save_layer_context(cpi_); + + cpi_->last_frame_dropped = 0; + cpi_->svc.last_layer_dropped[cpi_->svc.spatial_layer_id] = 0; + if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1) + cpi_->svc.num_encoded_top_layer++; + + return FrameDropDecision::kOk; } int VP9RateControlRTC::GetQP() const { return cpi_->common.base_qindex; } diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index d3876de875..a8dd5c42ff 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -38,6 +38,7 @@ struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig { scaling_factor_den[0] = 1; max_quantizers[0] = max_quantizer; min_quantizers[0] = min_quantizer; + max_consec_drop = INT_MAX; } // Number of spatial layers @@ -46,6 +47,8 @@ struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig { int min_quantizers[VPX_MAX_LAYERS]; int scaling_factor_num[VPX_SS_MAX_LAYERS]; int scaling_factor_den[VPX_SS_MAX_LAYERS]; + // This is only for SVC for now. + int max_consec_drop; }; struct VP9FrameParamsQpRTC { @@ -61,6 +64,11 @@ struct VP9SegmentationData { size_t delta_q_size; }; +enum class FrameDropDecision { + kOk, // Frame is encoded. + kDrop, // Frame is dropped. +}; + // This interface allows using VP9 real-time rate control without initializing // the encoder. To use this interface, you need to link with libvpxrc.a. // @@ -92,7 +100,11 @@ class VP9RateControlRTC { int GetQP() const; int GetLoopfilterLevel() const; bool GetSegmentationData(VP9SegmentationData *segmentation_data) const; - void ComputeQP(const VP9FrameParamsQpRTC &frame_params); + // ComputeQP returns the QP is the frame is not dropped (kOk return), + // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate + // are not to be called (vp9_rc_postencode_update_drop_frame is already + // called via ComputeQP if drop is decided). + FrameDropDecision ComputeQP(const VP9FrameParamsQpRTC &frame_params); // Feedback to rate control with the size of current encoded frame void PostEncodeUpdate(uint64_t encoded_frame_size, const VP9FrameParamsQpRTC &frame_params); diff --git a/vpx/internal/vpx_ratectrl_rtc.h b/vpx/internal/vpx_ratectrl_rtc.h index 33c57e219b..eb90cd1d0c 100644 --- a/vpx/internal/vpx_ratectrl_rtc.h +++ b/vpx/internal/vpx_ratectrl_rtc.h @@ -37,6 +37,7 @@ struct VpxRateControlRtcConfig { aq_mode = 0; layer_target_bitrate[0] = static_cast(target_bandwidth); ts_rate_decimator[0] = 1; + frame_drop_thresh = 0; } int width; @@ -60,6 +61,7 @@ struct VpxRateControlRtcConfig { // vbr, cbr enum vpx_rc_mode rc_mode; int aq_mode; + int frame_drop_thresh; }; } // namespace libvpx #endif // VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_ From 1464d7738a49b2b6e2c8e8e9f03b565f4f6c0860 Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Thu, 5 Oct 2023 11:19:19 +0530 Subject: [PATCH 0932/1000] Modify C vs SIMD test script - Enable C vs SIMD test for x86 32-bit platform - Correct a print message in run_tests() BUG=webm:1800 Change-Id: Ib1ccd3a87a64b5ec6cde524a14d5d1b7e200abfb --- test/tools_common.sh | 7 ++++++- test/vp9_c_vs_simd_encode.sh | 16 +++++++--------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/test/tools_common.sh b/test/tools_common.sh index 0e4a0a5c0e..d0dd24df36 100755 --- a/test/tools_common.sh +++ b/test/tools_common.sh @@ -280,7 +280,12 @@ run_tests() { test_end "${test}" done - local tested_config="$(test_configuration_target) @ $(current_hash)" + # C vs SIMD tests are run for x86 32-bit, 64-bit and ARM platform + if [ "${test_name}" = "vp9_c_vs_simd_encode" ]; then + local tested_config="$(current_hash)" + else + local tested_config="$(test_configuration_target) @ $(current_hash)" + fi echo "${test_name}: Done, all tests pass for ${tested_config}." } diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh index 42e5f3b589..7cd60543cc 100755 --- a/test/vp9_c_vs_simd_encode.sh +++ b/test/vp9_c_vs_simd_encode.sh @@ -378,16 +378,14 @@ vp9_c_vs_simd_enc_test() { # Test Generic vp9_test_generic - # TODO(webm:1816): Enable x86 test once issue 1816 is fixed. - # Details: https://bugs.chromium.org/p/webm/issues/detail?id=1816 # Test x86 (32 bit) - # echo "vp9 test for x86 (32 bit): Started." - # if ! vp9_test_x86 "x86"; then - # echo "vp9 test for x86 (32 bit): Done, test failed." - # return 1 - # else - # echo "vp9 test for x86 (32 bit): Done, all tests passed." - # fi + echo "vp9 test for x86 (32 bit): Started." + if ! vp9_test_x86 "x86"; then + echo "vp9 test for x86 (32 bit): Done, test failed." + return 1 + else + echo "vp9 test for x86 (32 bit): Done, all tests passed." + fi # Test x86_64 (64 bit) if [ "$(eval uname -m)" = "x86_64" ]; then From 879c9bd9066527770d2999831501d9aeda0b79ac Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 3 Nov 2023 15:22:04 -0400 Subject: [PATCH 0933/1000] Check fragments count before use Bug: webm:1827 Change-Id: I8d603d5db92476222cbee1c61fece957ad03a49f --- vp8/vp8_dx_iface.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index 9e622e3b97..d4e06a7bce 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -249,14 +249,14 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data, /* Store a pointer to this fragment and return. We haven't * received the complete frame yet, so we will wait with decoding. */ - ctx->fragments.ptrs[ctx->fragments.count] = data; - ctx->fragments.sizes[ctx->fragments.count] = data_sz; - ctx->fragments.count++; - if (ctx->fragments.count > (1 << EIGHT_PARTITION) + 1) { + if (ctx->fragments.count >= MAX_PARTITIONS) { ctx->fragments.count = 0; *res = VPX_CODEC_INVALID_PARAM; return -1; } + ctx->fragments.ptrs[ctx->fragments.count] = data; + ctx->fragments.sizes[ctx->fragments.count] = data_sz; + ctx->fragments.count++; return 0; } From 5b8d24f678560edb545beeffee7668761ad5fa7e Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 6 Nov 2023 10:21:56 -0800 Subject: [PATCH 0934/1000] configure: detect PIE and enable PIC Fixes the creation of DT_TEXTREL entries in binaries built with PIE enabled: /usr/bin/ld: warning: creating DT_TEXTREL in a PIE This matches the changes made in libaom: 1df26009da aom_configure: only override CONFIG_PIC if not set on cmd line 7235e65746 aom_configure.cmake: detect PIE and set CONFIG_PIC Change-Id: I0a43e964af2d8eb8c5e7811ce14ad39285eec3a8 --- build/make/configure.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/build/make/configure.sh b/build/make/configure.sh index 9d3cd80cb3..54fb1daf4d 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -863,8 +863,14 @@ process_common_toolchain() { ;; esac - # PIC is probably what we want when building shared libs + # Position independent code (PIC) is probably what we want when building + # shared libs or position independent executable (PIE) targets. enabled shared && soft_enable pic + check_cpp << EOF || soft_enable pic +#if !(__pie__ || __PIE__) +#error Neither __pie__ or __PIE__ are set +#endif +EOF # Minimum iOS version for all target platforms (darwin and iphonesimulator). # Shared library framework builds are only possible on iOS 8 and later. From c732fa70705a1563d1e92462d0157501efc85718 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Tue, 7 Nov 2023 15:47:43 -0800 Subject: [PATCH 0935/1000] Use symbolic constant VPX_CBR instead of 1 Change-Id: Idae94cfc6d7a882691deeb4fa3ce0015f80ed937 --- vpxenc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpxenc.c b/vpxenc.c index 38d69a1923..d20bd3f967 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -883,7 +883,7 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global, /* Default lag_in_frames is 0 in realtime mode CBR mode*/ if (global->deadline == VPX_DL_REALTIME && - stream->config.cfg.rc_end_usage == 1) + stream->config.cfg.rc_end_usage == VPX_CBR) stream->config.cfg.g_lag_in_frames = 0; } From 7ab673a9f62a26035155bc4e26fc375fe483bb95 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Wed, 8 Nov 2023 15:16:13 -0800 Subject: [PATCH 0936/1000] Fix float-cast-overflow in vp8_change_config() Bug: b:309716574 Change-Id: I9c523d5e9211f895c7497a9e3674b55f6be6c742 --- test/encode_api_test.cc | 56 +++++++++++++++++++++++++++++++++++++++++ vp8/encoder/onyx_if.c | 8 +++--- 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 270be3679e..012e54a33d 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -122,6 +122,62 @@ TEST(EncodeAPI, ImageSizeSetting) { vpx_codec_destroy(&enc); } + +// Verifies the fix for a float-cast-overflow in vp8_change_config(). +// +// Causes cpi->framerate to become the largest possible value (10,000,000) in +// VP8 by setting cfg.g_timebase to 1/10000000 and passing a duration of 1 to +// vpx_codec_encode(). +TEST(EncodeAPI, HugeFramerateVp8) { + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + cfg.g_w = 271; + cfg.g_h = 1080; + cfg.g_timebase.num = 1; + // Largest value (VP8's TICKS_PER_SEC) such that frame duration is nonzero (1 + // tick). + cfg.g_timebase.den = 10000000; + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.rc_end_usage = VPX_CBR; + + vpx_codec_ctx_t enc; + // Before we encode the first frame, cpi->framerate is set to a guess (the + // reciprocal of cfg.g_timebase). If this guess doesn't seem reasonable + // (> 180), cpi->framerate is set to 30. + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -12), VPX_CODEC_OK); + + vpx_image_t *const image = + vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1); + ASSERT_NE(image, nullptr); + + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], 128, image->d_w); + } + const unsigned int uv_h = (image->d_h + 1) / 2; + const unsigned int uv_w = (image->d_w + 1) / 2; + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], 128, uv_w); + memset(image->planes[2] + i * image->stride[2], 128, uv_w); + } + + // Encode a frame. + const unsigned long deadline = VPX_DL_REALTIME; + // Up to this point cpi->framerate is 30. Now pass a duration of only 1. This + // causes cpi->framerate to become 10,000,000. + ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, deadline), VPX_CODEC_OK); + + // Change to the same config. Since cpi->framerate is now huge, when it is + // used to calculate raw_target_rate (bit rate of uncompressed frames), the + // result is likely to overflow an unsigned int. + ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK); + + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} #endif // Set up 2 spatial streams with 2 temporal layers per stream, and generate diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 12a99584f9..79140864e7 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1416,7 +1416,7 @@ void vp8_change_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) { VP8_COMMON *cm = &cpi->common; int last_w, last_h; unsigned int prev_number_of_layers; - unsigned int raw_target_rate; + double raw_target_rate; if (!cpi) return; @@ -1557,10 +1557,10 @@ void vp8_change_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) { cpi->oxcf.maximum_buffer_size_in_ms = 240000; } - raw_target_rate = (unsigned int)((int64_t)cpi->oxcf.Width * cpi->oxcf.Height * - 8 * 3 * cpi->framerate / 1000); + raw_target_rate = ((int64_t)cpi->oxcf.Width * cpi->oxcf.Height * 8 * 3 * + cpi->framerate / 1000.0); if (cpi->oxcf.target_bandwidth > raw_target_rate) - cpi->oxcf.target_bandwidth = raw_target_rate; + cpi->oxcf.target_bandwidth = (unsigned int)raw_target_rate; /* Convert target bandwidth from Kbit/s to Bit/s */ cpi->oxcf.target_bandwidth *= 1000; From 4e05c38c85fd3f72e167bbf8bb82816bc45393a6 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 9 Nov 2023 10:19:47 -0800 Subject: [PATCH 0937/1000] Document the units of VP8 target_bandwidth/bitrate Change-Id: I6298a0acb4ef546ae198bb1f16dea50ed34b2dae --- vp8/common/onyx.h | 10 +++++++++- vp8/encoder/onyx_int.h | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index 7f7f567c6a..96cd2fe59e 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -83,7 +83,14 @@ typedef struct { int Width; int Height; struct vpx_rational timebase; - unsigned int target_bandwidth; /* kilobits per second */ + /* In either kilobits per second or bits per second, depending on which + * copy of oxcf this is in. + * - ctx->oxcf.target_bandwidth is in kilobits per second. See + * set_vp8e_config(). + * - ctx->cpi->oxcf.target_bandwidth in is bits per second. See + * vp8_change_config(). + */ + unsigned int target_bandwidth; /* Parameter used for applying denoiser. * For temporal denoiser: noise_sensitivity = 0 means off, @@ -214,6 +221,7 @@ typedef struct { /* Temporal scaling parameters */ unsigned int number_of_layers; + /* kilobits per second */ unsigned int target_bitrate[VPX_TS_MAX_PERIODICITY]; unsigned int rate_decimator[VPX_TS_MAX_PERIODICITY]; unsigned int periodicity; diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 2f06702a1d..cdf94f4f23 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -215,7 +215,7 @@ enum { typedef struct { /* Layer configuration */ double framerate; - int target_bandwidth; + int target_bandwidth; /* bits per second */ /* Layer specific coding parameters */ int64_t starting_buffer_level; @@ -438,7 +438,7 @@ typedef struct VP8_COMP { int kf_boost; int last_boost; - int target_bandwidth; + int target_bandwidth; /* bits per second */ struct vpx_codec_pkt_list *output_pkt_list; #if 0 From 296784c83afd2eacce61f6dc94a003b383c90d01 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 9 Nov 2023 12:42:58 -0800 Subject: [PATCH 0938/1000] Declare oxcf arg of vp8_create_compressor as const Declare the oxcf parameters of vp8_create_compressor() and init_config() as const. This helps code analysis. Change-Id: I344ef3e6afc3adced2b2865b7e0057c6d4b1d3c0 --- vp8/common/onyx.h | 2 +- vp8/encoder/onyx_if.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index 7f7f567c6a..c363c1c81b 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -236,7 +236,7 @@ typedef struct { void vp8_initialize(); -struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf); +struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf); void vp8_remove_compressor(struct VP8_COMP **comp); void vp8_init_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf); diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 12a99584f9..ed243ef951 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1290,7 +1290,7 @@ void vp8_new_framerate(VP8_COMP *cpi, double framerate) { } } -static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { +static void init_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) { VP8_COMMON *cm = &cpi->common; cpi->oxcf = *oxcf; @@ -1739,7 +1739,7 @@ static void cal_mvsadcosts(int *mvsadcost[2]) { } while (++i <= mvfp_max); } -struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { +struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) { int i; VP8_COMP *cpi; From f05122d35cf9b11d309c412157e0f250426f6de4 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 9 Nov 2023 18:35:51 -0800 Subject: [PATCH 0939/1000] Fix ClangTidy warnings Most are related to include-what-you-use. One is to avoid using the unsigned long type explicitly (by passing VPX_DL_REALTIME directly to vpx_codec_encode). Change-Id: Ieaf3418382ad8516cb4b172f7678893286fcb8cf --- test/encode_api_test.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 012e54a33d..770052c859 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -21,6 +21,9 @@ #include "./vpx_config.h" #include "vpx/vp8cx.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_encoder.h" +#include "vpx/vpx_image.h" #include "vpx/vpx_tpl.h" namespace { @@ -165,10 +168,10 @@ TEST(EncodeAPI, HugeFramerateVp8) { } // Encode a frame. - const unsigned long deadline = VPX_DL_REALTIME; // Up to this point cpi->framerate is 30. Now pass a duration of only 1. This // causes cpi->framerate to become 10,000,000. - ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, deadline), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_REALTIME), + VPX_CODEC_OK); // Change to the same config. Since cpi->framerate is now huge, when it is // used to calculate raw_target_rate (bit rate of uncompressed frames), the From d15a1970c153fe85761ccb88441832ba856aec1e Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Wed, 8 Nov 2023 21:05:15 -0800 Subject: [PATCH 0940/1000] Delete -Wdeclaration-after-statement Older versions of MSVC do not allow declarations after statements in C files. We don't need to support those versions of MSVC now. Use -std=gnu99 instead of -std=gnu89. Change-Id: I76ba962f5a2bca30d6a5b2b05c5786507398ad32 --- CHANGELOG | 5 +++++ configure | 6 ++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index f932f6bf4d..5a8605a73d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,8 @@ +20yy-mm-dd v1.14.0 "V Duck" + This release drops support for old C compilers, such as Visual Studio 2012 + and older, that disallow mixing variable declarations and statements (a C99 + feature). + 2023-09-29 v1.13.1 "Ugly Duckling" This release contains two security related fixes. One each for VP8 and VP9. diff --git a/configure b/configure index 434c43792c..6b910160a8 100755 --- a/configure +++ b/configure @@ -643,7 +643,6 @@ process_toolchain() { if enabled gcc; then enabled werror && check_add_cflags -Werror check_add_cflags -Wall - check_add_cflags -Wdeclaration-after-statement check_add_cflags -Wdisabled-optimization check_add_cflags -Wextra-semi check_add_cflags -Wextra-semi-stmt @@ -670,9 +669,8 @@ process_toolchain() { if enabled mips || [ -z "${INLINE}" ]; then enabled extra_warnings || check_add_cflags -Wno-unused-function fi - # Enforce c89 for c files. Don't be too strict about it though. Allow - # gnu extensions like "//" for comments. - check_cflags -std=gnu89 && add_cflags_only -std=gnu89 + # Enforce C99 for C files. Allow GNU extensions. + check_cflags -std=gnu99 && add_cflags_only -std=gnu99 # Avoid this warning for third_party C++ sources. Some reorganization # would be needed to apply this only to test/*.cc. check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32 From e4127f591de35c2e5dd62704c345716a3e7e3706 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Fri, 10 Nov 2023 13:14:18 -0800 Subject: [PATCH 0941/1000] Document how VP9 treats a negative speed value Change-Id: I12948b08a7bb5beb5024b8676de9dafc239f8e89 --- vpx/vp8cx.h | 1 + 1 file changed, 1 insertion(+) diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 3c0278c848..d098c4c985 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -166,6 +166,7 @@ enum vp8e_enc_control_id { * * \note Valid range for VP8: -16..16 * \note Valid range for VP9: -9..9 + * \note A negative value (-n) is treated as its absolute value (n) in VP9. * * Supported in codecs: VP8, VP9 */ From 81aaa7f04b7644ac80960d17442199607195b24c Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Sun, 12 Nov 2023 00:39:14 -0800 Subject: [PATCH 0942/1000] rtc: Add frame dropper to VP8 external RC Move some internal drop_frame code to separate function so the external RC can use. And add new flag setting under VP8E_SET_RTC_EXTERNAL_RATECTRL to disable vp8_drop_encodedframe_overshoot() for testing the external RC. Unittest added for single layer and 3 temporal layers. Bug: b/280363228 Change-Id: Ibea2f627cc54e7156ff35259a64dd111d42d146c --- test/vp8_ratectrl_rtc_test.cc | 75 ++++++++++-- vp8/encoder/onyx_if.c | 215 ++++++++++++++++++---------------- vp8/encoder/onyx_int.h | 6 + vp8/vp8_cx_iface.c | 3 +- vp8/vp8_ratectrl_rtc.cc | 21 +++- vp8/vp8_ratectrl_rtc.h | 10 +- 6 files changed, 212 insertions(+), 118 deletions(-) diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc index 81f06d90ad..9fbc1d4d98 100644 --- a/test/vp8_ratectrl_rtc_test.cc +++ b/test/vp8_ratectrl_rtc_test.cc @@ -52,7 +52,8 @@ class Vp8RcInterfaceTest public ::libvpx_test::CodecTestWith2Params { public: Vp8RcInterfaceTest() - : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false) {} + : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false), + frame_drop_thresh_(0) {} ~Vp8RcInterfaceTest() override = default; protected: @@ -145,8 +146,11 @@ class Vp8RcInterfaceTest } int qp; encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp); - rc_api_->ComputeQP(frame_params_); - ASSERT_EQ(rc_api_->GetQP(), qp); + if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) { + ASSERT_EQ(rc_api_->GetQP(), qp); + } else { + num_drops_++; + } } void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { @@ -156,8 +160,6 @@ class Vp8RcInterfaceTest void RunOneLayer() { test_video_ = GET_PARAM(2); target_bitrate_ = GET_PARAM(1); - if (test_video_.width == 1280 && target_bitrate_ == 200) return; - if (test_video_.width == 640 && target_bitrate_ == 1000) return; SetConfig(); rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); @@ -169,12 +171,33 @@ class Vp8RcInterfaceTest ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } + void RunOneLayerDropFrames() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + frame_drop_thresh_ = 30; + num_drops_ = 0; + // Use lower target_bitrate and max_quantizer to trigger drops. + target_bitrate_ = target_bitrate_ >> 2; + SetConfig(); + rc_cfg_.max_quantizer = 56; + cfg_.rc_max_quantizer = 56; + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); + } + void RunPeriodicKey() { test_video_ = GET_PARAM(2); target_bitrate_ = GET_PARAM(1); - if (test_video_.width == 1280 && target_bitrate_ == 200) return; - if (test_video_.width == 640 && target_bitrate_ == 1000) return; key_interval_ = 100; + frame_drop_thresh_ = 30; SetConfig(); rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); @@ -189,8 +212,6 @@ class Vp8RcInterfaceTest void RunTemporalLayers2TL() { test_video_ = GET_PARAM(2); target_bitrate_ = GET_PARAM(1); - if (test_video_.width == 1280 && target_bitrate_ == 200) return; - if (test_video_.width == 640 && target_bitrate_ == 1000) return; SetConfigTemporalLayers(2); rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); @@ -205,8 +226,6 @@ class Vp8RcInterfaceTest void RunTemporalLayers3TL() { test_video_ = GET_PARAM(2); target_bitrate_ = GET_PARAM(1); - if (test_video_.width == 1280 && target_bitrate_ == 200) return; - if (test_video_.width == 640 && target_bitrate_ == 1000) return; SetConfigTemporalLayers(3); rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); @@ -218,6 +237,28 @@ class Vp8RcInterfaceTest ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } + void RunTemporalLayers3TLDropFrames() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + frame_drop_thresh_ = 30; + num_drops_ = 0; + // Use lower target_bitrate and max_quantizer to trigger drops. + target_bitrate_ = target_bitrate_ >> 2; + SetConfigTemporalLayers(3); + rc_cfg_.max_quantizer = 56; + cfg_.rc_max_quantizer = 56; + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); + } + private: void SetConfig() { rc_cfg_.width = test_video_.width; @@ -233,6 +274,7 @@ class Vp8RcInterfaceTest rc_cfg_.max_intra_bitrate_pct = 1000; rc_cfg_.framerate = 30.0; rc_cfg_.layer_target_bitrate[0] = target_bitrate_; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; // Encoder settings for ground truth. cfg_.g_w = test_video_.width; @@ -251,6 +293,7 @@ class Vp8RcInterfaceTest cfg_.rc_target_bitrate = target_bitrate_; cfg_.kf_min_dist = key_interval_; cfg_.kf_max_dist = key_interval_; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; } void SetConfigTemporalLayers(int temporal_layers) { @@ -266,6 +309,7 @@ class Vp8RcInterfaceTest rc_cfg_.overshoot_pct = 50; rc_cfg_.max_intra_bitrate_pct = 1000; rc_cfg_.framerate = 30.0; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; if (temporal_layers == 2) { rc_cfg_.layer_target_bitrate[0] = 60 * target_bitrate_ / 100; rc_cfg_.layer_target_bitrate[1] = target_bitrate_; @@ -299,6 +343,7 @@ class Vp8RcInterfaceTest cfg_.rc_target_bitrate = target_bitrate_; cfg_.kf_min_dist = key_interval_; cfg_.kf_max_dist = key_interval_; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; // 2 Temporal layers, no spatial layers, CBR mode. cfg_.ss_number_layers = 1; cfg_.ts_number_layers = temporal_layers; @@ -326,16 +371,24 @@ class Vp8RcInterfaceTest Vp8RCTestVideo test_video_; libvpx::VP8FrameParamsQpRTC frame_params_; bool encoder_exit_; + int frame_drop_thresh_; + int num_drops_; }; TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); } +TEST_P(Vp8RcInterfaceTest, OneLayerDropFrames) { RunOneLayerDropFrames(); } + TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); } TEST_P(Vp8RcInterfaceTest, TemporalLayers2TL) { RunTemporalLayers2TL(); } TEST_P(Vp8RcInterfaceTest, TemporalLayers3TL) { RunTemporalLayers3TL(); } +TEST_P(Vp8RcInterfaceTest, TemporalLayers3TLDropFrames) { + RunTemporalLayers3TLDropFrames(); +} + VP8_INSTANTIATE_TEST_SUITE(Vp8RcInterfaceTest, ::testing::Values(200, 400, 1000), ::testing::ValuesIn(kVp8RCTestVectors)); diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 15cef5ed33..4e128e3c49 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1899,6 +1899,7 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) { cpi->force_maxqp = 0; cpi->frames_since_last_drop_overshoot = 0; cpi->rt_always_update_correction_factor = 0; + cpi->rt_drop_recode_on_overshoot = 1; cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; #if CONFIG_INTERNAL_STATS @@ -3183,6 +3184,113 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) { vp8_yv12_extend_frame_borders(cm->frame_to_show); } +// Return 1 if frame is to be dropped. Update frame drop decimation +// counters. +int vp8_check_drop_buffer(VP8_COMP *cpi) { + VP8_COMMON *cm = &cpi->common; + int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark * + cpi->oxcf.optimal_buffer_level / 100); + int drop_mark75 = drop_mark * 2 / 3; + int drop_mark50 = drop_mark / 4; + int drop_mark25 = drop_mark / 8; + if (cpi->drop_frames_allowed) { + /* The reset to decimation 0 is only done here for one pass. + * Once it is set two pass leaves decimation on till the next kf. + */ + if (cpi->buffer_level > drop_mark && cpi->decimation_factor > 0) { + cpi->decimation_factor--; + } + + if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) { + cpi->decimation_factor = 1; + + } else if (cpi->buffer_level < drop_mark25 && + (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) { + cpi->decimation_factor = 3; + } else if (cpi->buffer_level < drop_mark50 && + (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) { + cpi->decimation_factor = 2; + } else if (cpi->buffer_level < drop_mark75 && + (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) { + cpi->decimation_factor = 1; + } + } + + /* The following decimates the frame rate according to a regular + * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help + * prevent buffer under-run in CBR mode. Alternatively it might be + * desirable in some situations to drop frame rate but throw more bits + * at each frame. + * + * Note that dropping a key frame can be problematic if spatial + * resampling is also active + */ + if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) { + switch (cpi->decimation_factor) { + case 1: + cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2; + break; + case 2: + cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; + break; + case 3: + cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; + break; + } + + /* Note that we should not throw out a key frame (especially when + * spatial resampling is enabled). + */ + if (cm->frame_type == KEY_FRAME) { + cpi->decimation_count = cpi->decimation_factor; + } else if (cpi->decimation_count > 0) { + cpi->decimation_count--; + + cpi->bits_off_target += cpi->av_per_frame_bandwidth; + if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) { + cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; + } + +#if CONFIG_MULTI_RES_ENCODING + vp8_store_drop_frame_info(cpi); +#endif + + cm->current_video_frame++; + cpi->frames_since_key++; + cpi->ext_refresh_frame_flags_pending = 0; + // We advance the temporal pattern for dropped frames. + cpi->temporal_pattern_counter++; + +#if CONFIG_INTERNAL_STATS + cpi->count++; +#endif + + cpi->buffer_level = cpi->bits_off_target; + + if (cpi->oxcf.number_of_layers > 1) { + unsigned int i; + + /* Propagate bits saved by dropping the frame to higher + * layers + */ + for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate); + if (lc->bits_off_target > lc->maximum_buffer_size) { + lc->bits_off_target = lc->maximum_buffer_size; + } + lc->buffer_level = lc->bits_off_target; + } + } + return 1; + } else { + cpi->decimation_count = cpi->decimation_factor; + } + } else { + cpi->decimation_count = 0; + } + return 0; +} static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, unsigned char *dest, @@ -3208,12 +3316,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, int undershoot_seen = 0; #endif - int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark * - cpi->oxcf.optimal_buffer_level / 100); - int drop_mark75 = drop_mark * 2 / 3; - int drop_mark50 = drop_mark / 4; - int drop_mark25 = drop_mark / 8; - /* Clear down mmx registers to allow floating point in what follows */ vpx_clear_system_state(); @@ -3427,102 +3529,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, update_rd_ref_frame_probs(cpi); - if (cpi->drop_frames_allowed) { - /* The reset to decimation 0 is only done here for one pass. - * Once it is set two pass leaves decimation on till the next kf. - */ - if ((cpi->buffer_level > drop_mark) && (cpi->decimation_factor > 0)) { - cpi->decimation_factor--; - } - - if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) { - cpi->decimation_factor = 1; - - } else if (cpi->buffer_level < drop_mark25 && - (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) { - cpi->decimation_factor = 3; - } else if (cpi->buffer_level < drop_mark50 && - (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) { - cpi->decimation_factor = 2; - } else if (cpi->buffer_level < drop_mark75 && - (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) { - cpi->decimation_factor = 1; - } - } - - /* The following decimates the frame rate according to a regular - * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help - * prevent buffer under-run in CBR mode. Alternatively it might be - * desirable in some situations to drop frame rate but throw more bits - * at each frame. - * - * Note that dropping a key frame can be problematic if spatial - * resampling is also active - */ - if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) { - switch (cpi->decimation_factor) { - case 1: - cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2; - break; - case 2: - cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; - break; - case 3: - cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; - break; - } - - /* Note that we should not throw out a key frame (especially when - * spatial resampling is enabled). - */ - if (cm->frame_type == KEY_FRAME) { - cpi->decimation_count = cpi->decimation_factor; - } else if (cpi->decimation_count > 0) { - cpi->decimation_count--; - - cpi->bits_off_target += cpi->av_per_frame_bandwidth; - if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) { - cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; - } - -#if CONFIG_MULTI_RES_ENCODING - vp8_store_drop_frame_info(cpi); -#endif - - cm->current_video_frame++; - cpi->frames_since_key++; - cpi->ext_refresh_frame_flags_pending = 0; - // We advance the temporal pattern for dropped frames. - cpi->temporal_pattern_counter++; - -#if CONFIG_INTERNAL_STATS - cpi->count++; -#endif - - cpi->buffer_level = cpi->bits_off_target; - - if (cpi->oxcf.number_of_layers > 1) { - unsigned int i; - - /* Propagate bits saved by dropping the frame to higher - * layers - */ - for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) { - LAYER_CONTEXT *lc = &cpi->layer_context[i]; - lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate); - if (lc->bits_off_target > lc->maximum_buffer_size) { - lc->bits_off_target = lc->maximum_buffer_size; - } - lc->buffer_level = lc->bits_off_target; - } - } - - return; - } else { - cpi->decimation_count = cpi->decimation_factor; - } - } else { - cpi->decimation_count = 0; + if (vp8_check_drop_buffer(cpi)) { + return; } /* Decide how big to make the frame */ @@ -3930,7 +3938,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, /* transform / motion compensation build reconstruction frame */ vp8_encode_frame(cpi); - if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && + cpi->rt_drop_recode_on_overshoot == 1) { if (vp8_drop_encodedframe_overshoot(cpi, Q)) { vpx_clear_system_state(); return; diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index cdf94f4f23..1451a27812 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -708,6 +708,10 @@ typedef struct VP8_COMP { // Always update correction factor used for rate control after each frame for // realtime encoding. int rt_always_update_correction_factor; + + // Flag to indicate frame may be dropped due to large expected overshoot, + // and re-encoded on next frame at max_qp. + int rt_drop_recode_on_overshoot; } VP8_COMP; void vp8_initialize_enc(void); @@ -732,6 +736,8 @@ void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **); void vp8_set_speed_features(VP8_COMP *cpi); +int vp8_check_drop_buffer(VP8_COMP *cpi); + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 20c44ff4e1..a6f0b4cbcf 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -624,10 +624,11 @@ static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx, va_list args) { VP8_COMP *cpi = ctx->cpi; - const unsigned int data = CAST(VP8E_SET_GF_CBR_BOOST_PCT, args); + const unsigned int data = CAST(VP8E_SET_RTC_EXTERNAL_RATECTRL, args); if (data) { cpi->cyclic_refresh_mode_enabled = 0; cpi->rt_always_update_correction_factor = 1; + cpi->rt_drop_recode_on_overshoot = 0; } return VPX_CODEC_OK; } diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc index 60bc258a6f..dd3c8e623b 100644 --- a/vp8/vp8_ratectrl_rtc.cc +++ b/vp8/vp8_ratectrl_rtc.cc @@ -133,6 +133,8 @@ bool VP8RateControlRTC::UpdateRateControl( cpi_->buffered_mode = oxcf->optimal_buffer_level > 0; oxcf->under_shoot_pct = rc_cfg.undershoot_pct; oxcf->over_shoot_pct = rc_cfg.overshoot_pct; + oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh; + if (oxcf->drop_frames_water_mark > 0) cpi_->drop_frames_allowed = 1; cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct; cpi_->framerate = rc_cfg.framerate; for (int i = 0; i < KEY_FRAME_CONTEXT; ++i) { @@ -208,7 +210,8 @@ bool VP8RateControlRTC::UpdateRateControl( return true; } -void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) { +FrameDropDecision VP8RateControlRTC::ComputeQP( + const VP8FrameParamsQpRTC &frame_params) { VP8_COMMON *const cm = &cpi_->common; vpx_clear_system_state(); if (cpi_->oxcf.number_of_layers > 1) { @@ -226,7 +229,20 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) { cpi_->common.frame_flags |= FRAMEFLAGS_KEY; } - vp8_pick_frame_size(cpi_); + cpi_->per_frame_bandwidth = static_cast( + round(cpi_->oxcf.target_bandwidth / cpi_->output_framerate)); + if (vp8_check_drop_buffer(cpi_)) { + if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_); + return FrameDropDecision::kDrop; + } + + if (!vp8_pick_frame_size(cpi_)) { + cm->current_video_frame++; + cpi_->frames_since_key++; + cpi_->ext_refresh_frame_flags_pending = 0; + if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_); + return FrameDropDecision::kDrop; + } if (cpi_->buffer_level >= cpi_->oxcf.optimal_buffer_level && cpi_->buffered_mode) { @@ -290,6 +306,7 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) { q_ = vp8_regulate_q(cpi_, cpi_->this_frame_target); vp8_set_quantizer(cpi_, q_); vpx_clear_system_state(); + return FrameDropDecision::kOk; } int VP8RateControlRTC::GetQP() const { return q_; } diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h index 496ef9eaad..5ffe54c47c 100644 --- a/vp8/vp8_ratectrl_rtc.h +++ b/vp8/vp8_ratectrl_rtc.h @@ -33,6 +33,11 @@ struct VP8FrameParamsQpRTC { int temporal_layer_id; }; +enum class FrameDropDecision { + kOk, // Frame is encoded. + kDrop, // Frame is dropped. +}; + class VP8RateControlRTC { public: static std::unique_ptr Create( @@ -46,7 +51,10 @@ class VP8RateControlRTC { // level is calculated from frame qp. int GetLoopfilterLevel() const; // int GetLoopfilterLevel() const; - void ComputeQP(const VP8FrameParamsQpRTC &frame_params); + // ComputeQP returns the QP is the frame is not dropped (kOk return), + // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate + // are not to be called. + FrameDropDecision ComputeQP(const VP8FrameParamsQpRTC &frame_params); // Feedback to rate control with the size of current encoded frame void PostEncodeUpdate(uint64_t encoded_frame_size); From 9142314c2cec2be364e6844d1630a056e7b0a3c8 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 14 Nov 2023 17:57:04 -0800 Subject: [PATCH 0943/1000] ratectrl_rtc.h: fix a few typos is -> if returns -> computes in the documentation for ComputeQP(). Change-Id: If70706736b0dc2ae56e45e2489dc208c61fd557a --- vp9/ratectrl_rtc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index a8dd5c42ff..7f624a5fe3 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -100,7 +100,7 @@ class VP9RateControlRTC { int GetQP() const; int GetLoopfilterLevel() const; bool GetSegmentationData(VP9SegmentationData *segmentation_data) const; - // ComputeQP returns the QP is the frame is not dropped (kOk return), + // ComputeQP computes the QP if the frame is not dropped (kOk return), // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate // are not to be called (vp9_rc_postencode_update_drop_frame is already // called via ComputeQP if drop is decided). From 9f8776ff4af45a412c247c3ebafee7d002c1094d Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 20 Nov 2023 11:49:14 -0800 Subject: [PATCH 0944/1000] vp8_ratectrl_rtc.h: fix a few typos is -> if returns -> computes in the documentation for ComputeQP(). This is the same as: 9142314c2 ratectrl_rtc.h: fix a few typos + remove a duplicate, commented out, version of GetLoopfilterLevel() Change-Id: I8832e628b63b0b7dac6236631072f36ad55d90e8 --- vp8/vp8_ratectrl_rtc.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h index 5ffe54c47c..4c174b1315 100644 --- a/vp8/vp8_ratectrl_rtc.h +++ b/vp8/vp8_ratectrl_rtc.h @@ -50,8 +50,7 @@ class VP8RateControlRTC { // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter // level is calculated from frame qp. int GetLoopfilterLevel() const; - // int GetLoopfilterLevel() const; - // ComputeQP returns the QP is the frame is not dropped (kOk return), + // ComputeQP computes the QP if the frame is not dropped (kOk return), // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate // are not to be called. FrameDropDecision ComputeQP(const VP8FrameParamsQpRTC &frame_params); From 1231fce45ecee61b8fc97e8a61b729d11a562897 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 20 Nov 2023 11:54:50 -0800 Subject: [PATCH 0945/1000] vp8_dx_iface.c: add include for MAX_PARTITIONS fixes clang-tidy warning: no header providing "MAX_PARTITIONS" is directly included Change-Id: Iba7a9d95df7f5bdee76e7975df764cd71461fc93 --- vp8/vp8_dx_iface.c | 1 + 1 file changed, 1 insertion(+) diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index d4e06a7bce..2e5d6dcfe8 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -20,6 +20,7 @@ #include "vpx_version.h" #include "common/alloccommon.h" #include "common/common.h" +#include "common/onyxc_int.h" #include "common/onyxd.h" #include "decoder/onyxd_int.h" #include "vpx_dsp/vpx_dsp_common.h" From a8db542b24a50060e996feda00b0538c7c334909 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Fri, 29 Sep 2023 10:45:32 -0700 Subject: [PATCH 0946/1000] Add vpx_sse and vpx_highbd_sse The code is ported from libaom's aom_sse and aom_highbd_sse at commit 1e20d2da96515524864b21010dbe23809cff2e9b. The vpx_sse and vpx_highbd_sse functions will be used by vpx_dsp/psnr.c. Bug: webm:1819 Change-Id: I4fbffa9000ab92755de5387b1ddd4370cb7020f7 --- test/sum_squares_test.cc | 211 +++++++++++++++++ vpx_dsp/arm/highbd_sse_neon.c | 288 +++++++++++++++++++++++ vpx_dsp/arm/sse_neon.c | 210 +++++++++++++++++ vpx_dsp/arm/sse_neon_dotprod.c | 223 ++++++++++++++++++ vpx_dsp/arm/sum_neon.h | 51 +++++ vpx_dsp/sse.c | 58 +++++ vpx_dsp/vpx_dsp.mk | 6 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 + vpx_dsp/x86/sse_avx2.c | 401 +++++++++++++++++++++++++++++++++ vpx_dsp/x86/sse_sse4.c | 359 +++++++++++++++++++++++++++++ 10 files changed, 1813 insertions(+) create mode 100644 vpx_dsp/arm/highbd_sse_neon.c create mode 100644 vpx_dsp/arm/sse_neon.c create mode 100644 vpx_dsp/arm/sse_neon_dotprod.c create mode 100644 vpx_dsp/sse.c create mode 100644 vpx_dsp/x86/sse_avx2.c create mode 100644 vpx_dsp/x86/sse_sse4.c diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc index 5abb464dc0..725d5eb853 100644 --- a/test/sum_squares_test.cc +++ b/test/sum_squares_test.cc @@ -21,9 +21,14 @@ #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" +#include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; +using ::testing::Combine; +using ::testing::Range; +using ::testing::ValuesIn; namespace { const int kNumIterations = 10000; @@ -126,4 +131,210 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, &vpx_sum_squares_2d_i16_msa))); #endif // HAVE_MSA + +typedef int64_t (*SSEFunc)(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height); + +struct TestSSEFuncs { + TestSSEFuncs(SSEFunc ref = nullptr, SSEFunc tst = nullptr, int depth = 0) + : ref_func(ref), tst_func(tst), bit_depth(depth) {} + SSEFunc ref_func; // Pointer to reference function + SSEFunc tst_func; // Pointer to tested function + int bit_depth; +}; + +typedef std::tuple SSETestParam; + +class SSETest : public ::testing::TestWithParam { + public: + ~SSETest() override = default; + void SetUp() override { + params_ = GET_PARAM(0); + width_ = GET_PARAM(1); + is_hbd_ = +#if CONFIG_VP9_HIGHBITDEPTH + params_.ref_func == vpx_highbd_sse_c; +#else + false; +#endif + rnd_.Reset(ACMRandom::DeterministicSeed()); + src_ = reinterpret_cast(vpx_memalign(32, 256 * 256 * 2)); + ref_ = reinterpret_cast(vpx_memalign(32, 256 * 256 * 2)); + ASSERT_NE(src_, nullptr); + ASSERT_NE(ref_, nullptr); + } + + void TearDown() override { + vpx_free(src_); + vpx_free(ref_); + } + void RunTest(bool is_random, int width, int height, int run_times); + + void GenRandomData(int width, int height, int stride) { + uint16_t *src16 = reinterpret_cast(src_); + uint16_t *ref16 = reinterpret_cast(ref_); + const int msb = 11; // Up to 12 bit input + const int limit = 1 << (msb + 1); + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + if (!is_hbd_) { + src_[ii * stride + jj] = rnd_.Rand8(); + ref_[ii * stride + jj] = rnd_.Rand8(); + } else { + src16[ii * stride + jj] = rnd_(limit); + ref16[ii * stride + jj] = rnd_(limit); + } + } + } + } + + void GenExtremeData(int width, int height, int stride, uint8_t *data, + int16_t val) { + uint16_t *data16 = reinterpret_cast(data); + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + if (!is_hbd_) { + data[ii * stride + jj] = static_cast(val); + } else { + data16[ii * stride + jj] = val; + } + } + } + } + + protected: + bool is_hbd_; + int width_; + TestSSEFuncs params_; + uint8_t *src_; + uint8_t *ref_; + ACMRandom rnd_; +}; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSETest); + +void SSETest::RunTest(bool is_random, int width, int height, int run_times) { + int failed = 0; + vpx_usec_timer ref_timer, test_timer; + for (int k = 0; k < 3; k++) { + int stride = 4 << rnd_(7); // Up to 256 stride + while (stride < width) { // Make sure it's valid + stride = 4 << rnd_(7); + } + if (is_random) { + GenRandomData(width, height, stride); + } else { + const int msb = is_hbd_ ? 12 : 8; // Up to 12 bit input + const int limit = (1 << msb) - 1; + if (k == 0) { + GenExtremeData(width, height, stride, src_, 0); + GenExtremeData(width, height, stride, ref_, limit); + } else { + GenExtremeData(width, height, stride, src_, limit); + GenExtremeData(width, height, stride, ref_, 0); + } + } + int64_t res_ref, res_tst; + uint8_t *src = src_; + uint8_t *ref = ref_; +#if CONFIG_VP9_HIGHBITDEPTH + if (is_hbd_) { + src = CONVERT_TO_BYTEPTR(src_); + ref = CONVERT_TO_BYTEPTR(ref_); + } +#endif + res_ref = params_.ref_func(src, stride, ref, stride, width, height); + res_tst = params_.tst_func(src, stride, ref, stride, width, height); + if (run_times > 1) { + vpx_usec_timer_start(&ref_timer); + for (int j = 0; j < run_times; j++) { + params_.ref_func(src, stride, ref, stride, width, height); + } + vpx_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast(vpx_usec_timer_elapsed(&ref_timer)); + + vpx_usec_timer_start(&test_timer); + for (int j = 0; j < run_times; j++) { + params_.tst_func(src, stride, ref, stride, width, height); + } + vpx_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast(vpx_usec_timer_elapsed(&test_timer)); + + printf( + "c_time=%d \t simd_time=%d \t " + "gain=%d\n", + elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / elapsed_time_simd)); + } else { + if (!failed) { + failed = res_ref != res_tst; + EXPECT_EQ(res_ref, res_tst) + << "Error:" << (is_hbd_ ? "hbd " : " ") << k << " SSE Test [" + << width << "x" << height + << "] C output does not match optimized output."; + } + } + } +} + +TEST_P(SSETest, OperationCheck) { + for (int height = 4; height <= 128; height += 4) { + RunTest(true, width_, height, 1); // GenRandomData + } +} + +TEST_P(SSETest, ExtremeValues) { + for (int height = 4; height <= 128; height += 4) { + RunTest(false, width_, height, 1); + } +} + +TEST_P(SSETest, DISABLED_Speed) { + for (int height = 4; height <= 128; height += 4) { + RunTest(true, width_, height, 100); + } +} + +#if HAVE_NEON +TestSSEFuncs sse_neon[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon), +#if CONFIG_VP9_HIGHBITDEPTH + TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_neon) +#endif +}; +INSTANTIATE_TEST_SUITE_P(NEON, SSETest, + Combine(ValuesIn(sse_neon), Range(4, 129, 4))); +#endif // HAVE_NEON + +#if HAVE_NEON_DOTPROD +TestSSEFuncs sse_neon_dotprod[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SSETest, + Combine(ValuesIn(sse_neon_dotprod), Range(4, 129, 4))); +#endif // HAVE_NEON_DOTPROD + +#if HAVE_SSE4_1 +TestSSEFuncs sse_sse4[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_sse4_1), +#if CONFIG_VP9_HIGHBITDEPTH + TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_sse4_1) +#endif +}; +INSTANTIATE_TEST_SUITE_P(SSE4_1, SSETest, + Combine(ValuesIn(sse_sse4), Range(4, 129, 4))); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 + +TestSSEFuncs sse_avx2[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_avx2) +#endif +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SSETest, + Combine(ValuesIn(sse_avx2), Range(4, 129, 4))); +#endif // HAVE_AVX2 } // namespace diff --git a/vpx_dsp/arm/highbd_sse_neon.c b/vpx_dsp/arm/highbd_sse_neon.c new file mode 100644 index 0000000000..717ad6b19a --- /dev/null +++ b/vpx_dsp/arm/highbd_sse_neon.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src, + const uint16_t *ref, + uint32x4_t *sse_acc0, + uint32x4_t *sse_acc1) { + uint16x8_t s = vld1q_u16(src); + uint16x8_t r = vld1q_u16(ref); + + uint16x8_t abs_diff = vabdq_u16(s, r); + uint16x4_t abs_diff_lo = vget_low_u16(abs_diff); + uint16x4_t abs_diff_hi = vget_high_u16(abs_diff); + + *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo); + *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi); +} + +static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref, + uint32x4_t *sse_acc0, + uint32x4_t *sse_acc1) { + uint16x8_t s = vld1q_u16(src); + uint16x8_t r = vld1q_u16(ref); + + uint16x8_t abs_diff = vabdq_u16(s, r); + uint16x4_t abs_diff_lo = vget_low_u16(abs_diff); + uint16x4_t abs_diff_hi = vget_high_u16(abs_diff); + + *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo); + *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi); +} + +static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[16]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]); + highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]); + highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]); + highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]); + highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]); + highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]); + highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]); + highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]); + highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]); + highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]); + highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]); + highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x16(sse); +} + +static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[8]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x8(sse); +} + +static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[8]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x8(sse); +} + +static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[4]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x4(sse); +} + +static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2]; + highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x2(sse); +} + +static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + // Peel the first loop iteration. + uint16x4_t s = vld1_u16(src); + uint16x4_t r = vld1_u16(ref); + + uint16x4_t abs_diff = vabd_u16(s, r); + uint32x4_t sse = vmull_u16(abs_diff, abs_diff); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + s = vld1_u16(src); + r = vld1_u16(ref); + + abs_diff = vabd_u16(s, r); + sse = vmlal_u16(sse, abs_diff, abs_diff); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4(sse); +} + +static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int width, int height) { + // { 0, 1, 2, 3, 4, 5, 6, 7 } + uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100)); + uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7)); + uint64_t sse = 0; + + do { + int w = width; + int offset = 0; + + do { + uint16x8_t s = vld1q_u16(src + offset); + uint16x8_t r = vld1q_u16(ref + offset); + uint16x8_t abs_diff; + uint16x4_t abs_diff_lo; + uint16x4_t abs_diff_hi; + uint32x4_t sse_u32; + + if (w < 8) { + // Mask out-of-range elements. + s = vandq_u16(s, remainder_mask); + r = vandq_u16(r, remainder_mask); + } + + abs_diff = vabdq_u16(s, r); + abs_diff_lo = vget_low_u16(abs_diff); + abs_diff_hi = vget_high_u16(abs_diff); + + sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo); + sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi); + + sse += horizontal_long_add_uint32x4(sse_u32); + + offset += 8; + w -= 8; + } while (w > 0); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + return sse; +} + +int64_t vpx_highbd_sse_neon(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, int width, + int height) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + switch (width) { + case 4: + return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height); + case 8: + return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height); + case 16: + return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height); + case 32: + return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height); + case 64: + return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height); + case 128: + return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height); + default: + return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width, + height); + } +} diff --git a/vpx_dsp/arm/sse_neon.c b/vpx_dsp/arm/sse_neon.c new file mode 100644 index 0000000000..0b4a6e504a --- /dev/null +++ b/vpx_dsp/arm/sse_neon.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref, + uint32x4_t *sse) { + uint8x16_t s = vld1q_u8(src); + uint8x16_t r = vld1q_u8(ref); + + uint8x16_t abs_diff = vabdq_u8(s, r); + uint8x8_t abs_diff_lo = vget_low_u8(abs_diff); + uint8x8_t abs_diff_hi = vget_high_u8(abs_diff); + + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo)); + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi)); +} + +static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref, + uint32x4_t *sse) { + uint8x8_t s = vld1_u8(src); + uint8x8_t r = vld1_u8(ref); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); +} + +static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + uint32x4_t *sse) { + uint8x8_t s = load_unaligned_u8(src, src_stride); + uint8x8_t r = load_unaligned_u8(ref, ref_stride); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); +} + +static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int width, int height) { + uint32x4_t sse = vdupq_n_u32(0); + + if ((width & 0x07) && ((width & 0x07) < 5)) { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon(src + j, ref + j, &sse); + sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse); + j += 8; + } while (j + 4 < width); + + sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse); + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + } else { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon(src + j, ref + j, &sse); + j += 8; + } while (j < width); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + } + return horizontal_add_uint32x4(sse); +} + +static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + sse_16x1_neon(src + 16, ref + 16, &sse[1]); + sse_16x1_neon(src + 32, ref + 32, &sse[0]); + sse_16x1_neon(src + 48, ref + 48, &sse[1]); + sse_16x1_neon(src + 64, ref + 64, &sse[0]); + sse_16x1_neon(src + 80, ref + 80, &sse[1]); + sse_16x1_neon(src + 96, ref + 96, &sse[0]); + sse_16x1_neon(src + 112, ref + 112, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + sse_16x1_neon(src + 16, ref + 16, &sse[1]); + sse_16x1_neon(src + 32, ref + 32, &sse[0]); + sse_16x1_neon(src + 48, ref + 48, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + sse_16x1_neon(src + 16, ref + 16, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + src += src_stride; + ref += ref_stride; + sse_16x1_neon(src, ref, &sse[1]); + src += src_stride; + ref += ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse = vdupq_n_u32(0); + + int i = height; + do { + sse_8x1_neon(src, ref, &sse); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(sse); +} + +static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse = vdupq_n_u32(0); + + int i = height; + do { + sse_4x2_neon(src, src_stride, ref, ref_stride, &sse); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(sse); +} + +int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride, int width, int height) { + switch (width) { + case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height); + case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height); + case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height); + case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height); + case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height); + case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height); + default: + return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height); + } +} diff --git a/vpx_dsp/arm/sse_neon_dotprod.c b/vpx_dsp/arm/sse_neon_dotprod.c new file mode 100644 index 0000000000..0f11b7cbb2 --- /dev/null +++ b/vpx_dsp/arm/sse_neon_dotprod.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref, + uint32x4_t *sse) { + uint8x16_t s = vld1q_u8(src); + uint8x16_t r = vld1q_u8(ref); + + uint8x16_t abs_diff = vabdq_u8(s, r); + + *sse = vdotq_u32(*sse, abs_diff, abs_diff); +} + +static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref, + uint32x2_t *sse) { + uint8x8_t s = vld1_u8(src); + uint8x8_t r = vld1_u8(ref); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vdot_u32(*sse, abs_diff, abs_diff); +} + +static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + uint32x2_t *sse) { + uint8x8_t s = load_unaligned_u8(src, src_stride); + uint8x8_t r = load_unaligned_u8(ref, ref_stride); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vdot_u32(*sse, abs_diff, abs_diff); +} + +static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int width, int height) { + uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + if ((width & 0x07) && ((width & 0x07) < 5)) { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]); + sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride, + &sse[1]); + j += 8; + } while (j + 4 < width); + + sse_4x2_neon_dotprod(src + j, src_stride, ref + j, ref_stride, &sse[0]); + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + } else { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]); + sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride, + &sse[1]); + j += 8; + } while (j < width); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + } + return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_128xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); + sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]); + sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]); + sse_16x1_neon_dotprod(src + 64, ref + 64, &sse[0]); + sse_16x1_neon_dotprod(src + 80, ref + 80, &sse[1]); + sse_16x1_neon_dotprod(src + 96, ref + 96, &sse[0]); + sse_16x1_neon_dotprod(src + 112, ref + 112, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); + sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]); + sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + src += src_stride; + ref += ref_stride; + sse_16x1_neon_dotprod(src, ref, &sse[1]); + src += src_stride; + ref += ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + int i = height; + do { + sse_8x1_neon_dotprod(src, ref, &sse[0]); + src += src_stride; + ref += ref_stride; + sse_8x1_neon_dotprod(src, ref, &sse[1]); + src += src_stride; + ref += ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x2_t sse = vdup_n_u32(0); + + int i = height; + do { + sse_4x2_neon_dotprod(src, src_stride, ref, ref_stride, &sse); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x2(sse); +} + +int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int width, + int height) { + switch (width) { + case 4: + return sse_4xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 8: + return sse_8xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 16: + return sse_16xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 32: + return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 64: + return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 128: + return sse_128xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + default: + return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width, + height); + } +} diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index 75c170df60..11821dc10e 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -221,4 +221,55 @@ static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) { #endif } +static INLINE uint64_t horizontal_long_add_uint32x4_x2(const uint32x4_t a[2]) { + return horizontal_long_add_uint32x4(a[0]) + + horizontal_long_add_uint32x4(a[1]); +} + +static INLINE uint64_t horizontal_long_add_uint32x4_x4(const uint32x4_t a[4]) { + uint64x2_t sum = vpaddlq_u32(a[0]); + sum = vpadalq_u32(sum, a[1]); + sum = vpadalq_u32(sum, a[2]); + sum = vpadalq_u32(sum, a[3]); + + return horizontal_add_uint64x2(sum); +} + +static INLINE uint64_t horizontal_long_add_uint32x4_x8(const uint32x4_t a[8]) { + uint64x2_t sum[2]; + sum[0] = vpaddlq_u32(a[0]); + sum[1] = vpaddlq_u32(a[1]); + sum[0] = vpadalq_u32(sum[0], a[2]); + sum[1] = vpadalq_u32(sum[1], a[3]); + sum[0] = vpadalq_u32(sum[0], a[4]); + sum[1] = vpadalq_u32(sum[1], a[5]); + sum[0] = vpadalq_u32(sum[0], a[6]); + sum[1] = vpadalq_u32(sum[1], a[7]); + + return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1])); +} + +static INLINE uint64_t +horizontal_long_add_uint32x4_x16(const uint32x4_t a[16]) { + uint64x2_t sum[2]; + sum[0] = vpaddlq_u32(a[0]); + sum[1] = vpaddlq_u32(a[1]); + sum[0] = vpadalq_u32(sum[0], a[2]); + sum[1] = vpadalq_u32(sum[1], a[3]); + sum[0] = vpadalq_u32(sum[0], a[4]); + sum[1] = vpadalq_u32(sum[1], a[5]); + sum[0] = vpadalq_u32(sum[0], a[6]); + sum[1] = vpadalq_u32(sum[1], a[7]); + sum[0] = vpadalq_u32(sum[0], a[8]); + sum[1] = vpadalq_u32(sum[1], a[9]); + sum[0] = vpadalq_u32(sum[0], a[10]); + sum[1] = vpadalq_u32(sum[1], a[11]); + sum[0] = vpadalq_u32(sum[0], a[12]); + sum[1] = vpadalq_u32(sum[1], a[13]); + sum[0] = vpadalq_u32(sum[0], a[14]); + sum[1] = vpadalq_u32(sum[1], a[15]); + + return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1])); +} + #endif // VPX_VPX_DSP_ARM_SUM_NEON_H_ diff --git a/vpx_dsp/sse.c b/vpx_dsp/sse.c new file mode 100644 index 0000000000..6cb4b705f8 --- /dev/null +++ b/vpx_dsp/sse.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * Sum the square of the difference between every corresponding element of the + * buffers. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" + +int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = abs(a[x] - b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} +#endif diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 84fd969daa..93abf39ff6 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -31,10 +31,15 @@ DSP_SRCS-yes += bitwriter_buffer.c DSP_SRCS-yes += bitwriter_buffer.h DSP_SRCS-yes += psnr.c DSP_SRCS-yes += psnr.h +DSP_SRCS-yes += sse.c DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c +DSP_SRCS-$(HAVE_NEON) += arm/sse_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sse_neon_dotprod.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/sse_sse4.c +DSP_SRCS-$(HAVE_AVX2) += x86/sse_avx2.c endif ifeq ($(CONFIG_DECODERS),yes) @@ -447,6 +452,7 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_sse_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c endif # CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index c9cdc285f2..e9d63f6ef2 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -744,6 +744,9 @@ () add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/; +add_proto qw/int64_t/, "vpx_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height"; +specialize qw/vpx_sse sse4_1 avx2 neon neon_dotprod/; + # # Single block SAD # @@ -1026,6 +1029,9 @@ () add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd"; specialize qw/vpx_highbd_subtract_block neon avx2/; + add_proto qw/int64_t/, "vpx_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height"; + specialize qw/vpx_highbd_sse sse4_1 avx2 neon/; + # # Single block SAD # diff --git a/vpx_dsp/x86/sse_avx2.c b/vpx_dsp/x86/sse_avx2.c new file mode 100644 index 0000000000..975446775e --- /dev/null +++ b/vpx_dsp/x86/sse_avx2.c @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx_dsp/x86/mem_sse2.h" + +static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, + const uint8_t *b) { + const __m256i v_a0 = _mm256_loadu_si256((const __m256i *)a); + const __m256i v_b0 = _mm256_loadu_si256((const __m256i *)b); + const __m256i zero = _mm256_setzero_si256(); + const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero); + const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero); + const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero); + const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero); + const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); + const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { + int64_t sum; + __m256i zero = _mm256_setzero_si256(); + const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero); + const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + _mm_storel_epi64((__m128i *)&sum, sum_1x64); + return sum; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) { + const __m256i sum0_4x64 = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32)); + const __m256i sum1_4x64 = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1)); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + *sum = _mm256_add_epi64(*sum, sum_4x64); +} + +static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) { + int64_t sum; + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + + _mm_storel_epi64((__m128i *)&sum, sum_1x64); + return sum; +} +#endif + +static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = load_unaligned_u32(a); + const __m128i v_a1 = load_unaligned_u32(a + a_stride); + const __m128i v_a2 = load_unaligned_u32(a + a_stride * 2); + const __m128i v_a3 = load_unaligned_u32(a + a_stride * 3); + const __m128i v_b0 = load_unaligned_u32(b); + const __m128i v_b1 = load_unaligned_u32(b + b_stride); + const __m128i v_b2 = load_unaligned_u32(b + b_stride * 2); + const __m128i v_b3 = load_unaligned_u32(b + b_stride * 3); + const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1), + _mm_unpacklo_epi32(v_a2, v_a3)); + const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1), + _mm_unpacklo_epi32(v_b2, v_b3)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); + const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + __m256i sum = _mm256_setzero_si256(); + __m256i zero = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + sse_w4x4_avx2(a, a_stride, b, b_stride, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + sse_w8x2_avx2(a, a_stride, b, b_stride, &sum); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a); + const __m128i v_a1 = _mm_loadu_si128((const __m128i *)(a + a_stride)); + const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b); + const __m128i v_b1 = _mm_loadu_si128((const __m128i *)(b + b_stride)); + const __m256i v_a = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01); + const __m256i v_b = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01); + const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero); + const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero); + const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero); + const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero); + const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl); + const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu); + const __m256i temp = + _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub), + _mm256_madd_epi16(v_bsub, v_bsub)); + sum = _mm256_add_epi32(sum, temp); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + sse_w32_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 64: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 128: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + sse_w32_avx2(&sum, a + 64, b + 64); + sse_w32_avx2(&sum, a + 96, b + 96); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + default: + if ((width & 0x07) == 0) { + do { + int i = 0; + do { + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + const uint8_t *a2; + const uint8_t *b2; + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + a2 = a + i + (a_stride << 1); + b2 = b + i + (b_stride << 1); + sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } + sse = summary_all_avx2(&sum); + break; + } + + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, + const uint16_t *b) { + const __m256i v_a_w = _mm256_loadu_si256((const __m256i *)a); + const __m256i v_b_w = _mm256_loadu_si256((const __m256i *)b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_a2 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 2)); + const __m128i v_a3 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 3)); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m128i v_b2 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 2)); + const __m128i v_b3 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 3)); + const __m128i v_a_hi = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_a_lo = _mm_unpacklo_epi64(v_a2, v_a3); + const __m256i v_a_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1); + const __m128i v_b_hi = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_b_lo = _mm_unpacklo_epi64(v_b2, v_b3); + const __m256i v_b_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a_hi = _mm_loadu_si128((const __m128i *)(a + a_stride)); + const __m128i v_a_lo = _mm_loadu_si128((const __m128i *)a); + const __m256i v_a_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1); + const __m128i v_b_hi = _mm_loadu_si128((const __m128i *)(b + b_stride)); + const __m128i v_b_lo = _mm_loadu_si128((const __m128i *)b); + const __m256i v_b_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m256i sum = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + highbd_sse_w16_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16, b + 16); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 64; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + case 64: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 32; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + case 128: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); + highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4); + highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5); + highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6); + highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 16 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 16; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + default: + if (width & 0x7) { + do { + int i = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + const uint16_t *a2; + const uint16_t *b2; + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + a2 = a + i + (a_stride << 1); + b2 = b + i + (b_stride << 1); + highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride); + summary_32_avx2(&sum32, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } else { + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + int i = 0; + do { + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + l += 2; + } while (l < 8 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 8; + } while (y < height); + } + sse = summary_4x64_avx2(sum); + break; + } + return sse; +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/sse_sse4.c b/vpx_dsp/x86/sse_sse4.c new file mode 100644 index 0000000000..1c2744e2fa --- /dev/null +++ b/vpx_dsp/x86/sse_sse4.c @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/mem_sse2.h" + +static INLINE int64_t summary_all_sse4(const __m128i *sum_all) { + int64_t sum; + const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8)); + const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + _mm_storel_epi64((__m128i *)&sum, sum_1x64); + return sum; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) { + const __m128i sum0 = _mm_cvtepu32_epi64(*sum32); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8)); + *sum64 = _mm_add_epi64(sum0, *sum64); + *sum64 = _mm_add_epi64(sum1, *sum64); +} +#endif + +static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, + const uint8_t *b) { + const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a); + const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b); + const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8)); + const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8)); + const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w); + const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m128i *sum) { + const __m128i v_a0 = load_unaligned_u32(a); + const __m128i v_a1 = load_unaligned_u32(a + a_stride); + const __m128i v_b0 = load_unaligned_u32(b); + const __m128i v_b1 = load_unaligned_u32(b + b_stride); + const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); + const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b, + __m128i *sum) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y = 0; + int64_t sse = 0; + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + sse4x2_sse4_1(a, a_stride, b, b_stride, &sum); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + sse8_sse4_1(a, b, &sum); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + sse_w16_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 32: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16, b + 16); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 64: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 128: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4); + sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5); + sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6); + sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + default: + if (width & 0x07) { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + } + sse = summary_all_sse4(&sum); + break; + } + + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, + const uint16_t *b) { + const __m128i v_a_w = _mm_loadu_si128((const __m128i *)a); + const __m128i v_b_w = _mm_loadu_si128((const __m128i *)b); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 64; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 32: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 32; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 64: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 16 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 16; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 128: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 8 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 8; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + default: + if (width & 0x7) { + do { + __m128i sum32 = _mm_setzero_si128(); + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + summary_32_sse4(&sum32, &sum); + } while (y < height); + } else { + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 8 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 8; + } while (y < height); + } + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + } + return sse; +} +#endif // CONFIG_VP9_HIGHBITDEPTH From 56c78a68b0a881b7f039c0de662cc758ab40d08c Mon Sep 17 00:00:00 2001 From: Hirokazu Honda Date: Tue, 21 Nov 2023 17:03:53 +0900 Subject: [PATCH 0947/1000] ratectrl_rtc: Remove duplicated DropFrameReason enum class DropFrameReason is declared in two places. This moves it to the common place. Change-Id: I04c16db4a49135588edff7e1746dcf9172750bb9 --- vp8/vp8_ratectrl_rtc.h | 5 ----- vp9/ratectrl_rtc.h | 5 ----- vpx/internal/vpx_ratectrl_rtc.h | 5 +++++ 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h index 4c174b1315..59fb607526 100644 --- a/vp8/vp8_ratectrl_rtc.h +++ b/vp8/vp8_ratectrl_rtc.h @@ -33,11 +33,6 @@ struct VP8FrameParamsQpRTC { int temporal_layer_id; }; -enum class FrameDropDecision { - kOk, // Frame is encoded. - kDrop, // Frame is dropped. -}; - class VP8RateControlRTC { public: static std::unique_ptr Create( diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h index 7f624a5fe3..85005c5474 100644 --- a/vp9/ratectrl_rtc.h +++ b/vp9/ratectrl_rtc.h @@ -64,11 +64,6 @@ struct VP9SegmentationData { size_t delta_q_size; }; -enum class FrameDropDecision { - kOk, // Frame is encoded. - kDrop, // Frame is dropped. -}; - // This interface allows using VP9 real-time rate control without initializing // the encoder. To use this interface, you need to link with libvpxrc.a. // diff --git a/vpx/internal/vpx_ratectrl_rtc.h b/vpx/internal/vpx_ratectrl_rtc.h index eb90cd1d0c..6ffd798eb2 100644 --- a/vpx/internal/vpx_ratectrl_rtc.h +++ b/vpx/internal/vpx_ratectrl_rtc.h @@ -17,6 +17,11 @@ namespace libvpx { enum class RcFrameType { kKeyFrame = 0, kInterFrame = 1 }; +enum class FrameDropDecision { + kOk, // Frame is encoded. + kDrop, // Frame is dropped. +}; + struct VpxRateControlRtcConfig { public: VpxRateControlRtcConfig() { From 741b8f6228984e888c99849d7675ea4132eaf268 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 21 Nov 2023 11:18:54 -0500 Subject: [PATCH 0948/1000] Check null ptr before use prev_mi is a pointer to pointer Bug: b/310401647 Bug: b/310590556 Change-Id: Ic3c39a7eec14693357bd2485a5451d4b7f031b5e --- test/encode_api_test.cc | 85 +++++++++++++++++++++++++++++++++++ vp9/encoder/vp9_encodeframe.c | 16 +++---- 2 files changed, 92 insertions(+), 9 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 770052c859..d2469572a6 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -496,6 +496,91 @@ TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) { } #endif // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64 +vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) { + vpx_image_t *image = + vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, width, height, 1); + if (!image) return image; + + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], 128, image->d_w); + } + const unsigned int uv_h = (image->d_h + 1) / 2; + const unsigned int uv_w = (image->d_w + 1) / 2; + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], 128, uv_w); + memset(image->planes[2] + i * image->stride[2], 128, uv_w); + } + + return image; +} + +// This is a test case from clusterfuzz. +TEST(EncodeAPI, PrevMiCheckNullptr) { + vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); + vpx_codec_enc_cfg_t cfg; + + struct Config { + unsigned int thread; + unsigned int width; + unsigned int height; + vpx_rc_mode end_usage; + unsigned long deadline; + }; + struct Config init_config = { 0, 1554, 644, VPX_VBR, 1 }; + unsigned long deadline = init_config.deadline; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0), + VPX_CODEC_OK); + cfg.g_threads = init_config.thread; + cfg.g_w = init_config.width; + cfg.g_h = init_config.height; + cfg.g_timebase.num = 1; + cfg.g_timebase.den = 1000 * 1000; // microseconds + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.rc_end_usage = init_config.end_usage; + cfg.rc_min_quantizer = 2; + cfg.rc_max_quantizer = 58; + + vpx_codec_ctx_t enc; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 0), VPX_CODEC_OK); + + const vpx_codec_cx_pkt_t *pkt; + + int frame_index = 0; + // First step: encode, without forcing KF. + vpx_image_t *image = CreateImage(cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + ASSERT_EQ(vpx_codec_encode(&enc, image, frame_index, 1, 0, deadline), + VPX_CODEC_OK); + frame_index++; + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + } + vpx_img_free(image); + // Second step: change config + struct Config encode_config = { 0, 1131, 644, VPX_CBR, 1000000 }; + cfg.g_threads = encode_config.thread; + cfg.g_w = encode_config.width; + cfg.g_h = encode_config.height; + cfg.rc_end_usage = encode_config.end_usage; + deadline = encode_config.deadline; + ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc); + // Third step: encode, without forcing KF + image = CreateImage(cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + ASSERT_EQ(vpx_codec_encode(&enc, image, frame_index, 1, 0, deadline), + VPX_CODEC_OK); + frame_index++; + while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + } + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + class EncodeApiGetTplStatsTest : public ::libvpx_test::EncoderTest, public ::testing::TestWithParam { diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 67869596b1..63306ac381 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -3052,14 +3052,12 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row, min_size = BLOCK_64X64; max_size = BLOCK_4X4; - if (prev_mi) { - for (idy = 0; idy < mi_height; ++idy) { - for (idx = 0; idx < mi_width; ++idx) { - mi = prev_mi[idy * cm->mi_stride + idx]; - bs = mi ? mi->sb_type : bsize; - min_size = VPXMIN(min_size, bs); - max_size = VPXMAX(max_size, bs); - } + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + mi = prev_mi[idy * cm->mi_stride + idx]; + bs = mi ? mi->sb_type : bsize; + min_size = VPXMIN(min_size, bs); + max_size = VPXMAX(max_size, bs); } } @@ -3205,7 +3203,7 @@ static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd, left_par = 1; } - if (prev_mi) { + if (prev_mi[0]) { context_size = prev_mi[0]->sb_type; if (context_size < bsize) last_par = 2; From 79257fd4595299415236d57fc750dcbb2c276ccb Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Wed, 22 Nov 2023 15:06:08 -0800 Subject: [PATCH 0949/1000] Conditionally skip using inter frames in speed features When the reference frame's scaling factor is not in the supported range, skip using it for motion compensation prediction in the partition speed features. BUG=b/312517065 Change-Id: Ie3687186521ad2616be258e80d3e5b16e5f2d5e9 --- vp9/encoder/vp9_encodeframe.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 63306ac381..d0863b723b 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1301,6 +1301,13 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, (frame_is_intra_only(cm) || (is_one_pass_svc(cpi) && cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)); + + if (!is_key_frame) { + if (cm->frame_refs[LAST_FRAME - 1].sf.x_scale_fp == REF_INVALID_SCALE || + cm->frame_refs[LAST_FRAME - 1].sf.y_scale_fp == REF_INVALID_SCALE) + is_key_frame = 1; + } + // Always use 4x4 partition for key frame. const int use_4x4_partition = frame_is_intra_only(cm); const int low_res = (cm->width <= 352 && cm->height <= 288); From b562fdd4e6833110b0c749c8aaf690e5fc16f1cd Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Wed, 22 Nov 2023 15:07:04 -0800 Subject: [PATCH 0950/1000] Remove invalid reference frames Remove the reference frames whose scaling factor is not in the supported range. BUG=b/312517065 Change-Id: Iaf8610ff7a95cd4a433bf529f741459d820d4f8b --- vp9/encoder/vp9_encodeframe.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index d0863b723b..b98fd84579 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -6144,6 +6144,15 @@ static void encode_frame_internal(VP9_COMP *cpi) { cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base; } + for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; + ++ref_frame) { + if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) { + if (cm->frame_refs[ref_frame - 1].sf.x_scale_fp == REF_INVALID_SCALE || + cm->frame_refs[ref_frame - 1].sf.y_scale_fp == REF_INVALID_SCALE) + cpi->ref_frame_flags &= ~ref_frame_to_flag(ref_frame); + } + } + // Frame segmentation if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ) build_kmeans_segmentation(cpi); From 635eba3319cb6e20c2a72902b6fb3dd2fc8e93ef Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Wed, 22 Nov 2023 15:38:04 -0800 Subject: [PATCH 0951/1000] Adding "const" to vpx_codec_iface_t is redundant vpx_codec_iface_t is defined as follows: typedef const struct vpx_codec_iface vpx_codec_iface_t; Since vpx_codec_iface_t is already a const struct, it is redundant to add "const" to vpx_codec_iface_t. Note: I think vpx_codec_iface_t should not have been defined as a const struct, but it is too late to change that now. Change-Id: Ifbd3f8a63c1d48e9169ff77fa0b505ea1e65519d --- test/decode_api_test.cc | 6 +++--- test/encode_api_test.cc | 8 ++++---- test/level_test.cc | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/decode_api_test.cc b/test/decode_api_test.cc index 9e82ace1b8..44e4397726 100644 --- a/test/decode_api_test.cc +++ b/test/decode_api_test.cc @@ -20,7 +20,7 @@ namespace { #define NELEMENTS(x) static_cast(sizeof(x) / sizeof(x[0])) TEST(DecodeAPI, InvalidParams) { - static const vpx_codec_iface_t *kCodecs[] = { + static vpx_codec_iface_t *kCodecs[] = { #if CONFIG_VP8_DECODER &vpx_codec_vp8_dx_algo, #endif @@ -120,7 +120,7 @@ void TestVp9Controls(vpx_codec_ctx_t *dec) { } TEST(DecodeAPI, Vp9InvalidDecode) { - const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; + vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; const char filename[] = "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"; libvpx_test::IVFVideoSource video(filename); @@ -147,7 +147,7 @@ TEST(DecodeAPI, Vp9InvalidDecode) { void TestPeekInfo(const uint8_t *const data, uint32_t data_sz, uint32_t peek_size) { - const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; + vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get // to decoder_peek_si_internal on frames of size < 8. if (data_sz >= 8) { diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index d2469572a6..cbbbb4ddce 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -28,7 +28,7 @@ namespace { -const vpx_codec_iface_t *kCodecIfaces[] = { +vpx_codec_iface_t *kCodecIfaces[] = { #if CONFIG_VP8_ENCODER &vpx_codec_vp8_cx_algo, #endif @@ -37,7 +37,7 @@ const vpx_codec_iface_t *kCodecIfaces[] = { #endif }; -bool IsVP9(const vpx_codec_iface_t *iface) { +bool IsVP9(vpx_codec_iface_t *iface) { static const char kVP9Name[] = "WebM Project VP9"; return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) == 0; @@ -259,7 +259,7 @@ TEST(EncodeAPI, MultiResEncode) { TEST(EncodeAPI, SetRoi) { static struct { - const vpx_codec_iface_t *iface; + vpx_codec_iface_t *iface; int ctrl_id; } kCodecs[] = { #if CONFIG_VP8_ENCODER @@ -365,7 +365,7 @@ TEST(EncodeAPI, SetRoi) { } } -void InitCodec(const vpx_codec_iface_t &iface, int width, int height, +void InitCodec(vpx_codec_iface_t &iface, int width, int height, vpx_codec_ctx_t *enc, vpx_codec_enc_cfg_t *cfg) { cfg->g_w = width; cfg->g_h = height; diff --git a/test/level_test.cc b/test/level_test.cc index 3f1cf9f1c5..36cfd645c9 100644 --- a/test/level_test.cc +++ b/test/level_test.cc @@ -120,7 +120,7 @@ TEST_P(LevelTest, TestTargetLevel255) { TEST_P(LevelTest, TestTargetLevelApi) { ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, 1); - static const vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo; + static vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo; vpx_codec_ctx_t enc; vpx_codec_enc_cfg_t cfg; EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(codec, &cfg, 0)); From 3bd54a37d0f820970352326941224afb618af808 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Wed, 22 Nov 2023 15:38:27 -0800 Subject: [PATCH 0952/1000] Disable intra mode search speed features conditionally When all the inter reference frames are invalid, disable the speed features that bypass intra mode search. BUG=b/312517065 Change-Id: I246c953fad3be61b9d307da11c752a21a36b90ff --- vp9/encoder/vp9_rdopt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index fc06967105..974e43c90f 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -3606,7 +3606,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL; } - if (bsize > sf->max_intra_bsize) { + if (bsize > sf->max_intra_bsize && cpi->ref_frame_flags != 0) { ref_frame_skip_mask[0] |= (1 << INTRA_FRAME); ref_frame_skip_mask[1] |= (1 << INTRA_FRAME); } From 366425079ba685bcd78511297dacb327ec363abe Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Wed, 22 Nov 2023 14:49:56 -0800 Subject: [PATCH 0953/1000] Tests kf_max_dist in one-pass zero-lag encoding The test shows that the comment for kf_max_dist in vpx/vpx_encoder.h differs from its behavior by one. We should modify the comment to match the encoding behavior. Bug: webm:1829 Change-Id: Icdc58b8f6b25353f10ce8ecc481c862bd3fe86df --- test/keyframe_test.cc | 107 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc index dabf88e415..5292bb188d 100644 --- a/test/keyframe_test.cc +++ b/test/keyframe_test.cc @@ -8,12 +8,18 @@ * be found in the AUTHORS file in the root of the source tree. */ #include +#include #include #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" #include "test/util.h" +#include "./vpx_config.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_encoder.h" +#include "vpx/vpx_image.h" namespace { @@ -146,4 +152,105 @@ TEST_P(KeyframeTest, TestAutoKeyframe) { } VP8_INSTANTIATE_TEST_SUITE(KeyframeTest, ALL_TEST_MODES); + +bool IsVP9(vpx_codec_iface_t *iface) { + static const char kVP9Name[] = "WebM Project VP9"; + return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) == + 0; +} + +vpx_image_t *CreateGrayImage(vpx_img_fmt_t fmt, unsigned int w, + unsigned int h) { + vpx_image_t *const image = vpx_img_alloc(nullptr, fmt, w, h, 1); + if (!image) return image; + + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], 128, image->d_w); + } + const unsigned int uv_h = (image->d_h + 1) / 2; + const unsigned int uv_w = (image->d_w + 1) / 2; + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], 128, uv_w); + memset(image->planes[2] + i * image->stride[2], 128, uv_w); + } + return image; +} + +// Tests kf_max_dist in one-pass encoding with zero lag. +void TestKeyframeMaximumInterval(vpx_codec_iface_t *iface, + unsigned long deadline, + unsigned int kf_max_dist) { + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0), + VPX_CODEC_OK); + cfg.g_w = 320; + cfg.g_h = 240; + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.kf_mode = VPX_KF_AUTO; + cfg.kf_min_dist = 0; + cfg.kf_max_dist = kf_max_dist; + + vpx_codec_ctx_t enc; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + const int speed = IsVP9(iface) ? 9 : -12; + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, speed), VPX_CODEC_OK); + + vpx_image_t *image = CreateGrayImage(VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frames. + const vpx_codec_cx_pkt_t *pkt; + const unsigned int num_frames = kf_max_dist == 0 ? 4 : 3 * kf_max_dist + 1; + for (unsigned int i = 0; i < num_frames; ++i) { + ASSERT_EQ(vpx_codec_encode(&enc, image, i, 1, 0, deadline), VPX_CODEC_OK); + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + if (kf_max_dist == 0 || i % kf_max_dist == 0) { + ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY); + } else { + ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, 0u); + } + } + } + + // Flush the encoder. + bool got_data; + do { + ASSERT_EQ(vpx_codec_encode(&enc, nullptr, 0, 1, 0, deadline), VPX_CODEC_OK); + got_data = false; + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + got_data = true; + } + } while (got_data); + + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + +TEST(KeyframeIntervalTest, KeyframeMaximumInterval) { + std::vector ifaces; +#if CONFIG_VP8_ENCODER + ifaces.push_back(vpx_codec_vp8_cx()); +#endif +#if CONFIG_VP9_ENCODER + ifaces.push_back(vpx_codec_vp9_cx()); +#endif + for (vpx_codec_iface_t *iface : ifaces) { + for (unsigned long deadline : + { VPX_DL_REALTIME, VPX_DL_GOOD_QUALITY, VPX_DL_BEST_QUALITY }) { + // Test 0 and 1 (both mean all intra), some powers of 2, some multiples + // of 10, and some prime numbers. + for (unsigned int kf_max_dist : + { 0, 1, 2, 3, 4, 7, 10, 13, 16, 20, 23, 29, 32 }) { + TestKeyframeMaximumInterval(iface, deadline, kf_max_dist); + } + } + } +} + } // namespace From 9b729500d5a257d4694b9d50237ebf0e77b5fff9 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 23 Nov 2023 10:13:46 +0000 Subject: [PATCH 0954/1000] Delete redundant code in Neon SDOT/USDOT vertical convolutions Delete redundant transpose/permute code in the Neon dot-product vertical convolution paths. Variable values were assigned but never used before subsequent assignment. Change-Id: I15b29d0c993f56599e0d18ac1d5787e6385d2a3a --- vpx_dsp/arm/vpx_convolve8_neon_dotprod.c | 30 --------------------- vpx_dsp/arm/vpx_convolve8_neon_i8mm.c | 34 ------------------------ 2 files changed, 64 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c index bf01364cf7..75e9d03929 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c +++ b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c @@ -397,9 +397,6 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. @@ -408,9 +405,6 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); do { uint8x8_t t7, t8, t9, t10; @@ -478,9 +472,6 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. @@ -493,12 +484,6 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, tran_concat_tbl); transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); do { uint8x8_t t7, t8, t9, t10; @@ -601,9 +586,6 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. @@ -612,9 +594,6 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); do { uint8x8_t t7, t8, t9, t10; @@ -688,9 +667,6 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. @@ -703,12 +679,6 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, tran_concat_tbl); transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); do { uint8x8_t t7, t8, t9, t10; diff --git a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c index e0e482e3f5..2ea587e622 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c +++ b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c @@ -375,10 +375,6 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src += 7 * src_stride; - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ @@ -386,9 +382,6 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); do { load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); @@ -441,10 +434,6 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ @@ -456,12 +445,6 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, tran_concat_tbl); transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); do { load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); @@ -544,10 +527,6 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src += 7 * src_stride; - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ @@ -555,9 +534,6 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); do { load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); @@ -616,10 +592,6 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ @@ -631,12 +603,6 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, tran_concat_tbl); transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); do { load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); From 1b3ec0676cfbb79cc1ceb17add1af543ad8d4f4c Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 23 Nov 2023 15:11:11 +0000 Subject: [PATCH 0955/1000] Make reporting of filter sizes more granular vpx_get_filter_taps() currently reports either 8-tap or 2-tap. However, many 8-tap filters are actually 0-padded, resulting in a lot of redundant work (multiplying by, and adding, 0) when processing using an 8-tap convolution function. In preparation for adding 2- and 4-tap SIMD implementations for the convolution paths, make the filter size reporting more granular, stripping any 0 padding. Filter sizes can now be reported as 2-, 4-, 6- or 8-tap. Change-Id: I100133aac7173134af34b918c9ad3007d98d6060 --- vpx_dsp/vpx_filter.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/vpx_dsp/vpx_filter.h b/vpx_dsp/vpx_filter.h index 54357ee6ca..0cddcb6991 100644 --- a/vpx_dsp/vpx_filter.h +++ b/vpx_dsp/vpx_filter.h @@ -29,10 +29,16 @@ typedef int16_t InterpKernel[SUBPEL_TAPS]; static INLINE int vpx_get_filter_taps(const int16_t *const filter) { assert(filter[3] != 128); - if (!filter[0] && !filter[1] && !filter[2]) - return 2; - else + if (filter[0] | filter[7]) { return 8; + } + if (filter[1] | filter[6]) { + return 6; + } + if (filter[2] | filter[5]) { + return 4; + } + return 2; } #ifdef __cplusplus From bdc9e1c9d4e8ac536e5e8e9a6c905c05fad8a03a Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 22 Nov 2023 17:38:32 +0000 Subject: [PATCH 0956/1000] Specialise Armv8.4 Neon vert convolution for 4-tap filters Add an Armv8.4 SDOT Neon implementation of vertical convolution specialised for executing with 4-tap filters (the most common filter size for settings --good --cpu-used=1.) This new path is also used when executing with bilinear (2-tap) filters. Change-Id: I3eb00b5a34f5676b68bda60a2a29be56e3d7d0cd --- vpx_dsp/arm/vpx_convolve8_neon.h | 27 +++ vpx_dsp/arm/vpx_convolve8_neon_dotprod.c | 221 ++++++++++++++++++++--- vpx_dsp/arm/vpx_convolve_neon_dotprod.c | 26 ++- 3 files changed, 240 insertions(+), 34 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index 025e943cc4..52df15a121 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -26,6 +26,33 @@ void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +static INLINE int16x4_t convolve4_4_sdot_partial(const int8x16_t samples, + const int32x4_t correction, + const int8x8_t filters) { + /* Accumulate dot product into 'correction' to account for range clamp. */ + int32x4_t sum = vdotq_lane_s32(correction, samples, filters, 0); + + /* Further narrowing and packing is performed by the caller. */ + return vmovn_s32(sum); +} + +static INLINE uint8x8_t convolve4_8_sdot_partial(const int8x16_t samples_lo, + const int8x16_t samples_hi, + const int32x4_t correction, + const int8x8_t filters) { + /* Sample range-clamping and permutation are performed by the caller. */ + /* Accumulate dot product into 'correction' to account for range clamp. */ + /* First 4 output values. */ + int32x4_t sum0 = vdotq_lane_s32(correction, samples_lo, filters, 0); + /* Second 4 output values. */ + int32x4_t sum1 = vdotq_lane_s32(correction, samples_hi, filters, 0); + + /* Narrow and re-pack. */ + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + /* We halved the filter values so -1 from right shift. */ + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, const int8x16_t samples_hi, const int32x4_t correction, diff --git a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c index 75e9d03929..cdb410bd58 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c +++ b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c @@ -356,29 +356,166 @@ static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); } -void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x8_t range_limit = vdup_n_u8(128); +static INLINE void vpx_convolve_4tap_vert_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, + const int32x4_t correction, const uint8x8_t range_limit) { const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); uint8x8_t t0, t1, t2, t3, t4, t5, t6; int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; int8x16x2_t samples_LUT; - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(y_step_q4 == 16); + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + int8x16_t s0123, s1234, s2345, s3456, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; - (void)x0_q4; - (void)x_step_q4; - (void)y_step_q4; + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; - src -= 3 * src_stride; + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + d0 = convolve4_4_sdot_partial(s0123, correction, filter); + d1 = convolve4_4_sdot_partial(s1234, correction, filter); + d2 = convolve4_4_sdot_partial(s2345, correction, filter); + d3 = convolve4_4_sdot_partial(s3456, correction, filter); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s0123 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s1234 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s2345 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + d0 = convolve4_8_sdot_partial(s0123_lo, s0123_hi, correction, filter); + d1 = convolve4_8_sdot_partial(s1234_lo, s1234_hi, correction, filter); + d2 = convolve4_8_sdot_partial(s2345_lo, s2345_hi, correction, filter); + d3 = convolve4_8_sdot_partial(s3456_lo, s3456_hi, correction, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s0123_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s1234_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s2345_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + s3456_lo = s78910_lo; + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s0123_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s1234_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s2345_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void vpx_convolve_8tap_vert_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, + const int32x4_t correction, const uint8x8_t range_limit) { + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int8x16x2_t samples_LUT; if (w == 4) { const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); @@ -425,10 +562,10 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); - d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); - d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); - d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); + d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter); + d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter); + d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter); + d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); @@ -512,13 +649,13 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filters); + correction, filter); d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filters); + correction, filter); d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filters); + correction, filter); d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filters); + correction, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -544,6 +681,42 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, } } +void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4])); + const int32x4_t correction_8tap = + vdupq_n_s32(vaddlvq_s16(vshll_n_s8(y_filter_8tap, FILTER_BITS))); + const uint8x8_t range_limit = vdup_n_u8(128); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. Also slide the filter values so the + * the 4 taps exist in the first 4 elements of the vector. + */ + const int8x8_t y_filter_4tap = + vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2); + const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1); + vpx_convolve_4tap_vert_neon_dotprod(src - src_stride, src_stride, dst, + dst_stride, w, h, y_filter_4tap, + correction_4tap, range_limit); + } else { + vpx_convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter_8tap, + correction_8tap, range_limit); + } +} + void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, diff --git a/vpx_dsp/arm/vpx_convolve_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve_neon_dotprod.c index 400e26b30a..9d754fde17 100644 --- a/vpx_dsp/arm/vpx_convolve_neon_dotprod.c +++ b/vpx_dsp/arm/vpx_convolve_neon_dotprod.c @@ -13,6 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/vpx_convolve8_neon.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, @@ -20,24 +21,26 @@ void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the + /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the * maximum buffer size to 64 * (64 + 7). */ uint8_t temp[64 * 71]; - /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ - const int intermediate_height = h + 7; + const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior + * and vert_filter_taps / 2 lines post. */ + const int intermediate_height = h + vert_filter_taps - 1; + const ptrdiff_t border_offset = vert_filter_taps / 2 - 1; assert(y_step_q4 == 16); assert(x_step_q4 == 16); - /* Filter starting 3 lines back. */ - vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w, - filter, x0_q4, x_step_q4, y0_q4, - y_step_q4, w, intermediate_height); + vpx_convolve8_2d_horiz_neon_dotprod( + src - src_stride * border_offset, src_stride, temp, w, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, intermediate_height); - /* Step into the temp buffer 3 lines to get the actual frame data. */ - vpx_convolve8_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + vpx_convolve8_vert_neon_dotprod(temp + w * border_offset, w, dst, dst_stride, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, + h); } void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, @@ -46,6 +49,9 @@ void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { uint8_t temp[64 * 71]; + + /* Averaging convolution always uses an 8-tap filter. */ + /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ const int intermediate_height = h + 7; assert(y_step_q4 == 16); From 2f8e94715df1beddcd02df35f55d043d4fdabafc Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 23 Nov 2023 17:23:13 +0000 Subject: [PATCH 0957/1000] Specialise Armv8.6 Neon vert convolution for 4-tap filters Add an Armv8.6 USDOT Neon implementation of vertical convolution specialised for executing with 4-tap filters (the most common filter size for settings --good --cpu-used=1.) This new path is also used when executing with bilinear (2-tap) filters. Change-Id: Ic893b25541e3317c5d5c270c338f868f080aed7c --- vpx_dsp/arm/vpx_convolve8_neon.h | 23 ++++ vpx_dsp/arm/vpx_convolve8_neon_i8mm.c | 179 +++++++++++++++++++++++--- vpx_dsp/arm/vpx_convolve_neon_i8mm.c | 26 ++-- 3 files changed, 197 insertions(+), 31 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index 52df15a121..4f4fe4c81c 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -159,6 +159,29 @@ void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +static INLINE int16x4_t convolve4_4_usdot_partial(const uint8x16_t samples, + const int8x8_t filters) { + int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples, filters, 0); + + /* Further narrowing and packing is performed by the caller. */ + return vmovn_s32(sum); +} + +static INLINE uint8x8_t convolve4_8_usdot_partial(const uint8x16_t samples_lo, + const uint8x16_t samples_hi, + const int8x8_t filters) { + /* Sample permutation is performed by the caller. */ + /* First 4 output values. */ + int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); + /* Second 4 output values. */ + int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples_hi, filters, 0); + + /* Narrow and re-pack. */ + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + /* We halved the filter values so -1 from right shift. */ + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, const uint8x16_t samples_hi, const int8x8_t filters) { diff --git a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c index 2ea587e622..7c394a937b 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c +++ b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c @@ -346,25 +346,132 @@ static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]); } -void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); +static INLINE void vpx_convolve_4tap_vert_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; uint8x16x2_t samples_LUT; - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(y_step_q4 == 16); + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + uint8x16_t s0123, s1234, s2345, s3456, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; - (void)x0_q4; - (void)x_step_q4; - (void)y_step_q4; + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; - src -= 3 * src_stride; + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + + do { + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + d0 = convolve4_4_usdot_partial(s0123, filter); + d1 = convolve4_4_usdot_partial(s1234, filter); + d2 = convolve4_4_usdot_partial(s2345, filter); + d3 = convolve4_4_usdot_partial(s3456, filter); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s0123 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s1234 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s2345 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + + do { + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + d0 = convolve4_8_usdot_partial(s0123_lo, s0123_hi, filter); + d1 = convolve4_8_usdot_partial(s1234_lo, s1234_hi, filter); + d2 = convolve4_8_usdot_partial(s2345_lo, s2345_hi, filter); + d3 = convolve4_8_usdot_partial(s3456_lo, s3456_hi, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s0123_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s1234_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s2345_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + s3456_lo = s78910_lo; + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s0123_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s1234_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s2345_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void vpx_convolve_8tap_vert_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint8x16x2_t samples_LUT; if (w == 4) { const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); @@ -395,10 +502,10 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_usdot_partial(s0123, s4567, filters); - d1 = convolve8_4_usdot_partial(s1234, s5678, filters); - d2 = convolve8_4_usdot_partial(s2345, s6789, filters); - d3 = convolve8_4_usdot_partial(s3456, s78910, filters); + d0 = convolve8_4_usdot_partial(s0123, s4567, filter); + d1 = convolve8_4_usdot_partial(s1234, s5678, filter); + d2 = convolve8_4_usdot_partial(s2345, s6789, filter); + d3 = convolve8_4_usdot_partial(s3456, s78910, filter); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); @@ -466,13 +573,13 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - filters); + filter); d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - filters); + filter); d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - filters); + filter); d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - filters); + filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -498,6 +605,36 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, } } +void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4])); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. Also slide the filter values so the + * the 4 taps exist in the first 4 elements of the vector. + */ + const int8x8_t y_filter_4tap = + vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2); + vpx_convolve_4tap_vert_neon_i8mm(src - src_stride, src_stride, dst, + dst_stride, w, h, y_filter_4tap); + } else { + vpx_convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter_8tap); + } +} + void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, diff --git a/vpx_dsp/arm/vpx_convolve_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve_neon_i8mm.c index 4d94bb79b7..d7cbb09ea6 100644 --- a/vpx_dsp/arm/vpx_convolve_neon_i8mm.c +++ b/vpx_dsp/arm/vpx_convolve_neon_i8mm.c @@ -13,6 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/vpx_convolve8_neon.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, @@ -20,24 +21,26 @@ void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the + /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the * maximum buffer size to 64 * (64 + 7). */ uint8_t temp[64 * 71]; - /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ - const int intermediate_height = h + 7; + const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior + * and vert_filter_taps / 2 lines post. */ + const int intermediate_height = h + vert_filter_taps - 1; + const ptrdiff_t border_offset = vert_filter_taps / 2 - 1; assert(y_step_q4 == 16); assert(x_step_q4 == 16); - /* Filter starting 3 lines back. */ - vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w, - filter, x0_q4, x_step_q4, y0_q4, y_step_q4, - w, intermediate_height); + vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * border_offset, src_stride, + temp, w, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, intermediate_height); - /* Step into the temp buffer 3 lines to get the actual frame data. */ - vpx_convolve8_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter, x0_q4, - x_step_q4, y0_q4, y_step_q4, w, h); + vpx_convolve8_vert_neon_i8mm(temp + w * border_offset, w, dst, dst_stride, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, + h); } void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, @@ -46,6 +49,9 @@ void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { uint8_t temp[64 * 71]; + + /* Averaging convolution always uses an 8-tap filter. */ + /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ const int intermediate_height = h + 7; assert(y_step_q4 == 16); From 68ef57f997e8505dbfceb1bc146373dfd035087a Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Tue, 21 Nov 2023 17:39:28 +0000 Subject: [PATCH 0958/1000] Specialise Armv8.4 Neon horiz convolution for 4-tap filters Add an Armv8.4 SDOT Neon implementation of horizontal convolution specialised for executing with 4-tap filters (the most common filter size for settings --good --cpu-used=1.) This new path is also used when executing with bilinear (2-tap) filters. Change-Id: Ib396681b3f7b8b0eeba94381fbe33a06cf7b4a13 --- vpx_dsp/arm/vpx_convolve8_neon.h | 48 +++++++++ vpx_dsp/arm/vpx_convolve8_neon_dotprod.c | 131 ++++++++++++++++++----- 2 files changed, 155 insertions(+), 24 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index 4f4fe4c81c..ca9f816bc4 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -36,6 +36,26 @@ static INLINE int16x4_t convolve4_4_sdot_partial(const int8x16_t samples, return vmovn_s32(sum); } +static INLINE int16x4_t convolve4_4_sdot(const uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16_t permute_tbl) { + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + int8x16_t clamped_samples = + vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); + + /* Accumulate dot product into 'correction' to account for range clamp. */ + int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0); + + /* Further narrowing and packing is performed by the caller. */ + return vmovn_s32(sum); +} + static INLINE uint8x8_t convolve4_8_sdot_partial(const int8x16_t samples_lo, const int8x16_t samples_hi, const int32x4_t correction, @@ -53,6 +73,34 @@ static INLINE uint8x8_t convolve4_8_sdot_partial(const int8x16_t samples_lo, return vqrshrun_n_s16(sum, FILTER_BITS - 1); } +static INLINE uint8x8_t convolve4_8_sdot(const uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x2_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[2]; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + + /* Accumulate dot product into 'correction' to account for range clamp. */ + /* First 4 output values. */ + int32x4_t sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); + /* Second 4 output values. */ + int32x4_t sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0); + + /* Narrow and re-pack. */ + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + /* We halved the filter values so -1 from right shift. */ + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, const int8x16_t samples_hi, const int32x4_t correction, diff --git a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c index cdb410bd58..6970bb1cd2 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c +++ b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c @@ -149,26 +149,72 @@ void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, } } -void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x16_t range_limit = vdupq_n_u8(128); +static INLINE void vpx_convolve_4tap_horiz_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, + const int32x4_t correction, const uint8x16_t range_limit) { uint8x16_t s0, s1, s2, s3; - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(x_step_q4 == 16); + if (w == 4) { + const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23; - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - src -= 3; + t0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl); + t1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl); + t2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl); + t3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl); + d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void vpx_convolve_8tap_horiz_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, + const int32x4_t correction, const uint8x16_t range_limit) { + uint8x16_t s0, s1, s2, s3; if (w == 4) { const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); @@ -178,10 +224,10 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); - t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); - t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); - t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); + t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); + t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); + t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); + t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl); d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); @@ -206,10 +252,10 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, do { load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); - d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); + d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); + d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -224,6 +270,43 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, } } +void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int32x4_t correction_8tap = + vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS))); + const uint8x16_t range_limit = vdupq_n_u8(128); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. Also slide the filter values so the + * the 4 taps exist in the first 4 elements of the vector. + */ + const int8x8_t x_filter_4tap = + vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); + const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1); + vpx_convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride, + w, h, x_filter_4tap, correction_4tap, + range_limit); + + } else { + vpx_convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride, + w, h, x_filter_8tap, correction_8tap, + range_limit); + } +} + void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, From 9cdb68891943a4cba72dcd0c2bf776a609cf7cb6 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 22 Nov 2023 11:33:07 +0000 Subject: [PATCH 0959/1000] Specialise Armv8.4 Neon 2D horiz convolution for 4-tap filters Add an Armv8.4 SDOT Neon path for the horizontal portion of 2D convolution, specialised for executing with 4-tap filters (the most common filter size for settings --good --cpu-used=1.) This new path is also used when executing with bilinear (2-tap) filters. Change-Id: I5116d10ddb371ac2cf302ef905d06f2140dc7600 --- vpx_dsp/arm/vpx_convolve8_neon_dotprod.c | 178 +++++++++++++++++++---- 1 file changed, 146 insertions(+), 32 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c index 6970bb1cd2..00bac3b9cf 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c +++ b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c @@ -40,28 +40,104 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, - ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, - int y_step_q4, int w, int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x16_t range_limit = vdupq_n_u8(128); +static INLINE void vpx_convolve_4tap_2d_horiz_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, + const int32x4_t correction, const uint8x16_t range_limit) { uint8x16_t s0, s1, s2, s3; - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(x_step_q4 == 16); - assert(h % 4 == 3); + if (w == 4) { + const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; + do { + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - src -= 3; + d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl); + d3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl); + d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + width = w; + s = src; + d = dst; + do { + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE void vpx_convolve_8tap_2d_horiz_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, + const int32x4_t correction, const uint8x16_t range_limit) { + uint8x16_t s0, s1, s2, s3; if (w == 4) { const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); @@ -71,10 +147,10 @@ void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, do { load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); - d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); - d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); - d3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); + d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); + d3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); @@ -90,9 +166,9 @@ void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, * further details on possible values of block height. */ load_u8_16x3(src, src_stride, &s0, &s1, &s2); - d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); - d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); - d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); + d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); @@ -112,10 +188,10 @@ void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, do { load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); - d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); + d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); + d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -136,9 +212,9 @@ void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, do { load_u8_16x3(s, src_stride, &s0, &s1, &s2); - d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); + d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); store_u8_8x3(d, dst_stride, d0, d1, d2); @@ -149,6 +225,44 @@ void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, } } +void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h) { + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int32x4_t correction_8tap = + vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS))); + const uint8x16_t range_limit = vdupq_n_u8(128); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. Also slide the filter values so the + * the 4 taps exist in the first 4 elements of the vector. + */ + const int8x8_t x_filter_4tap = + vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); + const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1); + vpx_convolve_4tap_2d_horiz_neon_dotprod(src - 1, src_stride, dst, + dst_stride, w, h, x_filter_4tap, + correction_4tap, range_limit); + + } else { + vpx_convolve_8tap_2d_horiz_neon_dotprod(src - 3, src_stride, dst, + dst_stride, w, h, x_filter_8tap, + correction_8tap, range_limit); + } +} + static INLINE void vpx_convolve_4tap_horiz_neon_dotprod( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, From 0dc67ecf54e11c7fa68dad6e79916b0d24c02f3a Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 27 Nov 2023 14:47:15 +0000 Subject: [PATCH 0960/1000] Specialise Armv8.6 Neon horiz convolution for 4-tap filters Add an Armv8.6 USDOT Neon implementation of horizontal convolution specialised for executing with 4-tap filters (the most common filter size for settings --good --cpu-used=1.) This new path is also used when executing with bilinear (2-tap) filters. Change-Id: I8f7633d9852ebfe8feb9b4a055715f849cccf297 --- vpx_dsp/arm/vpx_convolve8_neon.h | 38 ++++++++ vpx_dsp/arm/vpx_convolve8_neon_i8mm.c | 120 +++++++++++++++++++++----- 2 files changed, 137 insertions(+), 21 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index ca9f816bc4..031b9eb852 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -215,6 +215,20 @@ static INLINE int16x4_t convolve4_4_usdot_partial(const uint8x16_t samples, return vmovn_s32(sum); } +static INLINE int16x4_t convolve4_4_usdot(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16_t permute_tbl) { + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); + + int32x4_t sum = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0); + + /* Further narrowing and packing is performed by the caller. */ + return vmovn_s32(sum); +} + static INLINE uint8x8_t convolve4_8_usdot_partial(const uint8x16_t samples_lo, const uint8x16_t samples_hi, const int8x8_t filters) { @@ -230,6 +244,30 @@ static INLINE uint8x8_t convolve4_8_usdot_partial(const uint8x16_t samples_lo, return vqrshrun_n_s16(sum, FILTER_BITS - 1); } +static INLINE uint8x8_t convolve4_8_usdot(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + uint8x16_t permuted_samples[2]; + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + + /* First 4 output values. */ + int32x4_t sum0 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + /* Second 4 output values. */ + int32x4_t sum1 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + + /* Narrow and re-pack. */ + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + /* We halved the filter values so -1 from right shift. */ + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, const uint8x16_t samples_hi, const int8x8_t filters) { diff --git a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c index 7c394a937b..dd60fa5169 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c +++ b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c @@ -145,23 +145,70 @@ void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, } } -void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); +static INLINE void vpx_convolve_4tap_horiz_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { uint8x16_t s0, s1, s2, s3; - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(x_step_q4 == 16); + if (w == 4) { + const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23; - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - src -= 3; + t0 = convolve4_4_usdot(s0, filter, perm_tbl); + t1 = convolve4_4_usdot(s1, filter, perm_tbl); + t2 = convolve4_4_usdot(s2, filter, perm_tbl); + t3 = convolve4_4_usdot(s3, filter, perm_tbl); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve4_8_usdot(s0, filter, perm_tbl); + d1 = convolve4_8_usdot(s1, filter, perm_tbl); + d2 = convolve4_8_usdot(s2, filter, perm_tbl); + d3 = convolve4_8_usdot(s3, filter, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void vpx_convolve_8tap_horiz_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + uint8x16_t s0, s1, s2, s3; if (w == 4) { const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); @@ -171,10 +218,10 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_usdot(s0, filters, perm_tbl); - t1 = convolve8_4_usdot(s1, filters, perm_tbl); - t2 = convolve8_4_usdot(s2, filters, perm_tbl); - t3 = convolve8_4_usdot(s3, filters, perm_tbl); + t0 = convolve8_4_usdot(s0, filter, perm_tbl); + t1 = convolve8_4_usdot(s1, filter, perm_tbl); + t2 = convolve8_4_usdot(s2, filter, perm_tbl); + t3 = convolve8_4_usdot(s3, filter, perm_tbl); d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); @@ -199,10 +246,10 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, do { load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_usdot(s0, filters, perm_tbl); - d1 = convolve8_8_usdot(s1, filters, perm_tbl); - d2 = convolve8_8_usdot(s2, filters, perm_tbl); - d3 = convolve8_8_usdot(s3, filters, perm_tbl); + d0 = convolve8_8_usdot(s0, filter, perm_tbl); + d1 = convolve8_8_usdot(s1, filter, perm_tbl); + d2 = convolve8_8_usdot(s2, filter, perm_tbl); + d3 = convolve8_8_usdot(s3, filter, perm_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -217,6 +264,37 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, } } +void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. Also slide the filter values so the + * the 4 taps exist in the first 4 elements of the vector. + */ + const int8x8_t x_filter_4tap = + vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); + vpx_convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w, + h, x_filter_4tap); + + } else { + vpx_convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w, + h, x_filter_8tap); + } +} + void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, From cc89450a4821dda0aef22ea36c18298b84de93f4 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 27 Nov 2023 14:55:55 +0000 Subject: [PATCH 0961/1000] Specialise Armv8.6 Neon 2D horiz convolution for 4-tap filters Add an Armv8.6 USDOT Neon path for the horizontal portion of 2D convolution, specialised for executing with 4-tap filters (the most common filter size for settings --good --cpu-used=1.) This new path is also used when executing with bilinear (2-tap) filters. Change-Id: I455e5a94bdcea1358025bd8e4d4c8c62e373aa5d --- vpx_dsp/arm/vpx_convolve8_neon_i8mm.c | 166 +++++++++++++++++++++----- 1 file changed, 138 insertions(+), 28 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c index dd60fa5169..bcad1dd121 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c +++ b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c @@ -40,24 +40,103 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h) { - const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); +static INLINE void vpx_convolve_4tap_2d_horiz_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { uint8x16_t s0, s1, s2, s3; - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(x_step_q4 == 16); - assert(h % 4 == 3); + if (w == 4) { + const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; + do { + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - src -= 3; + d0 = convolve4_4_usdot(s0, filter, perm_tbl); + d1 = convolve4_4_usdot(s1, filter, perm_tbl); + d2 = convolve4_4_usdot(s2, filter, perm_tbl); + d3 = convolve4_4_usdot(s3, filter, perm_tbl); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + d0 = convolve4_4_usdot(s0, filter, perm_tbl); + d1 = convolve4_4_usdot(s1, filter, perm_tbl); + d2 = convolve4_4_usdot(s2, filter, perm_tbl); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve4_8_usdot(s0, filter, perm_tbl); + d1 = convolve4_8_usdot(s1, filter, perm_tbl); + d2 = convolve4_8_usdot(s2, filter, perm_tbl); + d3 = convolve4_8_usdot(s3, filter, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + width = w; + s = src; + d = dst; + do { + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + d0 = convolve4_8_usdot(s0, filter, perm_tbl); + d1 = convolve4_8_usdot(s1, filter, perm_tbl); + d2 = convolve4_8_usdot(s2, filter, perm_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + } +} + +static INLINE void vpx_convolve_8tap_2d_horiz_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + uint8x16_t s0, s1, s2, s3; if (w == 4) { const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); @@ -67,10 +146,10 @@ void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, do { load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_4_usdot(s0, filters, perm_tbl); - d1 = convolve8_4_usdot(s1, filters, perm_tbl); - d2 = convolve8_4_usdot(s2, filters, perm_tbl); - d3 = convolve8_4_usdot(s3, filters, perm_tbl); + d0 = convolve8_4_usdot(s0, filter, perm_tbl); + d1 = convolve8_4_usdot(s1, filter, perm_tbl); + d2 = convolve8_4_usdot(s2, filter, perm_tbl); + d3 = convolve8_4_usdot(s3, filter, perm_tbl); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); @@ -86,9 +165,9 @@ void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, * further details on possible values of block height. */ load_u8_16x3(src, src_stride, &s0, &s1, &s2); - d0 = convolve8_4_usdot(s0, filters, perm_tbl); - d1 = convolve8_4_usdot(s1, filters, perm_tbl); - d2 = convolve8_4_usdot(s2, filters, perm_tbl); + d0 = convolve8_4_usdot(s0, filter, perm_tbl); + d1 = convolve8_4_usdot(s1, filter, perm_tbl); + d2 = convolve8_4_usdot(s2, filter, perm_tbl); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); @@ -108,10 +187,10 @@ void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, do { load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_usdot(s0, filters, perm_tbl); - d1 = convolve8_8_usdot(s1, filters, perm_tbl); - d2 = convolve8_8_usdot(s2, filters, perm_tbl); - d3 = convolve8_8_usdot(s3, filters, perm_tbl); + d0 = convolve8_8_usdot(s0, filter, perm_tbl); + d1 = convolve8_8_usdot(s1, filter, perm_tbl); + d2 = convolve8_8_usdot(s2, filter, perm_tbl); + d3 = convolve8_8_usdot(s3, filter, perm_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -132,9 +211,9 @@ void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, do { load_u8_16x3(s, src_stride, &s0, &s1, &s2); - d0 = convolve8_8_usdot(s0, filters, perm_tbl); - d1 = convolve8_8_usdot(s1, filters, perm_tbl); - d2 = convolve8_8_usdot(s2, filters, perm_tbl); + d0 = convolve8_8_usdot(s0, filter, perm_tbl); + d1 = convolve8_8_usdot(s1, filter, perm_tbl); + d2 = convolve8_8_usdot(s2, filter, perm_tbl); store_u8_8x3(d, dst_stride, d0, d1, d2); @@ -145,6 +224,37 @@ void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, } } +void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. Also slide the filter values so the + * the 4 taps exist in the first 4 elements of the vector. + */ + const int8x8_t x_filter_4tap = + vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); + vpx_convolve_4tap_2d_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, + w, h, x_filter_4tap); + + } else { + vpx_convolve_8tap_2d_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, + w, h, x_filter_8tap); + } +} + static INLINE void vpx_convolve_4tap_horiz_neon_i8mm( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { From d7358ed53a6b344f6f374b3a5e8fe3d207bcc720 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Sun, 26 Nov 2023 20:38:01 -0800 Subject: [PATCH 0962/1000] Unitest for issue: b/310477034 Fix is made here: https://chromium-review.googlesource.com/c/webm/libvpx/+/5055827 Bug: b/310477034 Change-Id: Id1cc7a6a95e1ea5d1a022f36d7971915c9918339 --- test/encode_api_test.cc | 86 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index d2469572a6..aa2d28b6da 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -514,6 +514,22 @@ vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) { return image; } +void CodecEncodeFrame(vpx_codec_ctx_t *enc, const int width, const int height, + const int frame_index, unsigned int deadline, + const bool is_key) { + const vpx_codec_cx_pkt_t *pkt; + vpx_image_t *image = CreateImage(width, height); + vpx_enc_frame_flags_t frame_flags = is_key ? VPX_EFLAG_FORCE_KF : 0; + ASSERT_NE(image, nullptr); + ASSERT_EQ(vpx_codec_encode(enc, image, frame_index, 1, frame_flags, deadline), + VPX_CODEC_OK); + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(enc, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + } + vpx_img_free(image); +} + // This is a test case from clusterfuzz. TEST(EncodeAPI, PrevMiCheckNullptr) { vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); @@ -581,6 +597,76 @@ TEST(EncodeAPI, PrevMiCheckNullptr) { ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); } +// This is a test case from clusterfuzz: based on 310477034. +// Encode a few frames with multiple change config call +// with different frame size. +TEST(EncodeAPI, MultipleChangeConfigResize) { + vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); + vpx_codec_enc_cfg_t cfg; + + struct Config { + unsigned int thread; + unsigned int width; + unsigned int height; + vpx_rc_mode end_usage; + unsigned long deadline; + }; + + // Set initial config. + struct Config config = { 3, 41, 1, VPX_VBR, 1 }; + unsigned long deadline = config.deadline; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0), + VPX_CODEC_OK); + cfg.g_threads = config.thread; + cfg.g_w = config.width; + cfg.g_h = config.height; + cfg.g_timebase.num = 1; + cfg.g_timebase.den = 1000 * 1000; // microseconds + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.rc_end_usage = config.end_usage; + cfg.rc_min_quantizer = 2; + cfg.rc_max_quantizer = 58; + + vpx_codec_ctx_t enc; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 3), VPX_CODEC_OK); + int frame_index = 0; + + // Encode first frame. + CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true); + frame_index++; + + // Change config. + config = { 16, 31, 1, VPX_VBR, 1000000 }; + cfg.g_threads = config.thread; + cfg.g_w = config.width; + cfg.g_h = config.height; + cfg.rc_end_usage = config.end_usage; + ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc); + + // Change config again. + config = { 0, 17, 1, VPX_CBR, 1 }; + cfg.g_threads = config.thread; + cfg.g_w = config.width; + cfg.g_h = config.height; + cfg.rc_end_usage = config.end_usage; + deadline = config.deadline; + ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc); + + // Encode 2nd frame with new config, set delta frame. + CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false); + frame_index++; + + // Encode 3rd frame with same config, set delta frame. + CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false); + frame_index++; + + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + class EncodeApiGetTplStatsTest : public ::libvpx_test::EncoderTest, public ::testing::TestWithParam { From adebf364cb7ea098a366108acae8cf01595388fa Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Tue, 21 Nov 2023 14:00:16 -0800 Subject: [PATCH 0963/1000] rtc: Set nonrd keyframe under dynamic change of deadline For realtime mode: if the deadline mode (good/best/realtime) is changed on the fly (via codec_encode() call), force a key frame and set the speed feature nonrd_keyframe = 1 to avoid entering the rd pickmode. nonrd_pickmode=0/off is the only feature in realtime mode that involves rd pickmode, so by forcing it on/1 we can cleanly separate nonrd (realtime) from rd (good/best), so we can avoid possible issues on this dynamic mode switch, such as in bug listed below. Dynamic change of deadline, in particular for realtime mode, involves a lot of coding/speed feature changes, so best to also force reset with keyframe. Added unitest that triggers the issue in the bug. Bug: b/310663186 Change-Id: Idf8fd7c9ee54b301968184be5481ee9faa06468d --- test/encode_api_test.cc | 94 ++++++++++++++++++++++++++++++++ vp9/encoder/vp9_encoder.c | 5 ++ vp9/encoder/vp9_encoder.h | 4 ++ vp9/encoder/vp9_ratectrl.c | 15 +++-- vp9/encoder/vp9_speed_features.c | 5 ++ 5 files changed, 118 insertions(+), 5 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index aa2d28b6da..f48c9a106f 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -667,6 +667,100 @@ TEST(EncodeAPI, MultipleChangeConfigResize) { ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); } +// This is a test case from clusterfuzz: based on b/310663186. +// Encode set of frames while varying the deadline on the fly from +// good to realtime to best and back to realtime. +TEST(EncodeAPI, DynamicDeadlineChange) { + vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); + vpx_codec_enc_cfg_t cfg; + + struct Config { + unsigned int thread; + unsigned int width; + unsigned int height; + vpx_rc_mode end_usage; + unsigned long deadline; + }; + + // Set initial config, in particular set deadline to GOOD mode. + struct Config config = { 0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY }; + unsigned long deadline = config.deadline; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0), + VPX_CODEC_OK); + cfg.g_threads = config.thread; + cfg.g_w = config.width; + cfg.g_h = config.height; + cfg.g_timebase.num = 1; + cfg.g_timebase.den = 1000 * 1000; // microseconds + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.rc_end_usage = config.end_usage; + cfg.rc_min_quantizer = 2; + cfg.rc_max_quantizer = 58; + + vpx_codec_ctx_t enc; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + // Use realtime speed: 5 to 9. + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 5), VPX_CODEC_OK); + int frame_index = 0; + + // Encode 1st frame. + CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true); + frame_index++; + + // Encode 2nd frame, delta frame. + CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false); + frame_index++; + + // Change config: change deadline to REALTIME. + config = { 0, 1, 1, VPX_VBR, VPX_DL_REALTIME }; + deadline = config.deadline; + ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc); + + // Encode 3rd frame with new config, set key frame. + CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true); + frame_index++; + + // Encode 4th frame with same config, delta frame. + CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false); + frame_index++; + + // Encode 5th frame with same config, key frame. + CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true); + frame_index++; + + // Change config: change deadline to BEST. + config = { 0, 1, 1, VPX_VBR, VPX_DL_BEST_QUALITY }; + deadline = config.deadline; + ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc); + + // Encode 6th frame with new config, set delta frame. + CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false); + frame_index++; + + // Change config: change deadline to REALTIME. + config = { 0, 1, 1, VPX_VBR, VPX_DL_REALTIME }; + deadline = config.deadline; + ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc); + + // Encode 7th frame with new config, set delta frame. + CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false); + frame_index++; + + // Encode 8th frame with new config, set key frame. + CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true); + frame_index++; + + // Encode 9th frame with new config, set delta frame. + CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false); + frame_index++; + + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + class EncodeApiGetTplStatsTest : public ::libvpx_test::EncoderTest, public ::testing::TestWithParam { diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index e1a4d986b7..e27a77e2e1 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -5535,6 +5535,11 @@ static void encode_frame_to_data_rate( set_ref_sign_bias(cpi); } + // On the very first frame set the deadline_mode_previous_frame to + // the current mode. + if (cpi->common.current_video_frame == 0) + cpi->deadline_mode_previous_frame = cpi->oxcf.mode; + // Set default state for segment based loop filter update flags. cm->lf.mode_ref_delta_update = 0; diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 7b02fe7f6b..171489358b 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1037,6 +1037,10 @@ typedef struct VP9_COMP { int fixed_qp_onepass; + // Flag to keep track of dynamic change in deadline mode + // (good/best/realtime). + MODE deadline_mode_previous_frame; + #if CONFIG_COLLECT_COMPONENT_TIMING /*! * component_time[] are initialized to zero while encoder starts. diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index e02b2892ac..aa77b7cbad 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1991,6 +1991,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth; if (cpi->use_svc && svc->spatial_layer_id < svc->number_spatial_layers - 1) svc->lower_layer_qindex = cm->base_qindex; + cpi->deadline_mode_previous_frame = cpi->oxcf.mode; } void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { @@ -2011,6 +2012,7 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { cpi->rc.buffer_level = cpi->rc.optimal_buffer_level; cpi->rc.bits_off_target = cpi->rc.optimal_buffer_level; } + cpi->deadline_mode_previous_frame = cpi->oxcf.mode; } int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) { @@ -2118,7 +2120,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { int target; if (!cpi->refresh_alt_ref_frame && (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) || - rc->frames_to_key == 0)) { + rc->frames_to_key == 0 || + (cpi->oxcf.mode != cpi->deadline_mode_previous_frame))) { cm->frame_type = KEY_FRAME; rc->this_key_frame_forced = cm->current_video_frame != 0 && rc->frames_to_key == 0; @@ -2284,14 +2287,15 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { // Periodic key frames is based on the super-frame counter // (svc.current_superframe), also only base spatial layer is key frame. // Key frame is set for any of the following: very first frame, frame flags - // indicates key, superframe counter hits key frequency, or (non-intra) sync - // flag is set for spatial layer 0. + // indicates key, superframe counter hits key frequency,(non-intra) sync + // flag is set for spatial layer 0, or deadline mode changes. if ((cm->current_video_frame == 0 && !svc->previous_frame_is_intra_only) || (cpi->frame_flags & FRAMEFLAGS_KEY) || (cpi->oxcf.auto_key && (svc->current_superframe % cpi->oxcf.key_freq == 0) && !svc->previous_frame_is_intra_only && svc->spatial_layer_id == 0) || - (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0)) { + (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0) || + (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) { cm->frame_type = KEY_FRAME; rc->source_alt_ref_active = 0; if (is_one_pass_svc(cpi)) { @@ -2490,7 +2494,8 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; int target; if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) || - (cpi->oxcf.auto_key && rc->frames_to_key == 0)) { + (cpi->oxcf.auto_key && rc->frames_to_key == 0) || + (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) { cm->frame_type = KEY_FRAME; rc->frames_to_key = cpi->oxcf.key_freq; rc->kf_boost = DEFAULT_KF_BOOST; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 4a7172118c..56fb5f94f4 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -859,6 +859,11 @@ static void set_rt_speed_feature_framesize_independent( // off for now. if (speed <= 3 && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) cpi->oxcf.aq_mode = 0; + // For all speeds for rt mode: if the deadline mode changed (was good/best + // quality on previous frame and now is realtime) set nonrd_keyframe to 1 to + // avoid entering rd pickmode. This causes issues, such as: b/310663186. + if (cpi->oxcf.mode != cpi->deadline_mode_previous_frame) + sf->nonrd_keyframe = 1; } void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi, int speed) { From 57b72fe80782ad294c04a7c3c278325f7b5bde54 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Wed, 29 Nov 2023 14:22:10 -0800 Subject: [PATCH 0964/1000] Add VP9Encoder class to simplify fuzz test cases Bug: b:306422625 Change-Id: I8344cb7fb4e1aee87d46f683746517cdcddf5c5d --- test/encode_api_test.cc | 274 ++++++++++++++-------------------------- 1 file changed, 93 insertions(+), 181 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 739e61a6e6..da3676f6bf 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -514,17 +514,75 @@ vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) { return image; } -void CodecEncodeFrame(vpx_codec_ctx_t *enc, const int width, const int height, - const int frame_index, unsigned int deadline, - const bool is_key) { +// Emulates the WebCodecs VideoEncoder interface. +class VP9Encoder { + public: + VP9Encoder(int speed) : speed_(speed) {} + ~VP9Encoder(); + + void Configure(unsigned int threads, unsigned int width, unsigned int height, + vpx_rc_mode end_usage, unsigned long deadline); + void Encode(bool key_frame); + + private: + const int speed_; + bool initialized_ = false; + vpx_codec_enc_cfg_t cfg_; + vpx_codec_ctx_t enc_; + int frame_index_ = 0; + unsigned long deadline_ = 0; +}; + +VP9Encoder::~VP9Encoder() { + if (initialized_) { + EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK); + } +} + +void VP9Encoder::Configure(unsigned int threads, unsigned int width, + unsigned int height, vpx_rc_mode end_usage, + unsigned long deadline) { + deadline_ = deadline; + + if (!initialized_) { + vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0), + VPX_CODEC_OK); + cfg_.g_threads = threads; + cfg_.g_w = width; + cfg_.g_h = height; + cfg_.g_timebase.num = 1; + cfg_.g_timebase.den = 1000 * 1000; // microseconds + cfg_.g_pass = VPX_RC_ONE_PASS; + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = end_usage; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 58; + ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK); + initialized_ = true; + return; + } + + cfg_.g_threads = threads; + cfg_.g_w = width; + cfg_.g_h = height; + cfg_.rc_end_usage = end_usage; + ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc_); +} + +void VP9Encoder::Encode(bool key_frame) { const vpx_codec_cx_pkt_t *pkt; - vpx_image_t *image = CreateImage(width, height); - vpx_enc_frame_flags_t frame_flags = is_key ? VPX_EFLAG_FORCE_KF : 0; + vpx_image_t *image = CreateImage(cfg_.g_w, cfg_.g_h); ASSERT_NE(image, nullptr); - ASSERT_EQ(vpx_codec_encode(enc, image, frame_index, 1, frame_flags, deadline), - VPX_CODEC_OK); + const vpx_enc_frame_flags_t frame_flags = key_frame ? VPX_EFLAG_FORCE_KF : 0; + ASSERT_EQ( + vpx_codec_encode(&enc_, image, frame_index_, 1, frame_flags, deadline_), + VPX_CODEC_OK); + frame_index_++; vpx_codec_iter_t iter = nullptr; - while ((pkt = vpx_codec_get_cx_data(enc, &iter)) != nullptr) { + while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) { ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); } vpx_img_free(image); @@ -532,233 +590,87 @@ void CodecEncodeFrame(vpx_codec_ctx_t *enc, const int width, const int height, // This is a test case from clusterfuzz. TEST(EncodeAPI, PrevMiCheckNullptr) { - vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); - vpx_codec_enc_cfg_t cfg; + VP9Encoder encoder(0); + encoder.Configure(0, 1554, 644, VPX_VBR, VPX_DL_REALTIME); - struct Config { - unsigned int thread; - unsigned int width; - unsigned int height; - vpx_rc_mode end_usage; - unsigned long deadline; - }; - struct Config init_config = { 0, 1554, 644, VPX_VBR, 1 }; - unsigned long deadline = init_config.deadline; - ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0), - VPX_CODEC_OK); - cfg.g_threads = init_config.thread; - cfg.g_w = init_config.width; - cfg.g_h = init_config.height; - cfg.g_timebase.num = 1; - cfg.g_timebase.den = 1000 * 1000; // microseconds - cfg.g_pass = VPX_RC_ONE_PASS; - cfg.g_lag_in_frames = 0; - cfg.rc_end_usage = init_config.end_usage; - cfg.rc_min_quantizer = 2; - cfg.rc_max_quantizer = 58; - - vpx_codec_ctx_t enc; - ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); - ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 0), VPX_CODEC_OK); - - const vpx_codec_cx_pkt_t *pkt; - - int frame_index = 0; // First step: encode, without forcing KF. - vpx_image_t *image = CreateImage(cfg.g_w, cfg.g_h); - ASSERT_NE(image, nullptr); - ASSERT_EQ(vpx_codec_encode(&enc, image, frame_index, 1, 0, deadline), - VPX_CODEC_OK); - frame_index++; - vpx_codec_iter_t iter = nullptr; - while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) { - ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); - } - vpx_img_free(image); + encoder.Encode(false); // Second step: change config - struct Config encode_config = { 0, 1131, 644, VPX_CBR, 1000000 }; - cfg.g_threads = encode_config.thread; - cfg.g_w = encode_config.width; - cfg.g_h = encode_config.height; - cfg.rc_end_usage = encode_config.end_usage; - deadline = encode_config.deadline; - ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK) - << vpx_codec_error_detail(&enc); + encoder.Configure(0, 1131, 644, VPX_CBR, VPX_DL_GOOD_QUALITY); // Third step: encode, without forcing KF - image = CreateImage(cfg.g_w, cfg.g_h); - ASSERT_NE(image, nullptr); - ASSERT_EQ(vpx_codec_encode(&enc, image, frame_index, 1, 0, deadline), - VPX_CODEC_OK); - frame_index++; - while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) { - ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); - } - vpx_img_free(image); - ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + encoder.Encode(false); } // This is a test case from clusterfuzz: based on 310477034. // Encode a few frames with multiple change config call // with different frame size. TEST(EncodeAPI, MultipleChangeConfigResize) { - vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); - vpx_codec_enc_cfg_t cfg; - - struct Config { - unsigned int thread; - unsigned int width; - unsigned int height; - vpx_rc_mode end_usage; - unsigned long deadline; - }; + VP9Encoder encoder(3); // Set initial config. - struct Config config = { 3, 41, 1, VPX_VBR, 1 }; - unsigned long deadline = config.deadline; - ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0), - VPX_CODEC_OK); - cfg.g_threads = config.thread; - cfg.g_w = config.width; - cfg.g_h = config.height; - cfg.g_timebase.num = 1; - cfg.g_timebase.den = 1000 * 1000; // microseconds - cfg.g_pass = VPX_RC_ONE_PASS; - cfg.g_lag_in_frames = 0; - cfg.rc_end_usage = config.end_usage; - cfg.rc_min_quantizer = 2; - cfg.rc_max_quantizer = 58; - - vpx_codec_ctx_t enc; - ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); - ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 3), VPX_CODEC_OK); - int frame_index = 0; + encoder.Configure(3, 41, 1, VPX_VBR, VPX_DL_REALTIME); // Encode first frame. - CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true); - frame_index++; + encoder.Encode(true); // Change config. - config = { 16, 31, 1, VPX_VBR, 1000000 }; - cfg.g_threads = config.thread; - cfg.g_w = config.width; - cfg.g_h = config.height; - cfg.rc_end_usage = config.end_usage; - ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK) - << vpx_codec_error_detail(&enc); + encoder.Configure(16, 31, 1, VPX_VBR, VPX_DL_GOOD_QUALITY); // Change config again. - config = { 0, 17, 1, VPX_CBR, 1 }; - cfg.g_threads = config.thread; - cfg.g_w = config.width; - cfg.g_h = config.height; - cfg.rc_end_usage = config.end_usage; - deadline = config.deadline; - ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK) - << vpx_codec_error_detail(&enc); + encoder.Configure(0, 17, 1, VPX_CBR, VPX_DL_REALTIME); // Encode 2nd frame with new config, set delta frame. - CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false); - frame_index++; + encoder.Encode(false); // Encode 3rd frame with same config, set delta frame. - CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false); - frame_index++; - - ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + encoder.Encode(false); } // This is a test case from clusterfuzz: based on b/310663186. // Encode set of frames while varying the deadline on the fly from // good to realtime to best and back to realtime. TEST(EncodeAPI, DynamicDeadlineChange) { - vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); - vpx_codec_enc_cfg_t cfg; - - struct Config { - unsigned int thread; - unsigned int width; - unsigned int height; - vpx_rc_mode end_usage; - unsigned long deadline; - }; + // Use realtime speed: 5 to 9. + VP9Encoder encoder(5); // Set initial config, in particular set deadline to GOOD mode. - struct Config config = { 0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY }; - unsigned long deadline = config.deadline; - ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0), - VPX_CODEC_OK); - cfg.g_threads = config.thread; - cfg.g_w = config.width; - cfg.g_h = config.height; - cfg.g_timebase.num = 1; - cfg.g_timebase.den = 1000 * 1000; // microseconds - cfg.g_pass = VPX_RC_ONE_PASS; - cfg.g_lag_in_frames = 0; - cfg.rc_end_usage = config.end_usage; - cfg.rc_min_quantizer = 2; - cfg.rc_max_quantizer = 58; - - vpx_codec_ctx_t enc; - ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); - // Use realtime speed: 5 to 9. - ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 5), VPX_CODEC_OK); - int frame_index = 0; + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY); // Encode 1st frame. - CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true); - frame_index++; + encoder.Encode(true); // Encode 2nd frame, delta frame. - CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false); - frame_index++; + encoder.Encode(false); // Change config: change deadline to REALTIME. - config = { 0, 1, 1, VPX_VBR, VPX_DL_REALTIME }; - deadline = config.deadline; - ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK) - << vpx_codec_error_detail(&enc); + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME); // Encode 3rd frame with new config, set key frame. - CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true); - frame_index++; + encoder.Encode(true); // Encode 4th frame with same config, delta frame. - CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false); - frame_index++; + encoder.Encode(false); // Encode 5th frame with same config, key frame. - CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true); - frame_index++; + encoder.Encode(true); // Change config: change deadline to BEST. - config = { 0, 1, 1, VPX_VBR, VPX_DL_BEST_QUALITY }; - deadline = config.deadline; - ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK) - << vpx_codec_error_detail(&enc); + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_BEST_QUALITY); // Encode 6th frame with new config, set delta frame. - CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false); - frame_index++; + encoder.Encode(false); // Change config: change deadline to REALTIME. - config = { 0, 1, 1, VPX_VBR, VPX_DL_REALTIME }; - deadline = config.deadline; - ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK) - << vpx_codec_error_detail(&enc); + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME); // Encode 7th frame with new config, set delta frame. - CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false); - frame_index++; + encoder.Encode(false); // Encode 8th frame with new config, set key frame. - CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, true); - frame_index++; + encoder.Encode(true); // Encode 9th frame with new config, set delta frame. - CodecEncodeFrame(&enc, cfg.g_w, cfg.g_h, frame_index, deadline, false); - frame_index++; - - ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + encoder.Encode(false); } class EncodeApiGetTplStatsTest From fe5dc9f7fc2f98709184dace2308d51265f2800d Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 10 Nov 2023 20:38:31 -0500 Subject: [PATCH 0965/1000] vp9 rc: support screen content Bug: b/281463780 Change-Id: I23668550257b28031bdca0537459f93ec63f1e2e --- test/vp9_ratectrl_rtc_test.cc | 21 ++++++++++++++++++++- vp9/encoder/vp9_encoder.c | 1 + vp9/encoder/vp9_encoder.h | 3 +++ vp9/encoder/vp9_ratectrl.c | 3 ++- vp9/ratectrl_rtc.cc | 1 + vp9/vp9_cx_iface.c | 1 + vpx/internal/vpx_ratectrl_rtc.h | 2 ++ 7 files changed, 30 insertions(+), 2 deletions(-) diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc index ff718bbaa7..f7be47542c 100644 --- a/test/vp9_ratectrl_rtc_test.cc +++ b/test/vp9_ratectrl_rtc_test.cc @@ -54,7 +54,11 @@ class RcInterfaceTest if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 7); encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); - encoder->Control(VP9E_SET_TUNE_CONTENT, 0); + if (rc_cfg_.is_screen) { + encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_SCREEN); + } else { + encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_DEFAULT); + } encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1); } @@ -101,6 +105,19 @@ class RcInterfaceTest ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } + void RunOneLayerScreen() { + SetConfig(GET_PARAM(2)); + rc_cfg_.is_screen = true; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + frame_params_.spatial_layer_id = 0; + frame_params_.temporal_layer_id = 0; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + void RunOneLayerDropFramesCBR() { if (GET_PARAM(2) != VPX_CBR) { GTEST_SKIP() << "Frame dropping is only for CBR mode."; @@ -632,6 +649,8 @@ TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); } TEST_P(RcInterfaceTest, OneLayerDropFramesCBR) { RunOneLayerDropFramesCBR(); } +TEST_P(RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); } + TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); } TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); } diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index e27a77e2e1..be55150140 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -4065,6 +4065,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, cpi->rc.hybrid_intra_scene_change = 0; cpi->rc.re_encode_maxq_scene_change = 0; if (cm->show_frame && cpi->oxcf.mode == REALTIME && + !cpi->disable_scene_detection_rtc_ratectrl && (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.content == VP9E_CONTENT_SCREEN || (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8))) diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 171489358b..160de0064f 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1041,6 +1041,9 @@ typedef struct VP9_COMP { // (good/best/realtime). MODE deadline_mode_previous_frame; + // Flag to disable scene detection when rtc rate control library is used. + int disable_scene_detection_rtc_ratectrl; + #if CONFIG_COLLECT_COMPONENT_TIMING /*! * component_time[] are initialized to zero while encoder starts. diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index aa77b7cbad..6452e349df 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -680,7 +680,8 @@ static int adjust_q_cbr(const VP9_COMP *cpi, int q) { else q = qclamp; } - if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_limit_q(cpi, &q); return VPXMAX(VPXMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality); } diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc index d8239718a8..fd81bce7b5 100644 --- a/vp9/ratectrl_rtc.cc +++ b/vp9/ratectrl_rtc.cc @@ -131,6 +131,7 @@ bool VP9RateControlRTC::UpdateRateControl( oxcf->under_shoot_pct = rc_cfg.undershoot_pct; oxcf->over_shoot_pct = rc_cfg.overshoot_pct; oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh; + oxcf->content = rc_cfg.is_screen ? VP9E_CONTENT_SCREEN : VP9E_CONTENT_DEFAULT; oxcf->ss_number_layers = rc_cfg.ss_number_layers; oxcf->ts_number_layers = rc_cfg.ts_number_layers; oxcf->temporal_layering_mode = (VP9E_TEMPORAL_LAYERING_MODE)( diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index b1dfe992cf..b75fc18fd2 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1077,6 +1077,7 @@ static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx, cpi->compute_frame_low_motion_onepass = 0; cpi->rc.constrain_gf_key_freq_onepass_vbr = 0; cpi->cyclic_refresh->content_mode = 0; + cpi->disable_scene_detection_rtc_ratectrl = 1; } return VPX_CODEC_OK; } diff --git a/vpx/internal/vpx_ratectrl_rtc.h b/vpx/internal/vpx_ratectrl_rtc.h index 6ffd798eb2..01d64b14b7 100644 --- a/vpx/internal/vpx_ratectrl_rtc.h +++ b/vpx/internal/vpx_ratectrl_rtc.h @@ -43,6 +43,7 @@ struct VpxRateControlRtcConfig { layer_target_bitrate[0] = static_cast(target_bandwidth); ts_rate_decimator[0] = 1; frame_drop_thresh = 0; + is_screen = false; } int width; @@ -67,6 +68,7 @@ struct VpxRateControlRtcConfig { enum vpx_rc_mode rc_mode; int aq_mode; int frame_drop_thresh; + bool is_screen; }; } // namespace libvpx #endif // VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_ From 97a0d139ce077d02a64a5bba1870104a43a0217f Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 29 Nov 2023 16:47:34 -0800 Subject: [PATCH 0966/1000] psnr.h,cosmetics: fix a typo (PNSR -> PSNR) Change-Id: I2adea9f150852c106acc57e5aeeac571d6bd15fb --- vpx_dsp/psnr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpx_dsp/psnr.h b/vpx_dsp/psnr.h index 9ebb64dd52..7c57aa429f 100644 --- a/vpx_dsp/psnr.h +++ b/vpx_dsp/psnr.h @@ -26,7 +26,7 @@ typedef struct vpx_psnr_pkt PSNR_STATS; /*!\brief Converts SSE to PSNR * - * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR). + * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PSNR). * * \param[in] samples Number of samples * \param[in] peak Max sample value From b027590c30d9bd5dda3f72aff7993fd94fd28813 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Wed, 29 Nov 2023 12:18:08 -0800 Subject: [PATCH 0967/1000] Define vpx_enc_deadline_t type for encode deadline The deadline parameter of vpx_codec_encode() is of the unsigned long type. The cpplint runtime/int check and the clang-tidy google-runtime-int warn about the use of the unsigned long type. Adding a type alias works around this issue. Note: vpx_codec_decode() also has a deadline parameter, but it is of the long type. So unfortuntely this type alias cannot be simply named vpx_codec_deadline_t and the name must suggest it is encoder-specific. Change-Id: I27b6b25730b620b328422ec3f91e63fdc55b377a --- test/codec_factory.h | 10 +++++----- test/encode_api_test.cc | 6 +++--- test/encode_test_driver.h | 8 ++++---- test/frame_size_tests.cc | 3 +-- test/keyframe_test.cc | 4 ++-- vp8/vp8_cx_iface.c | 6 +++--- vp9/vp9_cx_iface.c | 4 ++-- vpx/internal/vpx_codec_internal.h | 2 +- vpx/src/vpx_encoder.c | 2 +- vpx/vpx_encoder.h | 8 +++++++- 10 files changed, 29 insertions(+), 24 deletions(-) diff --git a/test/codec_factory.h b/test/codec_factory.h index d00563df1c..c7e8f54847 100644 --- a/test/codec_factory.h +++ b/test/codec_factory.h @@ -40,7 +40,7 @@ class CodecFactory { const vpx_codec_flags_t flags) const = 0; virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, - unsigned long deadline, + vpx_enc_deadline_t deadline, const unsigned long init_flags, TwopassStatsStore *stats) const = 0; @@ -95,7 +95,7 @@ class VP8Decoder : public Decoder { class VP8Encoder : public Encoder { public: - VP8Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, + VP8Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, const unsigned long init_flags, TwopassStatsStore *stats) : Encoder(cfg, deadline, init_flags, stats) {} @@ -128,7 +128,7 @@ class VP8CodecFactory : public CodecFactory { #endif } - Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, + Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, const unsigned long init_flags, TwopassStatsStore *stats) const override { #if CONFIG_VP8_ENCODER @@ -190,7 +190,7 @@ class VP9Decoder : public Decoder { class VP9Encoder : public Encoder { public: - VP9Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, + VP9Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, const unsigned long init_flags, TwopassStatsStore *stats) : Encoder(cfg, deadline, init_flags, stats) {} @@ -223,7 +223,7 @@ class VP9CodecFactory : public CodecFactory { #endif } - Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, + Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, const unsigned long init_flags, TwopassStatsStore *stats) const override { #if CONFIG_VP9_ENCODER diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index da3676f6bf..462427fe15 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -521,7 +521,7 @@ class VP9Encoder { ~VP9Encoder(); void Configure(unsigned int threads, unsigned int width, unsigned int height, - vpx_rc_mode end_usage, unsigned long deadline); + vpx_rc_mode end_usage, vpx_enc_deadline_t deadline); void Encode(bool key_frame); private: @@ -530,7 +530,7 @@ class VP9Encoder { vpx_codec_enc_cfg_t cfg_; vpx_codec_ctx_t enc_; int frame_index_ = 0; - unsigned long deadline_ = 0; + vpx_enc_deadline_t deadline_ = 0; }; VP9Encoder::~VP9Encoder() { @@ -541,7 +541,7 @@ VP9Encoder::~VP9Encoder() { void VP9Encoder::Configure(unsigned int threads, unsigned int width, unsigned int height, vpx_rc_mode end_usage, - unsigned long deadline) { + vpx_enc_deadline_t deadline) { deadline_ = deadline; if (!initialized_) { diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index c7974894c7..7dd80d6664 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -86,7 +86,7 @@ class TwopassStatsStore { // level of abstraction will be fleshed out as more tests are written. class Encoder { public: - Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, + Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, const unsigned long init_flags, TwopassStatsStore *stats) : cfg_(cfg), deadline_(deadline), init_flags_(init_flags), stats_(stats) { memset(&encoder_, 0, sizeof(encoder_)); @@ -177,7 +177,7 @@ class Encoder { cfg_ = *cfg; } - void set_deadline(unsigned long deadline) { deadline_ = deadline; } + void set_deadline(vpx_enc_deadline_t deadline) { deadline_ = deadline; } protected: virtual vpx_codec_iface_t *CodecInterface() const = 0; @@ -196,7 +196,7 @@ class Encoder { vpx_codec_ctx_t encoder_; vpx_codec_enc_cfg_t cfg_; - unsigned long deadline_; + vpx_enc_deadline_t deadline_; unsigned long init_flags_; TwopassStatsStore *stats_; }; @@ -291,7 +291,7 @@ class EncoderTest { vpx_codec_enc_cfg_t cfg_; vpx_codec_dec_cfg_t dec_cfg_; unsigned int passes_; - unsigned long deadline_; + vpx_enc_deadline_t deadline_; TwopassStatsStore stats_; unsigned long init_flags_; vpx_enc_frame_flags_t frame_flags_; diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc index 7b6c29a88f..eea5647a78 100644 --- a/test/frame_size_tests.cc +++ b/test/frame_size_tests.cc @@ -18,8 +18,7 @@ namespace { class EncoderWithExpectedError : public ::libvpx_test::Encoder { public: - EncoderWithExpectedError(vpx_codec_enc_cfg_t cfg, - unsigned long deadline, // NOLINT + EncoderWithExpectedError(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, const unsigned long init_flags, // NOLINT ::libvpx_test::TwopassStatsStore *stats) : ::libvpx_test::Encoder(cfg, deadline, init_flags, stats) {} diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc index 5292bb188d..6a1c99cbe2 100644 --- a/test/keyframe_test.cc +++ b/test/keyframe_test.cc @@ -178,7 +178,7 @@ vpx_image_t *CreateGrayImage(vpx_img_fmt_t fmt, unsigned int w, // Tests kf_max_dist in one-pass encoding with zero lag. void TestKeyframeMaximumInterval(vpx_codec_iface_t *iface, - unsigned long deadline, + vpx_enc_deadline_t deadline, unsigned int kf_max_dist) { vpx_codec_enc_cfg_t cfg; ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0), @@ -241,7 +241,7 @@ TEST(KeyframeIntervalTest, KeyframeMaximumInterval) { ifaces.push_back(vpx_codec_vp9_cx()); #endif for (vpx_codec_iface_t *iface : ifaces) { - for (unsigned long deadline : + for (vpx_enc_deadline_t deadline : { VPX_DL_REALTIME, VPX_DL_GOOD_QUALITY, VPX_DL_BEST_QUALITY }) { // Test 0 and 1 (both mean all intra), some powers of 2, some multiples // of 10, and some prime numbers. diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index a6f0b4cbcf..c42a837ebe 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -12,7 +12,7 @@ #include "./vp8_rtcd.h" #include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" -#include "vpx/vpx_codec.h" +#include "vpx/vpx_encoder.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx_version.h" #include "vpx_mem/vpx_mem.h" @@ -776,7 +776,7 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, unsigned long duration, - unsigned long deadline) { + vpx_enc_deadline_t deadline) { int new_qc; #if !(CONFIG_REALTIME_ONLY) @@ -866,7 +866,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, const vpx_image_t *img, vpx_codec_pts_t pts, unsigned long duration, vpx_enc_frame_flags_t enc_flags, - unsigned long deadline) { + vpx_enc_deadline_t deadline) { volatile vpx_codec_err_t res = VPX_CODEC_OK; // Make a copy as volatile to avoid -Wclobbered with longjmp. volatile vpx_enc_frame_flags_t flags = enc_flags; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index b75fc18fd2..568a21c2a1 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1153,7 +1153,7 @@ static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) { static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, unsigned long duration, - unsigned long deadline) { + vpx_enc_deadline_t deadline) { MODE new_mode = BEST; #if CONFIG_REALTIME_ONLY @@ -1298,7 +1298,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, vpx_codec_pts_t pts_val, unsigned long duration, vpx_enc_frame_flags_t enc_flags, - unsigned long deadline) { + vpx_enc_deadline_t deadline) { volatile vpx_codec_err_t res = VPX_CODEC_OK; volatile vpx_enc_frame_flags_t flags = enc_flags; volatile vpx_codec_pts_t pts = pts_val; diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h index aae3218738..60f90aa111 100644 --- a/vpx/internal/vpx_codec_internal.h +++ b/vpx/internal/vpx_codec_internal.h @@ -255,7 +255,7 @@ typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t *ctx, vpx_codec_pts_t pts, unsigned long duration, vpx_enc_frame_flags_t flags, - unsigned long deadline); + vpx_enc_deadline_t deadline); typedef const vpx_codec_cx_pkt_t *(*vpx_codec_get_cx_data_fn_t)( vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter); diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c index 0d6e48015a..017525aeee 100644 --- a/vpx/src/vpx_encoder.c +++ b/vpx/src/vpx_encoder.c @@ -191,7 +191,7 @@ static void FLOATING_POINT_RESTORE() {} vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, vpx_codec_pts_t pts, unsigned long duration, vpx_enc_frame_flags_t flags, - unsigned long deadline) { + vpx_enc_deadline_t deadline) { vpx_codec_err_t res = VPX_CODEC_OK; if (!ctx || (img && !duration)) diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index c45d1a2ba5..efb1be6f12 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -979,6 +979,12 @@ vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx, */ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx); +/*!\brief Encode Deadline + * + * This type indicates a deadline, in microseconds, to be passed to + * vpx_codec_encode(). + */ +typedef unsigned long vpx_enc_deadline_t; /*!\brief deadline parameter analogous to VPx REALTIME mode. */ #define VPX_DL_REALTIME (1) /*!\brief deadline parameter analogous to VPx GOOD QUALITY mode. */ @@ -1024,7 +1030,7 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx); vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, vpx_codec_pts_t pts, unsigned long duration, vpx_enc_frame_flags_t flags, - unsigned long deadline); + vpx_enc_deadline_t deadline); /*!\brief Set compressed data output buffer * From a33ac12dc0e6d99ee47f30edd37c4ca5e71b5b1d Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 29 Nov 2023 15:50:56 +0000 Subject: [PATCH 0968/1000] Specialise Armv8.0 Neon vert convolution for 4-tap filters Add an Armv8.0 MLA Neon implementation of vertical convolution specialised for executing with 4-tap filters (the most common filter size for settings --good --cpu-used=1.) This new path is also used when executing with bilinear (2-tap) filters. Change-Id: I027eaf2d1bb9711c2217cc8aa6b1e379d3e66b26 --- vpx_dsp/arm/mem_neon.h | 10 ++ vpx_dsp/arm/vpx_convolve8_neon.c | 161 +++++++++++++++++++++++++++---- vpx_dsp/arm/vpx_convolve8_neon.h | 21 ++++ vpx_dsp/arm/vpx_convolve_neon.c | 29 +++--- 4 files changed, 187 insertions(+), 34 deletions(-) diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index 586bfb85af..38b0b6c1a9 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -273,6 +273,16 @@ static INLINE void store_u8_8x3(uint8_t *s, const ptrdiff_t p, vst1_u8(s, s2); } +static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); +} + static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, uint8x8_t *const s0, uint8x8_t *const s1, uint8x8_t *const s2, uint8x8_t *const s3) { diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index 8b89862ba9..790c8d8352 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -482,23 +482,115 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } -void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int16x8_t filters = vld1q_s16(filter[y0_q4]); +static INLINE void vpx_convolve_4tap_vert_neon(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int16x4_t filter) { + if (w == 4) { + uint8x8_t t0, t1, t2, t3, d01, d23; + int16x4_t s0, s1, s2, s3, s4, s5, s6, d0, d1, d2, d3; - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(y_step_q4 == 16); + load_u8_8x3(src, src_stride, &t0, &t1, &t2); + s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); - (void)x0_q4; - (void)x_step_q4; - (void)y_step_q4; + src += 3 * src_stride; - src -= 3 * src_stride; + do { + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + + d0 = convolve4_4(s0, s1, s2, s3, filter); + d1 = convolve4_4(s1, s2, s3, s4, filter); + d2 = convolve4_4(s2, s3, s4, s5, filter); + d3 = convolve4_4(s3, s4, s5, s6, filter); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + int height; + const uint8_t *s; + uint8_t *d; + uint8x8_t t0, t1, t2, t3, d0, d1, d2, d3; + int16x8_t s0, s1, s2, s3, s4, s5, s6; + + do { + load_u8_8x3(src, src_stride, &t0, &t1, &t2); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + + s = src + 3 * src_stride; + d = dst; + height = h; + + do { + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + s3 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + + d0 = convolve4_8(s0, s1, s2, s3, filter); + d1 = convolve4_8(s1, s2, s3, s4, filter); + d2 = convolve4_8(s2, s3, s4, s5, filter); + d3 = convolve4_8(s3, s4, s5, s6, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + s0 = s4; + s1 = s5; + s2 = s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int16x8_t filter) { if (w == 4) { uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; @@ -530,10 +622,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); @@ -596,10 +688,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -621,6 +713,33 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } } +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. + */ + const int16x4_t y_filter_4tap = vshr_n_s16(vld1_s16(filter[y0_q4] + 2), 1); + vpx_convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, + w, h, y_filter_4tap); + } else { + const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]); + vpx_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter_8tap); + } +} + void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index 031b9eb852..f01d4f6a42 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -351,6 +351,27 @@ static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples, #endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) +static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t filters) { + int16x4_t sum = vmul_lane_s16(s0, filters, 0); + sum = vmla_lane_s16(sum, s1, filters, 1); + sum = vmla_lane_s16(sum, s2, filters, 2); + sum = vmla_lane_s16(sum, s3, filters, 3); + return sum; +} + +static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x4_t filters) { + int16x8_t sum = vmulq_lane_s16(s0, filters, 0); + sum = vmlaq_lane_s16(sum, s1, filters, 1); + sum = vmlaq_lane_s16(sum, s2, filters, 2); + sum = vmlaq_lane_s16(sum, s3, filters, 3); + /* We halved the filter values so -1 from right shift. */ + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c index 830f3176d7..57772ea668 100644 --- a/vpx_dsp/arm/vpx_convolve_neon.c +++ b/vpx_dsp/arm/vpx_convolve_neon.c @@ -12,35 +12,38 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the + /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). */ uint8_t temp[64 * 72]; - // Account for the vertical phase needing 3 lines prior and 4 lines post - // (+ 1 to make it divisible by 4). - const int intermediate_height = h + 8; + const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior + * and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.) */ + const int intermediate_height = h + vert_filter_taps; + const ptrdiff_t border_offset = vert_filter_taps / 2 - 1; assert(y_step_q4 == 16); assert(x_step_q4 == 16); - /* Filter starting 3 lines back. The neon implementation will ignore the given - * height and filter a multiple of 4 lines. Since this goes in to the temp - * buffer which has lots of extra room and is subsequently discarded this is - * safe if somewhat less than ideal. */ - vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, + /* Filter starting border_offset lines back. The Neon implementation will + * ignore the given height and filter a multiple of 4 lines. Since this goes + * in to the temp buffer which has lots of extra room and is subsequently + * discarded this is safe if somewhat less than ideal. */ + vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, temp, + w, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, intermediate_height); - /* Step into the temp buffer 3 lines to get the actual frame data */ - vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, - x_step_q4, y0_q4, y_step_q4, w, h); + /* Step into the temp buffer border_offset lines to get actual frame data. */ + vpx_convolve8_vert_neon(temp + w * border_offset, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, From 15c2a9a02fa9049430c010c9dea2339440a03add Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 30 Nov 2023 15:37:12 -0800 Subject: [PATCH 0969/1000] Add a test for b/312517065 Bug: b/312517065 Change-Id: I6b5529a8e034fb0468f110e420fafb4944a19d0f --- test/encode_api_test.cc | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index da3676f6bf..7ca2b5941f 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -601,7 +601,7 @@ TEST(EncodeAPI, PrevMiCheckNullptr) { encoder.Encode(false); } -// This is a test case from clusterfuzz: based on 310477034. +// This is a test case from clusterfuzz: based on b/310477034. // Encode a few frames with multiple change config call // with different frame size. TEST(EncodeAPI, MultipleChangeConfigResize) { @@ -673,6 +673,18 @@ TEST(EncodeAPI, DynamicDeadlineChange) { encoder.Encode(false); } +// This is a test case from clusterfuzz: based on b/312517065. +TEST(EncodeAPI, Buganizer312517065) { + VP9Encoder encoder(4); + encoder.Configure(0, 1060, 437, VPX_CBR, VPX_DL_REALTIME); + encoder.Encode(true); + encoder.Configure(10, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY); + encoder.Encode(false); + encoder.Configure(6, 327, 269, VPX_VBR, VPX_DL_GOOD_QUALITY); + encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME); + encoder.Encode(false); +} + class EncodeApiGetTplStatsTest : public ::libvpx_test::EncoderTest, public ::testing::TestWithParam { From d144e6e95d7cb11a0161ddf2b141a0813f0b665e Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 30 Nov 2023 15:02:47 +0000 Subject: [PATCH 0970/1000] Specialise Armv8.0 Neon horiz convolution for 4-tap filters Add an Armv8.0 MLA Neon implementation of horizontal convolution specialised for executing with 4-tap filters (the most common filter size for settings --good --cpu-used=1.) This new path is also used when executing with bilinear (2-tap) filters. Change-Id: Ic2c3cb307b95964cd0ba86f1c42eece3a8ab7cf4 --- vpx_dsp/arm/vpx_convolve8_neon.c | 162 ++++++++++++++++++++++--------- 1 file changed, 118 insertions(+), 44 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index 790c8d8352..65fb67c984 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -32,33 +32,88 @@ // instructions. This optimization is much faster in speed unit test, but slowed // down the whole decoder by 5%. -void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - const int16x8_t filters = vld1q_s16(filter[x0_q4]); - uint8x8_t t0, t1, t2, t3; - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(x_step_q4 == 16); +static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int16x4_t filter) { + if (w == 4) { + do { + int16x4_t s0[4], s1[4]; + + int16x8_t t0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src))); + s0[0] = vget_low_s16(vextq_s16(t0, t0, 0)); + s0[1] = vget_low_s16(vextq_s16(t0, t0, 1)); + s0[2] = vget_low_s16(vextq_s16(t0, t0, 2)); + s0[3] = vget_low_s16(vextq_s16(t0, t0, 3)); + + int16x8_t t1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + src_stride))); + s1[0] = vget_low_s16(vextq_s16(t1, t1, 0)); + s1[1] = vget_low_s16(vextq_s16(t1, t1, 1)); + s1[2] = vget_low_s16(vextq_s16(t1, t1, 2)); + s1[3] = vget_low_s16(vextq_s16(t1, t1, 3)); + + int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter); + int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + + store_u8(dst, dst_stride, d01); + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 0); + } else { + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; + do { + int16x8_t t0[2], t1[2]; + int16x8_t s0[4], s1[4]; + + t0[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + t0[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 8))); + s0[0] = vextq_s16(t0[0], t0[1], 0); + s0[1] = vextq_s16(t0[0], t0[1], 1); + s0[2] = vextq_s16(t0[0], t0[1], 2); + s0[3] = vextq_s16(t0[0], t0[1], 3); + + t1[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride))); + t1[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride + 8))); + s1[0] = vextq_s16(t1[0], t1[1], 0); + s1[1] = vextq_s16(t1[0], t1[1], 1); + s1[2] = vextq_s16(t1[0], t1[1], 2); + s1[3] = vextq_s16(t1[0], t1[1], 3); + + uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter); + uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter); + + vst1_u8(d, d0); + vst1_u8(d + dst_stride, d1); + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 0); + } +} - src -= 3; +static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int16x8_t filter) { + uint8x8_t t0, t1, t2, t3; if (h == 4) { uint8x8_t d01, d23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); @@ -83,10 +138,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); @@ -149,10 +204,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); transpose_u8_8x4(&d04, &d15, &d26, &d37); @@ -170,14 +225,6 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, int16x8_t s11, s12, s13, s14; do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); @@ -212,14 +259,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); + d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); + d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); + d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); @@ -244,6 +291,33 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. + */ + const int16x4_t x_filter_4tap = vshr_n_s16(vld1_s16(filter[x0_q4] + 2), 1); + vpx_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h, + x_filter_4tap); + } else { + const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]); + vpx_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h, + x_filter_8tap); + } +} + void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, From 845a817c056c05e8fe7ae9298be47b949d8aceee Mon Sep 17 00:00:00 2001 From: Bohan Li Date: Thu, 30 Nov 2023 15:49:38 -0800 Subject: [PATCH 0971/1000] Fix scaled reference offsets. Since the reference frame is already scaled, do not scale the offsets. BUG: b/311489136, b/312656387 Change-Id: Ib346242e7ec8c4d3ed26668fa4094271218278ed --- test/encode_api_test.cc | 59 +++++++++++++++++++++++++++++++++++ vp9/encoder/vp9_encodeframe.c | 3 +- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 7ca2b5941f..6205a56ce0 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -685,6 +685,65 @@ TEST(EncodeAPI, Buganizer312517065) { encoder.Encode(false); } +// This is a test case from clusterfuzz: based on b/311489136. +// Encode a few frames with multiple change config call +// with different frame size. +TEST(EncodeAPI, Buganizer311489136) { + VP9Encoder encoder(1); + + // Set initial config. + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode first frame. + encoder.Encode(true); + + // Change config. + encoder.Configure(3, 1678, 202, VPX_CBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame with new config, set delta frame. + encoder.Encode(false); + + // Change config again. + encoder.Configure(8, 1037, 476, VPX_CBR, VPX_DL_REALTIME); + + // Encode 3rd frame with new config, set delta frame. + encoder.Encode(false); + + // Change config again. + encoder.Configure(0, 580, 620, VPX_CBR, VPX_DL_GOOD_QUALITY); + + // Encode 4th frame with same config, set delta frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/312656387. +// Encode a few frames with multiple change config call +// with different frame size. +TEST(EncodeAPI, Buganizer312656387) { + VP9Encoder encoder(1); + + // Set initial config. + encoder.Configure(16, 1, 1024, VPX_CBR, VPX_DL_REALTIME); + + // Change config. + encoder.Configure(15, 1, 1024, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(true); + + // Change config again. + encoder.Configure(14, 1, 595, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame with new config. + encoder.Encode(true); + + // Change config again. + encoder.Configure(2, 1, 1024, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 3rd frame with new config, set delta frame. + encoder.Encode(false); +} + class EncodeApiGetTplStatsTest : public ::libvpx_test::EncoderTest, public ::testing::TestWithParam { diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index b98fd84579..ddf47c1288 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -3451,8 +3451,7 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, assert(yv12 != NULL); if (!yv12) return; - vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, - &cm->frame_refs[ref - 1].sf); + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, NULL); mi->ref_frame[0] = ref; mi->ref_frame[1] = NO_REF_FRAME; mi->sb_type = bsize; From 070d7e5cf339845b0c280687697f7e8dd0444098 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 30 Nov 2023 10:08:18 -0800 Subject: [PATCH 0972/1000] Document vpx_codec_decode() ignores deadline param The changes in this CL show that both the VP8 and VP9 implementations of the decode function eventually discard the deadline parameter. Change the code to ignore the deadline parameter in vpx_codec_decode() without passing it to the decode function, and document that the deadline parameter is ignored and 0 should be passed. Change-Id: Ia977e16cdbdf97901207aa2d749887980137c4c0 --- vp8/common/onyxd.h | 3 +-- vp8/decoder/onyxd_if.c | 6 +----- vp8/decoder/onyxd_int.h | 1 - vp8/vp8_dx_iface.c | 8 +++----- vp9/vp9_dx_iface.c | 10 ++++------ vpx/internal/vpx_codec_internal.h | 3 +-- vpx/src/vpx_decoder.c | 7 +++---- vpx/vpx_decoder.h | 2 ++ 8 files changed, 15 insertions(+), 25 deletions(-) diff --git a/vp8/common/onyxd.h b/vp8/common/onyxd.h index e4e81aaac5..217a598de7 100644 --- a/vp8/common/onyxd.h +++ b/vp8/common/onyxd.h @@ -41,9 +41,8 @@ void vp8dx_set_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst, int x); int vp8dx_get_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst); -int vp8dx_receive_compressed_data(struct VP8D_COMP *pbi, int64_t time_stamp); +int vp8dx_receive_compressed_data(struct VP8D_COMP *pbi); int vp8dx_get_raw_frame(struct VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, - int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags); int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame); diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c index 765d2ec83e..2248345ba2 100644 --- a/vp8/decoder/onyxd_if.c +++ b/vp8/decoder/onyxd_if.c @@ -302,7 +302,7 @@ static int check_fragments_for_errors(VP8D_COMP *pbi) { return 1; } -int vp8dx_receive_compressed_data(VP8D_COMP *pbi, int64_t time_stamp) { +int vp8dx_receive_compressed_data(VP8D_COMP *pbi) { VP8_COMMON *cm = &pbi->common; int retcode = -1; @@ -368,14 +368,12 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, int64_t time_stamp) { #endif pbi->ready_for_new_data = 0; - pbi->last_time_stamp = time_stamp; decode_exit: vpx_clear_system_state(); return retcode; } int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, - int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags) { int ret = -1; @@ -385,8 +383,6 @@ int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, if (pbi->common.show_frame == 0) return ret; pbi->ready_for_new_data = 1; - *time_stamp = pbi->last_time_stamp; - *time_end_stamp = 0; #if CONFIG_POSTPROC ret = vp8_post_proc_frame(&pbi->common, sd, flags); diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h index 56500a8506..1070849620 100644 --- a/vp8/decoder/onyxd_int.h +++ b/vp8/decoder/onyxd_int.h @@ -99,7 +99,6 @@ typedef struct VP8D_COMP { /* end of threading data */ #endif - int64_t last_time_stamp; int ready_for_new_data; vp8_prob prob_intra; diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index 2e5d6dcfe8..e81deaf4ea 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -276,7 +276,7 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data, static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *data, unsigned int data_sz, - void *user_priv, long deadline) { + void *user_priv) { volatile vpx_codec_err_t res; volatile unsigned int resolution_change = 0; volatile unsigned int w, h; @@ -508,7 +508,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, pbi->restart_threads = 0; #endif ctx->user_priv = user_priv; - if (vp8dx_receive_compressed_data(pbi, deadline)) { + if (vp8dx_receive_compressed_data(pbi)) { res = update_error_state(ctx, &pbi->common.error); } @@ -529,7 +529,6 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx, */ if (!(*iter) && ctx->yv12_frame_buffers.pbi[0]) { YV12_BUFFER_CONFIG sd; - int64_t time_stamp = 0, time_end_stamp = 0; vp8_ppflags_t flags; vp8_zero(flags); @@ -539,8 +538,7 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx, flags.noise_level = ctx->postproc_cfg.noise_level; } - if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd, - &time_stamp, &time_end_stamp, &flags)) { + if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd, &flags)) { yuvconfig2image(&ctx->img, &sd, ctx->user_priv); img = &ctx->img; diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index a242c776cd..860f721dc5 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -305,9 +305,7 @@ static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx, static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, const uint8_t **data, unsigned int data_sz, - void *user_priv, int64_t deadline) { - (void)deadline; - + void *user_priv) { // Determine the stream parameters. Note that we rely on peek_si to // validate that we have a buffer that does not wrap around the top // of the heap. @@ -342,7 +340,7 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *data, unsigned int data_sz, - void *user_priv, long deadline) { + void *user_priv) { const uint8_t *data_start = data; vpx_codec_err_t res; uint32_t frame_sizes[8]; @@ -382,7 +380,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_CORRUPT_FRAME; } - res = decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline); + res = decode_one(ctx, &data_start_copy, frame_size, user_priv); if (res != VPX_CODEC_OK) return res; data_start += frame_size; @@ -391,7 +389,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *const data_end = data + data_sz; while (data_start < data_end) { const uint32_t frame_size = (uint32_t)(data_end - data_start); - res = decode_one(ctx, &data_start, frame_size, user_priv, deadline); + res = decode_one(ctx, &data_start, frame_size, user_priv); if (res != VPX_CODEC_OK) return res; // Account for suboptimal termination by the encoder. diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h index aae3218738..99c1b3d110 100644 --- a/vpx/internal/vpx_codec_internal.h +++ b/vpx/internal/vpx_codec_internal.h @@ -198,8 +198,7 @@ typedef const struct vpx_codec_ctrl_fn_map { typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx, const uint8_t *data, unsigned int data_sz, - void *user_priv, - long deadline); + void *user_priv); /*!\brief Decoded frames iterator * diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c index 427cd1bf43..c79cc708cd 100644 --- a/vpx/src/vpx_decoder.c +++ b/vpx/src/vpx_decoder.c @@ -105,6 +105,7 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data, unsigned int data_sz, void *user_priv, long deadline) { vpx_codec_err_t res; + (void)deadline; /* Sanity checks */ /* NULL data ptr allowed if data_sz is 0 too */ @@ -112,10 +113,8 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data, res = VPX_CODEC_INVALID_PARAM; else if (!ctx->iface || !ctx->priv) res = VPX_CODEC_ERROR; - else { - res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv, - deadline); - } + else + res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv); return SAVE_STATUS(ctx, res); } diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h index 99dd8cf694..3242692e7e 100644 --- a/vpx/vpx_decoder.h +++ b/vpx/vpx_decoder.h @@ -205,6 +205,8 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx, * this frame. * \param[in] deadline Soft deadline the decoder should attempt to meet, * in us. Set to zero for unlimited. + * NOTE: The deadline parameter is ignored. Always + * pass 0. * * \return Returns #VPX_CODEC_OK if the coded data was processed completely * and future pictures can be decoded without error. Otherwise, From 5cad6fdc9280ee656e91905250e7675e694a1c24 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 1 Dec 2023 12:13:01 -0800 Subject: [PATCH 0973/1000] CHANGELOG: add CVE for issue #1642 CVE-2023-6349 was reserved for this issue. It's not yet published. Bug: webm:1642, b:302710624 Change-Id: Iaab2a0bcae449a45e35678f5c049413fe0a4d2a4 --- CHANGELOG | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index 5a8605a73d..21070785ed 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -11,7 +11,7 @@ - Bug fixes: https://crbug.com/1486441 (CVE-2023-5217) - Fix to a crash related to VP9 encoding (#1642) + Fix to a crash related to VP9 encoding (#1642, CVE-2023-6349) 2023-01-31 v1.13.0 "Ugly Duckling" This release includes more Neon and AVX2 optimizations, adds a new codec From a9f1bfdb8e93a742da9a14d4a9d3b1d847edd70d Mon Sep 17 00:00:00 2001 From: Bohan Li Date: Thu, 30 Nov 2023 16:18:25 -0800 Subject: [PATCH 0974/1000] Fix edge case when downsizing to one. BUG: b/310329177 Change-Id: I2ebf4165adbc7351d6cc73554827812dedc4d362 --- test/encode_api_test.cc | 19 +++++++++++++++++++ vp9/encoder/vp9_resize.c | 6 ++++++ 2 files changed, 25 insertions(+) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 6205a56ce0..030b376949 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -744,6 +744,25 @@ TEST(EncodeAPI, Buganizer312656387) { encoder.Encode(false); } +// This is a test case from clusterfuzz: based on b/310329177. +// Encode a few frames with multiple change config call +// with different frame size. +TEST(EncodeAPI, Buganizer310329177) { + VP9Encoder encoder(6); + + // Set initial config. + encoder.Configure(10, 41, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(true); + + // Change config. + encoder.Configure(16, 1, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 2nd frame with new config, set delta frame. + encoder.Encode(false); +} + class EncodeApiGetTplStatsTest : public ::libvpx_test::EncoderTest, public ::testing::TestWithParam { diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c index 7486dee25b..ca55ec9886 100644 --- a/vp9/encoder/vp9_resize.c +++ b/vp9/encoder/vp9_resize.c @@ -360,6 +360,12 @@ static int get_down2_steps(int in_length, int out_length) { while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) { ++steps; in_length = proj_in_length; + if (in_length == 1) { + // Special case: we break because any further calls to get_down2_length() + // with be with length == 1, which return 1, resulting in an infinite + // loop. + break; + } } return steps; } From bf0755418357237f6ea4794dfab3c474d06a0937 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 30 Nov 2023 14:27:06 -0800 Subject: [PATCH 0975/1000] Add the needed Android API level predicates. fseeko and ftello are available on Android only from API level 24. Add the needed guards for these functions. Suggested by Yifan Yang. Change-Id: I3a6721d31e1d961ab10b434ea6e92959bd5a70ab --- tools_common.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools_common.h b/tools_common.h index 9850907c15..e2942d04b8 100644 --- a/tools_common.h +++ b/tools_common.h @@ -32,7 +32,12 @@ typedef int64_t FileOffset; #define fseeko fseeko64 #define ftello ftello64 typedef off64_t FileOffset; -#elif CONFIG_OS_SUPPORT +#elif CONFIG_OS_SUPPORT && \ + !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \ + defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64) +/* POSIX.1 has fseeko and ftello. fseeko and ftello are not available before + * Android API level 24. See + * https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md */ #include /* NOLINT */ typedef off_t FileOffset; /* Use 32-bit file operations in WebM file format when building ARM From 5dcb4c17402ddae13afe5cb115bad09935fdd3d1 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Wed, 29 Nov 2023 12:42:38 -0800 Subject: [PATCH 0976/1000] Define VPX_DL_* macros as unsigned long constants Define the VPX_DL_REALTIME, VPX_DL_GOOD_QUALITY, and VPX_DL_BEST_QUALITY macros as unsigned long, because the deadline parameter of vpx_codec_encode() is of the unsigned long type. This enables C++ templates to deduce the unsigned long type from these macros. Change-Id: I2173e3bbf5e15c84c11843790df93a497a35ed7d --- vpx/vpx_encoder.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index efb1be6f12..f3799d674c 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -986,11 +986,11 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx); */ typedef unsigned long vpx_enc_deadline_t; /*!\brief deadline parameter analogous to VPx REALTIME mode. */ -#define VPX_DL_REALTIME (1) +#define VPX_DL_REALTIME 1ul /*!\brief deadline parameter analogous to VPx GOOD QUALITY mode. */ -#define VPX_DL_GOOD_QUALITY (1000000) +#define VPX_DL_GOOD_QUALITY 1000000ul /*!\brief deadline parameter analogous to VPx BEST QUALITY mode. */ -#define VPX_DL_BEST_QUALITY (0) +#define VPX_DL_BEST_QUALITY 0ul /*!\brief Encode a frame * * Encodes a video frame at the given "presentation time." The presentation From f10481dc0a49b5d53428560de2a2eee43f9ed60f Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Wed, 22 Nov 2023 17:09:14 -0800 Subject: [PATCH 0977/1000] Set skip_recode=0 in nonrd_pick_sb_modes Need to set skip_recode properly so that vp9_encode_block_intra() can work properly when it is called by block_rd_txfm(). We can not skip "recode" because it is still at the rd search stage. Bug: b/310340241 Change-Id: I7d7600ef72addd341636549c2dad1868ad90e1cb --- test/encode_api_test.cc | 19 +++++++++++++++++++ vp9/encoder/vp9_encodeframe.c | 2 ++ 2 files changed, 21 insertions(+) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 785875c229..4ef83d75b3 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -673,6 +673,25 @@ TEST(EncodeAPI, DynamicDeadlineChange) { encoder.Encode(false); } +TEST(EncodeAPI, Buganizer310340241) { + VP9Encoder encoder(-6); + + // Set initial config, in particular set deadline to GOOD mode. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 1st frame. + encoder.Encode(true); + + // Encode 2nd frame, delta frame. + encoder.Encode(false); + + // Change config: change deadline to REALTIME. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 3rd frame with new config, set key frame. + encoder.Encode(true); +} + // This is a test case from clusterfuzz: based on b/312517065. TEST(EncodeAPI, Buganizer312517065) { VP9Encoder encoder(4); diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index ddf47c1288..d6ac04400d 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -4714,6 +4714,8 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, set_segment_index(cpi, x, mi_row, mi_col, bsize, 0); + x->skip_recode = 0; + mi = xd->mi[0]; mi->sb_type = bsize; From db83435afbeee1a31a9e6cdeca38407ebc724bc8 Mon Sep 17 00:00:00 2001 From: Gerda Zsejke More Date: Tue, 28 Nov 2023 11:32:42 +0100 Subject: [PATCH 0978/1000] configure: Add darwin23 support Add target arm64-darwin23-gcc, x86_64-darwin23-gcc for MacOS 14. Change-Id: I6b68a6a61d51aaa78ec11a5055bb95ce77a81d9c --- build/make/configure.sh | 2 +- configure | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/build/make/configure.sh b/build/make/configure.sh index 54fb1daf4d..869793a296 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -792,7 +792,7 @@ process_common_toolchain() { tgt_isa=x86_64 tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'` ;; - *darwin2[0-2]*) + *darwin2[0-3]*) tgt_isa=`uname -m` tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'` ;; diff --git a/configure b/configure index 6b910160a8..b212e0709d 100755 --- a/configure +++ b/configure @@ -102,6 +102,7 @@ all_platforms="${all_platforms} arm64-darwin-gcc" all_platforms="${all_platforms} arm64-darwin20-gcc" all_platforms="${all_platforms} arm64-darwin21-gcc" all_platforms="${all_platforms} arm64-darwin22-gcc" +all_platforms="${all_platforms} arm64-darwin23-gcc" all_platforms="${all_platforms} arm64-linux-gcc" all_platforms="${all_platforms} arm64-win64-gcc" all_platforms="${all_platforms} arm64-win64-vs15" @@ -165,6 +166,7 @@ all_platforms="${all_platforms} x86_64-darwin19-gcc" all_platforms="${all_platforms} x86_64-darwin20-gcc" all_platforms="${all_platforms} x86_64-darwin21-gcc" all_platforms="${all_platforms} x86_64-darwin22-gcc" +all_platforms="${all_platforms} x86_64-darwin23-gcc" all_platforms="${all_platforms} x86_64-iphonesimulator-gcc" all_platforms="${all_platforms} x86_64-linux-gcc" all_platforms="${all_platforms} x86_64-linux-icc" From 9ad598f249ee5af0ad211797063fba6de8d1ff80 Mon Sep 17 00:00:00 2001 From: Bohan Li Date: Mon, 4 Dec 2023 13:12:46 -0800 Subject: [PATCH 0979/1000] Improve test comments. Change-Id: I42dddb946193e30cf07e39b43eaad051c5da479a --- test/encode_api_test.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 4ef83d75b3..65501ee8ca 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -602,8 +602,8 @@ TEST(EncodeAPI, PrevMiCheckNullptr) { } // This is a test case from clusterfuzz: based on b/310477034. -// Encode a few frames with multiple change config call -// with different frame size. +// Encode a few frames with multiple change config calls +// with different frame sizes. TEST(EncodeAPI, MultipleChangeConfigResize) { VP9Encoder encoder(3); @@ -705,8 +705,8 @@ TEST(EncodeAPI, Buganizer312517065) { } // This is a test case from clusterfuzz: based on b/311489136. -// Encode a few frames with multiple change config call -// with different frame size. +// Encode a few frames with multiple change config calls +// with different frame sizes. TEST(EncodeAPI, Buganizer311489136) { VP9Encoder encoder(1); @@ -736,8 +736,8 @@ TEST(EncodeAPI, Buganizer311489136) { } // This is a test case from clusterfuzz: based on b/312656387. -// Encode a few frames with multiple change config call -// with different frame size. +// Encode a few frames with multiple change config calls +// with different frame sizes. TEST(EncodeAPI, Buganizer312656387) { VP9Encoder encoder(1); @@ -764,8 +764,8 @@ TEST(EncodeAPI, Buganizer312656387) { } // This is a test case from clusterfuzz: based on b/310329177. -// Encode a few frames with multiple change config call -// with different frame size. +// Encode a few frames with multiple change config calls +// with different frame sizes. TEST(EncodeAPI, Buganizer310329177) { VP9Encoder encoder(6); From 8bf3649d410cd68076e532e697f34dcec3f87ce7 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Mon, 4 Dec 2023 17:32:51 -0800 Subject: [PATCH 0980/1000] Fix a bug in frame scaling This change fixed a corner case bug reealed by b/311394513. During the frame scaling, vpx_highbd_convolve8() and vpx_scaled_2d() requires both x_step_q4 and y_step_q4 are less than or equal to a defined value. Otherwise, it needs to call vp9_scale_and_extend_ frame_nonnormative() that supports arbitrary scaling. The fix was done in LBD and HBD funnctions. Bug: b/311394513 Change-Id: Id0d34e7910ec98859030ef968ac19331488046d4 --- test/encode_api_test.cc | 19 +++++++++++++++++++ vp9/encoder/vp9_encoder.c | 31 ++++++++++++++++++++++++------- vp9/encoder/vp9_encoder.h | 8 ++++++++ vp9/encoder/vp9_frame_scale.c | 18 ++++++++++++++++++ 4 files changed, 69 insertions(+), 7 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 65501ee8ca..f046a9db39 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -782,6 +782,25 @@ TEST(EncodeAPI, Buganizer310329177) { encoder.Encode(false); } +// This is a test case from clusterfuzz: based on b/311394513. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer311394513) { + VP9Encoder encoder(-7); + + // Set initial config. + encoder.Configure(0, 5, 9, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(false); + + // Change config. + encoder.Configure(5, 2, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 2nd frame with new config. + encoder.Encode(true); +} + class EncodeApiGetTplStatsTest : public ::libvpx_test::EncoderTest, public ::testing::TestWithParam { diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index be55150140..20e35077cd 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -3079,12 +3079,11 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { #endif #if CONFIG_VP9_HIGHBITDEPTH -static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, - int bd) { +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int bd) { #else -static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst) { +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { #endif // CONFIG_VP9_HIGHBITDEPTH // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t int i; @@ -3129,6 +3128,23 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; const int dst_h = dst->y_crop_height; + + // The issue b/311394513 reveals a corner case bug. + // For bd = 8, vpx_scaled_2d() requires both x_step_q4 and y_step_q4 are less + // than or equal to 64. For bd >= 10, vpx_highbd_convolve8() requires both + // x_step_q4 and y_step_q4 are less than or equal to 32. If this condition + // isn't met, it needs to call vp9_scale_and_extend_frame_nonnormative() that + // supports arbitrary scaling. + const int x_step_q4 = 16 * src_w / dst_w; + const int y_step_q4 = 16 * src_h / dst_h; + const int is_arbitrary_scaling = + (bd == 8 && (x_step_q4 > 64 || y_step_q4 > 64)) || + (bd >= 10 && (x_step_q4 > 32 || y_step_q4 > 32)); + if (is_arbitrary_scaling) { + vp9_scale_and_extend_frame_nonnormative(src, dst, bd); + return; + } + const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer, src->v_buffer }; const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; @@ -4993,13 +5009,14 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required( scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth, filter_type, phase_scaler); else - scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth); + vp9_scale_and_extend_frame_nonnormative(unscaled, scaled, + (int)cm->bit_depth); #else if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) && unscaled->y_height <= (scaled->y_height << 1)) vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler); else - scale_and_extend_frame_nonnormative(unscaled, scaled); + vp9_scale_and_extend_frame_nonnormative(unscaled, scaled); #endif // CONFIG_VP9_HIGHBITDEPTH return scaled; } else { diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 160de0064f..83b7081e7b 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1382,6 +1382,14 @@ void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags, void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv); +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int bd); +#else +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); +#endif // CONFIG_VP9_HIGHBITDEPTH + YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c index a410d0407f..22b3f05579 100644 --- a/vp9/encoder/vp9_frame_scale.c +++ b/vp9/encoder/vp9_frame_scale.c @@ -12,6 +12,7 @@ #include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vp9/common/vp9_blockd.h" +#include "vp9/encoder/vp9_encoder.h" #include "vpx_dsp/vpx_filter.h" #include "vpx_scale/yv12config.h" @@ -91,6 +92,23 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, { const int dst_w = dst->y_crop_width; const int dst_h = dst->y_crop_height; + + // The issue b/311394513 reveals a corner case bug. vpx_scaled_2d() requires + // both x_step_q4 and y_step_q4 are less than or equal to 64. Otherwise, it + // needs to call vp9_scale_and_extend_frame_nonnormative() that supports + // arbitrary scaling. + const int x_step_q4 = 16 * src_w / dst_w; + const int y_step_q4 = 16 * src_h / dst_h; + if (x_step_q4 > 64 || y_step_q4 > 64) { + // This funnction is only called while cm->bit_depth is VPX_BITS_8. +#if CONFIG_VP9_HIGHBITDEPTH + vp9_scale_and_extend_frame_nonnormative(src, dst, (int)VPX_BITS_8); +#else + vp9_scale_and_extend_frame_nonnormative(src, dst); +#endif // CONFIG_VP9_HIGHBITDEPTH + return; + } + for (i = 0; i < MAX_MB_PLANE; ++i) { const int factor = (i == 0 || i == 3 ? 1 : 2); const int src_stride = src_strides[i]; From 7cfc58de4894153b20a144ed142956ac2f2e7aa6 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 5 Dec 2023 13:45:05 -0500 Subject: [PATCH 0981/1000] RTC RC: add screen content support for vp8 Bug: b/281463780 Change-Id: I446c00bf8d794aa9134e4fe37960dd8a465448a4 --- test/vp8_ratectrl_rtc_test.cc | 21 +++++++++++++++++++++ vp8/vp8_ratectrl_rtc.cc | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc index 9fbc1d4d98..50478f7635 100644 --- a/test/vp8_ratectrl_rtc_test.cc +++ b/test/vp8_ratectrl_rtc_test.cc @@ -45,6 +45,7 @@ struct Vp8RCTestVideo { const Vp8RCTestVideo kVp8RCTestVectors[] = { Vp8RCTestVideo("niklas_640_480_30.yuv", 640, 480, 470), Vp8RCTestVideo("desktop_office1.1280_720-020.yuv", 1280, 720, 300), + Vp8RCTestVideo("hantro_collage_w352h288.yuv", 352, 288, 100), }; class Vp8RcInterfaceTest @@ -128,6 +129,9 @@ class Vp8RcInterfaceTest encoder->Control(VP8E_SET_CPUUSED, -6); encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1); encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); + if (rc_cfg_.is_screen) { + encoder->Control(VP8E_SET_SCREEN_CONTENT_MODE, 1); + } } else if (frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) { // Disable golden frame update. frame_flags_ |= VP8_EFLAG_NO_UPD_GF; @@ -171,6 +175,21 @@ class Vp8RcInterfaceTest ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } + void RunOneLayerScreen() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + SetConfig(); + rc_cfg_.is_screen = true; + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + void RunOneLayerDropFrames() { test_video_ = GET_PARAM(2); target_bitrate_ = GET_PARAM(1); @@ -377,6 +396,8 @@ class Vp8RcInterfaceTest TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); } +TEST_P(Vp8RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); } + TEST_P(Vp8RcInterfaceTest, OneLayerDropFrames) { RunOneLayerDropFrames(); } TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); } diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc index dd3c8e623b..261c316fd1 100644 --- a/vp8/vp8_ratectrl_rtc.cc +++ b/vp8/vp8_ratectrl_rtc.cc @@ -141,7 +141,7 @@ bool VP8RateControlRTC::UpdateRateControl( cpi_->prior_key_frame_distance[i] = static_cast(cpi_->output_framerate); } - + oxcf->screen_content_mode = rc_cfg.is_screen; if (oxcf->number_of_layers > 1 || prev_number_of_layers > 1) { memcpy(oxcf->target_bitrate, rc_cfg.layer_target_bitrate, sizeof(rc_cfg.layer_target_bitrate)); From 97184161d5c93e5e69d0de6b064b005e5c82d342 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Tue, 5 Dec 2023 10:55:16 -0800 Subject: [PATCH 0982/1000] Add "IWYU pragma: export" to some public headers vpx/vpx_integer.h is clearly intended as the facade header for the Standard C Library headers , , and . It is reasonable to expect that vpx/vpx_decoder.h and vpx/vpx_encoder.h should provide the symbols from vpx/vpx_codec.h. Change-Id: I220797e63b2efc3dd9e2ac197fe2f918bf80d247 --- vpx/vpx_decoder.h | 2 +- vpx/vpx_encoder.h | 2 +- vpx/vpx_integer.h | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h index 3242692e7e..0e9611e31f 100644 --- a/vpx/vpx_decoder.h +++ b/vpx/vpx_decoder.h @@ -29,7 +29,7 @@ extern "C" { #endif -#include "./vpx_codec.h" +#include "./vpx_codec.h" // IWYU pragma: export #include "./vpx_frame_buffer.h" /*!\brief Current ABI version number diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index f3799d674c..18e3862bd7 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -29,7 +29,7 @@ extern "C" { #endif -#include "./vpx_codec.h" +#include "./vpx_codec.h" // IWYU pragma: export #include "./vpx_ext_ratectrl.h" #include "./vpx_tpl.h" diff --git a/vpx/vpx_integer.h b/vpx/vpx_integer.h index 4129d156f8..34e3796411 100644 --- a/vpx/vpx_integer.h +++ b/vpx/vpx_integer.h @@ -12,7 +12,7 @@ #define VPX_VPX_VPX_INTEGER_H_ /* get ptrdiff_t, size_t, wchar_t, NULL */ -#include +#include // IWYU pragma: export #if defined(_MSC_VER) #define VPX_FORCE_INLINE __forceinline @@ -34,7 +34,7 @@ #endif #endif // __cplusplus -#include -#include +#include // IWYU pragma: export +#include // IWYU pragma: export #endif // VPX_VPX_VPX_INTEGER_H_ From 12e928cb342d1d4cc8ac3c71b26da45e4488cb88 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Tue, 5 Dec 2023 11:59:15 -0800 Subject: [PATCH 0983/1000] Add unittest for issue b/314857577 Bug: b/314857577 Change-Id: I591036c1ad3362023686d395adb4783c51baa62d --- test/encode_api_test.cc | 49 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index f046a9db39..08e7539272 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -801,6 +801,55 @@ TEST(EncodeAPI, Buganizer311394513) { encoder.Encode(true); } +// This is a test case from clusterfuzz: based on b/314857577. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer314857577) { + VP9Encoder encoder(4); + + // Set initial config. + encoder.Configure(12, 1060, 437, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(false); + + // Change config. + encoder.Configure(16, 1060, 1, VPX_CBR, VPX_DL_REALTIME); + + // Encode 2nd frame with new config. + encoder.Encode(false); + + // Encode 3rd frame with new config. + encoder.Encode(true); + + // Change config. + encoder.Configure(15, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 4th frame with new config. + encoder.Encode(true); + + // Encode 5th frame with new config. + encoder.Encode(false); + + // Change config. + encoder.Configure(5, 327, 269, VPX_VBR, VPX_DL_REALTIME); + + // Change config. + encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME); + + // Encode 6th frame with new config. + encoder.Encode(false); + + // Encode 7th frame with new config. + encoder.Encode(false); + + // Change config. + encoder.Configure(4, 1060, 437, VPX_VBR, VPX_DL_REALTIME); + + // Encode 8th frame with new config. + encoder.Encode(false); +} + class EncodeApiGetTplStatsTest : public ::libvpx_test::EncoderTest, public ::testing::TestWithParam { From 4c2435c33e12a72640e96262f982a9f5f5c513cd Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Tue, 5 Dec 2023 10:46:52 -0500 Subject: [PATCH 0984/1000] Fix several clang-tidy complaints Change-Id: I78721d6b7ed692ad9363b5cac4e3324a3136d5b6 --- test/encode_api_test.cc | 2 +- test/sum_squares_test.cc | 1 + vp9/encoder/vp9_encodeframe.c | 1 + vpx_dsp/arm/highbd_sse_neon.c | 1 + vpx_dsp/arm/sse_neon.c | 2 ++ vpx_dsp/sse.c | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/sse_avx2.c | 3 ++- 8 files changed, 10 insertions(+), 3 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 08e7539272..99f38c3af9 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -517,7 +517,7 @@ vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) { // Emulates the WebCodecs VideoEncoder interface. class VP9Encoder { public: - VP9Encoder(int speed) : speed_(speed) {} + explicit VP9Encoder(int speed) : speed_(speed) {} ~VP9Encoder(); void Configure(unsigned int threads, unsigned int width, unsigned int height, diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc index 725d5eb853..d3c76a34d2 100644 --- a/test/sum_squares_test.cc +++ b/test/sum_squares_test.cc @@ -9,6 +9,7 @@ */ #include +#include #include #include #include diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index d6ac04400d..7ab183e28a 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -48,6 +48,7 @@ #include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_multi_thread.h" diff --git a/vpx_dsp/arm/highbd_sse_neon.c b/vpx_dsp/arm/highbd_sse_neon.c index 717ad6b19a..8e389df8b8 100644 --- a/vpx_dsp/arm/highbd_sse_neon.c +++ b/vpx_dsp/arm/highbd_sse_neon.c @@ -9,6 +9,7 @@ */ #include +#include #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/sum_neon.h" diff --git a/vpx_dsp/arm/sse_neon.c b/vpx_dsp/arm/sse_neon.c index 0b4a6e504a..75ee7e7816 100644 --- a/vpx_dsp/arm/sse_neon.c +++ b/vpx_dsp/arm/sse_neon.c @@ -9,7 +9,9 @@ */ #include +#include +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" diff --git a/vpx_dsp/sse.c b/vpx_dsp/sse.c index 6cb4b705f8..c9d751859d 100644 --- a/vpx_dsp/sse.c +++ b/vpx_dsp/sse.c @@ -19,6 +19,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height) { diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index e9d63f6ef2..18087e25d9 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -744,7 +744,7 @@ () add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/; -add_proto qw/int64_t/, "vpx_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height"; +add_proto qw/int64_t/, "vpx_sse", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height"; specialize qw/vpx_sse sse4_1 avx2 neon neon_dotprod/; # diff --git a/vpx_dsp/x86/sse_avx2.c b/vpx_dsp/x86/sse_avx2.c index 975446775e..381c1a1e6c 100644 --- a/vpx_dsp/x86/sse_avx2.c +++ b/vpx_dsp/x86/sse_avx2.c @@ -8,8 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include #include +#include +#include #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" From 7dfe343199381bddddc5eaa648e947876979b61b Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Thu, 28 Sep 2023 09:26:58 -0700 Subject: [PATCH 0985/1000] Use vpx_sse instead of vpx_mse to compute SSE Use vpx_sse and vpx_highbd_sse instead of vpx_mse16x16 and vpx_highbd_8_mse16x16 respectively to compute SSE for PSNR calculations. This solves an issue whereby vpx_highbd_8_mse16x16 was being used to calculate SSE for 10- and 12-bit input. This is a port of the libaom CL https://aomedia-review.googlesource.com/c/aom/+/175063 by Jonathan Wright . Bug: webm:1819 Change-Id: I37e3ac72835e67ccb44ac89a4ed16df62c2169a7 --- vpx_dsp/psnr.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c index f0d4e927ae..4ee4130a21 100644 --- a/vpx_dsp/psnr.c +++ b/vpx_dsp/psnr.c @@ -45,14 +45,14 @@ static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b, } #if CONFIG_VP9_HIGHBITDEPTH -static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h) { +static int64_t encoder_highbd_sse(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h) { int i, j; int64_t sse = 0; - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { @@ -88,10 +88,8 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; - unsigned int sse; for (x = 0; x < width / 16; ++x) { - vpx_mse16x16(pa, a_stride, pb, b_stride, &sse); - total_sse += sse; + total_sse += vpx_sse(pa, a_stride, pb, b_stride, 16, 16); pa += 16; pb += 16; @@ -131,21 +129,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, const int dw = width % 16; const int dh = height % 16; if (dw > 0) { - total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw], - b_stride, dw, height); + total_sse += encoder_highbd_sse(&a[width - dw], a_stride, &b[width - dw], + b_stride, dw, height); } if (dh > 0) { - total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, - width - dw, dh); + total_sse += encoder_highbd_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; - unsigned int sse; for (x = 0; x < width / 16; ++x) { - vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse); - total_sse += sse; + total_sse += vpx_highbd_sse(pa, a_stride, pb, b_stride, 16, 16); pa += 16; pb += 16; } From c4c92080545970899488ab27944792a95c7131a2 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Tue, 5 Dec 2023 14:29:37 -0800 Subject: [PATCH 0986/1000] Remove SSE code for 128x* blocks The maximum block size is 64x64 in VP9. Bug: webm:1819 Change-Id: If9802be9f81b51dbcdbc8a68d5afe48ca6d3d0e7 --- vpx_dsp/arm/highbd_sse_neon.c | 51 ---------------------------------- vpx_dsp/arm/sse_neon.c | 24 ---------------- vpx_dsp/arm/sse_neon_dotprod.c | 26 ----------------- vpx_dsp/x86/sse_avx2.c | 34 ----------------------- vpx_dsp/x86/sse_sse4.c | 47 ------------------------------- 5 files changed, 182 deletions(-) diff --git a/vpx_dsp/arm/highbd_sse_neon.c b/vpx_dsp/arm/highbd_sse_neon.c index 8e389df8b8..91dfebf900 100644 --- a/vpx_dsp/arm/highbd_sse_neon.c +++ b/vpx_dsp/arm/highbd_sse_neon.c @@ -43,55 +43,6 @@ static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref, *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi); } -static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - int height) { - uint32x4_t sse[16]; - highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); - highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); - highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); - highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); - highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]); - highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]); - highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]); - highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]); - highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]); - highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]); - highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]); - highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]); - highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]); - highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]); - highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]); - highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]); - - src += src_stride; - ref += ref_stride; - - while (--height != 0) { - highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); - highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); - highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); - highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); - highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]); - highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]); - highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]); - highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]); - highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]); - highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]); - highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]); - highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]); - highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]); - highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]); - highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]); - highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]); - - src += src_stride; - ref += ref_stride; - } - - return horizontal_long_add_uint32x4_x16(sse); -} - static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int height) { @@ -280,8 +231,6 @@ int64_t vpx_highbd_sse_neon(const uint8_t *src8, int src_stride, return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height); case 64: return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height); - case 128: - return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height); default: return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width, height); diff --git a/vpx_dsp/arm/sse_neon.c b/vpx_dsp/arm/sse_neon.c index 75ee7e7816..2dd57e596c 100644 --- a/vpx_dsp/arm/sse_neon.c +++ b/vpx_dsp/arm/sse_neon.c @@ -86,29 +86,6 @@ static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride, return horizontal_add_uint32x4(sse); } -static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - int height) { - uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; - - int i = height; - do { - sse_16x1_neon(src, ref, &sse[0]); - sse_16x1_neon(src + 16, ref + 16, &sse[1]); - sse_16x1_neon(src + 32, ref + 32, &sse[0]); - sse_16x1_neon(src + 48, ref + 48, &sse[1]); - sse_16x1_neon(src + 64, ref + 64, &sse[0]); - sse_16x1_neon(src + 80, ref + 80, &sse[1]); - sse_16x1_neon(src + 96, ref + 96, &sse[0]); - sse_16x1_neon(src + 112, ref + 112, &sse[1]); - - src += src_stride; - ref += ref_stride; - } while (--i != 0); - - return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); -} - static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { @@ -205,7 +182,6 @@ int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height); case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height); case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height); - case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height); default: return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height); } diff --git a/vpx_dsp/arm/sse_neon_dotprod.c b/vpx_dsp/arm/sse_neon_dotprod.c index 0f11b7cbb2..8777773918 100644 --- a/vpx_dsp/arm/sse_neon_dotprod.c +++ b/vpx_dsp/arm/sse_neon_dotprod.c @@ -85,30 +85,6 @@ static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride, return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1])); } -static INLINE uint32_t sse_128xh_neon_dotprod(const uint8_t *src, - int src_stride, - const uint8_t *ref, - int ref_stride, int height) { - uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; - - int i = height; - do { - sse_16x1_neon_dotprod(src, ref, &sse[0]); - sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); - sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]); - sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]); - sse_16x1_neon_dotprod(src + 64, ref + 64, &sse[0]); - sse_16x1_neon_dotprod(src + 80, ref + 80, &sse[1]); - sse_16x1_neon_dotprod(src + 96, ref + 96, &sse[0]); - sse_16x1_neon_dotprod(src + 112, ref + 112, &sse[1]); - - src += src_stride; - ref += ref_stride; - } while (--i != 0); - - return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); -} - static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { @@ -214,8 +190,6 @@ int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride, return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height); case 64: return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height); - case 128: - return sse_128xh_neon_dotprod(src, src_stride, ref, ref_stride, height); default: return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width, height); diff --git a/vpx_dsp/x86/sse_avx2.c b/vpx_dsp/x86/sse_avx2.c index 381c1a1e6c..dfe45b6115 100644 --- a/vpx_dsp/x86/sse_avx2.c +++ b/vpx_dsp/x86/sse_avx2.c @@ -169,18 +169,6 @@ int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, } while (y < height); sse = summary_all_avx2(&sum); break; - case 128: - do { - sse_w32_avx2(&sum, a, b); - sse_w32_avx2(&sum, a + 32, b + 32); - sse_w32_avx2(&sum, a + 64, b + 64); - sse_w32_avx2(&sum, a + 96, b + 96); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_avx2(&sum); - break; default: if ((width & 0x07) == 0) { do { @@ -334,28 +322,6 @@ int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, } while (y < height); sse = summary_4x64_avx2(sum); break; - case 128: - do { - int l = 0; - __m256i sum32 = _mm256_setzero_si256(); - do { - highbd_sse_w16_avx2(&sum32, a, b); - highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); - highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); - highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); - highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4); - highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5); - highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6); - highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7); - a += a_stride; - b += b_stride; - l += 1; - } while (l < 16 && l < (height - y)); - summary_32_avx2(&sum32, &sum); - y += 16; - } while (y < height); - sse = summary_4x64_avx2(sum); - break; default: if (width & 0x7) { do { diff --git a/vpx_dsp/x86/sse_sse4.c b/vpx_dsp/x86/sse_sse4.c index 1c2744e2fa..4a7585c57e 100644 --- a/vpx_dsp/x86/sse_sse4.c +++ b/vpx_dsp/x86/sse_sse4.c @@ -128,22 +128,6 @@ int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, } while (y < height); sse = summary_all_sse4(&sum); break; - case 128: - do { - sse_w16_sse4_1(&sum, a, b); - sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); - sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); - sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); - sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4); - sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5); - sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6); - sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_sse4(&sum); - break; default: if (width & 0x07) { do { @@ -285,37 +269,6 @@ int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, _mm_storel_epi64((__m128i *)&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); break; - case 128: - do { - int l = 0; - __m128i sum32 = _mm_setzero_si128(); - do { - highbd_sse_w8_sse4_1(&sum32, a, b); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15); - a += a_stride; - b += b_stride; - l += 1; - } while (l < 8 && l < (height - y)); - summary_32_sse4(&sum32, &sum); - y += 8; - } while (y < height); - _mm_storel_epi64((__m128i *)&sse, - _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); - break; default: if (width & 0x7) { do { From f9b7c857683cb1d033b9d6a13e92843e6d8740a3 Mon Sep 17 00:00:00 2001 From: James Zern Date: Mon, 4 Dec 2023 13:25:01 -0800 Subject: [PATCH 0987/1000] README: update target list Change-Id: I001179ce34b2bf2350dce5f0197b6be175ab1c37 --- README | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README b/README index 5ccc9c3fce..4c25b15d81 100644 --- a/README +++ b/README @@ -66,6 +66,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: arm64-darwin20-gcc arm64-darwin21-gcc arm64-darwin22-gcc + arm64-darwin23-gcc arm64-linux-gcc arm64-win64-gcc arm64-win64-vs15 @@ -81,6 +82,8 @@ COMPILING THE APPLICATIONS/LIBRARIES: armv7-win32-gcc armv7-win32-vs14 armv7-win32-vs15 + armv7-win32-vs16 + armv7-win32-vs17 armv7s-darwin-gcc armv8-linux-gcc loongarch32-linux-gcc @@ -127,6 +130,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86_64-darwin20-gcc x86_64-darwin21-gcc x86_64-darwin22-gcc + x86_64-darwin23-gcc x86_64-iphonesimulator-gcc x86_64-linux-gcc x86_64-linux-icc From 476d02a2d206f959613afc0832da8656e26c8602 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Wed, 6 Dec 2023 10:03:51 -0800 Subject: [PATCH 0988/1000] Fix two clang-tidy misc-include-cleaner warnings no header providing "CONFIG_VP9_HIGHBITDEPTH" is directly included no header providing "VPX_BITS_8" is directly included Change-Id: Ie6d78c79ab462501417f2b451bbe808a1fdce931 --- vp9/encoder/vp9_frame_scale.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c index 22b3f05579..e6c375413d 100644 --- a/vp9/encoder/vp9_frame_scale.c +++ b/vp9/encoder/vp9_frame_scale.c @@ -9,10 +9,12 @@ */ #include "./vp9_rtcd.h" +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vp9/common/vp9_blockd.h" #include "vp9/encoder/vp9_encoder.h" +#include "vpx/vpx_codec.h" #include "vpx_dsp/vpx_filter.h" #include "vpx_scale/yv12config.h" From 2f258fdee1b2dc276d973cde6bd2f81c63f13155 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 6 Dec 2023 10:54:21 -0800 Subject: [PATCH 0989/1000] vp9_frame_scale.c,cosmetics: funnction -> function Change-Id: I8ecbd52037ff096f5c84c834b193b0a34c55a8b7 --- vp9/encoder/vp9_frame_scale.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c index e6c375413d..c74d523246 100644 --- a/vp9/encoder/vp9_frame_scale.c +++ b/vp9/encoder/vp9_frame_scale.c @@ -102,7 +102,7 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, const int x_step_q4 = 16 * src_w / dst_w; const int y_step_q4 = 16 * src_h / dst_h; if (x_step_q4 > 64 || y_step_q4 > 64) { - // This funnction is only called while cm->bit_depth is VPX_BITS_8. + // This function is only called while cm->bit_depth is VPX_BITS_8. #if CONFIG_VP9_HIGHBITDEPTH vp9_scale_and_extend_frame_nonnormative(src, dst, (int)VPX_BITS_8); #else From 585798f756d60bef3761d76700f3a14e8d5d46d9 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Wed, 6 Dec 2023 10:35:18 -0500 Subject: [PATCH 0990/1000] Set pred buffer stride correctly Bug: b/312875957 Change-Id: I2eb5ab86d5fe30079b3ed1cbdb8b45bb2dc72a1d --- test/encode_api_test.cc | 13 +++++++++++++ vp9/common/vp9_reconinter.c | 13 +++++++------ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 99f38c3af9..596edb229c 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -850,6 +850,19 @@ TEST(EncodeAPI, Buganizer314857577) { encoder.Encode(false); } +TEST(EncodeAPI, Buganizer312875957PredBufferStride) { + VP9Encoder encoder(-1); + + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME); + encoder.Encode(true); + encoder.Encode(false); + encoder.Configure(0, 456, 486, VPX_VBR, VPX_DL_REALTIME); + encoder.Encode(true); + encoder.Configure(0, 1678, 620, VPX_CBR, 1000000); + encoder.Encode(false); + encoder.Encode(false); +} + class EncodeApiGetTplStatsTest : public ::libvpx_test::EncoderTest, public ::testing::TestWithParam { diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index ff59ff5042..4878dc15ee 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -158,18 +158,19 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, // Co-ordinate of containing block to pixel precision. const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); + const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf; + uint8_t *buf_array[] = { ref_buf->y_buffer, ref_buf->u_buffer, + ref_buf->v_buffer }; + const int stride_array[] = { ref_buf->y_stride, ref_buf->uv_stride, + ref_buf->uv_stride }; #if 0 // CONFIG_BETTER_HW_COMPATIBILITY assert(xd->mi[0]->sb_type != BLOCK_4X8 && xd->mi[0]->sb_type != BLOCK_8X4); assert(mv_q4.row == mv.row * (1 << (1 - pd->subsampling_y)) && mv_q4.col == mv.col * (1 << (1 - pd->subsampling_x))); #endif - if (plane == 0) - pre_buf->buf = xd->block_refs[ref]->buf->y_buffer; - else if (plane == 1) - pre_buf->buf = xd->block_refs[ref]->buf->u_buffer; - else - pre_buf->buf = xd->block_refs[ref]->buf->v_buffer; + pre_buf->buf = buf_array[plane]; + pre_buf->stride = stride_array[plane]; pre_buf->buf += scaled_buffer_offset(x_start + x, y_start + y, pre_buf->stride, sf); From 50ed636e49db2b8fa2436413480f11ab1f2a2d1a Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Wed, 6 Dec 2023 14:24:07 -0800 Subject: [PATCH 0991/1000] Fix a bug in simple motion search This change fixed a bug revealed by b/311294795. In simple motion search, the reference buffer pointer needs to be restored after the search. Otherwise, it causes problems while the reference frame scaling happens. This CL fixes the bug. Bug: b/311294795 Change-Id: I093722d5888de3cc6a6542de82a6ec9d601f897d --- test/encode_api_test.cc | 31 +++++++++++++++++++++++++++++++ vp9/encoder/vp9_encodeframe.c | 15 +++++++++++++-- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 596edb229c..5c421e3d70 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -863,6 +863,37 @@ TEST(EncodeAPI, Buganizer312875957PredBufferStride) { encoder.Encode(false); } +// This is a test case from clusterfuzz: based on b/311294795 +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer311294795) { + VP9Encoder encoder(1); + + // Set initial config. + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(false); + + // Change config. + encoder.Configure(16, 632, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame with new config + encoder.Encode(true); + + // Change config. + encoder.Configure(16, 1678, 342, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 3rd frame with new config. + encoder.Encode(false); + + // Change config. + encoder.Configure(0, 1574, 618, VPX_VBR, VPX_DL_REALTIME); + // Encode more frames with new config. + encoder.Encode(false); + encoder.Encode(false); +} + class EncodeApiGetTplStatsTest : public ::libvpx_test::EncoderTest, public ::testing::TestWithParam { diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 7ab183e28a..46291f4868 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -3444,11 +3444,17 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, MV ref_mv_full = { ref_mv.row >> 3, ref_mv.col >> 3 }; MV best_mv = { 0, 0 }; int cost_list[5]; + struct buf_2d backup_pre[MAX_MB_PLANE] = { { 0, 0 } }; - if (scaled_ref_frame) + if (scaled_ref_frame) { yv12 = scaled_ref_frame; - else + // As reported in b/311294795, the reference buffer pointer needs to be + // saved and restored after the search. Otherwise, it causes problems while + // the reference frame scaling happens. + for (int i = 0; i < MAX_MB_PLANE; i++) backup_pre[i] = xd->plane[i].pre[0]; + } else { yv12 = get_ref_frame_buffer(cpi, ref); + } assert(yv12 != NULL); if (!yv12) return; @@ -3465,6 +3471,11 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, x->mv_limits = tmp_mv_limits; mi->mv[0].as_mv = best_mv; + // Restore reference buffer pointer. + if (scaled_ref_frame) { + for (int i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_pre[i]; + } + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); xd->plane[0].dst.buf = pred_buf; xd->plane[0].dst.stride = 64; From 1ed56a46b3f6b18e1fb89a091e60d80ae20eec01 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Mon, 4 Dec 2023 15:34:50 -0800 Subject: [PATCH 0992/1000] Update frame size in actual encoding Issue explanation: The unit test calls set_config function twice after encoding the first frame. The first call of set_config reduces frame width, but is still within half of the first frame. The second call reduces frame width even more, making is less than half of the first frame, which according to the encoder logic, there is no valid ref frames, and this frame should be set as a forced keyframe. This leads to null pointer access in scale_factors later. Solution: To make sure the correct detection of a forced key frame, we need to update the frame width and height only when the actual encoding is performed. Bug: b/311985118 Change-Id: Ie2cd3b760d4a4b399845693d7421c4eb11a12775 --- test/encode_api_test.cc | 19 +++++++++++++++++++ vp9/encoder/vp9_encoder.c | 7 +++++++ vp9/encoder/vp9_encoder.h | 3 +++ vp9/vp9_cx_iface.c | 19 +++++++++++++++++-- 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 5c421e3d70..eb8c456de7 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -801,6 +801,25 @@ TEST(EncodeAPI, Buganizer311394513) { encoder.Encode(true); } +TEST(EncodeAPI, Buganizer311985118) { + VP9Encoder encoder(0); + + // Set initial config, in particular set deadline to GOOD mode. + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 1st frame. + encoder.Encode(false); + + // Change config: change threads and width. + encoder.Configure(0, 1574, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Change config: change threads, width and height. + encoder.Configure(16, 837, 432, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame. + encoder.Encode(false); +} + // This is a test case from clusterfuzz: based on b/314857577. // Encode a few frames with multiple change config calls // with different frame sizes. diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 20e35077cd..152d42bc9a 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -3888,6 +3888,7 @@ static void set_frame_size(VP9_COMP *cpi) { alloc_util_frame_buffers(cpi); init_motion_estimation(cpi); + int has_valid_ref_frame = 0; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1]; const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); @@ -3906,11 +3907,17 @@ static void set_frame_size(VP9_COMP *cpi) { buf->y_crop_height, cm->width, cm->height); #endif // CONFIG_VP9_HIGHBITDEPTH + has_valid_ref_frame |= vp9_is_valid_scale(&ref_buf->sf); if (vp9_is_scaled(&ref_buf->sf)) vpx_extend_frame_borders(buf); } else { ref_buf->buf = NULL; } } + if (!frame_is_intra_only(cm) && !has_valid_ref_frame) { + vpx_internal_error( + &cm->error, VPX_CODEC_CORRUPT_FRAME, + "Can't find at least one reference frame with valid size"); + } set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); } diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 83b7081e7b..7136f7faa3 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -921,6 +921,9 @@ typedef struct VP9_COMP { // number of MBs in the current frame when the frame is // scaled. + int last_coded_width; + int last_coded_height; + int use_svc; SVC svc; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 568a21c2a1..e611a6e863 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -797,10 +797,22 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) { if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS) ERROR("Cannot change width or height after initialization"); - if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) || + // Note: function encoder_set_config() is allowed to be called multiple + // times. However, when the original frame width or height is less than two + // times of the new frame width or height, a forced key frame should be + // used. To make sure the correct detection of a forced key frame, we need + // to update the frame width and height only when the actual encoding is + // performed. cpi->last_coded_width and cpi->last_coded_height are used to + // track the actual coded frame size. + if ((ctx->cpi->last_coded_width && ctx->cpi->last_coded_height && + !valid_ref_frame_size(ctx->cpi->last_coded_width, + ctx->cpi->last_coded_height, cfg->g_w, + cfg->g_h)) || (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) || - (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height)) + (ctx->cpi->initial_height && + (int)cfg->g_h > ctx->cpi->initial_height)) { force_key = 1; + } } // Prevent increasing lag_in_frames. This check is stricter than it needs @@ -1310,6 +1322,9 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (cpi == NULL) return VPX_CODEC_INVALID_PARAM; + cpi->last_coded_width = ctx->oxcf.width; + cpi->last_coded_height = ctx->oxcf.height; + if (img != NULL) { res = validate_img(ctx, img); if (res == VPX_CODEC_OK) { From fa60c7d9c16f0e7ce1daa2030e9920e1a64525d8 Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 7 Dec 2023 12:20:59 -0500 Subject: [PATCH 0993/1000] IWYU: include yv12config.h for YV12_BUFFER_CONFIG Fix clang-tiday warning Change-Id: Ic4d6739cb933a37168176f6b481afdfd2562acfc --- vp9/common/vp9_reconinter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index 4878dc15ee..0a60b853d8 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -13,12 +13,13 @@ #include "./vpx_scale_rtcd.h" #include "./vpx_config.h" -#include "vpx/vpx_integer.h" - #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" +#include "vpx/vpx_integer.h" +#include "vpx_scale/yv12config.h" + #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_build_inter_predictor( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, From 3a88c0c2046870e73f51bbd75d590e735da1f661 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Fri, 8 Dec 2023 14:39:18 -0800 Subject: [PATCH 0994/1000] Avoid dangling pointers in vp9_encode_free_mt_data Set cpi->tile_thr_data and cpi->workers to NULL after freeing them. Change-Id: I46fec5f08a6dd034c8d76828f4d546630442f216 --- vp9/encoder/vp9_ethread.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c index 681996d334..a8d1cb7a7a 100644 --- a/vp9/encoder/vp9_ethread.c +++ b/vp9/encoder/vp9_ethread.c @@ -187,7 +187,9 @@ void vp9_encode_free_mt_data(struct VP9_COMP *cpi) { } } vpx_free(cpi->tile_thr_data); + cpi->tile_thr_data = NULL; vpx_free(cpi->workers); + cpi->workers = NULL; cpi->num_workers = 0; } From 7e735cdf4328361169596f4ddab7bce32930c87f Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Fri, 8 Dec 2023 14:49:47 -0800 Subject: [PATCH 0995/1000] IWYU: include vp9_scale.h and vpx_codec.h Fix the following clang-tidy misc-include-cleaner warnings: vp9/encoder/vp9_encoder.c: no header providing "vp9_is_valid_scale" is directly included no header providing "VPX_CODEC_CORRUPT_FRAME" is directly included vp9/vp9_cx_iface.c: no header providing "valid_ref_frame_size" is directly included Change-Id: I20e846f5b14c42c72aaefec0718b4ae9c7eea44a --- vp9/encoder/vp9_encoder.c | 5 +++-- vp9/vp9_cx_iface.c | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 152d42bc9a..fd213f1e6b 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -18,6 +18,8 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_ext_ratectrl.h" #include "vpx_dsp/psnr.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/vpx_filter.h" @@ -42,6 +44,7 @@ #endif #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scale.h" #include "vp9/common/vp9_tile_common.h" #if !CONFIG_REALTIME_ONLY @@ -83,8 +86,6 @@ #include "vp9/encoder/vp9_tpl_model.h" #include "vp9/vp9_cx_iface.h" -#include "vpx/vpx_ext_ratectrl.h" - #define AM_SEGMENT_ID_INACTIVE 7 #define AM_SEGMENT_ID_ACTIVE 0 diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index e611a6e863..5fa2d7c196 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -23,6 +23,7 @@ #include "vp9/encoder/vp9_encoder.h" #include "vpx/vp8cx.h" #include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_scale.h" #include "vp9/vp9_cx_iface.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_lookahead.h" From a75859c439cb4d4af92e4e894f7e2c43758d6cd5 Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Mon, 27 Nov 2023 17:55:25 +0000 Subject: [PATCH 0996/1000] Remove redundant comment in convolve8_4_usdot The function convolve8_4_usdot contains a comment relating to the SDOT implementation of convolve8, which requires addition of a correction constant to account for range clamp of the input values. This is not performed in the i8mm USDOT implementation - so remove the comment. Also add some const qualifiers to function arguments. Change-Id: I10aff560d20403897f708ee293bf873be9c35761 --- vpx_dsp/arm/vpx_convolve8_neon.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index f01d4f6a42..4ecaee0f99 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -116,7 +116,7 @@ static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, return vqmovn_s32(sum); } -static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples, +static INLINE int16x4_t convolve8_4_sdot(const uint8x16_t samples, const int8x8_t filters, const int32x4_t correction, const uint8x16_t range_limit, @@ -164,7 +164,7 @@ static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo, return vqrshrun_n_s16(sum, FILTER_BITS); } -static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, +static INLINE uint8x8_t convolve8_8_sdot(const uint8x16_t samples, const int8x8_t filters, const int32x4_t correction, const uint8x16_t range_limit, @@ -281,7 +281,7 @@ static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, return vqmovn_s32(sum); } -static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples, +static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples, const int8x8_t filters, const uint8x16x2_t permute_tbl) { uint8x16_t permuted_samples[2]; @@ -293,7 +293,6 @@ static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples, /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); - /* Accumulate dot product into 'correction' to account for range clamp. */ sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); @@ -322,7 +321,7 @@ static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo, return vqrshrun_n_s16(sum, FILTER_BITS); } -static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples, +static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples, const int8x8_t filters, const uint8x16x3_t permute_tbl) { uint8x16_t permuted_samples[3]; From 193b1511956f1732a8d54041a26ca9633a92abf9 Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Mon, 11 Dec 2023 21:02:36 -0800 Subject: [PATCH 0997/1000] Fix to integer overflow in vp8 encodeframe.c Unit test added. Bug:webm:1831 Change-Id: Ib85f4f0fbdbebc0b49555f206a36376cea687df6 --- test/encode_api_test.cc | 32 ++++++++++++++++++++++++++++++++ vp8/encoder/encodeframe.c | 12 ++++++++++-- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index eb8c456de7..928f1ede17 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -14,6 +14,7 @@ #include #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" @@ -181,6 +182,37 @@ TEST(EncodeAPI, HugeFramerateVp8) { vpx_img_free(image); ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); } + +// A test that reproduces https://crbug.com/webm/1831. +TEST(EncodeAPI, RandomPixelsVp8) { + // Initialize libvpx encoder + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + cfg.rc_target_bitrate = 2000; + cfg.g_w = 1280; + cfg.g_h = 720; + + vpx_codec_ctx_t enc; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + // Generate random frame data and encode + uint8_t img[1280 * 720 * 3 / 2]; + libvpx_test::ACMRandom rng; + for (size_t i = 0; i < sizeof(img); ++i) { + img[i] = rng.Rand8(); + } + vpx_image_t img_wrapper; + ASSERT_EQ( + vpx_img_wrap(&img_wrapper, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1, img), + &img_wrapper); + ASSERT_EQ(vpx_codec_encode(&enc, &img_wrapper, 0, 1, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_OK); + + // Destroy libvpx encoder + vpx_codec_destroy(&enc); +} #endif // Set up 2 spatial streams with 2 temporal layers per stream, and generate diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 5c973940ec..82c48b13a7 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -447,13 +447,21 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, x->active_ptr = cpi->active_map + map_index + mb_col; if (cm->frame_type == KEY_FRAME) { - *totalrate += vp8cx_encode_intra_macroblock(cpi, x, tp); + const int intra_rate_cost = vp8cx_encode_intra_macroblock(cpi, x, tp); + if (INT_MAX - *totalrate > intra_rate_cost) + *totalrate += intra_rate_cost; + else + *totalrate = INT_MAX; #ifdef MODE_STATS y_modes[xd->mbmi.mode]++; #endif } else { - *totalrate += vp8cx_encode_inter_macroblock( + const int inter_rate_cost = vp8cx_encode_inter_macroblock( cpi, x, tp, recon_yoffset, recon_uvoffset, mb_row, mb_col); + if (INT_MAX - *totalrate > inter_rate_cost) + *totalrate += inter_rate_cost; + else + *totalrate = INT_MAX; #ifdef MODE_STATS inter_y_modes[xd->mbmi.mode]++; From 4fe07a0c411d8ed0722c979f13248fd63ff69a73 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Wed, 13 Dec 2023 10:56:28 -0800 Subject: [PATCH 0998/1000] Return correct error after longjmp in vp8e_encode After a longjmp() call in vp8e_encode(), call update_error_state() so that we return the error code and error detail set by the vpx_internal_error() call. Change-Id: I1f2428eb1b1f61e46c02604e16a5d44dcf162479 --- vp8/vp8_cx_iface.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index c42a837ebe..e7da15d16b 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -930,8 +930,9 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, if (setjmp(ctx->cpi->common.error.jmp)) { ctx->cpi->common.error.setjmp = 0; + res = update_error_state(ctx, &ctx->cpi->common.error); vpx_clear_system_state(); - return VPX_CODEC_CORRUPT_FRAME; + return res; } ctx->cpi->common.error.setjmp = 1; From df655cf4fb6c2a23b964544acd015cc715752830 Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Tue, 12 Dec 2023 16:32:29 -0800 Subject: [PATCH 0999/1000] Clarify the comment for update_error_state() Explain why the encoder init functions cannot call update_error_state(). In vp8/vp8_cx_iface.c, this comment should have been added in https://chromium-review.googlesource.com/c/webm/libvpx/+/4506609. Rewrite update_error_state() in vp8/vp8_cx_iface.c to look like the versions in vp9/vp9_cx_iface.c and av1/av1_cx_iface.c (in libaom). Change-Id: I3f153d67b8c549ca5ac8ea0cfbcaad4ae705c8e6 --- vp8/vp8_cx_iface.c | 9 ++++++--- vp9/vp9_cx_iface.c | 4 +++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index e7da15d16b..a10e08975c 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -95,13 +95,16 @@ struct vpx_codec_alg_priv { vpx_enc_frame_flags_t control_frame_flags; }; +// Called by vp8e_set_config() and vp8e_encode() only. Must not be called +// by vp8e_init() because the `error` paramerer (cpi->common.error) will be +// destroyed by vpx_codec_enc_init_ver() after vp8e_init() returns an error. +// See the "IMPORTANT" comment in vpx_codec_enc_init_ver(). static vpx_codec_err_t update_error_state( vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) { - vpx_codec_err_t res; + const vpx_codec_err_t res = error->error_code; - if ((res = error->error_code)) { + if (res != VPX_CODEC_OK) ctx->base.err_detail = error->has_detail ? error->detail : NULL; - } return res; } diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 5fa2d7c196..4899b1ed12 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -133,7 +133,9 @@ struct vpx_codec_alg_priv { }; // Called by encoder_set_config() and encoder_encode() only. Must not be called -// by encoder_init(). +// by encoder_init() because the `error` paramerer (cpi->common.error) will be +// destroyed by vpx_codec_enc_init_ver() after encoder_init() returns an error. +// See the "IMPORTANT" comment in vpx_codec_enc_init_ver(). static vpx_codec_err_t update_error_state( vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) { const vpx_codec_err_t res = error->error_code; From 41ced868a69625372c95ff0b2bd5f90987516c3b Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Fri, 15 Dec 2023 15:49:01 -0500 Subject: [PATCH 1000/1000] Remove VP9E_GET_TPL_STATS This is never used. A callback in external rc func was added and used instead. Change-Id: Iade6f361072f0c28af98904baf457d2f0e9ca904 --- test/encode_api_test.cc | 132 ---------------------------------------- vp9/vp9_cx_iface.c | 19 ------ vpx/vp8cx.h | 14 ----- 3 files changed, 165 deletions(-) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 928f1ede17..508083673a 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -944,138 +944,6 @@ TEST(EncodeAPI, Buganizer311294795) { encoder.Encode(false); encoder.Encode(false); } - -class EncodeApiGetTplStatsTest - : public ::libvpx_test::EncoderTest, - public ::testing::TestWithParam { - public: - EncodeApiGetTplStatsTest() : EncoderTest(GetParam()), test_io_(false) {} - ~EncodeApiGetTplStatsTest() override = default; - - protected: - void SetUp() override { - InitializeConfig(); - SetMode(::libvpx_test::kTwoPassGood); - } - - void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) override { - if (video->frame() == 0) { - encoder->Control(VP9E_SET_TPL, 1); - } - } - - vpx_codec_err_t AllocateTplList(VpxTplGopStats *data) { - // Allocate MAX_ARF_GOP_SIZE (50) * sizeof(VpxTplFrameStats) that will be - // filled by VP9E_GET_TPL_STATS. - // MAX_ARF_GOP_SIZE is used here because the test doesn't know the size of - // each GOP before getting TPL stats from the encoder. - data->size = 50; - data->frame_stats_list = - static_cast(calloc(50, sizeof(VpxTplFrameStats))); - if (data->frame_stats_list == nullptr) return VPX_CODEC_MEM_ERROR; - return VPX_CODEC_OK; - } - - void CompareTplGopStats(const VpxTplGopStats &ref_gop_stats, - const VpxTplGopStats &test_gop_stats) { - ASSERT_EQ(ref_gop_stats.size, test_gop_stats.size); - for (int frame = 0; frame < ref_gop_stats.size; frame++) { - const VpxTplFrameStats &ref_frame_stats = - ref_gop_stats.frame_stats_list[frame]; - const VpxTplFrameStats &test_frame_stats = - test_gop_stats.frame_stats_list[frame]; - ASSERT_EQ(ref_frame_stats.num_blocks, test_frame_stats.num_blocks); - ASSERT_EQ(ref_frame_stats.frame_width, test_frame_stats.frame_width); - ASSERT_EQ(ref_frame_stats.frame_height, test_frame_stats.frame_height); - for (int block = 0; block < ref_frame_stats.num_blocks; block++) { - const VpxTplBlockStats &ref_block_stats = - ref_frame_stats.block_stats_list[block]; - const VpxTplBlockStats &test_block_stats = - test_frame_stats.block_stats_list[block]; - ASSERT_EQ(ref_block_stats.inter_cost, test_block_stats.inter_cost); - ASSERT_EQ(ref_block_stats.intra_cost, test_block_stats.intra_cost); - ASSERT_EQ(ref_block_stats.mv_c, test_block_stats.mv_c); - ASSERT_EQ(ref_block_stats.mv_r, test_block_stats.mv_r); - ASSERT_EQ(ref_block_stats.recrf_dist, test_block_stats.recrf_dist); - ASSERT_EQ(ref_block_stats.recrf_rate, test_block_stats.recrf_rate); - ASSERT_EQ(ref_block_stats.ref_frame_index, - test_block_stats.ref_frame_index); - } - } - } - - void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { - ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); - while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { - switch (pkt->kind) { - case VPX_CODEC_CX_FRAME_PKT: { - VpxTplGopStats tpl_stats; - EXPECT_EQ(AllocateTplList(&tpl_stats), VPX_CODEC_OK); - encoder->Control(VP9E_GET_TPL_STATS, &tpl_stats); - bool stats_not_all_zero = false; - for (int i = 0; i < tpl_stats.size; i++) { - VpxTplFrameStats *frame_stats_list = tpl_stats.frame_stats_list; - if (frame_stats_list[i].frame_width != 0) { - ASSERT_EQ(frame_stats_list[i].frame_width, width_); - ASSERT_EQ(frame_stats_list[i].frame_height, height_); - ASSERT_GT(frame_stats_list[i].num_blocks, 0); - ASSERT_NE(frame_stats_list[i].block_stats_list, nullptr); - stats_not_all_zero = true; - } - } - ASSERT_TRUE(stats_not_all_zero); - if (test_io_ && tpl_stats.size > 0) { - libvpx_test::TempOutFile *temp_out_file = - new (std::nothrow) libvpx_test::TempOutFile("w+"); - ASSERT_NE(temp_out_file, nullptr); - ASSERT_NE(temp_out_file->file(), nullptr); - vpx_write_tpl_gop_stats(temp_out_file->file(), &tpl_stats); - rewind(temp_out_file->file()); - VpxTplGopStats gop_stats_io; - ASSERT_EQ( - vpx_read_tpl_gop_stats(temp_out_file->file(), &gop_stats_io), - VPX_CODEC_OK); - CompareTplGopStats(gop_stats_io, tpl_stats); - vpx_free_tpl_gop_stats(&gop_stats_io); - delete temp_out_file; - } - free(tpl_stats.frame_stats_list); - break; - } - default: break; - } - } - } - - int width_; - int height_; - bool test_io_; -}; - -TEST_P(EncodeApiGetTplStatsTest, GetTplStats) { - cfg_.g_lag_in_frames = 25; - width_ = 352; - height_ = 288; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_, - height_, 30, 1, 0, 50); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); -} - -TEST_P(EncodeApiGetTplStatsTest, GetTplStatsIO) { - cfg_.g_lag_in_frames = 25; - width_ = 352; - height_ = 288; - test_io_ = true; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_, - height_, 30, 1, 0, 50); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); -} - -INSTANTIATE_TEST_SUITE_P( - VP9, EncodeApiGetTplStatsTest, - ::testing::Values( - static_cast(&libvpx_test::kVP9))); #endif // CONFIG_VP9_ENCODER } // namespace diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 4899b1ed12..2e2c8176d4 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1812,24 +1812,6 @@ static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } -static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx, - va_list args) { - VP9_COMP *const cpi = ctx->cpi; - VpxTplGopStats *data = va_arg(args, VpxTplGopStats *); - VpxTplFrameStats *frame_stats_list = cpi->tpl_gop_stats.frame_stats_list; - int i; - if (data == NULL) { - return VPX_CODEC_INVALID_PARAM; - } - data->size = cpi->tpl_gop_stats.size; - - for (i = 0; i < data->size; i++) { - data->frame_stats_list[i] = frame_stats_list[i]; - } - - return VPX_CODEC_OK; -} - static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, va_list args) { VP9_COMP *const cpi = ctx->cpi; @@ -2089,7 +2071,6 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_GET_ACTIVEMAP, ctrl_get_active_map }, { VP9E_GET_LEVEL, ctrl_get_level }, { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config }, - { VP9E_GET_TPL_STATS, ctrl_get_tpl_stats }, { -1, NULL }, }; diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index d098c4c985..2875e185e6 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -768,18 +768,6 @@ enum vp8e_enc_control_id { * */ VP9E_SET_QUANTIZER_ONE_PASS, - - /*!\brief Codec control to get TPL stats for the current GOP. - * - * Allocation and free of memory of size MAX_ARF_GOP_SIZE (50) * - * sizeof(VpxTplFrameStats) should be done by applications. - * - * VPX_CODEC_INVALID_PARAM will be returned if the pointer passed in is NULL. - * - * Supported in codecs: VP9 - * - */ - VP9E_GET_TPL_STATS, }; /*!\brief vpx 1-D scaling mode @@ -1110,8 +1098,6 @@ VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int) #define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int) #define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS -VPX_CTRL_USE_TYPE(VP9E_GET_TPL_STATS, void *) -#define VPX_CTRL_VP9E_GET_TPL_STATS /*!\endcond */ /*! @} - end defgroup vp8_encoder */